fix retrieve final technical questions v4

This commit is contained in:
team 1
2026-04-24 11:49:02 +02:00
parent 8a7cb89c5d
commit 66f09e83ca
4 changed files with 356 additions and 13 deletions

View File

@@ -54,6 +54,8 @@ final readonly class AgentRunner
$optimizedShopQuery = ''; $optimizedShopQuery = '';
$shopSearchQuery = ''; $shopSearchQuery = '';
$commerceIntent = CommerceIntentLite::NONE; $commerceIntent = CommerceIntentLite::NONE;
$knowledgeRetrievalPrompt = $prompt;
$usedFollowUpRetrievalContext = false;
$commerceHistoryContext = ''; $commerceHistoryContext = '';
$attemptedShopRepair = false; $attemptedShopRepair = false;
$usedShopRepair = false; $usedShopRepair = false;
@@ -77,14 +79,30 @@ final readonly class AgentRunner
$this->addSource($sources, $this->agentRunnerConfig->getExternalUrlSourceLabel()); $this->addSource($sources, $this->agentRunnerConfig->getExternalUrlSourceLabel());
} }
$commerceIntent = $this->detectCommerceIntent($prompt);
yield $this->systemMsg($this->agentRunnerConfig->getRetrieveKnowledgeMessage(), 'think'); yield $this->systemMsg($this->agentRunnerConfig->getRetrieveKnowledgeMessage(), 'think');
$knowledgeChunks = $this->retriever->retrieve($prompt); $knowledgeRetrievalPrompt = $this->buildKnowledgeRetrievalPrompt(
prompt: $prompt,
userId: $userId,
commerceIntent: $commerceIntent
);
$usedFollowUpRetrievalContext = $knowledgeRetrievalPrompt !== $prompt;
$knowledgeChunks = $this->retriever->retrieve($knowledgeRetrievalPrompt);
if ($knowledgeChunks !== []) { if ($knowledgeChunks !== []) {
$this->addSource($sources, $this->agentRunnerConfig->getRagKnowledgeSourceLabel()); $this->addSource($sources, $this->agentRunnerConfig->getRagKnowledgeSourceLabel());
} }
$commerceIntent = $this->detectCommerceIntent($prompt); if ($usedFollowUpRetrievalContext) {
$this->agentLogger->info('Knowledge retrieval used follow-up context', [
'userId' => $userId,
'prompt' => $prompt,
'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt,
'commerceIntent' => $commerceIntent,
]);
}
if ($this->isCommerceIntent($commerceIntent)) { if ($this->isCommerceIntent($commerceIntent)) {
yield $this->systemMsg($this->agentRunnerConfig->getOptimizeSearchMessage(), 'think'); yield $this->systemMsg($this->agentRunnerConfig->getOptimizeSearchMessage(), 'think');
@@ -171,6 +189,8 @@ final readonly class AgentRunner
'finalPrompt' => $finalPrompt, 'finalPrompt' => $finalPrompt,
'optimizedShopQuery' => $optimizedShopQuery, 'optimizedShopQuery' => $optimizedShopQuery,
'shopSearchQuery' => $shopSearchQuery, 'shopSearchQuery' => $shopSearchQuery,
'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt,
'usedFollowUpRetrievalContext' => $usedFollowUpRetrievalContext,
'primaryShopResultsCount' => count($primaryShopResults), 'primaryShopResultsCount' => count($primaryShopResults),
'shopResultsCount' => count($shopResults), 'shopResultsCount' => count($shopResults),
'attemptedShopRepair' => $attemptedShopRepair, 'attemptedShopRepair' => $attemptedShopRepair,
@@ -228,6 +248,8 @@ final readonly class AgentRunner
'usedShopRepair' => $usedShopRepair, 'usedShopRepair' => $usedShopRepair,
'shopRepairQueries' => $shopRepairQueries, 'shopRepairQueries' => $shopRepairQueries,
'knowledgeChunkCount' => count($knowledgeChunks), 'knowledgeChunkCount' => count($knowledgeChunks),
'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt,
'usedFollowUpRetrievalContext' => $usedFollowUpRetrievalContext,
'hasUrlContent' => $urlContent !== '', 'hasUrlContent' => $urlContent !== '',
'usedOptimizedShopQuery' => $optimizedShopQuery !== '', 'usedOptimizedShopQuery' => $optimizedShopQuery !== '',
'optimizedShopQuery' => $optimizedShopQuery, 'optimizedShopQuery' => $optimizedShopQuery,
@@ -258,6 +280,159 @@ final readonly class AgentRunner
|| $commerceIntent === CommerceIntentLite::ADVISORY_PRODUCT_SEARCH; || $commerceIntent === CommerceIntentLite::ADVISORY_PRODUCT_SEARCH;
} }
private function buildKnowledgeRetrievalPrompt(
string $prompt,
string $userId,
string $commerceIntent
): string {
if (!$this->shouldUseFollowUpContextForKnowledgeRetrieval($prompt, $commerceIntent)) {
return $prompt;
}
$history = $this->contextService->buildUserContextWithinBudget($userId, 3000);
$previousQuestions = $this->extractRecentUserQuestions($history, 2);
if ($previousQuestions === []) {
return $prompt;
}
$lines = [];
foreach ($previousQuestions as $question) {
$lines[] = 'Vorherige Nutzerfrage: ' . $question;
}
$lines[] = 'Aktuelle Folgefrage: ' . $prompt;
return implode("\n", $lines);
}
private function shouldUseFollowUpContextForKnowledgeRetrieval(string $prompt, string $commerceIntent): bool
{
if ($this->isCommerceIntent($commerceIntent)) {
return false;
}
$normalized = $this->normalizeFollowUpText($prompt);
if ($normalized === '') {
return false;
}
if ($this->containsExplicitCommercialFollowUpSignal($normalized)) {
return false;
}
if (mb_strlen($normalized, 'UTF-8') > 180 && !$this->containsStrongFollowUpReference($normalized)) {
return false;
}
return $this->containsStrongFollowUpReference($normalized);
}
private function containsStrongFollowUpReference(string $normalized): bool
{
$patterns = [
'/\bder\s+wert\b/u',
'/\bdieser\s+wert\b/u',
'/\bdiesen\s+wert\b/u',
'/\bdem\s+wert\b/u',
'/\bmit\s+welche(?:m|n|r)?\b/u',
'/\bwomit\b/u',
'/\bdamit\b/u',
'/\bdafuer\b/u',
'/\bdafür\b/u',
'/\bdazu\b/u',
'/\bdaraus\b/u',
'/\bwelche(?:r|s|m|n)?\s+indikator\b/u',
'/\bwelche(?:r|s|m|n)?\s+indikatortyp\b/u',
'/\bindikator\s+(?:dafuer|dafür|dazu|hierfuer|hierfür)\b/u',
'/\bwelche(?:r|s|m|n)?\s+bereich\b/u',
'/\bwelche(?:r|s|m|n)?\s+messbereich\b/u',
'/\bwelche(?:r|s|m|n)?\s+grenzwert\b/u',
];
foreach ($patterns as $pattern) {
if (preg_match($pattern, $normalized) === 1) {
return true;
}
}
return false;
}
private function containsExplicitCommercialFollowUpSignal(string $normalized): bool
{
$commercialSignals = [
'shop', 'preis', 'preise', 'kostet', 'kosten', 'kaufen', 'bestellen',
'warenkorb', 'lieferzeit', 'verfuegbar', 'verfügbar', 'lager', 'url',
'link', 'artikelnummer', 'sku', 'produktnummer',
];
foreach ($commercialSignals as $signal) {
if (str_contains($normalized, $signal)) {
return true;
}
}
return false;
}
/**
* @return string[]
*/
private function extractRecentUserQuestions(string $history, int $limit): array
{
$history = trim($history);
if ($history === '' || $limit <= 0) {
return [];
}
if (preg_match_all('/^Question:\s*(.+)$/mi', $history, $matches) !== 1) {
return [];
}
$questions = array_values(array_filter(
array_map(
fn(string $question): string => $this->sanitizeHistoryQuestion($question),
$matches[1] ?? []
),
static fn(string $question): bool => $question !== ''
));
if ($questions === []) {
return [];
}
return array_slice($questions, -$limit);
}
private function sanitizeHistoryQuestion(string $question): string
{
$question = trim((string) preg_replace('/\s+/u', ' ', $question));
if ($question === '') {
return '';
}
if (mb_strlen($question, 'UTF-8') <= 500) {
return $question;
}
return rtrim(mb_substr($question, 0, 497, 'UTF-8')) . '...';
}
private function normalizeFollowUpText(string $value): string
{
$value = mb_strtolower(trim($value), 'UTF-8');
$value = str_replace(['-', '/', '_'], ' ', $value);
$value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
return trim($value);
}
private function buildOptimizedShopQuery( private function buildOptimizedShopQuery(
string $prompt, string $prompt,
string $userId, string $userId,

View File

@@ -13,12 +13,12 @@ final class AgentRunnerConfig
public function getProductSearchKnowledgeChunkLimit(): int public function getProductSearchKnowledgeChunkLimit(): int
{ {
return 2; return 6;
} }
public function getAdvisoryProductSearchKnowledgeChunkLimit(): int public function getAdvisoryProductSearchKnowledgeChunkLimit(): int
{ {
return 3; return 9;
} }
public function getOptimizedShopQueryPrefixPattern(): string public function getOptimizedShopQueryPrefixPattern(): string

View File

@@ -88,8 +88,10 @@ final class PromptBuilderConfig
{ {
return [ return [
'The following messages are previous turns of this conversation.', 'The following messages are previous turns of this conversation.',
'Use them to resolve references, follow-up questions, and user intent.', 'Use them only to resolve references, follow-up questions, and user intent.',
'They must not override retrieved factual knowledge or live shop data.', 'Previous assistant answers are not a factual source for technical values, product compatibility, indicators, ranges, prices, or availability.',
'All factual claims must come from retrieved factual knowledge, user-provided URL content, or live shop data.',
'Conversation context must not override retrieved factual knowledge or live shop data.',
]; ];
} }
@@ -253,6 +255,7 @@ final class PromptBuilderConfig
'- Clearly separate explicit facts from inferences.', '- Clearly separate explicit facts from inferences.',
'- If a conclusion goes beyond the source wording, label it exactly as \'Inference:\'.', '- If a conclusion goes beyond the source wording, label it exactly as \'Inference:\'.',
'- If a sentence cannot be traced to the provided sources, do not write it.', '- If a sentence cannot be traced to the provided sources, do not write it.',
'- For follow-up questions, use the conversation only to resolve what the user refers to; do not copy technical facts from previous assistant answers unless the same fact is present in the current retrieved sources.',
'- Never mention external manufacturers, external brands, or external products unless they are explicitly present in the provided sources.', '- Never mention external manufacturers, external brands, or external products unless they are explicitly present in the provided sources.',
'- If the sources do not identify a suitable product, do not invent one.', '- If the sources do not identify a suitable product, do not invent one.',
]; ];

View File

@@ -211,7 +211,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
if ($exactDocumentMatch !== null) { if ($exactDocumentMatch !== null) {
$selectedChunkIds = $this->selectExactDocumentChunkIds( $selectedChunkIds = $this->selectExactDocumentChunkIds(
$exactDocumentMatch['rows'], $exactDocumentMatch['rows'],
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)) max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)),
$prompt
); );
if ($selectedChunkIds !== []) { if ($selectedChunkIds !== []) {
@@ -631,15 +632,127 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
} }
/** /**
* Selects a coherent chunk window from one exact document title match. * Selects a coherent chunk window from one exact document-title match.
* *
* For exact product questions we prefer a pure document slice over * A pure first-N slice is too weak for follow-up questions: the title may
* cross-document fusion to avoid mixing neighbouring product families. * identify the right document, while the current follow-up asks for a
* specific detail from a later chunk (for example an indicator, range,
* threshold, interface, relay, or error code).
*
* Therefore this method stays inside the matched document, but ranks its
* chunks by overlap with the effective retrieval query before sorting the
* final selection back into document order for prompt readability.
* *
* @param array<string,array<string,mixed>> $rows * @param array<string,array<string,mixed>> $rows
* @return string[] * @return string[]
*/ */
private function selectExactDocumentChunkIds(array $rows, int $limit): array private function selectExactDocumentChunkIds(array $rows, int $limit, string $prompt): array
{
$orderedRows = $this->sortRowsByChunkIndex($rows);
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
if ($orderedRows === [] || $max <= 0) {
return [];
}
$queryTokens = $this->buildExactDocumentSelectionTokens($prompt);
if ($queryTokens === []) {
return $this->firstChunkIdsFromRows($orderedRows, $max);
}
$scored = [];
foreach ($orderedRows as $order => $row) {
$chunkId = $row['chunk_id'] ?? null;
$text = trim((string)($row['text'] ?? ''));
if (!is_string($chunkId) || $chunkId === '' || $text === '') {
continue;
}
$haystack = $this->normalizeText($this->extractDocumentTitle($row) . ' ' . $text);
$haystackTokens = array_fill_keys($this->tokenizeText($haystack), true);
$score = 0.0;
foreach ($queryTokens as $token) {
if (!isset($haystackTokens[$token])) {
continue;
}
if (preg_match('/\d/u', $token) === 1) {
$score += 6.0;
continue;
}
if ($this->isExactDetailToken($token)) {
$score += 5.0;
continue;
}
$score += 2.0;
}
// Keep early chunks slightly competitive for overview facts,
// without letting them hide strongly matching detail chunks.
$score += max(0.0, 1.0 - ($order * 0.05));
$scored[] = [
'id' => $chunkId,
'score' => $score,
'order' => $order,
'chunk_index' => is_int($row['chunk_index'] ?? null) ? (int)$row['chunk_index'] : null,
];
}
if ($scored === []) {
return [];
}
usort($scored, static function (array $a, array $b): int {
if ($a['score'] !== $b['score']) {
return $b['score'] <=> $a['score'];
}
return $a['order'] <=> $b['order'];
});
$selected = array_slice($scored, 0, $max);
usort($selected, static function (array $a, array $b): int {
$aIndex = $a['chunk_index'];
$bIndex = $b['chunk_index'];
if ($aIndex === null && $bIndex === null) {
return $a['order'] <=> $b['order'];
}
if ($aIndex === null) {
return 1;
}
if ($bIndex === null) {
return -1;
}
if ($aIndex !== $bIndex) {
return $aIndex <=> $bIndex;
}
return $a['order'] <=> $b['order'];
});
return array_map(
static fn(array $row): string => (string)$row['id'],
$selected
);
}
/**
* @param array<string,array<string,mixed>> $rows
* @return array<int,array<string,mixed>>
*/
private function sortRowsByChunkIndex(array $rows): array
{ {
uasort($rows, static function (array $a, array $b): int { uasort($rows, static function (array $a, array $b): int {
$aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX; $aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX;
@@ -652,8 +765,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? '')); return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? ''));
}); });
return array_values($rows);
}
/**
* @param array<int,array<string,mixed>> $rows
* @return string[]
*/
private function firstChunkIdsFromRows(array $rows, int $limit): array
{
$selected = []; $selected = [];
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
foreach ($rows as $row) { foreach ($rows as $row) {
$chunkId = $row['chunk_id'] ?? null; $chunkId = $row['chunk_id'] ?? null;
@@ -665,7 +786,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$selected[] = $chunkId; $selected[] = $chunkId;
if (count($selected) >= $max) { if (count($selected) >= $limit) {
break; break;
} }
} }
@@ -673,6 +794,50 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return $selected; return $selected;
} }
/**
* @return string[]
*/
private function buildExactDocumentSelectionTokens(string $prompt): array
{
$tokens = $this->tokenizeText($this->normalizeText($prompt));
$out = [];
foreach ($tokens as $token) {
if ($this->isGenericExactSelectionToken($token)) {
continue;
}
if (preg_match('/\d/u', $token) === 1 || mb_strlen($token, 'UTF-8') >= 3) {
$out[] = $token;
}
}
return array_values(array_unique($out));
}
private function isExactDetailToken(string $token): bool
{
return in_array($token, [
'indikator', 'indikatortyp', 'reagenz', 'reagens', 'grenzwert',
'messbereich', 'bereich', 'wasserhaerte', 'wasserhärte',
'resthaerte', 'resthärte', 'haerte', 'härte', 'aufloesung',
'auflösung', 'schnittstelle', 'relais', 'fehlercode', 'code',
'wert', 'werte',
], true);
}
private function isGenericExactSelectionToken(string $token): bool
{
return in_array($token, [
'vorherige', 'vorheriger', 'nutzerfrage', 'aktuelle',
'folgefrage', 'frage', 'antwort', 'welche', 'welcher', 'welches',
'welchem', 'welchen', 'wird', 'werden', 'wurde', 'kann', 'koennen',
'können', 'mit', 'der', 'die', 'das', 'den', 'dem', 'ein', 'eine',
'einer', 'eines', 'ist', 'sind', 'was', 'wie', 'wo', 'zum', 'zur',
'fuer', 'für', 'durch', 'von', 'vom', 'und', 'oder', 'auch',
], true);
}
/** /**
* Builds synthetic scores for exact-title fast-path selections. * Builds synthetic scores for exact-title fast-path selections.
* *