diff --git a/src/Agent/AgentRunner.php b/src/Agent/AgentRunner.php index 7207239..29be76a 100644 --- a/src/Agent/AgentRunner.php +++ b/src/Agent/AgentRunner.php @@ -54,6 +54,8 @@ final readonly class AgentRunner $optimizedShopQuery = ''; $shopSearchQuery = ''; $commerceIntent = CommerceIntentLite::NONE; + $knowledgeRetrievalPrompt = $prompt; + $usedFollowUpRetrievalContext = false; $commerceHistoryContext = ''; $attemptedShopRepair = false; $usedShopRepair = false; @@ -77,14 +79,30 @@ final readonly class AgentRunner $this->addSource($sources, $this->agentRunnerConfig->getExternalUrlSourceLabel()); } + $commerceIntent = $this->detectCommerceIntent($prompt); + yield $this->systemMsg($this->agentRunnerConfig->getRetrieveKnowledgeMessage(), 'think'); - $knowledgeChunks = $this->retriever->retrieve($prompt); + $knowledgeRetrievalPrompt = $this->buildKnowledgeRetrievalPrompt( + prompt: $prompt, + userId: $userId, + commerceIntent: $commerceIntent + ); + $usedFollowUpRetrievalContext = $knowledgeRetrievalPrompt !== $prompt; + + $knowledgeChunks = $this->retriever->retrieve($knowledgeRetrievalPrompt); if ($knowledgeChunks !== []) { $this->addSource($sources, $this->agentRunnerConfig->getRagKnowledgeSourceLabel()); } - $commerceIntent = $this->detectCommerceIntent($prompt); + if ($usedFollowUpRetrievalContext) { + $this->agentLogger->info('Knowledge retrieval used follow-up context', [ + 'userId' => $userId, + 'prompt' => $prompt, + 'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt, + 'commerceIntent' => $commerceIntent, + ]); + } if ($this->isCommerceIntent($commerceIntent)) { yield $this->systemMsg($this->agentRunnerConfig->getOptimizeSearchMessage(), 'think'); @@ -171,6 +189,8 @@ final readonly class AgentRunner 'finalPrompt' => $finalPrompt, 'optimizedShopQuery' => $optimizedShopQuery, 'shopSearchQuery' => $shopSearchQuery, + 'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt, + 'usedFollowUpRetrievalContext' => $usedFollowUpRetrievalContext, 'primaryShopResultsCount' => count($primaryShopResults), 'shopResultsCount' => count($shopResults), 'attemptedShopRepair' => $attemptedShopRepair, @@ -228,6 +248,8 @@ final readonly class AgentRunner 'usedShopRepair' => $usedShopRepair, 'shopRepairQueries' => $shopRepairQueries, 'knowledgeChunkCount' => count($knowledgeChunks), + 'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt, + 'usedFollowUpRetrievalContext' => $usedFollowUpRetrievalContext, 'hasUrlContent' => $urlContent !== '', 'usedOptimizedShopQuery' => $optimizedShopQuery !== '', 'optimizedShopQuery' => $optimizedShopQuery, @@ -258,6 +280,159 @@ final readonly class AgentRunner || $commerceIntent === CommerceIntentLite::ADVISORY_PRODUCT_SEARCH; } + private function buildKnowledgeRetrievalPrompt( + string $prompt, + string $userId, + string $commerceIntent + ): string { + if (!$this->shouldUseFollowUpContextForKnowledgeRetrieval($prompt, $commerceIntent)) { + return $prompt; + } + + $history = $this->contextService->buildUserContextWithinBudget($userId, 3000); + $previousQuestions = $this->extractRecentUserQuestions($history, 2); + + if ($previousQuestions === []) { + return $prompt; + } + + $lines = []; + + foreach ($previousQuestions as $question) { + $lines[] = 'Vorherige Nutzerfrage: ' . $question; + } + + $lines[] = 'Aktuelle Folgefrage: ' . $prompt; + + return implode("\n", $lines); + } + + private function shouldUseFollowUpContextForKnowledgeRetrieval(string $prompt, string $commerceIntent): bool + { + if ($this->isCommerceIntent($commerceIntent)) { + return false; + } + + $normalized = $this->normalizeFollowUpText($prompt); + + if ($normalized === '') { + return false; + } + + if ($this->containsExplicitCommercialFollowUpSignal($normalized)) { + return false; + } + + if (mb_strlen($normalized, 'UTF-8') > 180 && !$this->containsStrongFollowUpReference($normalized)) { + return false; + } + + return $this->containsStrongFollowUpReference($normalized); + } + + private function containsStrongFollowUpReference(string $normalized): bool + { + $patterns = [ + '/\bder\s+wert\b/u', + '/\bdieser\s+wert\b/u', + '/\bdiesen\s+wert\b/u', + '/\bdem\s+wert\b/u', + '/\bmit\s+welche(?:m|n|r)?\b/u', + '/\bwomit\b/u', + '/\bdamit\b/u', + '/\bdafuer\b/u', + '/\bdafür\b/u', + '/\bdazu\b/u', + '/\bdaraus\b/u', + '/\bwelche(?:r|s|m|n)?\s+indikator\b/u', + '/\bwelche(?:r|s|m|n)?\s+indikatortyp\b/u', + '/\bindikator\s+(?:dafuer|dafür|dazu|hierfuer|hierfür)\b/u', + '/\bwelche(?:r|s|m|n)?\s+bereich\b/u', + '/\bwelche(?:r|s|m|n)?\s+messbereich\b/u', + '/\bwelche(?:r|s|m|n)?\s+grenzwert\b/u', + ]; + + foreach ($patterns as $pattern) { + if (preg_match($pattern, $normalized) === 1) { + return true; + } + } + + return false; + } + + private function containsExplicitCommercialFollowUpSignal(string $normalized): bool + { + $commercialSignals = [ + 'shop', 'preis', 'preise', 'kostet', 'kosten', 'kaufen', 'bestellen', + 'warenkorb', 'lieferzeit', 'verfuegbar', 'verfügbar', 'lager', 'url', + 'link', 'artikelnummer', 'sku', 'produktnummer', + ]; + + foreach ($commercialSignals as $signal) { + if (str_contains($normalized, $signal)) { + return true; + } + } + + return false; + } + + /** + * @return string[] + */ + private function extractRecentUserQuestions(string $history, int $limit): array + { + $history = trim($history); + + if ($history === '' || $limit <= 0) { + return []; + } + + if (preg_match_all('/^Question:\s*(.+)$/mi', $history, $matches) !== 1) { + return []; + } + + $questions = array_values(array_filter( + array_map( + fn(string $question): string => $this->sanitizeHistoryQuestion($question), + $matches[1] ?? [] + ), + static fn(string $question): bool => $question !== '' + )); + + if ($questions === []) { + return []; + } + + return array_slice($questions, -$limit); + } + + private function sanitizeHistoryQuestion(string $question): string + { + $question = trim((string) preg_replace('/\s+/u', ' ', $question)); + + if ($question === '') { + return ''; + } + + if (mb_strlen($question, 'UTF-8') <= 500) { + return $question; + } + + return rtrim(mb_substr($question, 0, 497, 'UTF-8')) . '...'; + } + + private function normalizeFollowUpText(string $value): string + { + $value = mb_strtolower(trim($value), 'UTF-8'); + $value = str_replace(['-', '/', '_'], ' ', $value); + $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value; + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim($value); + } + private function buildOptimizedShopQuery( string $prompt, string $userId, diff --git a/src/Config/AgentRunnerConfig.php b/src/Config/AgentRunnerConfig.php index ea9ec0e..0884d8e 100644 --- a/src/Config/AgentRunnerConfig.php +++ b/src/Config/AgentRunnerConfig.php @@ -13,12 +13,12 @@ final class AgentRunnerConfig public function getProductSearchKnowledgeChunkLimit(): int { - return 2; + return 6; } public function getAdvisoryProductSearchKnowledgeChunkLimit(): int { - return 3; + return 9; } public function getOptimizedShopQueryPrefixPattern(): string diff --git a/src/Config/PromptBuilderConfig.php b/src/Config/PromptBuilderConfig.php index baca8cb..a94fb66 100644 --- a/src/Config/PromptBuilderConfig.php +++ b/src/Config/PromptBuilderConfig.php @@ -88,8 +88,10 @@ final class PromptBuilderConfig { return [ 'The following messages are previous turns of this conversation.', - 'Use them to resolve references, follow-up questions, and user intent.', - 'They must not override retrieved factual knowledge or live shop data.', + 'Use them only to resolve references, follow-up questions, and user intent.', + 'Previous assistant answers are not a factual source for technical values, product compatibility, indicators, ranges, prices, or availability.', + 'All factual claims must come from retrieved factual knowledge, user-provided URL content, or live shop data.', + 'Conversation context must not override retrieved factual knowledge or live shop data.', ]; } @@ -253,6 +255,7 @@ final class PromptBuilderConfig '- Clearly separate explicit facts from inferences.', '- If a conclusion goes beyond the source wording, label it exactly as \'Inference:\'.', '- If a sentence cannot be traced to the provided sources, do not write it.', + '- For follow-up questions, use the conversation only to resolve what the user refers to; do not copy technical facts from previous assistant answers unless the same fact is present in the current retrieved sources.', '- Never mention external manufacturers, external brands, or external products unless they are explicitly present in the provided sources.', '- If the sources do not identify a suitable product, do not invent one.', ]; diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 4b0e4a6..35e3530 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -211,7 +211,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface if ($exactDocumentMatch !== null) { $selectedChunkIds = $this->selectExactDocumentChunkIds( $exactDocumentMatch['rows'], - max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)) + max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)), + $prompt ); if ($selectedChunkIds !== []) { @@ -631,15 +632,127 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } /** - * Selects a coherent chunk window from one exact document title match. + * Selects a coherent chunk window from one exact document-title match. * - * For exact product questions we prefer a pure document slice over - * cross-document fusion to avoid mixing neighbouring product families. + * A pure first-N slice is too weak for follow-up questions: the title may + * identify the right document, while the current follow-up asks for a + * specific detail from a later chunk (for example an indicator, range, + * threshold, interface, relay, or error code). + * + * Therefore this method stays inside the matched document, but ranks its + * chunks by overlap with the effective retrieval query before sorting the + * final selection back into document order for prompt readability. * * @param array> $rows * @return string[] */ - private function selectExactDocumentChunkIds(array $rows, int $limit): array + private function selectExactDocumentChunkIds(array $rows, int $limit, string $prompt): array + { + $orderedRows = $this->sortRowsByChunkIndex($rows); + $max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS); + + if ($orderedRows === [] || $max <= 0) { + return []; + } + + $queryTokens = $this->buildExactDocumentSelectionTokens($prompt); + + if ($queryTokens === []) { + return $this->firstChunkIdsFromRows($orderedRows, $max); + } + + $scored = []; + + foreach ($orderedRows as $order => $row) { + $chunkId = $row['chunk_id'] ?? null; + $text = trim((string)($row['text'] ?? '')); + + if (!is_string($chunkId) || $chunkId === '' || $text === '') { + continue; + } + + $haystack = $this->normalizeText($this->extractDocumentTitle($row) . ' ' . $text); + $haystackTokens = array_fill_keys($this->tokenizeText($haystack), true); + $score = 0.0; + + foreach ($queryTokens as $token) { + if (!isset($haystackTokens[$token])) { + continue; + } + + if (preg_match('/\d/u', $token) === 1) { + $score += 6.0; + continue; + } + + if ($this->isExactDetailToken($token)) { + $score += 5.0; + continue; + } + + $score += 2.0; + } + + // Keep early chunks slightly competitive for overview facts, + // without letting them hide strongly matching detail chunks. + $score += max(0.0, 1.0 - ($order * 0.05)); + + $scored[] = [ + 'id' => $chunkId, + 'score' => $score, + 'order' => $order, + 'chunk_index' => is_int($row['chunk_index'] ?? null) ? (int)$row['chunk_index'] : null, + ]; + } + + if ($scored === []) { + return []; + } + + usort($scored, static function (array $a, array $b): int { + if ($a['score'] !== $b['score']) { + return $b['score'] <=> $a['score']; + } + + return $a['order'] <=> $b['order']; + }); + + $selected = array_slice($scored, 0, $max); + + usort($selected, static function (array $a, array $b): int { + $aIndex = $a['chunk_index']; + $bIndex = $b['chunk_index']; + + if ($aIndex === null && $bIndex === null) { + return $a['order'] <=> $b['order']; + } + + if ($aIndex === null) { + return 1; + } + + if ($bIndex === null) { + return -1; + } + + if ($aIndex !== $bIndex) { + return $aIndex <=> $bIndex; + } + + return $a['order'] <=> $b['order']; + }); + + return array_map( + static fn(array $row): string => (string)$row['id'], + $selected + ); + } + + /** + * @param array> $rows + * @return array> + */ + private function sortRowsByChunkIndex(array $rows): array { uasort($rows, static function (array $a, array $b): int { $aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX; @@ -652,8 +765,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? '')); }); + return array_values($rows); + } + + /** + * @param array> $rows + * @return string[] + */ + private function firstChunkIdsFromRows(array $rows, int $limit): array + { $selected = []; - $max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS); foreach ($rows as $row) { $chunkId = $row['chunk_id'] ?? null; @@ -665,7 +786,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $selected[] = $chunkId; - if (count($selected) >= $max) { + if (count($selected) >= $limit) { break; } } @@ -673,6 +794,50 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $selected; } + /** + * @return string[] + */ + private function buildExactDocumentSelectionTokens(string $prompt): array + { + $tokens = $this->tokenizeText($this->normalizeText($prompt)); + $out = []; + + foreach ($tokens as $token) { + if ($this->isGenericExactSelectionToken($token)) { + continue; + } + + if (preg_match('/\d/u', $token) === 1 || mb_strlen($token, 'UTF-8') >= 3) { + $out[] = $token; + } + } + + return array_values(array_unique($out)); + } + + private function isExactDetailToken(string $token): bool + { + return in_array($token, [ + 'indikator', 'indikatortyp', 'reagenz', 'reagens', 'grenzwert', + 'messbereich', 'bereich', 'wasserhaerte', 'wasserhärte', + 'resthaerte', 'resthärte', 'haerte', 'härte', 'aufloesung', + 'auflösung', 'schnittstelle', 'relais', 'fehlercode', 'code', + 'wert', 'werte', + ], true); + } + + private function isGenericExactSelectionToken(string $token): bool + { + return in_array($token, [ + 'vorherige', 'vorheriger', 'nutzerfrage', 'aktuelle', + 'folgefrage', 'frage', 'antwort', 'welche', 'welcher', 'welches', + 'welchem', 'welchen', 'wird', 'werden', 'wurde', 'kann', 'koennen', + 'können', 'mit', 'der', 'die', 'das', 'den', 'dem', 'ein', 'eine', + 'einer', 'eines', 'ist', 'sind', 'was', 'wie', 'wo', 'zum', 'zur', + 'fuer', 'für', 'durch', 'von', 'vom', 'und', 'oder', 'auch', + ], true); + } + /** * Builds synthetic scores for exact-title fast-path selections. *