From 868f9a885752515e84f199c99688bf30031279c3 Mon Sep 17 00:00:00 2001 From: team 1 Date: Fri, 24 Apr 2026 12:02:34 +0200 Subject: [PATCH] fix retrieve final --- src/Agent/AgentRunner.php | 103 +++++++++++- src/Config/PromptBuilderConfig.php | 3 + .../Retrieval/NdjsonHybridRetriever.php | 151 +++++++++++++++++- 3 files changed, 252 insertions(+), 5 deletions(-) diff --git a/src/Agent/AgentRunner.php b/src/Agent/AgentRunner.php index 29be76a..e0bdbaa 100644 --- a/src/Agent/AgentRunner.php +++ b/src/Agent/AgentRunner.php @@ -291,8 +291,9 @@ final readonly class AgentRunner $history = $this->contextService->buildUserContextWithinBudget($userId, 3000); $previousQuestions = $this->extractRecentUserQuestions($history, 2); + $referenceAnchors = $this->extractLatestAssistantReferenceAnchors($history); - if ($previousQuestions === []) { + if ($previousQuestions === [] && $referenceAnchors === []) { return $prompt; } @@ -302,6 +303,11 @@ final readonly class AgentRunner $lines[] = 'Vorherige Nutzerfrage: ' . $question; } + if ($referenceAnchors !== []) { + $lines[] = 'Vorherige technische Referenzanker (nur zur Referenzauflösung, keine Faktenquelle): ' + . implode(' ', $referenceAnchors); + } + $lines[] = 'Aktuelle Folgefrage: ' . $prompt; return implode("\n", $lines); @@ -408,6 +414,101 @@ final readonly class AgentRunner return array_slice($questions, -$limit); } + /** + * Extracts stable reference anchors from the latest assistant answer. + * + * These anchors are only used to resolve follow-up references such as + * "der Wert" or "welcher Indikator". They are not factual evidence for + * the final answer. To avoid propagating wrong earlier answers, only the + * first explicit Testomat model reference and the first explicit °dH value + * are kept. Indicator names, reagent codes, prices, URLs and product + * numbers are intentionally ignored here. + * + * @return string[] + */ + private function extractLatestAssistantReferenceAnchors(string $history): array + { + $turn = $this->extractLatestHistoryTurn($history); + + if ($turn === '') { + return []; + } + + $answer = preg_replace('/^Question:\s*.*(?:\R|$)/u', '', $turn, 1) ?? ''; + $answer = trim($answer); + + if ($answer === '') { + return []; + } + + $anchors = []; + + $model = $this->extractFirstTestomatModelAnchor($answer); + if ($model !== '') { + $anchors[] = $model; + } + + $hardnessValue = $this->extractFirstHardnessValueAnchor($answer); + if ($hardnessValue !== '') { + $anchors[] = $hardnessValue; + } + + return array_values(array_unique($anchors)); + } + + private function extractLatestHistoryTurn(string $history): string + { + $history = trim($history); + + if ($history === '') { + return ''; + } + + $parts = preg_split('/(?=^Question:\s)/m', $history); + + if ($parts === false || $parts === []) { + return ''; + } + + $turns = array_values(array_filter( + array_map(static fn(string $part): string => trim($part), $parts), + static fn(string $part): bool => $part !== '' + )); + + if ($turns === []) { + return ''; + } + + return (string) end($turns); + } + + private function extractFirstTestomatModelAnchor(string $text): string + { + $pattern = '/\bTestomat(?:®)?\s+' + . '(?:\d{3,4}|EVO(?:\s+[A-Z]{2,6})?|ECO(?:[-\s]?(?:PLUS|C))?|DUO(?:\s+\d{3,4})?|LAB(?:\s+[A-Z]{2,6})?)' + . '\b/iu'; + + if (preg_match($pattern, $text, $matches) !== 1) { + return ''; + } + + $value = $this->sanitizeHistoryQuestion((string) ($matches[0] ?? '')); + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim(str_replace('®', '', $value)); + } + + private function extractFirstHardnessValueAnchor(string $text): string + { + if (preg_match('/\b\d+(?:[,.]\d+)?\s*°\s*dH\b/iu', $text, $matches) !== 1) { + return ''; + } + + $value = preg_replace('/\s+/u', ' ', (string) ($matches[0] ?? '')) ?? ''; + + return trim($value); + } + private function sanitizeHistoryQuestion(string $question): string { $question = trim((string) preg_replace('/\s+/u', ' ', $question)); diff --git a/src/Config/PromptBuilderConfig.php b/src/Config/PromptBuilderConfig.php index a94fb66..7e44631 100644 --- a/src/Config/PromptBuilderConfig.php +++ b/src/Config/PromptBuilderConfig.php @@ -310,6 +310,9 @@ final class PromptBuilderConfig '- If the source lists application areas, repeat only those areas and do not broaden them.', '- If the source names an indicator and threshold, reproduce that exactly without extrapolation.', '- For lowest, highest, smallest, largest, minimum, maximum, Grenzwert, Messbereich or Aufloesung questions, first identify the exact numeric extreme from the retrieved knowledge and answer that value directly.', + '- For lowest/highest/minimum/maximum questions, answer only the requested extreme unless the user explicitly asks for a comparison or alternatives.', + '- Do not add the runner-up product, second-lowest value, or adjacent range unless the user asks for it.', + '- For follow-up questions such as "which indicator measures that value", first resolve the referenced value/device, then use the retrieved source entry that explicitly connects value, device and indicator.', '- For numeric extreme questions, do not combine a value, device name, indicator name, range or product variant from different chunks unless the same retrieved entry explicitly connects them.', '- If several devices or indicators are present, keep each device-indicator-range assignment separate and do not transfer an indicator from one product to another.', '- If the source states only a threshold function, do not expand it into broader control logic.', diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 35e3530..d2e6526 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -655,12 +655,15 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return []; } - $queryTokens = $this->buildExactDocumentSelectionTokens($prompt); + $queryTokens = $this->expandExactSelectionTokenVariants( + $this->buildExactDocumentSelectionTokens($prompt) + ); if ($queryTokens === []) { return $this->firstChunkIdsFromRows($orderedRows, $max); } + $detailFocus = $this->buildExactDocumentDetailFocus($prompt); $scored = []; foreach ($orderedRows as $order => $row) { @@ -672,7 +675,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } $haystack = $this->normalizeText($this->extractDocumentTitle($row) . ' ' . $text); - $haystackTokens = array_fill_keys($this->tokenizeText($haystack), true); + $haystackTokens = array_fill_keys( + $this->expandExactSelectionTokenVariants($this->tokenizeText($haystack)), + true + ); $score = 0.0; foreach ($queryTokens as $token) { @@ -693,6 +699,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $score += 2.0; } + $score += $this->scoreExactDocumentDetailFocus($detailFocus, $haystack, $text); + // Keep early chunks slightly competitive for overview facts, // without letting them hide strongly matching detail chunks. $score += max(0.0, 1.0 - ($order * 0.05)); @@ -815,10 +823,143 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return array_values(array_unique($out)); } + /** + * @param string[] $tokens + * @return string[] + */ + private function expandExactSelectionTokenVariants(array $tokens): array + { + $out = []; + + foreach ($tokens as $token) { + foreach ($this->exactSelectionTokenVariants($token) as $variant) { + $out[] = $variant; + } + } + + return array_values(array_unique(array_filter( + $out, + static fn(string $token): bool => $token !== '' + ))); + } + + /** + * @return string[] + */ + private function exactSelectionTokenVariants(string $token): array + { + $token = trim($token); + + if ($token === '') { + return []; + } + + $variants = [$token]; + $length = mb_strlen($token, 'UTF-8'); + + if ($length >= 5) { + foreach (['typen', 'innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) { + if (!str_ends_with($token, $suffix)) { + continue; + } + + $stem = mb_substr($token, 0, $length - mb_strlen($suffix, 'UTF-8'), 'UTF-8'); + + if (mb_strlen($stem, 'UTF-8') >= 3) { + $variants[] = $stem; + } + } + } + + if (str_starts_with($token, 'indikator')) { + $variants[] = 'indikator'; + $variants[] = 'indikatortyp'; + } + + if (str_starts_with($token, 'grenzwert')) { + $variants[] = 'grenzwert'; + } + + if (str_starts_with($token, 'messbereich')) { + $variants[] = 'messbereich'; + } + + if (str_starts_with($token, 'testomat')) { + $variants[] = 'testomat'; + } + + return array_values(array_unique($variants)); + } + + /** + * @return array{asks_indicator:bool} + */ + private function buildExactDocumentDetailFocus(string $prompt): array + { + $normalized = $this->normalizeText($prompt); + $tokens = array_fill_keys( + $this->expandExactSelectionTokenVariants($this->tokenizeText($normalized)), + true + ); + + $asksIndicator = isset($tokens['indikator']) + || isset($tokens['indikatortyp']) + || isset($tokens['reagenz']) + || isset($tokens['reagens']) + || str_contains($normalized, 'mit welchem') + || str_contains($normalized, 'womit'); + + return [ + 'asks_indicator' => $asksIndicator, + ]; + } + + /** + * Gives detail chunks inside an already matched exact document a strong + * advantage for follow-up questions such as "which indicator measures that + * value". This remains scoped to the exact document, so it does not affect + * shop searches or broad product discovery. + * + * @param array{asks_indicator:bool} $detailFocus + */ + private function scoreExactDocumentDetailFocus(array $detailFocus, string $normalizedHaystack, string $rawText): float + { + $score = 0.0; + + if (!$detailFocus['asks_indicator']) { + return $score; + } + + if (preg_match('/verf(?:ü|ue)gbare\s+indikatortypen|indikatortypen|indikatorvarianten/iu', $rawText) === 1) { + $score += 14.0; + } + + if (preg_match('/\|\s*(?:typ|indikator)\s*\|\s*(?:grenzwert|messbereich|bereich)/iu', $rawText) === 1) { + $score += 10.0; + } + + if (preg_match('/\|\s*[A-Z]{0,4}\s*\d{2,4}\s*[A-Z]?\s*\|\s*\d/iu', $rawText) === 1) { + $score += 8.0; + } + + if ( + str_contains($normalizedHaystack, 'indikator') + && ( + str_contains($normalizedHaystack, 'grenzwert') + || str_contains($normalizedHaystack, 'messbereich') + || str_contains($normalizedHaystack, 'bereich') + ) + ) { + $score += 5.0; + } + + return $score; + } + private function isExactDetailToken(string $token): bool { return in_array($token, [ - 'indikator', 'indikatortyp', 'reagenz', 'reagens', 'grenzwert', + 'indikator', 'indikatoren', 'indikatortyp', 'indikatortypen', 'reagenz', 'reagens', 'grenzwert', 'messbereich', 'bereich', 'wasserhaerte', 'wasserhärte', 'resthaerte', 'resthärte', 'haerte', 'härte', 'aufloesung', 'auflösung', 'schnittstelle', 'relais', 'fehlercode', 'code', @@ -830,7 +971,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface { return in_array($token, [ 'vorherige', 'vorheriger', 'nutzerfrage', 'aktuelle', - 'folgefrage', 'frage', 'antwort', 'welche', 'welcher', 'welches', + 'folgefrage', 'frage', 'antwort', 'technische', 'referenzanker', + 'referenzaufloesung', 'referenzauflösung', 'faktenquelle', 'keine', + 'welche', 'welcher', 'welches', 'welchem', 'welchen', 'wird', 'werden', 'wurde', 'kann', 'koennen', 'können', 'mit', 'der', 'die', 'das', 'den', 'dem', 'ein', 'eine', 'einer', 'eines', 'ist', 'sind', 'was', 'wie', 'wo', 'zum', 'zur',