From 0fc34f4bc04f6214636e0206138f87449032c041 Mon Sep 17 00:00:00 2001 From: team 1 Date: Sat, 2 May 2026 19:17:59 +0200 Subject: [PATCH] patch 20 --- ...CH_20B_ROUTING_NORMALIZATION_FIX_README.md | 58 +++ ...PATCH_20_LLM_INPUT_NORMALIZATION_README.md | 173 ++++++++ config/retriex/agent.yaml | 107 +++++ config/retriex/commerce.yaml | 4 - config/retriex/intent.yaml | 13 +- src/Agent/AgentRunner.php | 404 +++++++++++++++++- src/Config/AgentRunnerConfig.php | 125 ++++++ src/Config/RetriexEffectiveConfigProvider.php | 79 ++++ 8 files changed, 940 insertions(+), 23 deletions(-) create mode 100644 RETRIEX_PATCH_20B_ROUTING_NORMALIZATION_FIX_README.md create mode 100644 RETRIEX_PATCH_20_LLM_INPUT_NORMALIZATION_README.md diff --git a/RETRIEX_PATCH_20B_ROUTING_NORMALIZATION_FIX_README.md b/RETRIEX_PATCH_20B_ROUTING_NORMALIZATION_FIX_README.md new file mode 100644 index 0000000..1b408e6 --- /dev/null +++ b/RETRIEX_PATCH_20B_ROUTING_NORMALIZATION_FIX_README.md @@ -0,0 +1,58 @@ +# RetrieX Patch 20b – Routing Normalization Fix + +## Ziel + +Patch 20b korrigiert den ersten LLM-Normalisierungsansatz aus p20. Die LLM-gestützte Eingabekorrektur bleibt erhalten, wird aber robuster und sauberer im Routing verwendet. + +## Problem + +Der Test `was kpstet der indikator` blieb nach p20 weiterhin RAG-only. Ursache: Wenn das LLM wegen strenger Guardrails unsicher war, konnte es den Originaltext unverändert zurückgeben. Dann erkannte die Commerce-/Shop-Intent-Logik das falsch geschriebene Routing-Wort nicht. + +Zusätzlich darf die normalisierte Routing-Frage nicht die eigentliche Nutzerfrage in History und Antwortkontext ersetzen. + +## Lösung + +- Originalprompt bleibt erhalten. +- Separater `routingPrompt` wird vor Intent-, RAG- und Shop-Routing verwendet. +- LLM-Normalisierung bleibt der erste Schritt. +- Danach generischer Fuzzy-Fallback gegen kanonische Routing-Terme aus YAML. +- Keine Tippfehlerlisten wie manuell gepflegte Vertipper. +- Fuzzy-Matching arbeitet nur gegen kanonische Signalwörter wie `kostet`, `suche`, `shop`, `messgerät`, `lösung`, `schwimmbad` usw. +- Mehrdeutige oder unsichere Korrekturen werden verworfen. +- Produktnamen, Zahlen und fachliche Ergänzungen werden nicht hinzugefügt. + +## Zusätzliche Intent-Korrektur + +Die Formulierung `ich suche eine preiswerte Lösung zur messung von pH & Chlor für mein schwimmbad` wird als beratende Produktsuche behandelt. Dafür wurde ein zusätzliches Advisory-Product-Selection-Pattern sowie Preiswert-/Günstig-Signale in YAML ergänzt. + +## Geänderte Dateien + +- `src/Agent/AgentRunner.php` +- `src/Config/AgentRunnerConfig.php` +- `src/Config/RetriexEffectiveConfigProvider.php` +- `config/retriex/agent.yaml` +- `config/retriex/intent.yaml` + +## Pflichtchecks + +```bash +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +bin/console mto:agent:config:audit-patterns --details +``` + +## Manuelle Regressionen + +1. `was kpstet der indikator` + - Erwartung: interne Routing-Normalisierung zu `was kostet der indikator` + - Shop-Suche wird angefragt + +2. `ich suche eine preiswerte Lösung zur messung von pH & Chlor für mein schwimmbad` + - Erwartung: beratende Shop-/Produktsuche wird ausgelöst + - Chlor-/pH-Messfähigkeit darf genannt werden + - Schwimmbad-Eignung nur vorsichtig formulieren, wenn nicht explizit belegt + +3. Stabile 1.5.1-Baseline + - `Was ist der niedrigste Grenzwert für die Wasserhärte, welcher mit einem Testomaten überwacht werden kann?` + - `mit welchem indikator wird der wert gemessen` diff --git a/RETRIEX_PATCH_20_LLM_INPUT_NORMALIZATION_README.md b/RETRIEX_PATCH_20_LLM_INPUT_NORMALIZATION_README.md new file mode 100644 index 0000000..415beda --- /dev/null +++ b/RETRIEX_PATCH_20_LLM_INPUT_NORMALIZATION_README.md @@ -0,0 +1,173 @@ +# RetrieX Patch 20 - LLM-assisted Input Normalization before Routing + +## Ziel + +Patch 20 ersetzt den p19-Symptom-Fix fuer einzelne Preis-Tippfehler (`kpstet`, `ksotet`) durch eine generische, LLM-gestuetzte Eingabe-Normalisierung vor der Intent-/Commerce-/Retrieval-Erkennung. + +Der Ausloeser war der Flow: + +1. `Was ist der niedrigste Grenzwert fuer die Wasserhaerte, welcher mit einem Testomaten ueberwacht werden kann?` +2. `mit welchem indikator wird der wert gemessen` +3. `was kpstet der indikator` + +p19 konnte diesen konkreten Tippfehler per YAML-Liste auffangen, skaliert aber nicht. p20 normalisiert die Nutzereingabe vorher, z. B. `was kpstet der indikator` -> `was kostet der indikator`, ohne Produktkontext fachlich aufzufuellen. + +## Architektur + +Der Normalisierungsschritt sitzt in `AgentRunner::run()` direkt nach `analyze_request` und vor: + +- URL-/Quellenpruefung +- CommerceIntentLite-Erkennung +- Knowledge-Retrieval-Prompt-Bau +- Shop-Query-Optimierung +- finalem PromptBuilder-Aufruf + +Das LLM ist ueber den bereits vorhandenen `final class OllamaClient` angebunden und nutzt analog zur Shop-Query-Optimierung: + +```php +foreach ($this->ollamaClient->stream($normalizationPrompt) as $token) { + ... +} +``` + +Die Originalfrage bleibt erhalten und wird weiterhin in die Conversation History geschrieben. Die normalisierte Frage wird nur als effektive Eingabe fuer Routing, Intent, Retrieval, Shop-Optimierung und Antwortgenerierung genutzt. + +## Guardrails + +Die Normalisierung darf nur offensichtliche Tippfehler korrigieren. Sie darf nicht fachlich interpretieren. + +Konkrete Schutzmechanismen: + +- YAML-konfigurierbarer Normalizer-Prompt in `config/retriex/agent.yaml` +- keine fachliche Kontextauflösung im Normalizer +- keine Produktnamen, Modellnummern, Messwerte, Artikelnummern oder Einsatzbereiche hinzufuegen +- vage Referenzen wie `der indikator` bleiben vage und werden erst spaeter ueber bestehende Kontextlogik aufgeloest +- URL-/Code-aehnliche Eingaben werden uebersprungen +- maximale Eingabe-/Ausgabelaenge +- maximale Laengenvergroesserung +- maximale Token-Zunahme +- neue Zahlen in der normalisierten Eingabe werden verworfen +- bei Fehlern, leerer Ausgabe oder unsicherem Ergebnis faellt RetrieX auf die Originalfrage zurueck + +## Geaenderte Dateien + +- `src/Agent/AgentRunner.php` +- `src/Config/AgentRunnerConfig.php` +- `src/Config/RetriexEffectiveConfigProvider.php` +- `config/retriex/agent.yaml` +- `config/retriex/intent.yaml` +- `config/retriex/commerce.yaml` + +## Entfernt aus p19-Symptomlisten + +Die expliziten Tippfehler `kpstet` und `ksotet` wurden entfernt aus: + +- `intent.yaml` strong/non-product/price/explicit-commerce Listen +- `commerce.yaml` stopword- und correction-Listen + +Damit ist p20 nicht mehr auf diese konkreten Fehlerlisten angewiesen. + +## Erwartetes Verhalten + +Eingabe: + +```text +was kpstet der indikator +``` + +Interne Normalisierung: + +```text +was kostet der indikator +``` + +Danach sollte der bestehende Commerce-/Shop-Follow-up-Flow greifen: + +- Commerce Intent wird erkannt +- Shop-Suche wird angefragt +- referenzieller Kontext `Indikatortyp 300` kann durch bestehende Shop-Query-Context-Anchor-Logik ergaenzt werden + +## Lokal ausgefuehrte Pruefungen + +Im Container ausgefuehrt: + +```bash +php -l src/Agent/AgentRunner.php +php -l src/Config/AgentRunnerConfig.php +php -l src/Config/RetriexEffectiveConfigProvider.php +python3 - <<'PY' +import yaml +from pathlib import Path +for rel in ['config/retriex/agent.yaml','config/retriex/intent.yaml','config/retriex/commerce.yaml']: + with (Path('.') / rel).open() as f: + yaml.safe_load(f) +PY +php -r '$patterns=["/^(?:normalisiert|korrigiert|corrected|normalized)\\s*:\\s*/iu","/https?:\\/\\//iu","/\\bwww\\./iu","/```/u"]; foreach($patterns as $p){ if(@preg_match($p, "was kpstet der indikator")===false){ exit(1); } } echo "OK\n";' +grep -R "kpstet\|ksotet" -n config src || true +``` + +Ergebnis: + +- PHP-Syntax: OK +- YAML-Parse: OK +- Regex-Smoke-Test: OK +- `kpstet` / `ksotet`: nicht mehr in `config` oder `src` + +## Nicht lokal ausfuehrbar + +Die Symfony-/Composer-basierten Pflichtchecks konnten im Container nicht ausgefuehrt werden, weil im ZIP keine installierten Vendor-Dependencies enthalten sind. + +Bitte nach dem Einspielen ausfuehren: + +```bash +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +bin/console mto:agent:config:audit-patterns --details +``` + +## Empfohlene Regressionstests + +1. Stabiler v1.5.1-Flow: + +```text +Was ist der niedrigste Grenzwert fuer die Wasserhaerte, welcher mit einem Testomaten ueberwacht werden kann? +mit welchem indikator wird der wert gemessen +was kostet der indikator +``` + +2. Tippfehler-Flow: + +```text +Was ist der niedrigste Grenzwert fuer die Wasserhaerte, welcher mit einem Testomaten ueberwacht werden kann? +mit welchem indikator wird der wert gemessen +was kpstet der indikator +``` + +Erwartung fuer beide Preisfragen: + +- Shop-Suche wird angefragt +- Shop-Treffer werden genutzt +- keine Rueckkehr in RAG-only mit Testomat-2000-Indikatoren + +3. Guardrail-Test: + +```text +was kpstet der indikator 300 +``` + +Erwartung: + +- Normalisierung darf `300` erhalten +- keine neue Modellnummer / Artikelnummer hinzufuegen + +4. URL-Skip-Test: + +```text +pruefe https://example.com/test?x=kpstet +``` + +Erwartung: + +- Normalisierung wird uebersprungen +- URL bleibt unveraendert diff --git a/config/retriex/agent.yaml b/config/retriex/agent.yaml index 499c63b..57fd22e 100644 --- a/config/retriex/agent.yaml +++ b/config/retriex/agent.yaml @@ -8,6 +8,113 @@ parameters: optimized_shop_query_prefix_pattern: '/^(?:keywords?|suchquery|search\s*query|query)\s*:\s*/iu' optimized_shop_query_trim_characters: " \t\n\r\0\x0B\"'`" + input_normalization: + enabled: true + max_input_chars: 500 + max_output_chars: 700 + max_added_tokens: 2 + max_length_ratio_percent: 150 + heartbeat_message: 'Ich prüfe die Eingabe auf Tippfehler…' + output_prefix_pattern: '/^(?:normalisiert|korrigiert|corrected|normalized)\s*:\s*/iu' + skip_patterns: + - '/https?:\/\//iu' + - '/\bwww\./iu' + - '/```/u' + prompt: + intro: 'Normalize the following user input for RetrieX routing before intent detection.' + output_format_block: |- + Output format: + + current_user_input_label: 'USER INPUT' + rules: + - '- Output only the normalized user input.' + - '- Correct only obvious typing mistakes, transposed letters, missing umlauts, spacing, and punctuation that clearly preserve the same meaning.' + - '- Do not add product names, model numbers, article numbers, measurement values, parameters, brands, or application areas that are not already present in the input.' + - '- Preserve product names, model numbers, article numbers, chemical symbols, units, pH, Redox, ORP, and measurement values exactly unless only letter casing is corrected.' + - '- Preserve the input language; do not translate German into English or English into German.' + - '- Preserve vague references such as "der indikator", "das gerät", "suche im shop", or "dazu" without resolving them from context.' + - '- If the input is already clear or you are uncertain, return the original input unchanged.' + - '- No introduction, no explanation, no quotation marks.' + + fuzzy_routing: + enabled: true + min_token_length: 4 + medium_token_length: 7 + long_token_length: 11 + max_distance_short: 1 + max_distance_medium: 2 + max_distance_long: 3 + min_similarity_percent: 72 + # Canonical routing terms only, not typo variants. + # The code fuzzy-matches user tokens against these terms when the LLM leaves + # an obvious routing typo unchanged. + terms: + - shop + - suche + - suchen + - such + - finde + - finden + - kostet + - kosten + - preis + - preise + - preiswert + - preiswerte + - günstig + - guenstig + - kaufen + - bestellen + - produkt + - produkte + - artikel + - sku + - online + - analysegerät + - analysegeraet + - messgerät + - messgeraet + - handmessgerät + - handmessgeraet + - pockettester + - analysator + - analyzer + - indikator + - indikatoren + - reagenz + - reagenzien + - verbrauchsmaterial + - zubehör + - zubehoer + - ersatzteil + - ersatzteile + - anschlusskabel + - kabel + - sensorkabel + - elektrode + - elektrodenkabel + - puffer + - kalibrierpuffer + - kalibrierlösung + - kalibrierloesung + - kalibrierung + - lösung + - loesung + - messen + - messung + - überwachen + - ueberwachen + - kontrollieren + - schwimmbad + - pool + - becken + - wasseranalyse + - geeignet + - passend + - empfehlung + - empfehlen + - empfiehl + follow_up_context: strong_reference_patterns: - '/\bder\s+wert\b/u' diff --git a/config/retriex/commerce.yaml b/config/retriex/commerce.yaml index 4a6dde9..478e0ca 100644 --- a/config/retriex/commerce.yaml +++ b/config/retriex/commerce.yaml @@ -110,8 +110,6 @@ parameters: - preise - preisen - kostet - - kpstet - - ksotet - kosten - ua - also @@ -144,8 +142,6 @@ parameters: indicatoren: indikatoren schwinnbad: schwimmbad schwimbad: schwimmbad - kpstet: kostet - ksotet: kostet search_token_canonical_map: indikatoren: indikator diff --git a/config/retriex/intent.yaml b/config/retriex/intent.yaml index 08109a3..52c7569 100644 --- a/config/retriex/intent.yaml +++ b/config/retriex/intent.yaml @@ -14,8 +14,6 @@ parameters: - sku - kaufen - kostet - - kpstet - - ksotet - suche - such - finde @@ -54,8 +52,6 @@ parameters: - online - kaufen - kostet - - kpstet - - ksotet - suche - such - finde @@ -79,6 +75,7 @@ parameters: - '/\bwelche(?:r|s|n|m)?\s+(?:testomat(?:en)?|pockettester|pocket\s+tester|analysegerät|analysegeraet|messgerät|messgeraet|analysator|analyzer)\b.*\b(?:kann|können|koennen|misst|messen|überwacht|ueberwacht|eignet|geeignet|passt|gut|empfehl)\b.*\b(?:messen|messung|überwach(?:en|ung)?|ueberwach(?:en|ung)?)\b/u' - '/\b(?:testomat(?:en)?|pockettester|pocket\s+tester|analysegerät|analysegeraet|messgerät|messgeraet|analysator|analyzer)\b.*\b(?:für|fuer)\b.*\b(?:messung|messen|überwachung|ueberwachung)\b/u' - '/\b(?:ich\s+)?(?:würde|wuerde|möchte|moechte|will|brauche|benötige|benoetige)\b.{0,80}\b(?:messen|messung|überwachen|ueberwachen|kontrollieren)\b/u' + - '/\b(?:ich\s+)?(?:suche|finde)\b.{0,120}\b(?:lösung|loesung|gerät|geraet|messgerät|messgeraet|handmessgerät|handmessgeraet|pockettester|analysegerät|analysegeraet|analysator|produkt|artikel)\b.{0,120}\b(?:messen|messung|überwachen|ueberwachen|kontrollieren|wasseranalyse|schwimmbad|pool|becken)\b/u' - '/\b(?:messen|messung|überwachen|ueberwachen|kontrollieren)\b.{0,80}\b(?:schwimmbad|pool|becken|wasseranalyse)\b/u' price_terms: - euro @@ -88,8 +85,10 @@ parameters: - preis - kosten - kostet - - kpstet - - ksotet + - preiswert + - preiswerte + - günstig + - guenstig color_terms: - schwarz - weiß @@ -144,8 +143,6 @@ parameters: - '/\bpreis\b/u' - '/\bkosten\b/u' - '/\bkostet\b/u' - - '/\bkpstet\b/u' - - '/\bksotet\b/u' - '/\bkaufen\b/u' - '/\bbestellen\b/u' - '/\bprodukt\b/u' diff --git a/src/Agent/AgentRunner.php b/src/Agent/AgentRunner.php index 2883733..08aa5c9 100644 --- a/src/Agent/AgentRunner.php +++ b/src/Agent/AgentRunner.php @@ -42,7 +42,9 @@ final readonly class AgentRunner public function run(string $prompt, string $userId, bool $forceFullContext = false, string $requestContextHint = ''): Generator { - $prompt = trim($prompt); + $originalPrompt = trim($prompt); + $prompt = $originalPrompt; + $routingPrompt = $prompt; if ($prompt === '') { yield $this->systemMsg($this->agentRunnerConfig->getEmptyPromptMessage(), 'err'); @@ -87,26 +89,37 @@ final readonly class AgentRunner ); yield $this->systemMsg($this->agentRunnerConfig->getAnalyzeRequestMessage(), 'think'); + + $normalizedPrompt = yield from $this->normalizePromptForRouting($prompt, $userId); + if ($normalizedPrompt !== $prompt) { + $this->agentLogger->info('Prompt normalized before routing', [ + 'userId' => $userId, + 'originalPrompt' => $prompt, + 'normalizedPrompt' => $normalizedPrompt, + ]); + $routingPrompt = $normalizedPrompt; + } + yield $this->systemMsg($this->agentRunnerConfig->getCheckInternetSourcesMessage(), 'think'); - $urlContent = $this->urlAnalyzer->extractContentFromPrompt($prompt); + $urlContent = $this->urlAnalyzer->extractContentFromPrompt($originalPrompt); if ($urlContent !== '') { $this->addSource($sources, $this->agentRunnerConfig->getExternalUrlSourceLabel()); } - $commerceIntent = $this->detectCommerceIntent($prompt); + $commerceIntent = $this->detectCommerceIntent($routingPrompt); yield $this->systemMsg($this->agentRunnerConfig->getRetrieveKnowledgeMessage(), 'think'); $knowledgeRetrievalPrompt = $this->buildKnowledgeRetrievalPrompt( - prompt: $prompt, + prompt: $routingPrompt, userId: $userId, commerceIntent: $commerceIntent ); - $usedFollowUpRetrievalContext = $knowledgeRetrievalPrompt !== $prompt; + $usedFollowUpRetrievalContext = $knowledgeRetrievalPrompt !== $routingPrompt; $knowledgeChunks = $this->retriever->retrieve($knowledgeRetrievalPrompt); - $knowledgeEvidenceState = $this->resolveKnowledgeEvidenceState($prompt, $knowledgeChunks, $urlContent); + $knowledgeEvidenceState = $this->resolveKnowledgeEvidenceState($routingPrompt, $knowledgeChunks, $urlContent); if ($knowledgeChunks !== []) { $this->addSource($sources, $this->agentRunnerConfig->getRagKnowledgeSourceLabel()); } @@ -127,6 +140,7 @@ final readonly class AgentRunner $this->agentLogger->info('Knowledge retrieval used follow-up context', [ 'userId' => $userId, 'prompt' => $prompt, + 'routingPrompt' => $routingPrompt, 'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt, 'commerceIntent' => $commerceIntent, ]); @@ -154,13 +168,13 @@ final readonly class AgentRunner } $optimizedShopQuery = yield from $this->buildOptimizedShopQuery( - $prompt, + $routingPrompt, $userId, $commerceHistoryContext ); $shopSearchQuery = $this->resolveShopSearchQuery( - prompt: $prompt, + prompt: $routingPrompt, optimizedShopQuery: $optimizedShopQuery, commerceHistoryContext: $commerceHistoryContext, userId: $userId @@ -171,6 +185,7 @@ final readonly class AgentRunner 'userId' => $userId, 'commerceIntent' => $commerceIntent, 'prompt' => $prompt, + 'routingPrompt' => $routingPrompt, 'optimizedShopQuery' => $optimizedShopQuery, 'hasCommerceHistoryContext' => $commerceHistoryContext !== '', 'commerceHistoryContextLength' => mb_strlen($commerceHistoryContext), @@ -199,7 +214,7 @@ final readonly class AgentRunner $this->contextService->appendHistory( $userId, - $prompt, + $originalPrompt, $this->plainTextFromHtml($noConcreteShopQueryMessage) ); @@ -484,7 +499,7 @@ final readonly class AgentRunner if ($historyResponse !== '') { $this->contextService->appendHistory( $userId, - $prompt, + $originalPrompt, $historyResponse ); } @@ -494,6 +509,10 @@ final readonly class AgentRunner 'outputLength' => mb_strlen($fullOutput), 'contextMode' => $forceFullContext ? 'full' : 'recent', 'commerceIntent' => $commerceIntent, + 'originalPrompt' => $originalPrompt, + 'effectivePrompt' => $prompt, + 'routingPrompt' => $routingPrompt, + 'promptWasNormalized' => $routingPrompt !== $originalPrompt, 'primaryShopResultsCount' => count($primaryShopResults), 'shopResultsCount' => count($shopResults), 'attemptedShopRepair' => $attemptedShopRepair, @@ -539,11 +558,374 @@ final readonly class AgentRunner )); if ($historyResponse !== '') { - $this->contextService->appendHistory($userId, $prompt, $historyResponse); + $this->contextService->appendHistory($userId, $originalPrompt, $historyResponse); } } } + /** + * @return Generator + */ + private function normalizePromptForRouting(string $prompt, string $userId): Generator + { + if (!$this->agentRunnerConfig->isInputNormalizationEnabled()) { + return $prompt; + } + + if ($this->shouldSkipInputNormalization($prompt)) { + return $prompt; + } + + $normalizationPrompt = trim($this->agentRunnerConfig->getInputNormalizationPrompt($prompt)); + if ($normalizationPrompt === '') { + return $prompt; + } + + $candidate = ''; + $lastHeartbeatAt = time(); + $this->thinkSuppressor->reset(); + + try { + foreach ($this->ollamaClient->stream($normalizationPrompt) as $token) { + if (!is_string($token)) { + continue; + } + + if (time() - $lastHeartbeatAt >= 2) { + yield $this->systemMsg($this->agentRunnerConfig->getInputNormalizationHeartbeatMessage(), 'think'); + $lastHeartbeatAt = time(); + } + + $cleanToken = $this->thinkSuppressor->filter($token); + if ($cleanToken === '') { + continue; + } + + $candidate .= $cleanToken; + } + } catch (Throwable $e) { + $this->agentLogger->warning('Prompt normalization failed, falling back to fuzzy routing-signal normalization', [ + 'userId' => $userId, + 'exception' => $e, + ]); + + return $this->applyFuzzyRoutingSignalNormalization($prompt, $prompt); + } + + $normalized = $this->sanitizeNormalizedPromptForRouting($candidate, $prompt); + + return $this->applyFuzzyRoutingSignalNormalization($normalized, $prompt); + } + + private function shouldSkipInputNormalization(string $prompt): bool + { + if (mb_strlen($prompt, 'UTF-8') > $this->agentRunnerConfig->getInputNormalizationMaxInputChars()) { + return true; + } + + foreach ($this->agentRunnerConfig->getInputNormalizationSkipPatterns() as $pattern) { + if (@preg_match($pattern, $prompt) === 1) { + return true; + } + } + + return false; + } + + private function sanitizeNormalizedPromptForRouting(string $candidate, string $originalPrompt): string + { + $candidate = trim($candidate); + if ($candidate === '') { + return $originalPrompt; + } + + $candidate = preg_split('/\R{2,}/u', $candidate, 2)[0] ?? $candidate; + $candidate = trim($candidate); + $candidate = preg_replace($this->agentRunnerConfig->getInputNormalizationOutputPrefixPattern(), '', $candidate) ?? $candidate; + $candidate = trim($candidate, $this->agentRunnerConfig->getOptimizedShopQueryTrimCharacters()); + $candidate = preg_replace('/\s+/u', ' ', $candidate) ?? $candidate; + $candidate = trim($candidate); + + if ($candidate === '') { + return $originalPrompt; + } + + if (mb_strlen($candidate, 'UTF-8') > $this->agentRunnerConfig->getInputNormalizationMaxOutputChars()) { + return $originalPrompt; + } + + if ($this->normalizeRoutingComparisonText($candidate) === $this->normalizeRoutingComparisonText($originalPrompt)) { + return $originalPrompt; + } + + if (!$this->isSafeNormalizedPromptCandidate($candidate, $originalPrompt)) { + return $originalPrompt; + } + + return $candidate; + } + + private function applyFuzzyRoutingSignalNormalization(string $candidate, string $originalPrompt): string + { + if (!$this->agentRunnerConfig->isInputNormalizationFuzzyRoutingEnabled()) { + return $candidate; + } + + $terms = $this->buildFuzzyRoutingTermIndex(); + if ($terms === []) { + return $candidate; + } + + $minLength = $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingMinTokenLength(); + $changed = false; + + $normalized = preg_replace_callback( + '/(?resolveFuzzyRoutingTokenReplacement($token, $terms); + + if ($replacement === null || $replacement === $token) { + return $token; + } + + $changed = true; + + return $replacement; + }, + $candidate + ); + + if (!is_string($normalized) || !$changed) { + return $candidate; + } + + $normalized = preg_replace('/\s+/u', ' ', trim($normalized)) ?? trim($normalized); + if ($normalized === '' || $this->normalizeRoutingComparisonText($normalized) === $this->normalizeRoutingComparisonText($candidate)) { + return $candidate; + } + + if (!$this->isSafeNormalizedPromptCandidate($normalized, $originalPrompt)) { + return $candidate; + } + + return $normalized; + } + + /** + * @return array + */ + private function buildFuzzyRoutingTermIndex(): array + { + $terms = []; + + foreach ($this->agentRunnerConfig->getInputNormalizationFuzzyRoutingTerms() as $term) { + $term = trim($term); + if ($term === '') { + continue; + } + + $normalized = $this->normalizeFuzzyRoutingToken($term); + if ($normalized === '') { + continue; + } + + $terms[$normalized] ??= mb_strtolower($term, 'UTF-8'); + } + + return $terms; + } + + /** + * @param array $terms + */ + private function resolveFuzzyRoutingTokenReplacement(string $token, array $terms): ?string + { + $normalizedToken = $this->normalizeFuzzyRoutingToken($token); + if ($normalizedToken === '' || isset($terms[$normalizedToken])) { + return null; + } + + $bestTerm = null; + $bestDistance = PHP_INT_MAX; + $ambiguous = false; + $tokenLength = max(1, strlen($normalizedToken)); + + foreach ($terms as $normalizedTerm => $term) { + $termLength = strlen($normalizedTerm); + if (abs($tokenLength - $termLength) > $this->resolveFuzzyRoutingMaxDistance(max($tokenLength, $termLength))) { + continue; + } + + $distance = $this->calculateFuzzyRoutingDistance($normalizedToken, $normalizedTerm); + $maxLength = max($tokenLength, $termLength); + $maxDistance = $this->resolveFuzzyRoutingMaxDistance($maxLength); + if ($distance > $maxDistance) { + continue; + } + + $similarityPercent = (int) round((1 - ($distance / max(1, $maxLength))) * 100); + if ($similarityPercent < $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingMinSimilarityPercent()) { + continue; + } + + if ($distance < $bestDistance) { + $bestDistance = $distance; + $bestTerm = $term; + $ambiguous = false; + continue; + } + + if ($distance === $bestDistance && $term !== $bestTerm) { + $ambiguous = true; + } + } + + if ($bestTerm === null || $ambiguous) { + return null; + } + + return $bestTerm; + } + + private function calculateFuzzyRoutingDistance(string $left, string $right): int + { + $leftLength = strlen($left); + $rightLength = strlen($right); + + if ($leftLength === 0) { + return $rightLength; + } + + if ($rightLength === 0) { + return $leftLength; + } + + $distance = []; + for ($i = 0; $i <= $leftLength; $i++) { + $distance[$i] = [$i]; + } + + for ($j = 0; $j <= $rightLength; $j++) { + $distance[0][$j] = $j; + } + + for ($i = 1; $i <= $leftLength; $i++) { + for ($j = 1; $j <= $rightLength; $j++) { + $cost = $left[$i - 1] === $right[$j - 1] ? 0 : 1; + $distance[$i][$j] = min( + $distance[$i - 1][$j] + 1, + $distance[$i][$j - 1] + 1, + $distance[$i - 1][$j - 1] + $cost + ); + + if ( + $i > 1 + && $j > 1 + && $left[$i - 1] === $right[$j - 2] + && $left[$i - 2] === $right[$j - 1] + ) { + $distance[$i][$j] = min($distance[$i][$j], $distance[$i - 2][$j - 2] + 1); + } + } + } + + return $distance[$leftLength][$rightLength]; + } + + private function resolveFuzzyRoutingMaxDistance(int $tokenLength): int + { + if ($tokenLength >= $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingLongTokenLength()) { + return $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingMaxDistanceLong(); + } + + if ($tokenLength >= $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingMediumTokenLength()) { + return $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingMaxDistanceMedium(); + } + + return $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingMaxDistanceShort(); + } + + private function normalizeFuzzyRoutingToken(string $token): string + { + $token = mb_strtolower(trim($token), 'UTF-8'); + $token = strtr($token, [ + 'ä' => 'ae', + 'ö' => 'oe', + 'ü' => 'ue', + 'ß' => 'ss', + ]); + $token = preg_replace('/[^a-z0-9]+/u', '', $token) ?? $token; + + return trim($token); + } + + private function isSafeNormalizedPromptCandidate(string $candidate, string $originalPrompt): bool + { + $originalLength = max(1, mb_strlen($originalPrompt, 'UTF-8')); + $candidateLength = mb_strlen($candidate, 'UTF-8'); + $maxLength = (int) ceil($originalLength * ($this->agentRunnerConfig->getInputNormalizationMaxLengthRatioPercent() / 100)); + + if ($candidateLength > $maxLength) { + return false; + } + + $originalTokens = $this->tokenizeInputNormalizationGuardText($originalPrompt); + $candidateTokens = $this->tokenizeInputNormalizationGuardText($candidate); + $maxAddedTokens = $this->agentRunnerConfig->getInputNormalizationMaxAddedTokens(); + + if (count($candidateTokens) > count($originalTokens) + $maxAddedTokens) { + return false; + } + + $originalNumbers = $this->extractInputNormalizationNumbers($originalPrompt); + foreach ($this->extractInputNormalizationNumbers($candidate) as $number) { + if (!in_array($number, $originalNumbers, true)) { + return false; + } + } + + return true; + } + + private function normalizeRoutingComparisonText(string $value): string + { + $value = mb_strtolower(trim($value), 'UTF-8'); + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim($value); + } + + /** + * @return string[] + */ + private function tokenizeInputNormalizationGuardText(string $value): array + { + if (preg_match_all('/\d+(?:[,.]\d+)?|[\p{L}\p{N}]+/u', mb_strtolower($value, 'UTF-8'), $matches) !== 1) { + return []; + } + + return array_values(array_filter( + array_map(static fn(string $token): string => trim($token), $matches[0] ?? []), + static fn(string $token): bool => $token !== '' + )); + } + + /** + * @return string[] + */ + private function extractInputNormalizationNumbers(string $value): array + { + if (preg_match_all('/\d+(?:[,.]\d+)?/u', $value, $matches) !== 1) { + return []; + } + + return array_values(array_unique(array_map( + static fn(string $number): string => str_replace(',', '.', $number), + $matches[0] ?? [] + ))); + } + private function detectCommerceIntent(string $prompt): string { $commerceMeta = $this->commerceIntentLite->detect($prompt); diff --git a/src/Config/AgentRunnerConfig.php b/src/Config/AgentRunnerConfig.php index 6196b6b..a593eed 100644 --- a/src/Config/AgentRunnerConfig.php +++ b/src/Config/AgentRunnerConfig.php @@ -80,6 +80,131 @@ final class AgentRunnerConfig return $this->getRequiredString('follow_up_context.reference_anchor.hardness_value_pattern'); } + public function isInputNormalizationEnabled(): bool + { + return $this->getRequiredBool('input_normalization.enabled'); + } + + public function getInputNormalizationMaxInputChars(): int + { + return $this->getRequiredInt('input_normalization.max_input_chars'); + } + + public function getInputNormalizationMaxOutputChars(): int + { + return $this->getRequiredInt('input_normalization.max_output_chars'); + } + + public function getInputNormalizationMaxAddedTokens(): int + { + return $this->getRequiredInt('input_normalization.max_added_tokens'); + } + + public function getInputNormalizationMaxLengthRatioPercent(): int + { + return $this->getRequiredInt('input_normalization.max_length_ratio_percent'); + } + + public function getInputNormalizationHeartbeatMessage(): string + { + return $this->getRequiredString('input_normalization.heartbeat_message'); + } + + public function getInputNormalizationOutputPrefixPattern(): string + { + return $this->getRequiredString('input_normalization.output_prefix_pattern'); + } + + /** + * @return string[] + */ + public function getInputNormalizationSkipPatterns(): array + { + return $this->getRequiredStringList('input_normalization.skip_patterns'); + } + + public function getInputNormalizationPrompt(string $prompt): string + { + return $this->implodePromptBlocks([ + $this->getInputNormalizationIntro(), + $this->buildRulesBlock($this->getInputNormalizationRules()), + $this->getInputNormalizationOutputFormatBlock(), + $this->getInputNormalizationCurrentUserInputLabel() . ':', + trim($prompt), + ]); + } + + /** + * @return string[] + */ + public function getInputNormalizationRules(): array + { + return $this->getRequiredStringList('input_normalization.prompt.rules'); + } + + public function getInputNormalizationIntro(): string + { + return $this->getRequiredString('input_normalization.prompt.intro'); + } + + public function getInputNormalizationOutputFormatBlock(): string + { + return $this->getRequiredString('input_normalization.prompt.output_format_block'); + } + + public function getInputNormalizationCurrentUserInputLabel(): string + { + return $this->getRequiredString('input_normalization.prompt.current_user_input_label'); + } + + public function isInputNormalizationFuzzyRoutingEnabled(): bool + { + return $this->getRequiredBool('input_normalization.fuzzy_routing.enabled'); + } + + public function getInputNormalizationFuzzyRoutingMinTokenLength(): int + { + return $this->getRequiredInt('input_normalization.fuzzy_routing.min_token_length'); + } + + public function getInputNormalizationFuzzyRoutingMediumTokenLength(): int + { + return $this->getRequiredInt('input_normalization.fuzzy_routing.medium_token_length'); + } + + public function getInputNormalizationFuzzyRoutingLongTokenLength(): int + { + return $this->getRequiredInt('input_normalization.fuzzy_routing.long_token_length'); + } + + public function getInputNormalizationFuzzyRoutingMaxDistanceShort(): int + { + return $this->getRequiredInt('input_normalization.fuzzy_routing.max_distance_short'); + } + + public function getInputNormalizationFuzzyRoutingMaxDistanceMedium(): int + { + return $this->getRequiredInt('input_normalization.fuzzy_routing.max_distance_medium'); + } + + public function getInputNormalizationFuzzyRoutingMaxDistanceLong(): int + { + return $this->getRequiredInt('input_normalization.fuzzy_routing.max_distance_long'); + } + + public function getInputNormalizationFuzzyRoutingMinSimilarityPercent(): int + { + return $this->getRequiredInt('input_normalization.fuzzy_routing.min_similarity_percent'); + } + + /** + * @return string[] + */ + public function getInputNormalizationFuzzyRoutingTerms(): array + { + return $this->getRequiredStringList('input_normalization.fuzzy_routing.terms'); + } + private function getRequiredInt(string $key): int { $value = $this->requiredValue($key); diff --git a/src/Config/RetriexEffectiveConfigProvider.php b/src/Config/RetriexEffectiveConfigProvider.php index 5d9a740..8384380 100644 --- a/src/Config/RetriexEffectiveConfigProvider.php +++ b/src/Config/RetriexEffectiveConfigProvider.php @@ -441,6 +441,33 @@ final readonly class RetriexEffectiveConfigProvider 'product_search_knowledge_chunk_limit' => $this->agentRunnerConfig->getProductSearchKnowledgeChunkLimit(), 'advisory_product_search_knowledge_chunk_limit' => $this->agentRunnerConfig->getAdvisoryProductSearchKnowledgeChunkLimit(), 'optimized_shop_query_prefix_pattern' => $this->agentRunnerConfig->getOptimizedShopQueryPrefixPattern(), + 'input_normalization' => [ + 'enabled' => $this->agentRunnerConfig->isInputNormalizationEnabled(), + 'max_input_chars' => $this->agentRunnerConfig->getInputNormalizationMaxInputChars(), + 'max_output_chars' => $this->agentRunnerConfig->getInputNormalizationMaxOutputChars(), + 'max_added_tokens' => $this->agentRunnerConfig->getInputNormalizationMaxAddedTokens(), + 'max_length_ratio_percent' => $this->agentRunnerConfig->getInputNormalizationMaxLengthRatioPercent(), + 'heartbeat_message' => $this->agentRunnerConfig->getInputNormalizationHeartbeatMessage(), + 'output_prefix_pattern' => $this->agentRunnerConfig->getInputNormalizationOutputPrefixPattern(), + 'skip_patterns' => $this->agentRunnerConfig->getInputNormalizationSkipPatterns(), + 'prompt' => [ + 'intro' => $this->agentRunnerConfig->getInputNormalizationIntro(), + 'rules' => $this->agentRunnerConfig->getInputNormalizationRules(), + 'output_format_block' => $this->agentRunnerConfig->getInputNormalizationOutputFormatBlock(), + 'current_user_input_label' => $this->agentRunnerConfig->getInputNormalizationCurrentUserInputLabel(), + ], + 'fuzzy_routing' => [ + 'enabled' => $this->agentRunnerConfig->isInputNormalizationFuzzyRoutingEnabled(), + 'min_token_length' => $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingMinTokenLength(), + 'medium_token_length' => $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingMediumTokenLength(), + 'long_token_length' => $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingLongTokenLength(), + 'max_distance_short' => $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingMaxDistanceShort(), + 'max_distance_medium' => $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingMaxDistanceMedium(), + 'max_distance_long' => $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingMaxDistanceLong(), + 'min_similarity_percent' => $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingMinSimilarityPercent(), + 'terms' => $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingTerms(), + ], + ], 'messages' => [ 'empty_prompt' => $this->agentRunnerConfig->getEmptyPromptMessage(), 'analyze_request' => $this->agentRunnerConfig->getAnalyzeRequestMessage(), @@ -1027,6 +1054,58 @@ final readonly class RetriexEffectiveConfigProvider $this->validateStringListMap($agent['shop_query_optimizer'] ?? [], 'agent.shop_query_optimizer', $errors, $warnings); $this->validateRegexPattern($agent['optimized_shop_query_prefix_pattern'] ?? null, 'agent.optimized_shop_query_prefix_pattern', $errors); + $normalization = is_array($agent['input_normalization'] ?? null) ? $agent['input_normalization'] : []; + $normalizationPrompt = is_array($normalization['prompt'] ?? null) ? $normalization['prompt'] : []; + if (($this->asInt($normalization['max_input_chars'] ?? null) ?? 0) < 1) { + $errors[] = 'agent.input_normalization.max_input_chars must be greater than 0.'; + } + if (($this->asInt($normalization['max_output_chars'] ?? null) ?? 0) < 1) { + $errors[] = 'agent.input_normalization.max_output_chars must be greater than 0.'; + } + if (($this->asInt($normalization['max_added_tokens'] ?? null) ?? -1) < 0) { + $errors[] = 'agent.input_normalization.max_added_tokens must be greater than or equal to 0.'; + } + if (($this->asInt($normalization['max_length_ratio_percent'] ?? null) ?? 0) < 100) { + $errors[] = 'agent.input_normalization.max_length_ratio_percent must be at least 100.'; + } + $this->validateRegexPattern($normalization['output_prefix_pattern'] ?? null, 'agent.input_normalization.output_prefix_pattern', $errors); + $this->validateRegexPatternList($normalization['skip_patterns'] ?? [], 'agent.input_normalization.skip_patterns', $errors); + $this->validateStringList($this->toList($normalizationPrompt['rules'] ?? []), 'agent.input_normalization.prompt.rules', $errors, $warnings); + + $fuzzyRouting = is_array($normalization['fuzzy_routing'] ?? null) ? $normalization['fuzzy_routing'] : []; + if (($this->asInt($fuzzyRouting['min_token_length'] ?? null) ?? 0) < 1) { + $errors[] = 'agent.input_normalization.fuzzy_routing.min_token_length must be greater than 0.'; + } + if (($this->asInt($fuzzyRouting['medium_token_length'] ?? null) ?? 0) < 1) { + $errors[] = 'agent.input_normalization.fuzzy_routing.medium_token_length must be greater than 0.'; + } + if (($this->asInt($fuzzyRouting['long_token_length'] ?? null) ?? 0) < 1) { + $errors[] = 'agent.input_normalization.fuzzy_routing.long_token_length must be greater than 0.'; + } + if (($this->asInt($fuzzyRouting['max_distance_short'] ?? null) ?? -1) < 0) { + $errors[] = 'agent.input_normalization.fuzzy_routing.max_distance_short must be greater than or equal to 0.'; + } + if (($this->asInt($fuzzyRouting['max_distance_medium'] ?? null) ?? -1) < 0) { + $errors[] = 'agent.input_normalization.fuzzy_routing.max_distance_medium must be greater than or equal to 0.'; + } + if (($this->asInt($fuzzyRouting['max_distance_long'] ?? null) ?? -1) < 0) { + $errors[] = 'agent.input_normalization.fuzzy_routing.max_distance_long must be greater than or equal to 0.'; + } + $minSimilarityPercent = $this->asInt($fuzzyRouting['min_similarity_percent'] ?? null) ?? 0; + if ($minSimilarityPercent < 1 || $minSimilarityPercent > 100) { + $errors[] = 'agent.input_normalization.fuzzy_routing.min_similarity_percent must be between 1 and 100.'; + } + $this->validateStringList($this->toList($fuzzyRouting['terms'] ?? []), 'agent.input_normalization.fuzzy_routing.terms', $errors, $warnings); + if (trim((string) ($normalizationPrompt['intro'] ?? '')) === '') { + $errors[] = 'agent.input_normalization.prompt.intro must not be empty.'; + } + if (trim((string) ($normalizationPrompt['output_format_block'] ?? '')) === '') { + $errors[] = 'agent.input_normalization.prompt.output_format_block must not be empty.'; + } + if (trim((string) ($normalizationPrompt['current_user_input_label'] ?? '')) === '') { + $errors[] = 'agent.input_normalization.prompt.current_user_input_label must not be empty.'; + } + $anchorEnrichment = $agent['shop_query_optimizer']['context_anchor_enrichment'] ?? []; if (is_array($anchorEnrichment)) { $this->validateStringList($this->toList($anchorEnrichment['trigger_terms'] ?? []), 'agent.shop_query_optimizer.context_anchor_enrichment.trigger_terms', $errors, $warnings);