From 03d4a1d7c3be7f1bd7a0c77c7b5e336649020d29 Mon Sep 17 00:00:00 2001 From: team 1 Date: Tue, 12 May 2026 08:38:16 +0200 Subject: [PATCH] p99c --- config/retriex/genre.yaml | 7 ++ ...X_PATCH_99B_EVAL_SUITE_ALIGNMENT_README.md | 85 +++++++++++++++++++ ...N_DEVICE_FOLLOWUP_EVAL_ALIGNMENT_README.md | 60 +++++++++++++ src/Agent/AgentRunner.php | 45 ++++++++-- tests/evals/cases/answer_guard.ndjson | 2 +- 5 files changed, 190 insertions(+), 9 deletions(-) create mode 100644 patch_history/RETRIEX_PATCH_99B_EVAL_SUITE_ALIGNMENT_README.md create mode 100644 patch_history/RETRIEX_PATCH_99C_MAIN_DEVICE_FOLLOWUP_EVAL_ALIGNMENT_README.md diff --git a/config/retriex/genre.yaml b/config/retriex/genre.yaml index f09d782..9d37144 100644 --- a/config/retriex/genre.yaml +++ b/config/retriex/genre.yaml @@ -1286,6 +1286,13 @@ parameters: - schwimmbad - schwimmbecken - pool + - silikat + - silikatüberwachung + - silikatueberwachung + - sio2 + - si o2 + - kieselsäure + - kieselsaeure - 0,02 stopword_cleanup: origin: genre_native diff --git a/patch_history/RETRIEX_PATCH_99B_EVAL_SUITE_ALIGNMENT_README.md b/patch_history/RETRIEX_PATCH_99B_EVAL_SUITE_ALIGNMENT_README.md new file mode 100644 index 0000000..955eff3 --- /dev/null +++ b/patch_history/RETRIEX_PATCH_99B_EVAL_SUITE_ALIGNMENT_README.md @@ -0,0 +1,85 @@ +# RetrieX Patch p99b - Eval Suite Alignment + +## Ziel + +p99 hatte die neue Eval-Suite erfolgreich aktiviert, aber drei neue Cases zeigten nach dem ersten Lauf rote Signale. p99b trennt dabei False-Positive-Assertions von zwei realen Robustheitsluecken, ohne die bestehende Retrieval-Baseline oder Shop-/Follow-up-Architektur umzubauen. + +## Ausgangslage + +Nach p99: + +- `mto:agent:config:validate`: OK +- `mto:agent:eval:run retrieval`: 19/19 OK +- `mto:agent:eval:run shop_query`: 4/5 OK +- `mto:agent:eval:run followup`: 3/4 OK +- `mto:agent:eval:run answer_guard`: 3/4 OK + +Rote Cases: + +- `shop_query_sio2_anchor_001`: normalisierte Shopquery konnte auf `gerät` zusammenschrumpfen. +- `followup_main_device_price_001`: Hauptgeraet-Follow-up konnte an der vorherigen Indikator-Query `testomat 808 indikator 300` haengen bleiben. +- `answer_guard_delivery_not_sdb_001`: Assertion war zu streng, weil ein Textbegriff `Sicherheitsdatenblatt` im Retrieval-Text kein ausreichender Fehlernachweis ist, solange das falsche Dokument nicht dominiert. + +## Aenderungen + +### 1. SiO2/Silikat als aktuelle Eingabe schuetzen + +`config/retriex/genre.yaml` + +Ergaenzt `shop_query_runtime.current_input_preservation_terms` um: + +- `silikat` +- `silikatüberwachung` +- `silikatueberwachung` +- `sio2` +- `si o2` +- `kieselsäure` +- `kieselsaeure` + +Damit verliert eine normalisierte Standalone-Shopfrage wie `suche gerät kühlsysteme Silikatüberwachung` nicht mehr den fachlichen Messparameter, bevor die generische Device-Anchor-Regel `testomat 808 sio2` greifen kann. + +### 2. Hauptgeraet-Follow-up darf Zubehoerreste entfernen + +`src/Agent/AgentRunner.php` + +`guardMainDeviceReferentialShopQueryWithHistoryModelAnchor()` wurde so angepasst, dass eine Shopquery wie `testomat 808 indikator 300` bei einem Prompt wie `und was kostet das gerät selber` nicht allein deshalb akzeptiert wird, weil sie bereits einen Modellanker enthaelt. + +Neu wird geprueft, ob nach dem Modellanker noch Zubehoer-/Code-Resttokens vorhanden sind. Falls ja, wird auf den reinen Modellanker aus dem Verlauf reduziert, z. B. `testomat 808`. + +### 3. Answer-Guard-Case weniger spröde + +`tests/evals/cases/answer_guard.ndjson` + +Der Case `answer_guard_delivery_not_sdb_001` prueft weiterhin: + +- passendes Liefer-/Versand-Dokument muss enthalten sein +- konkretes SDB-Dokument darf nicht enthalten sein + +Die zu breite Text-Assertion auf den Begriff `sicherheitsdatenblatt` wurde entfernt, weil sie auch legitime Neben-/Hinweistexte treffen kann. + +## Bewusst nicht geaendert + +- Keine Retrieval-Gewichte +- Keine Shopware-Suche +- Keine Prompt-Texte +- Keine Modellparameter +- Keine neue Produkt-Sonderlogik +- Keine Aenderung an p98-Retrieval-Eval-Cases + +## Erwartete Checks + +```bash +php bin/console mto:agent:config:validate +php bin/console mto:agent:eval:run retrieval +php bin/console mto:agent:eval:run shop_query +php bin/console mto:agent:eval:run followup +php bin/console mto:agent:eval:run answer_guard +``` + +Erwartung: + +- Config valid +- Retrieval 19/19 +- Shopquery 5/5 +- Followup 4/4 +- Answer guard 4/4 diff --git a/patch_history/RETRIEX_PATCH_99C_MAIN_DEVICE_FOLLOWUP_EVAL_ALIGNMENT_README.md b/patch_history/RETRIEX_PATCH_99C_MAIN_DEVICE_FOLLOWUP_EVAL_ALIGNMENT_README.md new file mode 100644 index 0000000..2adac16 --- /dev/null +++ b/patch_history/RETRIEX_PATCH_99C_MAIN_DEVICE_FOLLOWUP_EVAL_ALIGNMENT_README.md @@ -0,0 +1,60 @@ +# RETRIEX PATCH 99C - Main Device Follow-up Eval Alignment + +Status: patch-only follow-up for p99/p99b. + +## Goal + +Keep the new p99 follow-up eval suite aligned with the already confirmed manual +reference flow: + +1. lowest water-hardness threshold +2. indicator type +3. indicator price +4. main device price + +The main-device follow-up `und was kostet das gerät selber` must resolve back to +the main device anchor (`testomat 808`) and must not keep accessory remnants such +as `indikator` or exact indicator code `300`. + +## Root cause + +p99b added a residual accessory guard, but the main-device history-anchor guard +returned early for non-generic shop queries before the residual check could run. +A query like `testomat 808 indikator 300` contains digits, so it was not treated +as a generic main-device query and stayed unchanged. + +## Change + +`AgentRunner::guardMainDeviceReferentialShopQueryWithHistoryModelAnchor()` now: + +1. detects the main-device referential prompt, +2. extracts the latest history model anchor, +3. if the generated shop query already contains that model anchor, checks for + accessory/code residuals, +4. reduces the query to the pure model anchor when such residuals are present. + +This keeps explicit non-generic product queries untouched unless they contain the +current history model anchor plus accessory leftovers in a main-device follow-up. + +## Expected eval result + +```bash +php bin/console mto:agent:config:validate +php bin/console mto:agent:eval:run retrieval +php bin/console mto:agent:eval:run shop_query +php bin/console mto:agent:eval:run followup +php bin/console mto:agent:eval:run answer_guard +``` + +Expected: + +- retrieval: 19/19 +- shop_query: 5/5 +- followup: 4/4 +- answer_guard: 4/4 + +## Productive logic impact + +Minimal. The patch only changes the already existing main-device follow-up guard +for prompts asking for the main device itself. It does not modify retrieval, +ranking, prompt templates, YAML vocabulary, shop result guards, or answer logic. diff --git a/src/Agent/AgentRunner.php b/src/Agent/AgentRunner.php index c173c19..b1e9a7d 100644 --- a/src/Agent/AgentRunner.php +++ b/src/Agent/AgentRunner.php @@ -4155,7 +4155,6 @@ final readonly class AgentRunner $shopSearchQuery === '' || trim($commerceHistoryContext) === '' || $this->referenceAnchorExtractor->extractFirstProductModelAnchor($prompt) !== '' - || $this->referenceAnchorExtractor->extractFirstProductModelAnchor($shopSearchQuery) !== '' ) { return $shopSearchQuery; } @@ -4164,10 +4163,6 @@ final readonly class AgentRunner return $shopSearchQuery; } - if (!$this->isGenericMainDeviceReferentialShopQuery($shopSearchQuery)) { - return $shopSearchQuery; - } - $modelAnchor = $this->normalizeShopQueryAnchor( $this->extractLatestHistoryProductModelAnchor($commerceHistoryContext) ); @@ -4176,9 +4171,43 @@ final readonly class AgentRunner return $shopSearchQuery; } - return $this->queryAlreadyContainsAllAnchorTokens($shopSearchQuery, $modelAnchor) - ? $shopSearchQuery - : $modelAnchor; + if ($this->queryAlreadyContainsAllAnchorTokens($shopSearchQuery, $modelAnchor)) { + return $this->containsMainDeviceFollowUpAccessoryResidual($shopSearchQuery, $modelAnchor) + ? $modelAnchor + : $shopSearchQuery; + } + + if (!$this->isGenericMainDeviceReferentialShopQuery($shopSearchQuery)) { + return $shopSearchQuery; + } + + return $modelAnchor; + } + + private function containsMainDeviceFollowUpAccessoryResidual(string $shopSearchQuery, string $modelAnchor): bool + { + $queryTokens = $this->tokenizeShopQueryCandidate($shopSearchQuery); + if ($queryTokens === []) { + return false; + } + + $modelTokens = array_fill_keys($this->tokenizeShopQueryCandidate($modelAnchor), true); + $accessoryTokens = $this->buildShopQueryTokenSet($this->mergeUniqueStrings( + $this->agentRunnerConfig->getNoLlmAccessoryProductRoleKeywords(), + $this->agentRunnerConfig->getRequestedAccessoryCodeTerms() + )); + + foreach ($queryTokens as $token) { + if (isset($modelTokens[$token])) { + continue; + } + + if (isset($accessoryTokens[$token]) || preg_match('/^\d{1,5}$/u', $token) === 1) { + return true; + } + } + + return false; } private function guardWeakReferentialShopQueryWithHistoryModelAnchor( diff --git a/tests/evals/cases/answer_guard.ndjson b/tests/evals/cases/answer_guard.ndjson index 4d78dae..83ad2fe 100644 --- a/tests/evals/cases/answer_guard.ndjson +++ b/tests/evals/cases/answer_guard.ndjson @@ -1,4 +1,4 @@ {"id":"answer_guard_noise_no_evidence_001","type":"answer_guard","prompt":"dsgfsdgfsdgf","assert":{"max_results":0}} {"id":"answer_guard_mythical_medium_no_direct_evidence_001","type":"answer_guard","prompt":"gibt es einen testomat für drachenblut","assert":{"must_not_include_terms":["drachenblut"]}} {"id":"answer_guard_lunar_water_no_direct_evidence_001","type":"answer_guard","prompt":"welcher testomat misst mondwasser im vakuum","assert":{"must_not_include_terms":["mondwasser","vakuum"]}} -{"id":"answer_guard_delivery_not_sdb_001","type":"answer_guard","prompt":"lieferbedingungen versand testomat","assert":{"min_results":1,"must_include_one_of_document_ids":["26ddf03d-9108-4a65-aa0e-a5df7613fa77"],"must_not_include_document_ids":["7166592f-85f2-425c-997b-73e323ae184d"],"must_not_include_terms":["sicherheitsdatenblatt"]}} +{"id":"answer_guard_delivery_not_sdb_001","type":"answer_guard","prompt":"lieferbedingungen versand testomat","assert":{"min_results":1,"must_include_one_of_document_ids":["26ddf03d-9108-4a65-aa0e-a5df7613fa77"],"must_not_include_document_ids":["7166592f-85f2-425c-997b-73e323ae184d"]}}