From c00cb3a9b979f5ae93fad1f4aec9d504220c3820 Mon Sep 17 00:00:00 2001 From: team 1 Date: Mon, 4 May 2026 08:38:53 +0200 Subject: [PATCH] p28 --- DEVELOPER_POLICIES.md | 15 ++ RETRIEX_LANGUAGE_CLEANUP_GUIDE.md | 77 ++++++++++ ..._FALLBACK_CLEANUP_PROFILE_HOTFIX_README.md | 13 ++ ...K_CLEANUP_PROFILE_VALIDATION_FIX_README.md | 13 ++ ...CONTEXT_FALLBACK_CLEANUP_PROFILE_README.md | 14 ++ ...H_26_LANGUAGE_CLEANUP_YAML_DEDUP_README.md | 25 ++++ ...NGUAGE_CLEANUP_REGRESSION_GUARDS_README.md | 32 +++++ ...8_LANGUAGE_CLEANUP_DOCUMENTATION_README.md | 22 +++ config/retriex/agent.yaml | 48 +------ config/retriex/commerce.yaml | 40 +----- config/retriex/governance.yaml | 35 +++++ config/retriex/language.yaml | 5 +- src/Agent/AgentRunner.php | 44 +++++- src/Config/AgentRunnerConfig.php | 11 ++ src/Config/GovernanceConfig.php | 42 ++++++ src/Config/RetriexEffectiveConfigProvider.php | 134 +++++++++++++++++- 16 files changed, 482 insertions(+), 88 deletions(-) create mode 100644 RETRIEX_LANGUAGE_CLEANUP_GUIDE.md create mode 100644 RETRIEX_PATCH_25B_SHOP_CONTEXT_FALLBACK_CLEANUP_PROFILE_HOTFIX_README.md create mode 100644 RETRIEX_PATCH_25C_SHOP_CONTEXT_FALLBACK_CLEANUP_PROFILE_VALIDATION_FIX_README.md create mode 100644 RETRIEX_PATCH_25_SHOP_CONTEXT_FALLBACK_CLEANUP_PROFILE_README.md create mode 100644 RETRIEX_PATCH_26_LANGUAGE_CLEANUP_YAML_DEDUP_README.md create mode 100644 RETRIEX_PATCH_27_LANGUAGE_CLEANUP_REGRESSION_GUARDS_README.md create mode 100644 RETRIEX_PATCH_28_LANGUAGE_CLEANUP_DOCUMENTATION_README.md diff --git a/DEVELOPER_POLICIES.md b/DEVELOPER_POLICIES.md index a0dbc12..f8ef696 100644 --- a/DEVELOPER_POLICIES.md +++ b/DEVELOPER_POLICIES.md @@ -123,3 +123,18 @@ Use this checklist for every relevant PR: - [ ] `mto:agent:regression:test` is OK. - [ ] The protected functional flows were manually checked if the touched area can affect them. - [ ] README or patch README documents the reason for any intentionally accepted technical fallback. +## 9. Language cleanup ownership + +Generic language cleanup must use `config/retriex/language.yaml` and its cleanup profiles. + +Rules: + +- add generic German stopwords to `stopword_groups`, not to domain YAML files +- add user wording such as `ich suche`, `zeige mir` or `habt ihr` to `phrase_groups` +- add table/list/overview wording to `meta_term_groups` +- keep commerce intent, product-role, measurement and routing terms in their owning domain YAML +- never remove protected terms such as `nicht`, `kein`, `testomat`, `indikator`, `ph`, `rx`, `th`, `tc` or `0,02` through generic cleanup +- prefer `cleanup_profile: ...` references over copied token lists + +See `RETRIEX_LANGUAGE_CLEANUP_GUIDE.md` for the detailed ownership rules. + diff --git a/RETRIEX_LANGUAGE_CLEANUP_GUIDE.md b/RETRIEX_LANGUAGE_CLEANUP_GUIDE.md new file mode 100644 index 0000000..1776abf --- /dev/null +++ b/RETRIEX_LANGUAGE_CLEANUP_GUIDE.md @@ -0,0 +1,77 @@ +# RetrieX Language Cleanup Guide + +Status: binding for RetrieX 1.5.3+ cleanup-profile work. + +This guide defines where language, interaction, commerce and domain tokens belong. Its goal is to keep YAML maintenance simple and avoid duplicated keyword lists. + +## 1. Central language cleanup lives in `language.yaml` + +Use `config/retriex/language.yaml` for generic language noise only. + +Allowed here: + +- German function words: `der`, `die`, `das`, `ein`, `eine`, `mit`, `und`, `oder`, `ist`, `sind`, `kann` +- conversation filler words: `bitte`, `mal`, `gerne`, `noch`, `dazu`, `also` +- user instruction phrases: `ich suche`, `suche nach`, `zeige mir`, `gib mir`, `habt ihr`, `gibt es` +- presentation/meta terms: `tabelle`, `liste`, `übersicht`, `tabellarisch`, `auflistung` +- protected terms that must not be removed generically + +Do not add product families, measurement parameters, intent terms or shop semantics here. + +## 2. Use cleanup profiles instead of copying lists + +Domain configs should reference a cleanup profile whenever they need generic language cleanup. + +Current profiles: + +- `commerce_query`: cleanup for shop/search query text +- `rag_evidence`: cleanup for evidence/answer-consistency checks +- `shop_context_fallback`: cleanup for history-based shop context fallback + +Preferred pattern: + +```yaml +cleanup_profile: commerce_query +``` + +Avoid adding the same generic words again to `commerce.yaml`, `agent.yaml`, `retrieval.yaml` or `intent.yaml`. + +## 3. Keep domain semantics in domain configs + +These belong outside `language.yaml`: + +- commerce intent terms: `shop`, `produkt`, `artikel`, `preis`, `kosten`, `kaufen`, `bestellen` +- measurement/domain terms: `wasserhärte`, `chlor`, `redox`, `leitfähigkeit`, `ph`, `rx`, `th`, `tc` +- product-role terms: device, accessory, reagent, spare part and document-role vocabulary +- routing and answer behavior rules +- prompt-specific role or grounding rules + +## 4. Protected terms are mandatory guardrails + +Never remove these generically unless a later patch explicitly changes the guardrail: + +- negations: `nicht`, `kein`, `keine` +- core product/domain anchors: `testomat`, `indikator`, `indikatortyp` +- short model/parameter tokens: `ph`, `rx`, `th`, `tc` +- important numeric anchors: `0,02` + +When in doubt, add terms to `protected_terms` rather than removing them through a broad stopword group. + +## 5. Change process + +Before adding a new token list: + +1. Ask whether it is generic language noise. +2. If yes, add it to `language.yaml` under the correct group/profile. +3. If no, keep it in the owning domain YAML. +4. Do not introduce PHP-only token lists. +5. Run the required checks. + +Required checks: + +```bash +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +bin/console mto:agent:config:audit-patterns --details +``` diff --git a/RETRIEX_PATCH_25B_SHOP_CONTEXT_FALLBACK_CLEANUP_PROFILE_HOTFIX_README.md b/RETRIEX_PATCH_25B_SHOP_CONTEXT_FALLBACK_CLEANUP_PROFILE_HOTFIX_README.md new file mode 100644 index 0000000..5945100 --- /dev/null +++ b/RETRIEX_PATCH_25B_SHOP_CONTEXT_FALLBACK_CLEANUP_PROFILE_HOTFIX_README.md @@ -0,0 +1,13 @@ +# RetrieX Patch p25b - Shop Context Fallback Cleanup Profile Hotfix + +## Goal +Fix p25 validation failure when `agent.shop_prompt.meta_query_guard.cleanup_profile` is missing in partially updated environments. + +## Scope +- Keeps `cleanup_profile: shop_context_fallback` in `config/retriex/agent.yaml`. +- Adds a migration-safe fallback in `AgentRunnerConfig::getShopQueryContextFallbackCleanupProfile()`. +- Keeps the actual value YAML-owned; the fallback only prevents failed intermediate states. + +## Safety +No legacy lists are removed. +No Commerce, Retrieval scoring, Prompt wording, or shop matching logic is changed. diff --git a/RETRIEX_PATCH_25C_SHOP_CONTEXT_FALLBACK_CLEANUP_PROFILE_VALIDATION_FIX_README.md b/RETRIEX_PATCH_25C_SHOP_CONTEXT_FALLBACK_CLEANUP_PROFILE_VALIDATION_FIX_README.md new file mode 100644 index 0000000..e063180 --- /dev/null +++ b/RETRIEX_PATCH_25C_SHOP_CONTEXT_FALLBACK_CLEANUP_PROFILE_VALIDATION_FIX_README.md @@ -0,0 +1,13 @@ +# RetrieX Patch p25c - Shop Context Fallback Cleanup Profile Validation Fix + +## Goal +Fix the p25/p25b validation failure for `agent.shop_prompt.meta_query_guard.cleanup_profile`. + +## Scope +- Keeps the YAML-owned value `shop_context_fallback` in `config/retriex/agent.yaml`. +- Keeps the runtime wiring from p25. +- Makes validation migration-safe: if the YAML value is missing in an already partially updated environment, validation uses the configured fallback from `AgentRunnerConfig` instead of failing with an empty-string error. + +## Safety +No legacy lists are removed. +No Commerce, Retrieval scoring, Prompt wording, or shop matching logic is changed. diff --git a/RETRIEX_PATCH_25_SHOP_CONTEXT_FALLBACK_CLEANUP_PROFILE_README.md b/RETRIEX_PATCH_25_SHOP_CONTEXT_FALLBACK_CLEANUP_PROFILE_README.md new file mode 100644 index 0000000..292733d --- /dev/null +++ b/RETRIEX_PATCH_25_SHOP_CONTEXT_FALLBACK_CLEANUP_PROFILE_README.md @@ -0,0 +1,14 @@ +# RetrieX Patch p25 - Shop Context Fallback Cleanup Profile + +## Goal +Wire the Shop context fallback/meta guard to the central `shop_context_fallback` cleanup profile from `language.yaml`. + +## Scope +- Adds `shop_prompt.meta_query_guard.cleanup_profile: shop_context_fallback` to `config/retriex/agent.yaml`. +- Merges central profile stopwords, user-instruction phrases and presentation meta terms with the existing legacy meta/fallback lists. +- Keeps all legacy lists in place. +- Adds config validation for the referenced cleanup profile. + +## Safety +No legacy lists are removed in this patch. +No Commerce, Retrieval scoring, Prompt wording, or shop matching logic is changed. diff --git a/RETRIEX_PATCH_26_LANGUAGE_CLEANUP_YAML_DEDUP_README.md b/RETRIEX_PATCH_26_LANGUAGE_CLEANUP_YAML_DEDUP_README.md new file mode 100644 index 0000000..1f5bb07 --- /dev/null +++ b/RETRIEX_PATCH_26_LANGUAGE_CLEANUP_YAML_DEDUP_README.md @@ -0,0 +1,25 @@ +# RetrieX Patch 26 - Language Cleanup YAML Dedup + +Purpose: simplify the maintenance layer after p21-p25. + +Changes: + +- Removes duplicate generic language noise from legacy Commerce/Agent lists when the terms are already supplied by `language.yaml` cleanup profiles. +- Keeps all domain-specific, commerce-specific and regression-sensitive tokens in their original domain configs. +- Adds comments that mark remaining lists as legacy/domain override lists. +- Does not change profile names, service wiring, routing or retrieval logic. + +Guardrails: + +- Do not move protected terms into generic removal logic. +- Keep intent/domain terms in their domain YAML files. +- Only remove a legacy term when it is covered by an active cleanup profile or clearly duplicated in the same list. + +Required checks: + +```bash +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +bin/console mto:agent:config:audit-patterns --details +``` diff --git a/RETRIEX_PATCH_27_LANGUAGE_CLEANUP_REGRESSION_GUARDS_README.md b/RETRIEX_PATCH_27_LANGUAGE_CLEANUP_REGRESSION_GUARDS_README.md new file mode 100644 index 0000000..7b689db --- /dev/null +++ b/RETRIEX_PATCH_27_LANGUAGE_CLEANUP_REGRESSION_GUARDS_README.md @@ -0,0 +1,32 @@ +# RetrieX Patch 27 - Language Cleanup Regression Guards + +## Ziel + +Absicherung der neuen Language-Cleanup-Profile aus p21-p26, ohne weitere produktive Runtime-Logik umzubauen. + +## Inhalt + +- erweitert `governance.yaml` um prüfbare Language-Cleanup-Guardrails +- prüft required Cleanup-Profile: + - `commerce_query` + - `rag_evidence` + - `shop_context_fallback` +- prüft repräsentative Pflichtbegriffe je Profil +- schützt zentrale Begriffe vor generischem Entfernen, u. a. `nicht`, `kein`, `keine`, `testomat`, `indikator`, `indikatortyp`, `ph`, `rx`, `th`, `tc`, `0,02` +- erweitert Config-Dump/Validation um Cleanup-Profile +- erweitert `mto:agent:regression:test` um Language-Cleanup-Checks + +## Einspielen + +```bash +unzip -o p27.zip -d /path/to/retriex +cd /path/to/retriex +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +bin/console mto:agent:config:audit-patterns --details +``` + +## Hinweis + +Dieser Patch loescht keine Listen und aendert keine fachliche Runtime-Entscheidung. Er macht die neue Cleanup-Schicht regressionssicherer. diff --git a/RETRIEX_PATCH_28_LANGUAGE_CLEANUP_DOCUMENTATION_README.md b/RETRIEX_PATCH_28_LANGUAGE_CLEANUP_DOCUMENTATION_README.md new file mode 100644 index 0000000..f5f1acf --- /dev/null +++ b/RETRIEX_PATCH_28_LANGUAGE_CLEANUP_DOCUMENTATION_README.md @@ -0,0 +1,22 @@ +# RetrieX Patch 28 - Language Cleanup Documentation + +Purpose: document the new Language Cleanup Profile ownership rules for RetrieX 1.5.3+. + +Changes: + +- adds `RETRIEX_LANGUAGE_CLEANUP_GUIDE.md` +- extends `DEVELOPER_POLICIES.md` with language cleanup ownership rules +- no runtime code changes +- no YAML value changes +- no list deletion + +Install: + +```bash +unzip -o p28.zip -d /path/to/retriex +cd /path/to/retriex +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +bin/console mto:agent:config:audit-patterns --details +``` diff --git a/config/retriex/agent.yaml b/config/retriex/agent.yaml index 56c4ac3..e022523 100644 --- a/config/retriex/agent.yaml +++ b/config/retriex/agent.yaml @@ -208,6 +208,8 @@ parameters: rag_evidence_guard: cleanup_profile: rag_evidence + # Legacy/domain override list. Generic German stopwords are provided by + # language cleanup profile `rag_evidence`. Keep RAG/product-role terms here. stop_terms: - suche - suchen @@ -215,24 +217,8 @@ parameters: - finden - zeige - einen - - eine - einem - - einer - - der - - die - - das - - den - - dem - - des - - für - - fuer - - mit - ohne - - und - - oder - - kann - - können - - koennen - messen - messung - tester @@ -466,19 +452,16 @@ parameters: - '/\b(?:indikator(?:typ)?|indicator(?:\s+type)?|reagenz(?:satz|typ)?|reagent(?:\s+set|\s+type)?|typ|type)\s+[A-Za-zÄÖÜäöüß]{0,8}\s*\d{1,5}(?:\s*[A-ZÄÖÜ]{1,4})?(?:\s*%)?\b/iu' meta_query_guard: enabled: true + cleanup_profile: shop_context_fallback context_fallback_enabled: true context_fallback_question_limit: 12 context_fallback_history_budget_chars: 20000 context_fallback_use_full_history: true context_fallback_max_terms: 6 + # Legacy/domain override list. Generic stopwords, user-instruction + # phrases and presentation terms are provided by profile + # `shop_context_fallback`. Keep shop/price/domain terms here. context_fallback_filter_terms: - - mit - - tabelle - - tabellarisch - - übersicht - - uebersicht - - liste - - auflistung - preis - preise - preisen @@ -492,8 +475,6 @@ parameters: - welches - welchem - welchen - - ist - - sind - gut - geeignet - was @@ -515,24 +496,15 @@ parameters: - nehmen - zur - zum - - für - - fuer - messen - gemessen meta_only_terms: - shop - - tabelle - - tabellarisch - - übersicht - - uebersicht - - liste - - auflistung - preis - preise - preisen - kosten - kostet - - mit - shopsuche - shop-suche - suche @@ -542,22 +514,14 @@ parameters: - find - zeige - zeig - - bitte - - mal - im - in - nach - danach - - dazu - damit - dafür - dafuer - hierzu - - den - - die - - das - - der - - dem language_preservation: enabled: true language_markers: diff --git a/config/retriex/commerce.yaml b/config/retriex/commerce.yaml index f430498..662003c 100644 --- a/config/retriex/commerce.yaml +++ b/config/retriex/commerce.yaml @@ -21,14 +21,11 @@ parameters: - horiba - neomeris + # Legacy/domain override list. General user-instruction phrases are provided by + # language cleanup profile `commerce_query`. Keep only commerce-specific or + # historically sensitive phrases here. phrases_to_remove: - - ich suche - suche - - habt ihr - - gibt es - - gebe mir - - gib mir - - zeige mir - welches gerät - welche gerät - welches modell @@ -42,8 +39,6 @@ parameters: - welcher - welches - welchen - - sind - - ist - geeignet - geeigent - verfügbarkeit @@ -53,27 +48,17 @@ parameters: - kurze - ich + # Legacy/domain override list. Generic German stopwords and conversation + # filler terms are provided by language cleanup profile `commerce_query`. filter_search_tokens: - - auch - - noch - - nochmal - zusätzlich - - dazu - - davon - stattdessen - - bitte - preiswerte - - gern - lösung - - eine - größer - würde - - gerne - welchem - - kann - - mit - mein - - größer - zeige - zeig - such @@ -81,33 +66,20 @@ parameters: - finde - find - mir - - mal - von - im - in - - für - - fuer - welche - welcher - welches - welchen - - sind - zur - - ist - geeignet - geeigent - verfügbarkeit - verfuegbarkeit - prüfe - pruefe - - den - - die - - das - - der - - dem - - des - - und - - oder - sowie - seine - seinen @@ -129,8 +101,6 @@ parameters: - kostet - kosten - ua - - ein - - also - gut - gute - guten diff --git a/config/retriex/governance.yaml b/config/retriex/governance.yaml index ca92e22..c36c102 100644 --- a/config/retriex/governance.yaml +++ b/config/retriex/governance.yaml @@ -68,10 +68,45 @@ parameters: protected_stopword_terms: - nicht - kein + - keine - welche - testomat - indikator + - indikatortyp + - ph + - rx + - th + - tc - '0,02' + required_cleanup_profiles: + - commerce_query + - rag_evidence + - shop_context_fallback + required_profile_terms: + commerce_query: + stopwords: + - der + - mit + - bitte + phrases: + - ich suche + - suche im shop + rag_evidence: + stopwords: + - der + - mit + - bitte + shop_context_fallback: + stopwords: + - der + - mit + - bitte + phrases: + - zeige mir + - suche im shop + meta_terms: + - tabelle + - übersicht core_pattern_audit: source_roots: - src diff --git a/config/retriex/language.yaml b/config/retriex/language.yaml index 75a937b..80918a2 100644 --- a/config/retriex/language.yaml +++ b/config/retriex/language.yaml @@ -53,12 +53,13 @@ parameters: # Central language cleanup structure for RetrieX 1.5.3+. # Legacy key `words` above remains the runtime-compatible default list. - # New cleanup profiles are introduced additively and are not yet wired into - # Commerce/Agent runtime logic in this patch. + # Cleanup profiles are the preferred home for generic language noise. + # Domain configs should only keep domain-specific overrides. protected_terms: - nicht - kein - keine + - welche - testomat - indikator - indikatortyp diff --git a/src/Agent/AgentRunner.php b/src/Agent/AgentRunner.php index b99b22a..2ade359 100644 --- a/src/Agent/AgentRunner.php +++ b/src/Agent/AgentRunner.php @@ -1910,10 +1910,7 @@ final readonly class AgentRunner $filterTerms = []; - foreach (array_merge( - $this->agentRunnerConfig->getShopQueryMetaOnlyTerms(), - $this->agentRunnerConfig->getShopQueryContextFallbackFilterTerms() - ) as $term) { + foreach ($this->getShopQueryContextFallbackFilterTerms() as $term) { foreach ($this->tokenizeShopQueryCandidate($term) as $token) { $filterTerms[$token] = true; } @@ -1941,6 +1938,43 @@ final readonly class AgentRunner return implode(' ', $out); } + /** @return string[] */ + private function getShopQueryContextFallbackFilterTerms(): array + { + $profileName = $this->agentRunnerConfig->getShopQueryContextFallbackCleanupProfile(); + + return $this->mergeUniqueStrings( + $this->mergeUniqueStrings( + $this->languageCleanupConfig->getStopWordsForProfile($profileName), + $this->languageCleanupConfig->getPhrasesForProfile($profileName) + ), + $this->mergeUniqueStrings( + $this->languageCleanupConfig->getMetaTermsForProfile($profileName), + $this->mergeUniqueStrings( + $this->agentRunnerConfig->getShopQueryMetaOnlyTerms(), + $this->agentRunnerConfig->getShopQueryContextFallbackFilterTerms() + ) + ) + ); + } + + /** @return string[] */ + private function getShopQueryMetaGuardTerms(): array + { + $profileName = $this->agentRunnerConfig->getShopQueryContextFallbackCleanupProfile(); + + return $this->mergeUniqueStrings( + $this->mergeUniqueStrings( + $this->languageCleanupConfig->getStopWordsForProfile($profileName), + $this->languageCleanupConfig->getPhrasesForProfile($profileName) + ), + $this->mergeUniqueStrings( + $this->languageCleanupConfig->getMetaTermsForProfile($profileName), + $this->agentRunnerConfig->getShopQueryMetaOnlyTerms() + ) + ); + } + /** * @return string[] */ @@ -1972,7 +2006,7 @@ final readonly class AgentRunner } $metaTerms = []; - foreach ($this->agentRunnerConfig->getShopQueryMetaOnlyTerms() as $term) { + foreach ($this->getShopQueryMetaGuardTerms() as $term) { foreach ($this->tokenizeMetaGuardText($term) as $token) { $metaTerms[$token] = true; } diff --git a/src/Config/AgentRunnerConfig.php b/src/Config/AgentRunnerConfig.php index 084e722..2a1f6b1 100644 --- a/src/Config/AgentRunnerConfig.php +++ b/src/Config/AgentRunnerConfig.php @@ -808,6 +808,17 @@ final class AgentRunnerConfig return $this->getRequiredBool('shop_prompt.meta_query_guard.enabled'); } + public function getShopQueryContextFallbackCleanupProfile(): string + { + $value = $this->optionalValue('shop_prompt.meta_query_guard.cleanup_profile'); + + if (is_string($value) && trim($value) !== '') { + return trim($value); + } + + return 'shop_context_fallback'; + } + /** * @return string[] */ diff --git a/src/Config/GovernanceConfig.php b/src/Config/GovernanceConfig.php index 7b7c6e8..5a6d3fe 100644 --- a/src/Config/GovernanceConfig.php +++ b/src/Config/GovernanceConfig.php @@ -132,6 +132,48 @@ final class GovernanceConfig return $this->requiredStringList('language.protected_stopword_terms'); } + /** @return string[] */ + public function getLanguageRequiredCleanupProfiles(): array + { + return $this->requiredStringList('language.required_cleanup_profiles'); + } + + /** @return array */ + public function getLanguageRequiredProfileTerms(): array + { + $value = $this->requiredValue('language.required_profile_terms'); + if (!is_array($value)) { + throw $this->invalid('language.required_profile_terms', 'must be a map of cleanup profile term lists'); + } + + $out = []; + foreach ($value as $profileName => $profileTerms) { + if (!is_string($profileName) || trim($profileName) === '' || !is_array($profileTerms)) { + throw $this->invalid('language.required_profile_terms', 'must be keyed by non-empty cleanup profile names'); + } + + $normalizedProfileName = trim($profileName); + $out[$normalizedProfileName] = [ + 'stopwords' => $this->normalizeStringList($profileTerms['stopwords'] ?? []), + 'phrases' => $this->normalizeStringList($profileTerms['phrases'] ?? []), + 'meta_terms' => $this->normalizeStringList($profileTerms['meta_terms'] ?? []), + ]; + + if ($out[$normalizedProfileName]['stopwords'] === [] + && $out[$normalizedProfileName]['phrases'] === [] + && $out[$normalizedProfileName]['meta_terms'] === [] + ) { + throw $this->invalid('language.required_profile_terms.' . $normalizedProfileName, 'must contain at least one required term'); + } + } + + if ($out === []) { + throw $this->invalid('language.required_profile_terms', 'must contain at least one cleanup profile'); + } + + return $out; + } + /** @return string[] */ public function getCorePatternAuditSourceRoots(): array { diff --git a/src/Config/RetriexEffectiveConfigProvider.php b/src/Config/RetriexEffectiveConfigProvider.php index 2641323..bdea903 100644 --- a/src/Config/RetriexEffectiveConfigProvider.php +++ b/src/Config/RetriexEffectiveConfigProvider.php @@ -117,6 +117,77 @@ final readonly class RetriexEffectiveConfigProvider $warnings[] = 'Config validation warning: ' . $warning; } + try { + $cleanupProfileNames = $this->languageCleanupConfig->getCleanupProfileNames(); + foreach ($this->governanceConfig->getLanguageRequiredCleanupProfiles() as $profileName) { + $key = 'language_cleanup_profile_' . $this->guardrailCheckKey($profileName); + $checks[$key] = in_array($profileName, $cleanupProfileNames, true); + if (!$checks[$key]) { + $errors[] = 'Missing required language cleanup profile: ' . $profileName . '.'; + continue; + } + + $this->languageCleanupConfig->getCleanupProfile($profileName); + } + + $legacyStopwords = $this->stopWordsConfig->getStopWords(); + foreach ($this->governanceConfig->getLanguageProtectedStopwordTerms() as $protectedTerm) { + $key = 'language_protected_term_' . $this->guardrailCheckKey($protectedTerm); + $checks[$key . '_registered'] = $this->languageCleanupConfig->isProtectedTerm($protectedTerm); + if (!$checks[$key . '_registered']) { + $errors[] = 'Missing protected language cleanup term: ' . $protectedTerm . '.'; + } + + $checks[$key . '_not_legacy_stopword'] = !in_array($protectedTerm, $legacyStopwords, true); + if (!$checks[$key . '_not_legacy_stopword']) { + $errors[] = 'Protected language cleanup term is still a legacy stopword: ' . $protectedTerm . '.'; + } + + foreach ($cleanupProfileNames as $profileName) { + $profile = $this->languageCleanupConfig->getCleanupProfile($profileName); + foreach (['stopwords', 'phrases', 'meta_terms'] as $bucket) { + $bucketKey = $key . '_not_in_' . $this->guardrailCheckKey($profileName . '_' . $bucket); + $checks[$bucketKey] = !in_array($protectedTerm, $profile[$bucket] ?? [], true); + if (!$checks[$bucketKey]) { + $errors[] = sprintf('Protected language cleanup term %s is present in %s.%s.', $protectedTerm, $profileName, $bucket); + } + } + } + } + + foreach ($this->governanceConfig->getLanguageRequiredProfileTerms() as $profileName => $requiredTerms) { + $profile = $this->languageCleanupConfig->getCleanupProfile($profileName); + foreach ($requiredTerms as $bucket => $terms) { + foreach ($terms as $term) { + $key = 'language_cleanup_profile_' . $this->guardrailCheckKey($profileName . '_' . $bucket . '_' . $term); + $checks[$key] = in_array($term, $profile[$bucket] ?? [], true); + if (!$checks[$key]) { + $errors[] = sprintf('Missing language cleanup profile term: %s.%s must contain %s.', $profileName, $bucket, $term); + } + } + } + } + + $checks['commerce_query_cleanup_profile_wired'] = $this->commerceQueryParserConfig->getCleanupProfile() === 'commerce_query'; + if (!$checks['commerce_query_cleanup_profile_wired']) { + $errors[] = 'Commerce query parser is not wired to cleanup profile commerce_query.'; + } + + $checks['rag_evidence_cleanup_profile_wired'] = $this->agentRunnerConfig->getRagEvidenceCleanupProfile() === 'rag_evidence'; + if (!$checks['rag_evidence_cleanup_profile_wired']) { + $errors[] = 'RAG evidence guard is not wired to cleanup profile rag_evidence.'; + } + + $checks['shop_context_fallback_cleanup_profile_wired'] = $this->agentRunnerConfig->getShopQueryContextFallbackCleanupProfile() === 'shop_context_fallback'; + if (!$checks['shop_context_fallback_cleanup_profile_wired']) { + $errors[] = 'Shop context fallback is not wired to cleanup profile shop_context_fallback.'; + } + + } catch (\InvalidArgumentException $e) { + $checks['language_cleanup_profile_config_valid'] = false; + $errors[] = 'Language cleanup profile guardrails failed: ' . $e->getMessage(); + } + $importantShortModelTokens = $this->retrieverConfig->importantShortModelTokens(); foreach ($this->governanceConfig->getRegressionProtectedShortModelTokens() as $token) { $key = 'important_short_model_token_' . $this->guardrailCheckKey($token); @@ -543,6 +614,7 @@ final readonly class RetriexEffectiveConfigProvider ], 'meta_query_guard' => [ 'enabled' => $this->agentRunnerConfig->isShopQueryMetaGuardEnabled(), + 'cleanup_profile' => $this->agentRunnerConfig->getShopQueryContextFallbackCleanupProfile(), 'context_fallback_use_full_history' => $this->agentRunnerConfig->shouldUseFullHistoryForShopQueryContextFallback(), 'meta_only_terms' => $this->agentRunnerConfig->getShopQueryMetaOnlyTerms(), 'context_fallback_enabled' => $this->agentRunnerConfig->isShopQueryContextFallbackEnabled(), @@ -799,7 +871,17 @@ final readonly class RetriexEffectiveConfigProvider private function languageConfig(): array { - return ['stopwords' => $this->stopWordsConfig->getStopWords()]; + $profiles = []; + foreach ($this->languageCleanupConfig->getCleanupProfileNames() as $profileName) { + $profiles[$profileName] = $this->languageCleanupConfig->getCleanupProfile($profileName); + } + + return [ + 'stopwords' => $this->stopWordsConfig->getStopWords(), + 'protected_terms' => $this->languageCleanupConfig->getProtectedTerms(), + 'cleanup_profile_names' => $this->languageCleanupConfig->getCleanupProfileNames(), + 'cleanup_profiles' => $profiles, + ]; } /** @return array */ @@ -861,6 +943,8 @@ final readonly class RetriexEffectiveConfigProvider $this->governanceConfig->getRegressionShopQueryContextFallbackFilterTerms(); $this->governanceConfig->getVocabularyProtectedShortModelTokens(); $this->governanceConfig->getLanguageProtectedStopwordTerms(); + $this->governanceConfig->getLanguageRequiredCleanupProfiles(); + $this->governanceConfig->getLanguageRequiredProfileTerms(); $this->governanceConfig->getCorePatternAuditSourceRoots(); $this->governanceConfig->getCorePatternAuditExcludedPathPrefixes(); $this->governanceConfig->getCorePatternAuditExcludedPathPatterns(); @@ -1090,6 +1174,18 @@ final readonly class RetriexEffectiveConfigProvider $this->validateStringList($this->toList($ragEvidence['aggregate_evidence_terms'] ?? []), 'agent.rag_evidence_guard.aggregate_evidence_terms', $errors, $warnings); $this->validateRegexPatternList($ragEvidence['aggregate_answer_evidence_patterns'] ?? [], 'agent.rag_evidence_guard.aggregate_answer_evidence_patterns', $errors); + $shopPrompt = is_array($agent['shop_prompt'] ?? null) ? $agent['shop_prompt'] : []; + $metaQueryGuard = is_array($shopPrompt['meta_query_guard'] ?? null) ? $shopPrompt['meta_query_guard'] : []; + $shopContextCleanupProfile = $metaQueryGuard['cleanup_profile'] ?? null; + if (!is_string($shopContextCleanupProfile) || trim($shopContextCleanupProfile) === '') { + $shopContextCleanupProfile = $this->agentRunnerConfig->getShopQueryContextFallbackCleanupProfile(); + } else { + $shopContextCleanupProfile = trim($shopContextCleanupProfile); + } + if (!in_array($shopContextCleanupProfile, $this->languageCleanupConfig->getCleanupProfileNames(), true)) { + $errors[] = 'agent.shop_prompt.meta_query_guard.cleanup_profile references unknown language cleanup profile: ' . $shopContextCleanupProfile . '.'; + } + $this->validateStringListMap($agent['shop_query_optimizer'] ?? [], 'agent.shop_query_optimizer', $errors, $warnings); $this->validateRegexPattern($agent['optimized_shop_query_prefix_pattern'] ?? null, 'agent.optimized_shop_query_prefix_pattern', $errors); @@ -1319,10 +1415,40 @@ final readonly class RetriexEffectiveConfigProvider { $this->validateStringListMap($language, 'language', $errors, $warnings); $stopwords = is_array($language['stopwords'] ?? null) ? $language['stopwords'] : []; - foreach ($this->governanceConfig->getLanguageProtectedStopwordTerms() as $protected) { - if (in_array($protected, $stopwords, true)) { - $errors[] = 'language.stopwords must not contain protected term: ' . $protected . '.'; + + try { + $profileNames = $this->languageCleanupConfig->getCleanupProfileNames(); + + foreach ($this->governanceConfig->getLanguageRequiredCleanupProfiles() as $profileName) { + if (!in_array($profileName, $profileNames, true)) { + $errors[] = 'language.cleanup_profiles must contain required profile: ' . $profileName . '.'; + continue; + } + + $this->languageCleanupConfig->getCleanupProfile($profileName); } + + foreach ($this->governanceConfig->getLanguageProtectedStopwordTerms() as $protected) { + if (in_array($protected, $stopwords, true)) { + $errors[] = 'language.stopwords must not contain protected term: ' . $protected . '.'; + } + if (!$this->languageCleanupConfig->isProtectedTerm($protected)) { + $errors[] = 'language.protected_terms must contain protected term: ' . $protected . '.'; + } + } + + foreach ($this->governanceConfig->getLanguageRequiredProfileTerms() as $profileName => $requiredTerms) { + $profile = $this->languageCleanupConfig->getCleanupProfile($profileName); + foreach ($requiredTerms as $bucket => $terms) { + foreach ($terms as $term) { + if (!in_array($term, $profile[$bucket] ?? [], true)) { + $errors[] = sprintf('language.cleanup_profiles.%s.%s must contain required term: %s.', $profileName, $bucket, $term); + } + } + } + } + } catch (\InvalidArgumentException $e) { + $errors[] = $e->getMessage(); } }