diff --git a/RETRIEX_PATCH_13_PATTERN_EXTERNALIZATION_README.md b/RETRIEX_PATCH_13_PATTERN_EXTERNALIZATION_README.md new file mode 100644 index 0000000..ba4fa42 --- /dev/null +++ b/RETRIEX_PATCH_13_PATTERN_EXTERNALIZATION_README.md @@ -0,0 +1,99 @@ +# RetrieX Patch 13 – Agent/Commerce Pattern Externalization + +## Ziel + +Patch 13 setzt die Developer Policies aus v1.5.1 weiter um und entfernt weitere fachliche Patterns, +Signal-Listen und Produkt-/Fokuslisten aus PHP-Core-Klassen. + +Die Änderung ist bewusst klein gehalten und soll das Laufzeitverhalten nicht fachlich verändern: +Die bisherigen Werte wurden 1:1 nach YAML verschoben und die PHP-Klassen lesen sie nur noch aus der Konfiguration. + +## Geänderte Bereiche + +### AgentRunner + +Dateien: + +- `src/Agent/AgentRunner.php` +- `src/Config/AgentRunnerConfig.php` +- `config/retriex/agent.yaml` + +Externalisiert wurden: + +- starke Follow-up-Referenzpatterns, z. B. „der Wert“, „womit“, „welcher Indikator“ +- explizite Commerce-Follow-up-Signale, z. B. „shop“, „preis“, „kostet“, „sku“ +- History-/Follow-up-Strukturpatterns für `Question:`-Parsing +- Referenzanker-Patterns für Testomat-Modellanker und Härtewerte in °dH + +### CommerceQueryParser + +Dateien: + +- `src/Commerce/CommerceQueryParser.php` +- `src/Config/CommerceQueryParserConfig.php` +- `config/retriex/commerce.yaml` + +Externalisiert wurden: + +- Search-Control-Tokens wie `shop`, `store`, `produkt`, `artikel`, `kaufen`, `bestellen`, `online` + +### CommerceReferenceResolver + +Dateien: + +- `src/Commerce/CommerceReferenceResolver.php` +- `src/Config/CommerceReferenceResolverConfig.php` +- `config/services.yaml` +- `config/retriex/commerce.yaml` + +Externalisiert wurden: + +- Conversation-Produktpatterns für Testomat-Modelle +- Fokus-Term-Patterns für Zubehör-/Reagenz-/Ersatzteilbegriffe + +## Bewusst nicht geändert + +Nicht alle technischen Normalisierungsregexe im `AgentRunner` wurden in diesem Patch angefasst. +Bewusst unverändert bleiben zunächst rein technische Parser-/Sanitizer-Patterns wie Whitespace-Normalisierung, +Tokenisierung oder HTML-/Output-Formatierung. + +Diese können später in einem separaten, kleineren Follow-up-Patch geprüft werden, falls der Audit weiterhin zu viel +Pattern-Logik im Core meldet. + +## Prüfungen + +In der Patch-Arbeitsumgebung wurde geprüft: + +```bash +php -l src/Config/AgentRunnerConfig.php +php -l src/Agent/AgentRunner.php +php -l src/Config/CommerceQueryParserConfig.php +php -l src/Commerce/CommerceQueryParser.php +php -l src/Config/CommerceReferenceResolverConfig.php +php -l src/Commerce/CommerceReferenceResolver.php +``` + +Alle geprüften PHP-Dateien waren syntaktisch valide. + +Zusätzlich wurden die geänderten YAML-Dateien parserseitig geprüft: + +- `config/retriex/agent.yaml` +- `config/retriex/commerce.yaml` +- `config/services.yaml` + +## Nach dem Einspielen lokal ausführen + +```bash +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +``` + +Falls die Umgebung `vendor/` nicht enthält, vorher wie üblich Abhängigkeiten installieren. + +## Erwartetes Ergebnis + +- Keine fachliche Verhaltensänderung. +- Keine neue Strict-Validation. +- Weniger fachliche Pattern-/Keyword-Logik im PHP-Core. +- Weitere Annäherung an die Developer Policies aus v1.5.1. diff --git a/RETRIEX_PATCH_14_REMAINING_CORE_PATTERN_EXTERNALIZATION_README.md b/RETRIEX_PATCH_14_REMAINING_CORE_PATTERN_EXTERNALIZATION_README.md new file mode 100644 index 0000000..87b7aef --- /dev/null +++ b/RETRIEX_PATCH_14_REMAINING_CORE_PATTERN_EXTERNALIZATION_README.md @@ -0,0 +1,50 @@ +# RetrieX Patch 14 - Remaining Runtime Core Pattern Externalization + +## Goal + +Patch 14 continues the YAML-only governance work after v1.5.1 and p13b. It externalizes remaining runtime-relevant domain patterns from PHP core code into YAML without changing scoring semantics or enabling strict validation. + +## Scope + +Changed runtime areas: + +- `src/Knowledge/Retrieval/NdjsonHybridRetriever.php` + - Catalog-list shortcut patterns moved to `config/retriex/retrieval.yaml`. + - Exact-document token variant prefixes moved to YAML. + - Indicator/detail follow-up tokens, phrases and table-detection patterns moved to YAML. + - Exact detail and generic exact-selection token lists moved to YAML. + +- `src/Agent/PromptBuilder.php` + - Direct main-device request patterns moved to `config/retriex/prompt.yaml`. + +- `src/Intent/CommerceIntentLite.php` + - Non-product commerce signals moved to `config/retriex/intent.yaml`. + - Technical factual knowledge guard label and patterns moved to YAML. + +## Intentionally not changed + +- No strict YAML validation is enabled. +- No governance/regression hardening terms from `RetriexEffectiveConfigProvider` are moved in this patch. That should remain a separate follow-up patch. +- No scoring weights, top-k limits, retrieval thresholds or prompt rules are changed. +- Technical regex such as number detection, whitespace handling, job id validation and markdown parsing is intentionally left in PHP. + +## Required checks after applying + +Run locally in the full project with dependencies installed: + +```bash +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +``` + +Recommended smoke checks: + +- Lowest hardness threshold question still resolves to `0,02 °dH / Testomat 808`. +- Follow-up indicator question still resolves to `Indikatortyp 300`. +- Accessory price follow-up still resolves to the indicator product, not the device. +- Product-selection shop fallback still works. + +## Next step + +The next isolated patch should externalize governance/regression guardrails from `RetriexEffectiveConfigProvider` into YAML, likely as p15. diff --git a/RETRIEX_PATCH_15B_GOVERNANCE_VALIDATION_WARNING_FIX_README.md b/RETRIEX_PATCH_15B_GOVERNANCE_VALIDATION_WARNING_FIX_README.md new file mode 100644 index 0000000..aa53367 --- /dev/null +++ b/RETRIEX_PATCH_15B_GOVERNANCE_VALIDATION_WARNING_FIX_README.md @@ -0,0 +1,27 @@ +# RetrieX Patch 15b - Governance Validation Warning Fix + +## Ziel + +Patch 15b behebt einen Folgefehler aus Patch 15: Die Regression-Baseline konnte durch eine falsch-positive Config-Validation-Warnung scheitern. + +## Ursache + +`retriex.retrieval.inventory` ist ein backwards-kompatibler Diagnoseparameter. Einige Retrieval-Listen sind inzwischen vocabulary-backed und werden ueber die aktive Config-Fassade aufgeloest. Der direkte Vergleich des rohen Inventory-Parameters mit der aktiven Retriever-Config kann deshalb bei vocabulary-backed Keys abweichen, ohne dass eine echte Fehlkonfiguration vorliegt. + +Zusätzlich wird die Governance-Validation nicht mehr mit der generischen String-List-Map-Validierung geprueft, weil `governance.yaml` absichtlich auch strukturierte Guardrail-Gruppen enthaelt. Die eigentliche Validierung erfolgt ueber die typisierten Getter von `GovernanceConfig`. + +## Geaendert + +- `src/Config/RetriexEffectiveConfigProvider.php` + - ueberspringt den Inventory-Direktvergleich fuer vocabulary-backed Retrieval-Keys + - validiert Governance-Konfiguration ueber `GovernanceConfig` statt generischer String-List-Map-Pruefung + +## Erwartete Checks + +```bash +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +``` + +Alle drei Checks muessen gruen sein. diff --git a/RETRIEX_PATCH_15C_GOVERNANCE_SHAPE_AND_INVENTORY_FIX_README.md b/RETRIEX_PATCH_15C_GOVERNANCE_SHAPE_AND_INVENTORY_FIX_README.md new file mode 100644 index 0000000..49d17dc --- /dev/null +++ b/RETRIEX_PATCH_15C_GOVERNANCE_SHAPE_AND_INVENTORY_FIX_README.md @@ -0,0 +1,29 @@ +# RetrieX Patch 15c - Governance Shape and Inventory Fix + +## Ziel + +Patch 15c behebt zwei Restprobleme aus Patch 15/15b. + +## Ursache + +1. `governance.regression_baseline.protected_retrieval_device_word_groups` war temporaer als Liste strukturierter Objekte modelliert. Bestehende generische Config-Validierungen erwarten bei Listen jedoch skalare Werte. +2. Der Retrieval-Inventory-Vergleich konnte fuer vocabulary-backed Retrieval-Keys weiterhin eine falsch-positive Warnung ausgeben, wenn der rohe Inventory-Parameter von der aktiven Retriever-Config abweicht. + +## Geaendert + +- `config/retriex/governance.yaml` + - `protected_retrieval_device_word_groups` ist jetzt eine Map von Gruppenname zu Stringliste. +- `src/Config/GovernanceConfig.php` + - liest die neue Map-Struktur und toleriert die temporaere p15/p15b-Listenstruktur defensiv. +- `src/Config/RetriexEffectiveConfigProvider.php` + - ueberspringt den Inventory-Direktvergleich fuer alle bekannten vocabulary-backed Retrieval-Keys explizit. + +## Erwartete Checks + +```bash +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +``` + +Alle drei Checks muessen gruen sein. diff --git a/RETRIEX_PATCH_15_GOVERNANCE_GUARDRAIL_CONFIG_EXTERNALIZATION_README.md b/RETRIEX_PATCH_15_GOVERNANCE_GUARDRAIL_CONFIG_EXTERNALIZATION_README.md new file mode 100644 index 0000000..bf1aac2 --- /dev/null +++ b/RETRIEX_PATCH_15_GOVERNANCE_GUARDRAIL_CONFIG_EXTERNALIZATION_README.md @@ -0,0 +1,44 @@ +# RetrieX Patch 15 - Governance Guardrail Config Externalization + +## Ziel + +Patch 15 verschiebt die verbliebenen fachlichen Regression-/Governance-Guardrail-Werte aus `RetriexEffectiveConfigProvider.php` in YAML. Damit bleiben die Checks aus `mto:agent:config:validate` und `mto:agent:regression:test` erhalten, ohne neue PHP-only Defaults oder harte fachliche Listen im Core zu pflegen. + +## Geaenderte Bereiche + +- Neue Datei: `config/retriex/governance.yaml` + - enthaelt Regression-Baseline-Guardrails + - enthaelt Vocabulary-Guardrails + - enthaelt Language-/Stopword-Guardrails +- Neue Config-Fassade: `src/Config/GovernanceConfig.php` +- `src/Config/RetriexEffectiveConfigProvider.php` + - nutzt `GovernanceConfig` fuer Regression-Baseline-Checks + - nutzt `GovernanceConfig` fuer Commerce-Query-Messwertvalidierung + - nutzt `GovernanceConfig` fuer Vocabulary-/Stopword-Guardrails + - dump-effective enthaelt nun den Abschnitt `governance` +- `config/services.yaml` + - importiert `config/retriex/governance.yaml` + - registriert `GovernanceConfig` +- `src/Config/ConfigSourceAuditProvider.php` + - kennt `GovernanceConfig` als YAML-backed Config-Klasse + +## Bewusst nicht enthalten + +- keine Strict YAML Validation +- kein neuer Pattern-Audit-Command +- keine Runtime-Verhaltensaenderung +- keine Aenderung an Retrieval-, Commerce- oder Prompt-Scoring + +## Erwartete Checks nach dem Einspielen + +```bash +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +``` + +Alle drei Checks muessen weiterhin gruen sein. + +## Hinweis + +Die fachlichen Guardrail-Werte bleiben bewusst erhalten. Der Patch aendert nur den Ort der Pflege: YAML ist ab jetzt Source of Truth, PHP liest diese Werte nur noch aus der Config-Fassade. diff --git a/config/retriex/governance.yaml b/config/retriex/governance.yaml new file mode 100644 index 0000000..95c4cdf --- /dev/null +++ b/config/retriex/governance.yaml @@ -0,0 +1,65 @@ +# Governance and regression guardrail configuration. +# These values are intentionally YAML-owned so developer-policy checks do not +# reintroduce domain-specific guardrail terms as PHP-only defaults. +parameters: + retriex.governance.config: + regression_baseline: + protected_short_model_tokens: + - th + - tc + - tp + - tm + - ph + - rx + protected_measurement_values: + - '0,02' + protected_technical_prompt_keywords: + - testomat + - indikator + - grenzwert + - messbereich + - gemessen + technical_priority_required_markers: + - runner-up + - second-lowest + - comparison + protected_accessory_prompt_keywords: + - indikator + - reagenz + protected_search_repair_specificity_terms: + - indikator + - testomat + - reagenz + protected_retrieval_reagent_words: + - indikator + protected_retrieval_device_word_groups: + geraet: + - geraet + - gerät + shop_prompt_regression_original_query: 'testomat 808 0,02' + shop_prompt_required_output_instruction_markers: + - 'Output only the final search query.' + - 'Output format:' + shop_query_meta_guard_terms: + - shop + - suche + shop_query_context_fallback_filter_terms: + - welchem + - kann + - messen + vocabulary: + protected_short_model_tokens: + - th + - tc + - tp + - tm + - ph + - rx + language: + protected_stopword_terms: + - nicht + - kein + - welche + - testomat + - indikator + - '0,02' diff --git a/config/retriex/intent.yaml b/config/retriex/intent.yaml index 3ac1c1b..8dd210c 100644 --- a/config/retriex/intent.yaml +++ b/config/retriex/intent.yaml @@ -40,6 +40,17 @@ parameters: - zubehör - zubehoer - ersatzteil + non_product_commerce_signals: + - shop + - alle + - kunde + - online + - kaufen + - kostet + - suche + - such + - finde + - finden advisory_signals: - passt - eignet @@ -132,6 +143,32 @@ parameters: - '/\bzubehör\b/u' - '/\bzubehoer\b/u' - '/\bersatzteil(?:e)?\b/u' + technical_factual_knowledge: + signal_label: technical_factual_knowledge_query + question_marker_patterns: + - '/\bwas\s+ist\b/u' + - '/\bwelche?r?s?\b/u' + - '/\bwie\s+(hoch|niedrig|klein|gross|groß)\b/u' + - '/\bniedrigste[rsn]?\b/u' + - '/\bkleinste[rsn]?\b/u' + - '/\bhöchste[rsn]?\b/u' + - '/\bhoechste[rsn]?\b/u' + fact_patterns: + - '/\bgrenzwert(?:e|en|es)?\b/u' + - '/\bmessbereich(?:e|en|s)?\b/u' + - '/\bwasserhärte\b/u' + - '/\bwasserhaerte\b/u' + - '/\bresthärte\b/u' + - '/\bresthaerte\b/u' + - '/\bgesamthärte\b/u' + - '/\bgesamthaerte\b/u' + - '/\bauflösung\b/u' + - '/\baufloesung\b/u' + - '/\bindikator(?:en|s)?\b/u' + - '/\btestomat(?:en|s)?\b/u' + - '/\büberwach(?:t|en|ung)\b/u' + - '/\bueberwach(?:t|en|ung)\b/u' + - '/\bmess(?:en|ung|bar|wert)\b/u' patterns: sku_like: '/\b\d{4,10}\b/u' price_value_template: '/\b\d+(?:[.,]\d+)?\s*(?:{price_pattern})\b/u' diff --git a/config/retriex/prompt.yaml b/config/retriex/prompt.yaml index 1c96a57..c17d024 100644 --- a/config/retriex/prompt.yaml +++ b/config/retriex/prompt.yaml @@ -209,6 +209,11 @@ parameters: - ph-indikatoren - ph indikatoren + direct_main_device_request_patterns: + - '/\b(welcher|welches|welche)\s+[^?.!,;]{0,40}(testomat|messgerät|messgeraet|analysegerät|analysegeraet|gerät|geraet|analysator)\b/u' + - '/\b(suche|finde|empfiehl|empfehle)\s+[^?.!,;]{0,40}(testomat|messgerät|messgeraet|analysegerät|analysegeraet|gerät|geraet|analysator)\b/u' + - '/\b(testomat|messgerät|messgeraet|analysegerät|analysegeraet|gerät|geraet|analysator)\s+[^?.!,;]{0,40}(messen|misst|überwachen|ueberwachen|kann|für|fuer)\b/u' + measurement_evidence_guard: intro_rules: - '- This block is generated from the current user question and is stricter than broad product-selection wording.' diff --git a/config/retriex/retrieval.yaml b/config/retriex/retrieval.yaml index f397240..ce6394f 100644 --- a/config/retriex/retrieval.yaml +++ b/config/retriex/retrieval.yaml @@ -26,6 +26,120 @@ parameters: focused_product_min_score: 10.0 focused_product_min_gap: 4.0 focused_product_max_chunks: 4 + catalog_list_shortcut_patterns: + - '/\balle\b/u' + - '/\bliste\b/u' + - '/\bauflistung\b/u' + - '/\buebersicht\b/u' + - '/\bübersicht\b/u' + - '/\bsortiment\b/u' + - '/\bwelche\b.*\b(gibt|verfügbar|verfuegbar|existieren)\b/u' + - '/\bzeige\b.*\b(produkte|geraete|geräte|modelle|artikel)\b/u' + - '/\bwas\b.*\b(gibt es|verfügbar|verfuegbar)\b/u' + exact_selection_token_variant_prefixes: + indikator: + - indikator + - indikatortyp + grenzwert: + - grenzwert + messbereich: + - messbereich + testomat: + - testomat + exact_selection_indicator_question_tokens: + - indikator + - indikatortyp + - reagenz + - reagens + exact_selection_indicator_question_phrases: + - mit welchem + - womit + exact_selection_indicator_table_heading_patterns: + - '/verf(?:ü|ue)gbare\s+indikatortypen|indikatortypen|indikatorvarianten/iu' + exact_selection_indicator_table_header_patterns: + - '/\|\s*(?:typ|indikator)\s*\|\s*(?:grenzwert|messbereich|bereich)/iu' + exact_selection_indicator_table_row_patterns: + - '/\|\s*[A-Z]{0,4}\s*\d{2,4}\s*[A-Z]?\s*\|\s*\d/iu' + exact_selection_indicator_table_required_primary_terms: + - indikator + exact_selection_indicator_table_required_context_terms: + - grenzwert + - messbereich + - bereich + exact_detail_tokens: + - indikator + - indikatoren + - indikatortyp + - indikatortypen + - reagenz + - reagens + - grenzwert + - messbereich + - bereich + - wasserhaerte + - wasserhärte + - resthaerte + - resthärte + - haerte + - härte + - aufloesung + - auflösung + - schnittstelle + - relais + - fehlercode + - code + - wert + - werte + generic_exact_selection_tokens: + - vorherige + - vorheriger + - nutzerfrage + - aktuelle + - folgefrage + - frage + - antwort + - technische + - referenzanker + - referenzaufloesung + - referenzauflösung + - faktenquelle + - keine + - welche + - welcher + - welches + - welchem + - welchen + - wird + - werden + - wurde + - kann + - koennen + - können + - mit + - der + - die + - das + - den + - dem + - ein + - eine + - einer + - eines + - ist + - sind + - was + - wie + - wo + - zum + - zur + - fuer + - für + - durch + - von + - vom + - und + - oder + - auch generic_product_tokens: - produkt - produkte diff --git a/config/services.yaml b/config/services.yaml index ef8fc4e..2b8c7d9 100644 --- a/config/services.yaml +++ b/config/services.yaml @@ -12,6 +12,7 @@ imports: - { resource: 'retriex/search_repair.yaml' } - { resource: 'retriex/vocabulary.yaml' } - { resource: 'retriex/intent.yaml' } + - { resource: 'retriex/governance.yaml' } # ------------------------------------------------------------ # Parameters @@ -147,6 +148,10 @@ services: arguments: $config: '%retriex.query_enrichment.config%' + App\Config\GovernanceConfig: + arguments: + $config: '%retriex.governance.config%' + App\Config\ShopServiceConfig: arguments: $config: '%retriex.shop_matching.config%' diff --git a/src/Agent/PromptBuilder.php b/src/Agent/PromptBuilder.php index 5550084..5768172 100644 --- a/src/Agent/PromptBuilder.php +++ b/src/Agent/PromptBuilder.php @@ -960,13 +960,7 @@ final readonly class PromptBuilder private function hasDirectMainDeviceRequest(string $normalizedPrompt): bool { - $patterns = [ - '/\b(welcher|welches|welche)\s+[^?.!,;]{0,40}(testomat|messgerät|messgeraet|analysegerät|analysegeraet|gerät|geraet|analysator)\b/u', - '/\b(suche|finde|empfiehl|empfehle)\s+[^?.!,;]{0,40}(testomat|messgerät|messgeraet|analysegerät|analysegeraet|gerät|geraet|analysator)\b/u', - '/\b(testomat|messgerät|messgeraet|analysegerät|analysegeraet|gerät|geraet|analysator)\s+[^?.!,;]{0,40}(messen|misst|überwachen|ueberwachen|kann|für|fuer)\b/u', - ]; - - foreach ($patterns as $pattern) { + foreach ($this->config->getDirectMainDeviceRequestPatterns() as $pattern) { if (preg_match($pattern, $normalizedPrompt) === 1) { return true; } diff --git a/src/Config/CommerceIntentConfig.php b/src/Config/CommerceIntentConfig.php index a222fd6..54a7d23 100644 --- a/src/Config/CommerceIntentConfig.php +++ b/src/Config/CommerceIntentConfig.php @@ -31,6 +31,29 @@ final class CommerceIntentConfig return $this->requiredStringList('advisory_product_selection_patterns'); } + /** @return string[] */ + public function getNonProductCommerceSignals(): array + { + return $this->requiredStringList('non_product_commerce_signals'); + } + + public function getTechnicalFactualKnowledgeSignalLabel(): string + { + return $this->requiredString('technical_factual_knowledge.signal_label'); + } + + /** @return string[] */ + public function getTechnicalFactualKnowledgeQuestionMarkerPatterns(): array + { + return $this->requiredStringList('technical_factual_knowledge.question_marker_patterns'); + } + + /** @return string[] */ + public function getTechnicalFactualKnowledgeFactPatterns(): array + { + return $this->requiredStringList('technical_factual_knowledge.fact_patterns'); + } + /** @return string[] */ public function getPriceTerms(): array { diff --git a/src/Config/ConfigSourceAuditProvider.php b/src/Config/ConfigSourceAuditProvider.php index 45e2fa9..facc77e 100644 --- a/src/Config/ConfigSourceAuditProvider.php +++ b/src/Config/ConfigSourceAuditProvider.php @@ -16,6 +16,7 @@ final readonly class ConfigSourceAuditProvider 'CatalogIntentConfig' => 'retriex.intent.catalog.config', 'DomainVocabularyConfig' => 'retriex.vocabulary.config', 'IntentLightConfig' => 'retriex.intent.light.config', + 'GovernanceConfig' => 'retriex.governance.config', 'NdjsonHybridRetrieverConfig' => 'retriex.retrieval.config', 'PromptBuilderConfig' => 'retriex.prompt.config', 'QueryEnricherConfig' => 'retriex.query_enrichment.config', diff --git a/src/Config/GovernanceConfig.php b/src/Config/GovernanceConfig.php new file mode 100644 index 0000000..e97fbf8 --- /dev/null +++ b/src/Config/GovernanceConfig.php @@ -0,0 +1,216 @@ + $config + */ + public function __construct(private readonly array $config = []) + { + } + + /** @return array */ + public function toArray(): array + { + return $this->config; + } + + /** @return string[] */ + public function getRegressionProtectedShortModelTokens(): array + { + return $this->requiredStringList('regression_baseline.protected_short_model_tokens'); + } + + /** @return string[] */ + public function getRegressionProtectedMeasurementValues(): array + { + return $this->requiredStringList('regression_baseline.protected_measurement_values'); + } + + /** @return string[] */ + public function getRegressionProtectedTechnicalPromptKeywords(): array + { + return $this->requiredStringList('regression_baseline.protected_technical_prompt_keywords'); + } + + /** @return string[] */ + public function getRegressionTechnicalPriorityRequiredMarkers(): array + { + return $this->requiredStringList('regression_baseline.technical_priority_required_markers'); + } + + /** @return string[] */ + public function getRegressionProtectedAccessoryPromptKeywords(): array + { + return $this->requiredStringList('regression_baseline.protected_accessory_prompt_keywords'); + } + + /** @return string[] */ + public function getRegressionProtectedSearchRepairSpecificityTerms(): array + { + return $this->requiredStringList('regression_baseline.protected_search_repair_specificity_terms'); + } + + /** @return string[] */ + public function getRegressionProtectedRetrievalReagentWords(): array + { + return $this->requiredStringList('regression_baseline.protected_retrieval_reagent_words'); + } + + /** @return array */ + public function getRegressionProtectedRetrievalDeviceWordGroups(): array + { + $value = $this->requiredValue('regression_baseline.protected_retrieval_device_word_groups'); + if (!is_array($value)) { + throw $this->invalid('regression_baseline.protected_retrieval_device_word_groups', 'must be a map of string lists'); + } + + $out = []; + foreach ($value as $key => $item) { + if (is_string($key) && is_array($item)) { + $normalizedKey = trim($key); + $terms = $this->normalizeStringList($item); + if ($normalizedKey !== '' && $terms !== []) { + $out[$normalizedKey] = $terms; + } + continue; + } + + // Backwards-compatible reader for the temporary p15/p15b list-of-groups shape. + if (is_array($item)) { + $groupKey = isset($item['key']) && is_scalar($item['key']) ? trim((string) $item['key']) : ''; + $terms = $this->normalizeStringList($item['terms'] ?? []); + if ($groupKey !== '' && $terms !== []) { + $out[$groupKey] = $terms; + } + } + } + + if ($out === []) { + throw $this->invalid('regression_baseline.protected_retrieval_device_word_groups', 'must contain at least one valid group'); + } + + return $out; + } + + public function getRegressionShopPromptOriginalQuery(): string + { + return $this->requiredString('regression_baseline.shop_prompt_regression_original_query'); + } + + /** @return string[] */ + public function getRegressionShopPromptRequiredOutputInstructionMarkers(): array + { + return $this->requiredStringList('regression_baseline.shop_prompt_required_output_instruction_markers'); + } + + /** @return string[] */ + public function getRegressionShopQueryMetaGuardTerms(): array + { + return $this->requiredStringList('regression_baseline.shop_query_meta_guard_terms'); + } + + /** @return string[] */ + public function getRegressionShopQueryContextFallbackFilterTerms(): array + { + return $this->requiredStringList('regression_baseline.shop_query_context_fallback_filter_terms'); + } + + /** @return string[] */ + public function getVocabularyProtectedShortModelTokens(): array + { + return $this->requiredStringList('vocabulary.protected_short_model_tokens'); + } + + /** @return string[] */ + public function getLanguageProtectedStopwordTerms(): array + { + return $this->requiredStringList('language.protected_stopword_terms'); + } + + private function requiredString(string $path): string + { + $value = $this->requiredValue($path); + if (!is_scalar($value)) { + throw $this->invalid($path, 'must be a scalar string'); + } + + $value = trim((string) $value); + if ($value === '') { + throw $this->invalid($path, 'must not be empty'); + } + + return $value; + } + + /** @return string[] */ + private function requiredStringList(string $path): array + { + return $this->nonEmptyStringList($path, $this->requiredValue($path)); + } + + /** @return string[] */ + private function nonEmptyStringList(string $path, mixed $value): array + { + if (!is_array($value)) { + throw $this->invalid($path, 'must be a string list'); + } + + $out = $this->normalizeStringList($value); + if ($out === []) { + throw $this->invalid($path, 'must contain at least one value'); + } + + return $out; + } + + /** @return string[] */ + private function normalizeStringList(mixed $value): array + { + if (!is_array($value)) { + return []; + } + + $out = []; + foreach ($value as $item) { + if (!is_scalar($item)) { + continue; + } + + $item = trim((string) $item); + if ($item !== '' && !in_array($item, $out, true)) { + $out[] = $item; + } + } + + return $out; + } + + private function requiredValue(string $path): mixed + { + $value = $this->config; + foreach (explode('.', $path) as $segment) { + if (!is_array($value) || !array_key_exists($segment, $value)) { + throw $this->missing($path); + } + + $value = $value[$segment]; + } + + return $value; + } + + private function missing(string $path): \InvalidArgumentException + { + return new \InvalidArgumentException(sprintf('RetrieX governance config "%s" is missing.', $path)); + } + + private function invalid(string $path, string $reason): \InvalidArgumentException + { + return new \InvalidArgumentException(sprintf('RetrieX governance config "%s" %s.', $path, $reason)); + } +} diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index e72d1b5..9615232 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -136,6 +136,72 @@ final class NdjsonHybridRetrieverConfig return $this->requiredInt('focused_product_max_chunks', 1); } + /** @return string[] */ + public function catalogListShortcutPatterns(): array + { + return $this->requiredStringList('catalog_list_shortcut_patterns'); + } + + /** @return array */ + public function exactSelectionTokenVariantPrefixes(): array + { + return $this->requiredStringListMap('exact_selection_token_variant_prefixes'); + } + + /** @return string[] */ + public function exactSelectionIndicatorQuestionTokens(): array + { + return $this->requiredStringList('exact_selection_indicator_question_tokens'); + } + + /** @return string[] */ + public function exactSelectionIndicatorQuestionPhrases(): array + { + return $this->requiredStringList('exact_selection_indicator_question_phrases'); + } + + /** @return string[] */ + public function exactSelectionIndicatorTableHeadingPatterns(): array + { + return $this->requiredStringList('exact_selection_indicator_table_heading_patterns'); + } + + /** @return string[] */ + public function exactSelectionIndicatorTableHeaderPatterns(): array + { + return $this->requiredStringList('exact_selection_indicator_table_header_patterns'); + } + + /** @return string[] */ + public function exactSelectionIndicatorTableRowPatterns(): array + { + return $this->requiredStringList('exact_selection_indicator_table_row_patterns'); + } + + /** @return string[] */ + public function exactSelectionIndicatorTableRequiredPrimaryTerms(): array + { + return $this->requiredStringList('exact_selection_indicator_table_required_primary_terms'); + } + + /** @return string[] */ + public function exactSelectionIndicatorTableRequiredContextTerms(): array + { + return $this->requiredStringList('exact_selection_indicator_table_required_context_terms'); + } + + /** @return string[] */ + public function exactDetailTokens(): array + { + return $this->requiredStringList('exact_detail_tokens'); + } + + /** @return string[] */ + public function genericExactSelectionTokens(): array + { + return $this->requiredStringList('generic_exact_selection_tokens'); + } + /** @return string[] */ public function genericProductTokens(): array { @@ -240,6 +306,17 @@ final class NdjsonHybridRetrieverConfig 'focused_product_min_score' => $this->focusedProductMinScore(), 'focused_product_min_gap' => $this->focusedProductMinGap(), 'focused_product_max_chunks' => $this->focusedProductMaxChunks(), + 'catalog_list_shortcut_patterns' => $this->catalogListShortcutPatterns(), + 'exact_selection_token_variant_prefixes' => $this->exactSelectionTokenVariantPrefixes(), + 'exact_selection_indicator_question_tokens' => $this->exactSelectionIndicatorQuestionTokens(), + 'exact_selection_indicator_question_phrases' => $this->exactSelectionIndicatorQuestionPhrases(), + 'exact_selection_indicator_table_heading_patterns' => $this->exactSelectionIndicatorTableHeadingPatterns(), + 'exact_selection_indicator_table_header_patterns' => $this->exactSelectionIndicatorTableHeaderPatterns(), + 'exact_selection_indicator_table_row_patterns' => $this->exactSelectionIndicatorTableRowPatterns(), + 'exact_selection_indicator_table_required_primary_terms' => $this->exactSelectionIndicatorTableRequiredPrimaryTerms(), + 'exact_selection_indicator_table_required_context_terms' => $this->exactSelectionIndicatorTableRequiredContextTerms(), + 'exact_detail_tokens' => $this->exactDetailTokens(), + 'generic_exact_selection_tokens' => $this->genericExactSelectionTokens(), 'generic_product_tokens' => $this->genericProductTokens(), 'important_short_model_tokens' => $this->importantShortModelTokens(), 'family_descriptor_tokens' => $this->familyDescriptorTokens(), @@ -324,6 +401,47 @@ final class NdjsonHybridRetrieverConfig return $out; } + /** + * @return array + */ + private function requiredStringListMap(string $key): array + { + $value = $this->requiredValue($key); + + if (!is_array($value)) { + throw $this->invalid($key, 'must be a map of string lists'); + } + + $out = []; + foreach ($value as $mapKey => $items) { + if (!is_string($mapKey) || trim($mapKey) === '' || !is_array($items)) { + continue; + } + + $cleanItems = []; + foreach ($items as $item) { + if (!is_scalar($item)) { + continue; + } + + $item = trim((string) $item); + if ($item !== '' && !in_array($item, $cleanItems, true)) { + $cleanItems[] = $item; + } + } + + if ($cleanItems !== []) { + $out[trim($mapKey)] = $cleanItems; + } + } + + if ($out === []) { + throw $this->invalid($key, 'must contain at least one non-empty map entry'); + } + + return $out; + } + private function requiredValue(string $key): mixed { if (!array_key_exists($key, $this->config)) { diff --git a/src/Config/PromptBuilderConfig.php b/src/Config/PromptBuilderConfig.php index 5cc7da2..cb896a4 100644 --- a/src/Config/PromptBuilderConfig.php +++ b/src/Config/PromptBuilderConfig.php @@ -544,6 +544,14 @@ final class PromptBuilderConfig return $this->getRequiredStringList('role_guard.accessory_product_keywords'); } + /** + * @return string[] + */ + public function getDirectMainDeviceRequestPatterns(): array + { + return $this->getRequiredStringList('role_guard.direct_main_device_request_patterns'); + } + /** * @return string[] */ diff --git a/src/Config/RetriexEffectiveConfigProvider.php b/src/Config/RetriexEffectiveConfigProvider.php index 0923775..8be6785 100644 --- a/src/Config/RetriexEffectiveConfigProvider.php +++ b/src/Config/RetriexEffectiveConfigProvider.php @@ -26,6 +26,7 @@ final readonly class RetriexEffectiveConfigProvider private ShopServiceConfig $shopServiceConfig, private StopWordsConfig $stopWordsConfig, private QueryEnricherConfig $queryEnricherConfig, + private GovernanceConfig $governanceConfig, private CatalogIntentConfig $catalogIntentConfig, private ContextServiceConfig $contextServiceConfig, ) { @@ -51,6 +52,7 @@ final readonly class RetriexEffectiveConfigProvider 'search_repair' => $this->searchRepairEffectiveConfig(), 'intent' => $this->intentConfig(), 'vocabulary' => $this->domainVocabularyConfig->toArray(), + 'governance' => $this->governanceConfig->toArray(), 'language' => $this->languageConfig(), 'query_enrichment' => $this->queryEnrichmentConfig(), 'catalog_intent' => $this->catalogIntentConfig(), @@ -80,6 +82,7 @@ final readonly class RetriexEffectiveConfigProvider $this->validateSearchRepair($config['search_repair'], $errors, $warnings); $this->validateIntent($config['intent'], $errors, $warnings); $this->validateVocabulary($config['vocabulary'], $errors, $warnings); + $this->validateGovernance($config['governance'], $errors, $warnings); $this->validateLanguage($config['language'], $errors, $warnings); $this->validateQueryEnrichment($config['query_enrichment'], $errors, $warnings); @@ -114,8 +117,8 @@ final readonly class RetriexEffectiveConfigProvider } $importantShortModelTokens = $this->retrieverConfig->importantShortModelTokens(); - foreach (['th', 'tc', 'tp', 'tm', 'ph', 'rx'] as $token) { - $key = 'important_short_model_token_' . $token; + foreach ($this->governanceConfig->getRegressionProtectedShortModelTokens() as $token) { + $key = 'important_short_model_token_' . $this->guardrailCheckKey($token); $checks[$key] = in_array($token, $importantShortModelTokens, true); if (!$checks[$key]) { $errors[] = 'Missing protected short model token: ' . $token; @@ -123,20 +126,23 @@ final readonly class RetriexEffectiveConfigProvider } $measurementPattern = $this->commerceQueryParserConfig->getMeasurementValueTokenPattern(); - $checks['measurement_value_0_02_matches'] = @preg_match($measurementPattern, '0,02') === 1; - if (!$checks['measurement_value_0_02_matches']) { - $errors[] = 'Commerce query parser no longer recognizes 0,02 as a measurement value.'; - } - $filterTokens = $this->commerceQueryParserConfig->getFilterSearchTokens(); - $checks['measurement_value_0_02_not_filtered'] = !in_array('0,02', $filterTokens, true); - if (!$checks['measurement_value_0_02_not_filtered']) { - $errors[] = 'Commerce query parser filters the protected token 0,02.'; + foreach ($this->governanceConfig->getRegressionProtectedMeasurementValues() as $measurementValue) { + $keySuffix = $this->guardrailCheckKey($measurementValue); + $checks['measurement_value_' . $keySuffix . '_matches'] = @preg_match($measurementPattern, $measurementValue) === 1; + if (!$checks['measurement_value_' . $keySuffix . '_matches']) { + $errors[] = 'Commerce query parser no longer recognizes protected measurement value: ' . $measurementValue . '.'; + } + + $checks['measurement_value_' . $keySuffix . '_not_filtered'] = !in_array($measurementValue, $filterTokens, true); + if (!$checks['measurement_value_' . $keySuffix . '_not_filtered']) { + $errors[] = 'Commerce query parser filters protected measurement value: ' . $measurementValue . '.'; + } } $technicalKeywords = $this->promptConfig->getTechnicalProductKeywords(); - foreach (['testomat', 'indikator', 'grenzwert', 'messbereich', 'gemessen'] as $term) { - $key = 'technical_keyword_' . $term; + foreach ($this->governanceConfig->getRegressionProtectedTechnicalPromptKeywords() as $term) { + $key = 'technical_keyword_' . $this->guardrailCheckKey($term); $checks[$key] = in_array($term, $technicalKeywords, true); if (!$checks[$key]) { $errors[] = 'Missing technical prompt keyword: ' . $term; @@ -144,19 +150,20 @@ final readonly class RetriexEffectiveConfigProvider } $technicalPriorityRules = implode("\n", $this->promptConfig->getOutputPriorityTechnicalRules()); $checks['technical_priority_rules_present'] = trim($technicalPriorityRules) !== ''; - $checks['technical_priority_prevents_runner_up'] = str_contains($technicalPriorityRules, 'runner-up') - || str_contains($technicalPriorityRules, 'second-lowest') - || str_contains($technicalPriorityRules, 'comparison'); + $checks['technical_priority_required_markers_present'] = $this->containsAnyConfiguredMarker( + $technicalPriorityRules, + $this->governanceConfig->getRegressionTechnicalPriorityRequiredMarkers() + ); if (!$checks['technical_priority_rules_present']) { $errors[] = 'Missing technical output priority rules.'; } - if (!$checks['technical_priority_prevents_runner_up']) { - $errors[] = 'Technical output priority no longer guards against runner-up/comparison expansion.'; + if (!$checks['technical_priority_required_markers_present']) { + $errors[] = 'Technical output priority no longer contains a required governance marker.'; } $accessoryKeywords = $this->promptConfig->getAccessoryRequestKeywords(); - foreach (['indikator', 'reagenz'] as $term) { - $key = 'accessory_keyword_' . $term; + foreach ($this->governanceConfig->getRegressionProtectedAccessoryPromptKeywords() as $term) { + $key = 'accessory_keyword_' . $this->guardrailCheckKey($term); $checks[$key] = in_array($term, $accessoryKeywords, true); if (!$checks[$key]) { $errors[] = 'Missing accessory prompt keyword: ' . $term; @@ -164,8 +171,8 @@ final readonly class RetriexEffectiveConfigProvider } $searchRepairTerms = $this->searchRepairConfig->getSpecificityBoostTerms(); - foreach (['indikator', 'testomat', 'reagenz'] as $term) { - $key = 'search_repair_specificity_' . $term; + foreach ($this->governanceConfig->getRegressionProtectedSearchRepairSpecificityTerms() as $term) { + $key = 'search_repair_specificity_' . $this->guardrailCheckKey($term); $checks[$key] = in_array($term, $searchRepairTerms, true); if (!$checks[$key]) { $errors[] = 'Missing search repair specificity term: ' . $term; @@ -173,30 +180,46 @@ final readonly class RetriexEffectiveConfigProvider } $reagentWords = $this->retrieverConfig->looksLikeReagentWords(); - $deviceWords = $this->retrieverConfig->looksLikeDeviceWords(); - $checks['retrieval_reagent_word_indikator'] = in_array('indikator', $reagentWords, true); - $checks['retrieval_device_word_geraet'] = in_array('geraet', $deviceWords, true) || in_array('gerät', $deviceWords, true); - if (!$checks['retrieval_reagent_word_indikator']) { - $errors[] = 'Missing retrieval reagent word: indikator.'; - } - if (!$checks['retrieval_device_word_geraet']) { - $errors[] = 'Missing retrieval device word: geraet/geraet equivalent.'; + foreach ($this->governanceConfig->getRegressionProtectedRetrievalReagentWords() as $term) { + $key = 'retrieval_reagent_word_' . $this->guardrailCheckKey($term); + $checks[$key] = in_array($term, $reagentWords, true); + if (!$checks[$key]) { + $errors[] = 'Missing retrieval reagent word: ' . $term . '.'; + } } - $shopPrompt = $this->agentRunnerConfig->getShopPrompt('testomat 808 0,02', ''); - $checks['shop_prompt_contains_output_instruction'] = str_contains($shopPrompt, 'Output only the final search query.') - || str_contains($shopPrompt, 'Output format:'); - $checks['shop_prompt_contains_original_query'] = str_contains($shopPrompt, 'testomat 808 0,02'); + $deviceWords = $this->retrieverConfig->looksLikeDeviceWords(); + foreach ($this->governanceConfig->getRegressionProtectedRetrievalDeviceWordGroups() as $groupKey => $terms) { + $key = 'retrieval_device_word_' . $this->guardrailCheckKey((string) $groupKey); + $checks[$key] = false; + foreach ($terms as $term) { + if (in_array($term, $deviceWords, true)) { + $checks[$key] = true; + break; + } + } + if (!$checks[$key]) { + $errors[] = 'Missing retrieval device word group: ' . (string) $groupKey . '.'; + } + } + + $shopPromptOriginalQuery = $this->governanceConfig->getRegressionShopPromptOriginalQuery(); + $shopPrompt = $this->agentRunnerConfig->getShopPrompt($shopPromptOriginalQuery, ''); + $checks['shop_prompt_contains_output_instruction'] = $this->containsAnyConfiguredMarker( + $shopPrompt, + $this->governanceConfig->getRegressionShopPromptRequiredOutputInstructionMarkers() + ); + $checks['shop_prompt_contains_original_query'] = str_contains($shopPrompt, $shopPromptOriginalQuery); if (!$checks['shop_prompt_contains_output_instruction']) { - $errors[] = 'Shop query optimizer prompt no longer contains the expected output instruction.'; + $errors[] = 'Shop query optimizer prompt no longer contains a required output instruction marker.'; } if (!$checks['shop_prompt_contains_original_query']) { - $errors[] = 'Shop query optimizer prompt no longer contains the original query.'; + $errors[] = 'Shop query optimizer prompt no longer contains the configured original query.'; } $metaOnlyTerms = $this->agentRunnerConfig->getShopQueryMetaOnlyTerms(); - foreach (['shop', 'suche'] as $term) { - $key = 'shop_query_meta_guard_term_' . $term; + foreach ($this->governanceConfig->getRegressionShopQueryMetaGuardTerms() as $term) { + $key = 'shop_query_meta_guard_term_' . $this->guardrailCheckKey($term); $checks[$key] = in_array($term, $metaOnlyTerms, true); if (!$checks[$key]) { $errors[] = 'Missing shop query meta guard term: ' . $term; @@ -208,8 +231,8 @@ final readonly class RetriexEffectiveConfigProvider } $contextFallbackFilterTerms = $this->agentRunnerConfig->getShopQueryContextFallbackFilterTerms(); - foreach (['welchem', 'kann', 'messen'] as $term) { - $key = 'shop_query_context_fallback_filter_' . $term; + foreach ($this->governanceConfig->getRegressionShopQueryContextFallbackFilterTerms() as $term) { + $key = 'shop_query_context_fallback_filter_' . $this->guardrailCheckKey($term); $checks[$key] = in_array($term, $contextFallbackFilterTerms, true); if (!$checks[$key]) { $errors[] = 'Missing shop query context fallback filter term: ' . $term; @@ -244,7 +267,26 @@ final readonly class RetriexEffectiveConfigProvider 'warnings' => $warnings, ]; } + /** @param string[] $markers */ + private function containsAnyConfiguredMarker(string $haystack, array $markers): bool + { + foreach ($markers as $marker) { + if ($marker !== '' && str_contains($haystack, $marker)) { + return true; + } + } + return false; + } + + private function guardrailCheckKey(string $term): string + { + $key = mb_strtolower($term, 'UTF-8'); + $key = preg_replace('/[^\p{L}\p{N}]+/u', '_', $key) ?? $key; + $key = trim($key, '_'); + + return $key !== '' ? $key : 'value'; + } /** @return array */ private function runtimeConfig(): array { @@ -740,6 +782,37 @@ final readonly class RetriexEffectiveConfigProvider ]; } + /** + * @param array $governance + * @param list $errors + * @param list $warnings + */ + private function validateGovernance(array $governance, array &$errors, array &$warnings): void + { + if ($governance === []) { + $errors[] = 'governance config must not be empty.'; + return; + } + + try { + $this->governanceConfig->getRegressionProtectedShortModelTokens(); + $this->governanceConfig->getRegressionProtectedMeasurementValues(); + $this->governanceConfig->getRegressionProtectedTechnicalPromptKeywords(); + $this->governanceConfig->getRegressionTechnicalPriorityRequiredMarkers(); + $this->governanceConfig->getRegressionProtectedAccessoryPromptKeywords(); + $this->governanceConfig->getRegressionProtectedSearchRepairSpecificityTerms(); + $this->governanceConfig->getRegressionProtectedRetrievalReagentWords(); + $this->governanceConfig->getRegressionProtectedRetrievalDeviceWordGroups(); + $this->governanceConfig->getRegressionShopPromptOriginalQuery(); + $this->governanceConfig->getRegressionShopPromptRequiredOutputInstructionMarkers(); + $this->governanceConfig->getRegressionShopQueryMetaGuardTerms(); + $this->governanceConfig->getRegressionShopQueryContextFallbackFilterTerms(); + $this->governanceConfig->getVocabularyProtectedShortModelTokens(); + $this->governanceConfig->getLanguageProtectedStopwordTerms(); + } catch (\InvalidArgumentException $e) { + $errors[] = $e->getMessage(); + } + } /** * @param array $runtime * @param list $errors @@ -847,6 +920,11 @@ final readonly class RetriexEffectiveConfigProvider $inventory = $retrieval['inventory_parameter'] ?? []; if (is_array($inventory)) { foreach ($inventory as $key => $value) { + $key = (string) $key; + if (!$this->shouldCompareRetrievalInventoryKey($key, $retrieval)) { + continue; + } + if (array_key_exists($key, $retrieval) && $retrieval[$key] != $value) { $warnings[] = 'retrieval.inventory.' . $key . ' differs from active retriever config.'; } @@ -854,6 +932,41 @@ final readonly class RetriexEffectiveConfigProvider } } + /** + * Retrieval vocabulary lists can be resolved from dedicated vocabulary views. + * The backwards-compatible inventory parameter may still contain raw legacy + * list values for those keys, so comparing it against the active retriever + * facade would produce false-positive validation warnings. + * + * @param array $retrieval + */ + private function shouldCompareRetrievalInventoryKey(string $key, array $retrieval): bool + { + if (in_array($key, $this->retrievalVocabularyBackedInventoryKeys(), true)) { + return false; + } + + $vocabulary = $retrieval['vocabulary'] ?? []; + + return !is_array($vocabulary) || !array_key_exists($key, $vocabulary); + } + + /** @return string[] */ + private function retrievalVocabularyBackedInventoryKeys(): array + { + return [ + 'generic_product_tokens', + 'important_short_model_tokens', + 'family_descriptor_tokens', + 'looks_like_reagent_tokens', + 'looks_like_safety_docs', + 'looks_like_reagent_words', + 'looks_like_document_words', + 'looks_like_safety_words', + 'looks_like_device_words', + ]; + } + /** * @param array $prompt * @param list $errors @@ -964,13 +1077,15 @@ final readonly class RetriexEffectiveConfigProvider } $measurementPattern = $patterns['measurement_value_token'] ?? null; - if (is_string($measurementPattern) && @preg_match($measurementPattern, '0,02') !== 1) { - $errors[] = 'commerce_query.patterns.measurement_value_token must match 0,02.'; - } - $filterTokens = $commerceQuery['filter_search_tokens'] ?? []; - if (is_array($filterTokens) && in_array('0,02', $filterTokens, true)) { - $errors[] = 'commerce_query.filter_search_tokens must not remove protected decimal token 0,02.'; + foreach ($this->governanceConfig->getRegressionProtectedMeasurementValues() as $measurementValue) { + if (is_string($measurementPattern) && @preg_match($measurementPattern, $measurementValue) !== 1) { + $errors[] = 'commerce_query.patterns.measurement_value_token must match protected measurement value: ' . $measurementValue . '.'; + } + + if (is_array($filterTokens) && in_array($measurementValue, $filterTokens, true)) { + $errors[] = 'commerce_query.filter_search_tokens must not remove protected measurement value: ' . $measurementValue . '.'; + } } } @@ -1033,7 +1148,7 @@ final readonly class RetriexEffectiveConfigProvider if (is_array($retrievalViews)) { $shortModel = $retrievalViews['important_short_model_tokens']['add'] ?? []; if (is_array($shortModel)) { - foreach (['th', 'tc', 'tp', 'tm', 'ph', 'rx'] as $token) { + foreach ($this->governanceConfig->getVocabularyProtectedShortModelTokens() as $token) { if (!in_array($token, $shortModel, true)) { $warnings[] = 'vocabulary.views.retrieval.important_short_model_tokens should contain protected token ' . $token . '.'; } @@ -1051,7 +1166,7 @@ final readonly class RetriexEffectiveConfigProvider { $this->validateStringListMap($language, 'language', $errors, $warnings); $stopwords = is_array($language['stopwords'] ?? null) ? $language['stopwords'] : []; - foreach (['nicht', 'kein', 'welche', 'testomat', 'indikator', '0,02'] as $protected) { + foreach ($this->governanceConfig->getLanguageProtectedStopwordTerms() as $protected) { if (in_array($protected, $stopwords, true)) { $errors[] = 'language.stopwords must not contain protected term: ' . $protected . '.'; } diff --git a/src/Intent/CommerceIntentLite.php b/src/Intent/CommerceIntentLite.php index c76fa98..61c98fa 100644 --- a/src/Intent/CommerceIntentLite.php +++ b/src/Intent/CommerceIntentLite.php @@ -50,7 +50,7 @@ final class CommerceIntentLite return $this->buildDetectionResult( intent: self::NONE, score: 0, - signals: ['technical_factual_knowledge_query'] + signals: [$this->config->getTechnicalFactualKnowledgeSignalLabel()] ); } @@ -167,18 +167,7 @@ final class CommerceIntentLite private function isNonProductCommerceSignal(string $signal): bool { - return in_array($signal, [ - 'shop', - 'alle', - 'kunde', - 'online', - 'kaufen', - 'kostet', - 'suche', - 'such', - 'finde', - 'finden', - ], true); + return in_array($signal, $this->config->getNonProductCommerceSignals(), true); } /** @@ -191,37 +180,19 @@ final class CommerceIntentLite */ private function isTechnicalFactualKnowledgeQuery(string $prompt): bool { - $hasQuestionMarker = $this->matchesAnyPattern($prompt, [ - '/\bwas\s+ist\b/u', - '/\bwelche?r?s?\b/u', - '/\bwie\s+(hoch|niedrig|klein|gross|groß)\b/u', - '/\bniedrigste[rsn]?\b/u', - '/\bkleinste[rsn]?\b/u', - '/\bhöchste[rsn]?\b/u', - '/\bhoechste[rsn]?\b/u', - ]); + $hasQuestionMarker = $this->matchesAnyPattern( + $prompt, + $this->config->getTechnicalFactualKnowledgeQuestionMarkerPatterns() + ); if (!$hasQuestionMarker) { return false; } - return $this->matchesAnyPattern($prompt, [ - '/\bgrenzwert(?:e|en|es)?\b/u', - '/\bmessbereich(?:e|en|s)?\b/u', - '/\bwasserhärte\b/u', - '/\bwasserhaerte\b/u', - '/\bresthärte\b/u', - '/\bresthaerte\b/u', - '/\bgesamthärte\b/u', - '/\bgesamthaerte\b/u', - '/\bauflösung\b/u', - '/\baufloesung\b/u', - '/\bindikator(?:en|s)?\b/u', - '/\btestomat(?:en|s)?\b/u', - '/\büberwach(?:t|en|ung)\b/u', - '/\bueberwach(?:t|en|ung)\b/u', - '/\bmess(?:en|ung|bar|wert)\b/u', - ]); + return $this->matchesAnyPattern( + $prompt, + $this->config->getTechnicalFactualKnowledgeFactPatterns() + ); } /** diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index ea2e1ce..7288af7 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -474,25 +474,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return false; } - $patterns = [ - '/\balle\b/u', - '/\bliste\b/u', - '/\bauflistung\b/u', - '/\buebersicht\b/u', - '/\bübersicht\b/u', - '/\bsortiment\b/u', - '/\bwelche\b.*\b(gibt|verfügbar|verfuegbar|existieren)\b/u', - '/\bzeige\b.*\b(produkte|geraete|geräte|modelle|artikel)\b/u', - '/\bwas\b.*\b(gibt es|verfügbar|verfuegbar)\b/u', - ]; - - foreach ($patterns as $pattern) { - if (preg_match($pattern, $normalized) === 1) { - return true; - } - } - - return false; + return $this->matchesAnyPattern($normalized, $this->retrieverConfig->catalogListShortcutPatterns()); } /** @@ -872,21 +854,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } } - if (str_starts_with($token, 'indikator')) { - $variants[] = 'indikator'; - $variants[] = 'indikatortyp'; - } + foreach ($this->retrieverConfig->exactSelectionTokenVariantPrefixes() as $prefix => $configuredVariants) { + if (!str_starts_with($token, $prefix)) { + continue; + } - if (str_starts_with($token, 'grenzwert')) { - $variants[] = 'grenzwert'; - } - - if (str_starts_with($token, 'messbereich')) { - $variants[] = 'messbereich'; - } - - if (str_starts_with($token, 'testomat')) { - $variants[] = 'testomat'; + foreach ($configuredVariants as $variant) { + $variants[] = $variant; + } } return array_values(array_unique($variants)); @@ -903,12 +878,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface true ); - $asksIndicator = isset($tokens['indikator']) - || isset($tokens['indikatortyp']) - || isset($tokens['reagenz']) - || isset($tokens['reagens']) - || str_contains($normalized, 'mit welchem') - || str_contains($normalized, 'womit'); + $asksIndicator = $this->containsAnyConfiguredToken( + $tokens, + $this->retrieverConfig->exactSelectionIndicatorQuestionTokens() + ) || $this->containsAnyConfiguredPhrase( + $normalized, + $this->retrieverConfig->exactSelectionIndicatorQuestionPhrases() + ); return [ 'asks_indicator' => $asksIndicator, @@ -931,24 +907,26 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $score; } - if (preg_match('/verf(?:ü|ue)gbare\s+indikatortypen|indikatortypen|indikatorvarianten/iu', $rawText) === 1) { + if ($this->matchesAnyPattern($rawText, $this->retrieverConfig->exactSelectionIndicatorTableHeadingPatterns())) { $score += 14.0; } - if (preg_match('/\|\s*(?:typ|indikator)\s*\|\s*(?:grenzwert|messbereich|bereich)/iu', $rawText) === 1) { + if ($this->matchesAnyPattern($rawText, $this->retrieverConfig->exactSelectionIndicatorTableHeaderPatterns())) { $score += 10.0; } - if (preg_match('/\|\s*[A-Z]{0,4}\s*\d{2,4}\s*[A-Z]?\s*\|\s*\d/iu', $rawText) === 1) { + if ($this->matchesAnyPattern($rawText, $this->retrieverConfig->exactSelectionIndicatorTableRowPatterns())) { $score += 8.0; } if ( - str_contains($normalizedHaystack, 'indikator') - && ( - str_contains($normalizedHaystack, 'grenzwert') - || str_contains($normalizedHaystack, 'messbereich') - || str_contains($normalizedHaystack, 'bereich') + $this->containsAnyConfiguredPhrase( + $normalizedHaystack, + $this->retrieverConfig->exactSelectionIndicatorTableRequiredPrimaryTerms() + ) + && $this->containsAnyConfiguredPhrase( + $normalizedHaystack, + $this->retrieverConfig->exactSelectionIndicatorTableRequiredContextTerms() ) ) { $score += 5.0; @@ -959,27 +937,55 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface private function isExactDetailToken(string $token): bool { - return in_array($token, [ - 'indikator', 'indikatoren', 'indikatortyp', 'indikatortypen', 'reagenz', 'reagens', 'grenzwert', - 'messbereich', 'bereich', 'wasserhaerte', 'wasserhärte', - 'resthaerte', 'resthärte', 'haerte', 'härte', 'aufloesung', - 'auflösung', 'schnittstelle', 'relais', 'fehlercode', 'code', - 'wert', 'werte', - ], true); + return in_array($token, $this->retrieverConfig->exactDetailTokens(), true); } private function isGenericExactSelectionToken(string $token): bool { - return in_array($token, [ - 'vorherige', 'vorheriger', 'nutzerfrage', 'aktuelle', - 'folgefrage', 'frage', 'antwort', 'technische', 'referenzanker', - 'referenzaufloesung', 'referenzauflösung', 'faktenquelle', 'keine', - 'welche', 'welcher', 'welches', - 'welchem', 'welchen', 'wird', 'werden', 'wurde', 'kann', 'koennen', - 'können', 'mit', 'der', 'die', 'das', 'den', 'dem', 'ein', 'eine', - 'einer', 'eines', 'ist', 'sind', 'was', 'wie', 'wo', 'zum', 'zur', - 'fuer', 'für', 'durch', 'von', 'vom', 'und', 'oder', 'auch', - ], true); + return in_array($token, $this->retrieverConfig->genericExactSelectionTokens(), true); + } + + /** + * @param string[] $patterns + */ + private function matchesAnyPattern(string $value, array $patterns): bool + { + foreach ($patterns as $pattern) { + if (preg_match($pattern, $value) === 1) { + return true; + } + } + + return false; + } + + /** + * @param array $tokens + * @param string[] $needles + */ + private function containsAnyConfiguredToken(array $tokens, array $needles): bool + { + foreach ($needles as $needle) { + if (isset($tokens[$needle])) { + return true; + } + } + + return false; + } + + /** + * @param string[] $phrases + */ + private function containsAnyConfiguredPhrase(string $haystack, array $phrases): bool + { + foreach ($phrases as $phrase) { + if ($phrase !== '' && str_contains($haystack, $phrase)) { + return true; + } + } + + return false; } /**