diff --git a/RETRIEX_PATCH_4_QUERY_SEARCHREPAIR_YAML_ONLY_README.md b/RETRIEX_PATCH_4_QUERY_SEARCHREPAIR_YAML_ONLY_README.md new file mode 100644 index 0000000..af09793 --- /dev/null +++ b/RETRIEX_PATCH_4_QUERY_SEARCHREPAIR_YAML_ONLY_README.md @@ -0,0 +1,96 @@ +# RetrieX Patch 4: Query Enrichment and Search Repair YAML-only + +This patch continues the YAML-as-source-of-truth migration. + +## Scope + +Changed areas only: + +- `src/Config/QueryEnricherConfig.php` +- `src/Config/SearchRepairConfig.php` +- `config/retriex/query_enrichment.yaml` +- `config/retriex/search_repair.yaml` +- `config/retriex/vocabulary.yaml` +- `config/services.yaml` + +No retrieval scoring, PromptBuilder, ShopService matching, AgentRunner, SSE, or answer-generation behavior was intentionally changed. + +## QueryEnricherConfig + +Removed the PHP fallback mapping `DEFAULT_ENRICH_QUERY_LIST`. + +The existing values are now required in: + +```yaml +config/retriex/query_enrichment.yaml +parameters: + retriex.query_enrichment.config: + max_expansions: 4 + rules: ... +``` + +Missing, invalid, or empty YAML now fails closed via `InvalidArgumentException` instead of silently falling back to PHP defaults. + +## SearchRepairConfig + +Removed PHP fallback constants and hardcoded fallback values for: + +- generic candidate tokens +- accessory candidate terms +- accessory/bundle terms +- specificity boost terms +- requested accessory-code behavior +- model-candidate patterns +- search-repair scoring values +- normalization patterns +- trim character set +- product-key separator +- top-product log limit + +The active search-repair settings now live in: + +```yaml +config/retriex/search_repair.yaml +``` + +Vocabulary-backed search-repair term views remain in: + +```yaml +config/retriex/vocabulary.yaml +parameters: + retriex.vocabulary.config: + views: + search_repair: ... +``` + +`SearchRepairConfig` now resolves term lists from explicit per-service YAML keys first, otherwise from the configured vocabulary view. If neither exists, it fails closed. + +## Validation performed in the artifact workspace + +PHP syntax checks passed for: + +```bash +php -l src/Config/QueryEnricherConfig.php +php -l src/Config/SearchRepairConfig.php +php -l src/Config/RetriexEffectiveConfigProvider.php +``` + +Full Symfony commands were not executed in this ZIP workspace because `vendor/` is not included. + +Recommended runtime checks after applying the patch: + +```bash +php bin/console mto:agent:config:validate +php bin/console mto:agent:config:dump-effective --json +php bin/console mto:agent:config:audit-source --details +php bin/console mto:agent:regression:test +``` + +Regression focus: + +- Testomat 808 / 0,02 °dH +- Indikatortyp 300 follow-up +- Indikator price follow-up +- `welcher pockettester ist für Redox messung gut` → `suche im shop` +- product-selection shop fallback for free chlorine +- SSE completion watchdog behavior diff --git a/RETRIEX_PATCH_4_TEMPLATE_VALIDATION_HOTFIX_README.md b/RETRIEX_PATCH_4_TEMPLATE_VALIDATION_HOTFIX_README.md new file mode 100644 index 0000000..d414037 --- /dev/null +++ b/RETRIEX_PATCH_4_TEMPLATE_VALIDATION_HOTFIX_README.md @@ -0,0 +1,28 @@ +# RetrieX Patch 4 Hotfix: SearchRepair pattern-template validation + +This hotfix fixes an overly strict placeholder validation in `SearchRepairConfig`. + +## Problem + +Patch 4 moved SearchRepair patterns to YAML. The new validator rejected every rendered template that still contained a `{` character. This also rejected valid regex quantifiers such as `\d{1,5}`. + +Symfony error example: + +```text +RetrieX search repair pattern template "patterns.accessory_candidate_template" contains unresolved placeholders. +``` + +## Fix + +The validator now only treats named placeholders like `{terms}` or `{model}` as unresolved. Regex quantifiers such as `{1,5}` remain valid. + +No retrieval, prompt, shop, scoring, or SearchRepair behavior is changed. + +## After applying + +```bash +php bin/console cache:clear +php bin/console mto:agent:config:validate +php bin/console mto:agent:config:audit-source --details +php bin/console mto:agent:regression:test +``` diff --git a/RETRIEX_PATCH_5_COMMERCE_QUERY_YAML_ONLY_README.md b/RETRIEX_PATCH_5_COMMERCE_QUERY_YAML_ONLY_README.md new file mode 100644 index 0000000..9cfa2dd --- /dev/null +++ b/RETRIEX_PATCH_5_COMMERCE_QUERY_YAML_ONLY_README.md @@ -0,0 +1,72 @@ +# RetrieX Patch 5: CommerceQueryParser YAML-only + +Basis: aktualisierte `rag-inprogress.zip` nach Patch 4. + +## Ziel + +`CommerceQueryParserConfig` darf keine fachlichen PHP-Defaults mehr enthalten. Alle Commerce-Query-Parser-Werte liegen jetzt in YAML unter: + +```yaml +retriex.commerce_query.config +``` + +## Geaenderte Dateien + +- `config/retriex/commerce.yaml` +- `config/retriex/vocabulary.yaml` +- `config/services.yaml` +- `src/Config/CommerceQueryParserConfig.php` +- `src/Config/RetriexEffectiveConfigProvider.php` + +## Inhalt + +Nach YAML verschoben wurden: + +- bekannte Marken +- zu entfernende Commerce-Phrasen +- Filter-/Stop-Tokens fuer Shop-Suchtexte +- Suchtoken-Korrekturen +- Canonical Maps +- semantische Shop-Suchtokens +- Normalisierung fuer `€` -> ` euro ` +- Trim-Zeichen fuer Query-Texte +- Limits fuer Tokenlaengen und Suchtextaufbau +- Regexe fuer History-Kontext, Preise, Modell-/Zubehoer-Erkennung, Messwerte +- Regex-Templates fuer exakte Token-Entfernung und Brand/Model-Erkennung + +## Wichtig + +- `CommerceQueryParserConfig` enthaelt keine `private const` Defaults mehr. +- Fehlende Pflichtwerte fallen nicht mehr still auf PHP zurueck, sondern erzeugen einen `InvalidArgumentException`. +- Die alten `commerce_query`-Vocabulary-Views/Maps wurden aus `vocabulary.yaml` entfernt, damit nicht zwei YAML-Stellen scheinbar denselben Parser steuern. +- `config/services.yaml` injiziert nur noch `%retriex.commerce_query.config%`; die alte Vocabulary-Abhaengigkeit im Parser-Config-Service ist entfernt. + +## Nicht geaendert + +- Retrieval-Scoring +- PromptBuilder +- AgentRunner +- Shop-Matching +- SSE/Job-Logik +- CommerceIntentConfig + +## Lokale Pruefung nach Einspielen + +```bash +php bin/console cache:clear +php bin/console mto:agent:config:validate +php bin/console mto:agent:config:audit-source --details +php bin/console mto:agent:regression:test +``` + +Besonders pruefen: + +- Testomat 808 / 0,02 Grad dH +- Folgefrage Indikatortyp 300 +- Preisfrage zum Indikator +- `welcher pockettester ist fuer Redox messung gut` -> `suche im shop` +- `mit welchem testomat kann ich freies chlor messen` + +## Hinweis + +Im ZIP-Arbeitsverzeichnis konnte ich Symfony-Kommandos nicht ausfuehren, weil `vendor/` nicht enthalten ist. PHP-Syntax der geaenderten PHP-Dateien wurde mit `php -l` geprueft. diff --git a/config/retriex/commerce.yaml b/config/retriex/commerce.yaml index 0b09b94..dd6bc25 100644 --- a/config/retriex/commerce.yaml +++ b/config/retriex/commerce.yaml @@ -11,6 +11,215 @@ parameters: retriex.commerce.search_repair.max_queries: 2 retriex.commerce.search_repair.min_primary_results_without_repair: 2 + # Commerce query parser configuration. + # YAML is the only operative source of truth; PHP must not contain parser defaults. + retriex.commerce_query.config: + known_brands: + - heyl + - horiba + - neomeris + + phrases_to_remove: + - ich suche + - suche + - habt ihr + - gibt es + - gebe mir + - gib mir + - zeige mir + - welches gerät + - welche gerät + - welches modell + - welches ist besser + - welches ist am besten + - alternative + - alternativen + - unter anderem + - u a + - welche + - welcher + - welches + - welchen + - sind + - ist + - geeignet + - geeigent + - verfügbarkeit + - verfuegbarkeit + + filter_search_tokens: + - auch + - noch + - nochmal + - zusätzlich + - dazu + - davon + - stattdessen + - bitte + - gern + - gerne + - zeige + - zeig + - such + - suche + - finde + - find + - mir + - mal + - von + - im + - in + - für + - fuer + - welche + - welcher + - welches + - welchen + - sind + - ist + - geeignet + - geeigent + - verfügbarkeit + - verfuegbarkeit + - prüfe + - pruefe + - den + - die + - das + - der + - dem + - des + - und + - oder + - sowie + - seine + - seinen + - seiner + - seinem + - seines + - siene + - sienen + - siener + - sienem + - sienes + - gebe + - gib + - nenne + - nenn + - preis + - preise + - preisen + - kostet + - kosten + - ua + - also + - gut + - gute + - guten + - guter + - gutes + - passen + - passend + + search_token_corrections: + siene: seine + sienen: seinen + siener: seiner + sienem: seinem + sienes: seines + indicatoren: indikatoren + + search_token_canonical_map: + indikatoren: indikator + indicators: indikator + indicator: indikator + reagenzien: reagenz + reagents: reagenz + reagent: reagenz + produkte: produkt + + semantic_shop_search_tokens: + - indikator + - indicator + - reagenz + - reagent + - zubehör + - zubehor + - ersatzteil + - verbrauchsmaterial + - chemie + - indikatorchemie + - reagenzchemie + - kit + - set + - filter + - pumpe + - pumpenkopf + - motorblock + - lösung + - loesung + - solution + - teststreifen + - gerät + - geraet + - messgerät + - messgeraet + - analysegerät + - analysegeraet + - analysator + - monitor + - controller + - system + + normalization: + search: ['€'] + replace: [' euro '] + + text: + trim_characters: + - space + - tab + - lf + - cr + - nul + - vertical_tab + - '-' + - '.' + - ',' + + limits: + min_search_token_length: 1 + min_direct_product_token_length: 1 + direct_product_max_tokens: 4 + model_context_token_window: 4 + min_meaningful_alpha_token_length: 2 + max_shop_search_tokens: 6 + + patterns: + history_context: 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt' + history_context_value_template: '/\b({fragment})\b/u' + prompt_sanitize: '/[^\p{L}\p{N}\s.,\-]/u' + whitespace_collapse: '/\s+/u' + whitespace_split: '/\s+/u' + history_question: '/^Question:\s*(.+)$/m' + price_between: '/\bzwischen\s+(\d+(?:[.,]\d+)?)\s+und\s+(\d+(?:[.,]\d+)?)\s+euro\b/u' + price_max: '/\b(?:unter|bis|max(?:imal)?)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u' + price_min: '/\b(?:ab|mindestens|min)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u' + price_removal_between: '/\bzwischen\s+\d+(?:[.,]\d+)?\s+und\s+\d+(?:[.,]\d+)?\s*euro\b/u' + price_removal_minmax: '/\b(?:unter|bis|max(?:imal)?|ab|mindestens|min)\s+\d+(?:[.,]\d+)?\s*euro\b/u' + price_removal_intent_template: '/\b(?:{price_pattern})\b/u' + direct_product_digit: '/\d/u' + model_like: '/\b[a-zäöüß][a-zäöüß®\-]*(?:\s+[a-zäöüß][a-zäöüß®\-]*){0,2}\s+\d{2,5}[a-z0-9\-]*\b/u' + accessory_like: '/\b(?:indikator|indicator|reagenz|reagent|kit|set)\s+\d{1,5}[a-z0-9\-]*\b/u' + contains_digit: '/\d/u' + model_number_token: '/^(?:\d{2,5}[a-z0-9\-]*|[a-z]{1,6}\d{1,5}[a-z0-9\-]*)$/u' + model_context_token: '/^[\p{L}][\p{L}0-9®\-]{2,}$/u' + model_suffix_token: '/^[a-z]{1,4}\d{0,3}$/u' + instruction_or_presentation_token: '/^(?:zeig(?:e)?|such(?:e)?|find(?:e)?|gib|gebe|nenn(?:e)?|liefer(?:e)?|erstelle?|mach(?:e)?|brauch(?:e)?|will|möchte|moechte|hätte|haette|kannst|bitte|mal|alle|alles|komplett|vollständig|vollstaendig|gesamt|ganze|ganzen|liste|listung|auflistung|tabelle|tabellarisch|übersicht|uebersicht|anzeigen?|ausgeben?|darstellen?|antwort(?:e)?|erklär(?:e)?|erklaer(?:e)?|info|infos|informationen|dazu|hierzu|damit|davon|an|als|mit|ohne|inkl|inklusive|also|gut|gute|guten|guter|gutes|passend|passen)$/u' + measurement_value_token: '/^\d+[.,]\d+$/u' + exact_token_removal_template: '/\b{token}\b/u' + brand_part_of_model_template: '/\b{brand}\s+\d{2,5}[a-z0-9\-]*\b/u' + # Shop matching and presentation configuration. # Defaults are intentionally identical to the previous PHP values. retriex.shop_matching.config: diff --git a/config/retriex/query_enrichment.yaml b/config/retriex/query_enrichment.yaml index 4641952..2367d0b 100644 --- a/config/retriex/query_enrichment.yaml +++ b/config/retriex/query_enrichment.yaml @@ -1,5 +1,5 @@ # Query enrichment vocabulary. -# Defaults are intentionally identical to the previous PHP mapping. +# YAML is the source of truth; QueryEnricherConfig intentionally contains no PHP defaults. parameters: retriex.query_enrichment.config: max_expansions: 4 diff --git a/config/retriex/search_repair.yaml b/config/retriex/search_repair.yaml new file mode 100644 index 0000000..bdf2fb8 --- /dev/null +++ b/config/retriex/search_repair.yaml @@ -0,0 +1,66 @@ +# Shop search-repair configuration. +# YAML is the source of truth; SearchRepairConfig intentionally contains no PHP defaults. +parameters: + retriex.search_repair.config: + strict_requested_accessory_code_repair: true + prefer_prompt_anchored_model_for_requested_accessory_code: true + + requested_accessory_code_fallback_query_templates: + - '{term} {code}' + requested_accessory_code_fallback_terms: + - indikatortyp + - indikator + - indicator + - reagenz + - reagent + requested_accessory_code_context_prefix_terms: + - indikatortyp + - indikator + - indicator + - reagenz + - reagent + requested_accessory_code_proximity_window: 1600 + + specific_model_candidate_patterns: + - '/\b([A-Za-zÄÖÜäöüß][A-Za-zÄÖÜäöüß®\-]*(?:\s+[A-Za-zÄÖÜäöüß0-9][A-Za-zÄÖÜäöüß0-9®\-]*){0,3}\s+\d{2,5}(?:\s+[A-ZÄÖÜ]{1,8})?)\b/u' + model_candidate_exclude_terms: + - indikatortyp + - indikator + - indicator + - reagenz + - reagent + - verfügbarkeit + - verfuegbarkeit + - shop + + limits: + top_product_log_limit: 3 + + # Character codes preserve the previous PHP trim charlist: + # space, tab, LF, CR, NUL, vertical tab, double quote, single quote, + # backtick, dot, comma, semicolon, colon, hyphen. + sanitize_trim_character_codes: [32, 9, 10, 13, 0, 11, 34, 39, 96, 46, 44, 59, 58, 45] + product_key_separator: '|' + + scores: + candidate_digit: 4 + candidate_word_count_cap: 4 + specificity_boost: 3 + primary_query_overlap_threshold: 0.9 + prompt_match_weight: 3 + primary_query_match_weight: 2 + repair_signal_match_weight: 4 + primary_result_order_bonus: 1 + token_intersection_score: 2 + numeric_token_match_score: 4 + + patterns: + model_candidate: '/\b([A-Za-zÄÖÜäöüß][A-Za-zÄÖÜäöüß®\-]*(?:\s+[A-Za-zÄÖÜäöüß][A-Za-zÄÖÜäöüß®\-]*){0,2}\s+\d{2,5}[A-Za-z0-9\-]*)\b/u' + accessory_candidate_template: '/\b((?:{terms})\s+\d{1,5}[A-Za-z0-9\-]*)\b/iu' + requested_accessory_code: '/\b(?:indikator(?:typ)?|indicator(?:\s*type)?|reagenz|reagent)\s*([A-Za-z]{0,3}\s*\d{1,5}[A-Za-z0-9\-]*)\b/iu' + accessory_or_bundle_template: '/\b({terms})\b/iu' + model_like: '/\b[A-Za-zÄÖÜäöüß][A-Za-zÄÖÜäöüß®\-]*(?:\s+[A-Za-zÄÖÜäöüß][A-Za-zÄÖÜäöüß®\-]*){0,2}\s+\d{2,5}[A-Za-z0-9\-]*\b/u' + specificity_boost_template: '/\b(?:{terms})\b/iu' + contains_digit: '/\d/u' + whitespace_collapse: '/\s+/u' + tokenize_cleanup: '/[^\p{L}\p{N}\s\-]+/u' diff --git a/config/retriex/vocabulary.yaml b/config/retriex/vocabulary.yaml index ac43b4e..515cb4c 100644 --- a/config/retriex/vocabulary.yaml +++ b/config/retriex/vocabulary.yaml @@ -1,37 +1,6 @@ # Central domain vocabulary for RetrieX. # Views preserve the previous 1.4.2-tuned ordering exactly; per-service configs may still override them. parameters: - retriex.commerce_query.config: {} - retriex.search_repair.config: - strict_requested_accessory_code_repair: true - prefer_prompt_anchored_model_for_requested_accessory_code: true - requested_accessory_code_pattern: '/\b(?:indikator(?:typ)?|indicator(?:\s*type)?|reagenz|reagent)\s*([A-Za-z]{0,3}\s*\d{1,5}[A-Za-z0-9\-]*)\b/iu' - requested_accessory_code_fallback_query_templates: - - '{term} {code}' - requested_accessory_code_fallback_terms: - - indikatortyp - - indikator - - indicator - - reagenz - - reagent - requested_accessory_code_context_prefix_terms: - - indikatortyp - - indikator - - indicator - - reagenz - - reagent - requested_accessory_code_proximity_window: 1600 - specific_model_candidate_patterns: - - '/\b([A-Za-zÄÖÜäöüß][A-Za-zÄÖÜäöüß®\-]*(?:\s+[A-Za-zÄÖÜäöüß0-9][A-Za-zÄÖÜäöüß0-9®\-]*){0,3}\s+\d{2,5}(?:\s+[A-ZÄÖÜ]{1,8})?)\b/u' - model_candidate_exclude_terms: - - indikatortyp - - indikator - - indicator - - reagenz - - reagent - - verfügbarkeit - - verfuegbarkeit - - shop retriex.vocabulary.config: classes: device: @@ -231,147 +200,6 @@ parameters: - filter - pumpenkopf - motorblock - commerce_query: - known_brands: - add: - - heyl - - horiba - - neomeris - phrases_to_remove: - add: - - ich suche - - suche - - habt ihr - - gibt es - - gebe mir - - gib mir - - zeige mir - - welches gerät - - welche gerät - - welches modell - - welches ist besser - - welches ist am besten - - alternative - - alternativen - - unter anderem - - u a - - welche - - welcher - - welches - - welchen - - sind - - ist - - geeignet - - geeigent - - verfügbarkeit - - verfuegbarkeit - filter_search_tokens: - add: - - auch - - noch - - nochmal - - zusätzlich - - dazu - - davon - - stattdessen - - bitte - - gern - - gerne - - zeige - - zeig - - such - - suche - - finde - - find - - mir - - mal - - von - - im - - in - - für - - fuer - - welche - - welcher - - welches - - welchen - - sind - - ist - - geeignet - - geeigent - - verfügbarkeit - - verfuegbarkeit - - prüfe - - pruefe - - den - - die - - das - - der - - dem - - des - - und - - oder - - sowie - - seine - - seinen - - seiner - - seinem - - seines - - siene - - sienen - - siener - - sienem - - sienes - - gebe - - gib - - nenne - - nenn - - preis - - preise - - preisen - - kostet - - kosten - - ua - - also - - gut - - gute - - guten - - guter - - gutes - - passen - - passend - semantic_shop_search_tokens: - add: - - indikator - - indicator - - reagenz - - reagent - - zubehör - - zubehor - - ersatzteil - - verbrauchsmaterial - - chemie - - indikatorchemie - - reagenzchemie - - kit - - set - - filter - - pumpe - - pumpenkopf - - motorblock - - lösung - - loesung - - solution - - teststreifen - - gerät - - geraet - - messgerät - - messgeraet - - analysegerät - - analysegeraet - - analysator - - monitor - - controller - - system retrieval: generic_product_tokens: add: @@ -660,19 +488,3 @@ parameters: - service set - serviceset - service-set - commerce_query: - search_token_corrections: - siene: seine - sienen: seinen - siener: seiner - sienem: seinem - sienes: seines - indicatoren: indikatoren - search_token_canonical: - indikatoren: indikator - indicators: indikator - indicator: indikator - reagenzien: reagenz - reagents: reagenz - reagent: reagenz - produkte: produkt diff --git a/config/services.yaml b/config/services.yaml index 83c51e4..71b0ab8 100644 --- a/config/services.yaml +++ b/config/services.yaml @@ -9,6 +9,7 @@ imports: - { resource: 'retriex/retrieval.yaml' } - { resource: 'retriex/language.yaml' } - { resource: 'retriex/query_enrichment.yaml' } + - { resource: 'retriex/search_repair.yaml' } - { resource: 'retriex/vocabulary.yaml' } - { resource: 'retriex/intent.yaml' } @@ -129,7 +130,6 @@ services: App\Config\PromptBuilderConfig: arguments: $config: '%retriex.prompt.config%' - $vocabulary: '@App\Config\DomainVocabularyConfig' App\Config\AgentRunnerConfig: arguments: @@ -138,7 +138,6 @@ services: App\Config\NdjsonHybridRetrieverConfig: arguments: $config: '%retriex.retrieval.config%' - $vocabulary: '@App\Config\DomainVocabularyConfig' App\Config\StopWordsConfig: arguments: @@ -151,7 +150,6 @@ services: App\Config\ShopServiceConfig: arguments: $config: '%retriex.shop_matching.config%' - $vocabulary: '@App\Config\DomainVocabularyConfig' App\Infrastructure\OllamaClient: arguments: @@ -199,7 +197,6 @@ services: App\Config\CommerceQueryParserConfig: arguments: $config: '%retriex.commerce_query.config%' - $vocabulary: '@App\Config\DomainVocabularyConfig' App\Commerce\CommerceQueryParser: ~ @@ -209,7 +206,6 @@ services: $maxRepairQueries: '%retriex.commerce.search_repair.max_queries%' $minPrimaryResultsWithoutRepair: '%retriex.commerce.search_repair.min_primary_results_without_repair%' $config: '%retriex.search_repair.config%' - $vocabulary: '@App\Config\DomainVocabularyConfig' App\Commerce\SearchRepairService: ~ diff --git a/src/Config/CommerceQueryParserConfig.php b/src/Config/CommerceQueryParserConfig.php index 9ab976c..7a99566 100644 --- a/src/Config/CommerceQueryParserConfig.php +++ b/src/Config/CommerceQueryParserConfig.php @@ -4,243 +4,60 @@ declare(strict_types=1); namespace App\Config; +use InvalidArgumentException; + final class CommerceQueryParserConfig { - private const KNOWN_BRANDS = [ - 'heyl', - 'horiba', - 'neomeris', - ]; - - private const PHRASES_TO_REMOVE = [ - 'ich suche', - 'suche', - 'habt ihr', - 'gibt es', - 'gebe mir', - 'gib mir', - 'zeige mir', - 'welches gerät', - 'welche gerät', - 'welches modell', - 'welches ist besser', - 'welches ist am besten', - 'alternative', - 'alternativen', - 'unter anderem', - 'u a', - 'welche', - 'welcher', - 'welches', - 'welchen', - 'sind', - 'ist', - 'geeignet', - 'geeigent', - 'verfügbarkeit', - 'verfuegbarkeit', - ]; - - private const FILTER_SEARCH_TOKENS = [ - 'auch', - 'noch', - 'nochmal', - 'zusätzlich', - 'dazu', - 'davon', - 'stattdessen', - 'bitte', - 'gern', - 'gerne', - 'zeige', - 'zeig', - 'such', - 'suche', - 'finde', - 'find', - 'mir', - 'mal', - 'von', - 'im', - 'in', - 'für', - 'fuer', - 'welche', - 'welcher', - 'welches', - 'welchen', - 'sind', - 'ist', - 'geeignet', - 'geeigent', - 'verfügbarkeit', - 'verfuegbarkeit', - 'prüfe', - 'pruefe', - 'den', - 'die', - 'das', - 'der', - 'dem', - 'des', - 'und', - 'oder', - 'sowie', - 'seine', - 'seinen', - 'seiner', - 'seinem', - 'seines', - 'siene', - 'sienen', - 'siener', - 'sienem', - 'sienes', - 'gebe', - 'gib', - 'nenne', - 'nenn', - 'preis', - 'preise', - 'preisen', - 'kostet', - 'kosten', - 'ua', - 'also', - 'gut', - 'gute', - 'guten', - 'guter', - 'gutes', - 'passen', - 'passend', - ]; - - private const SEARCH_TOKEN_CORRECTIONS = [ - 'siene' => 'seine', - 'sienen' => 'seinen', - 'siener' => 'seiner', - 'sienem' => 'seinem', - 'sienes' => 'seines', - 'indicatoren' => 'indikatoren', - ]; - - private const SEARCH_TOKEN_CANONICAL_MAP = [ - 'indikatoren' => 'indikator', - 'indicators' => 'indikator', - 'indicator' => 'indikator', - 'reagenzien' => 'reagenz', - 'reagents' => 'reagenz', - 'reagent' => 'reagenz', - 'produkte' => 'produkt', - ]; - - private const SEMANTIC_SHOP_SEARCH_TOKENS = [ - 'indikator', - 'indicator', - 'reagenz', - 'reagent', - 'zubehör', - 'zubehor', - 'ersatzteil', - 'verbrauchsmaterial', - 'chemie', - 'indikatorchemie', - 'reagenzchemie', - 'kit', - 'set', - 'filter', - 'pumpe', - 'pumpenkopf', - 'motorblock', - 'lösung', - 'loesung', - 'solution', - 'teststreifen', - 'gerät', - 'geraet', - 'messgerät', - 'messgeraet', - 'analysegerät', - 'analysegeraet', - 'analysator', - 'monitor', - 'controller', - 'system', - ]; - /** * @param array $config */ public function __construct( private readonly array $config = [], - private readonly ?DomainVocabularyConfig $vocabulary = null, ) { } - /** - * @return string[] - */ + /** @return string[] */ public function getKnownBrands(): array { - return $this->stringList( - 'known_brands', - $this->vocabularyView('commerce_query.known_brands', self::KNOWN_BRANDS) - ); + return $this->stringList('known_brands'); } - /** - * @return string[] - */ + /** @return string[] */ public function getPhrasesToRemove(): array { - return $this->stringList( - 'phrases_to_remove', - $this->vocabularyView('commerce_query.phrases_to_remove', self::PHRASES_TO_REMOVE) - ); + return $this->stringList('phrases_to_remove'); } public function getHistoryContextPattern(): string { - return 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt'; + return $this->string('patterns.history_context'); } public function getHistoryContextValuePattern(): string { - return '/\b(' . $this->getHistoryContextPattern() . ')\b/u'; + return $this->renderTemplate( + $this->string('patterns.history_context_value_template'), + ['fragment' => $this->getHistoryContextPattern()], + 'patterns.history_context_value_template' + ); } - /** - * @return string[] - */ + /** @return string[] */ public function getFilterSearchTokens(): array { - return $this->stringList( - 'filter_search_tokens', - $this->vocabularyView('commerce_query.filter_search_tokens', self::FILTER_SEARCH_TOKENS) - ); + return $this->stringList('filter_search_tokens'); } - /** - * @return array - */ + /** @return array */ public function getSearchTokenCorrections(): array { - return $this->stringMap( - 'search_token_corrections', - $this->vocabularyStringMap('commerce_query.search_token_corrections', self::SEARCH_TOKEN_CORRECTIONS) - ); + return $this->stringMap('search_token_corrections'); } - /** - * @return array - */ + /** @return array */ public function getSearchTokenCanonicalMap(): array { - return $this->stringMap( - 'search_token_canonical_map', - $this->vocabularyStringMap('commerce_query.search_token_canonical', self::SEARCH_TOKEN_CANONICAL_MAP) - ); + return $this->stringMap('search_token_canonical_map'); } /** @@ -253,181 +70,190 @@ final class CommerceQueryParserConfig return $this->getFilterSearchTokens(); } - /** - * @return string[] - */ + /** @return string[] */ public function getNormalizationSearch(): array { - return ['€']; + return $this->stringList('normalization.search', true); } - /** - * @return string[] - */ + /** @return string[] */ public function getNormalizationReplace(): array { - return [' euro ']; + return $this->stringList('normalization.replace', true); } public function getPromptSanitizePattern(): string { - return '/[^\p{L}\p{N}\s.,\-]/u'; + return $this->string('patterns.prompt_sanitize'); } public function getWhitespaceCollapsePattern(): string { - return '/\s+/u'; + return $this->string('patterns.whitespace_collapse'); } public function getWhitespaceSplitPattern(): string { - return '/\s+/u'; + return $this->string('patterns.whitespace_split'); } public function getSearchTextTrimCharacters(): string { - return " \t\n\r\0\x0B-.,"; + $characters = ''; + foreach ($this->stringList('text.trim_characters') as $item) { + $characters .= match ($item) { + 'space' => ' ', + 'tab' => "\t", + 'lf' => "\n", + 'cr' => "\r", + 'nul' => "\0", + 'vertical_tab' => "\x0B", + default => $item, + }; + } + + return $characters; } public function getMinSearchTokenLength(): int { - return 1; + return $this->int('limits.min_search_token_length'); } public function getMinDirectProductTokenLength(): int { - return 1; + return $this->int('limits.min_direct_product_token_length'); } public function getHistoryQuestionPattern(): string { - return '/^Question:\s*(.+)$/m'; + return $this->string('patterns.history_question'); } public function getPriceBetweenPattern(): string { - return '/\bzwischen\s+(\d+(?:[.,]\d+)?)\s+und\s+(\d+(?:[.,]\d+)?)\s+euro\b/u'; + return $this->string('patterns.price_between'); } public function getPriceMaxPattern(): string { - return '/\b(?:unter|bis|max(?:imal)?)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u'; + return $this->string('patterns.price_max'); } public function getPriceMinPattern(): string { - return '/\b(?:ab|mindestens|min)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u'; + return $this->string('patterns.price_min'); } - /** - * @return string[] - */ + /** @return string[] */ public function getPriceRemovalPatterns(CommerceIntentConfig $intentConfig): array { return [ - '/\bzwischen\s+\d+(?:[.,]\d+)?\s+und\s+\d+(?:[.,]\d+)?\s*euro\b/u', - '/\b(?:unter|bis|max(?:imal)?|ab|mindestens|min)\s+\d+(?:[.,]\d+)?\s*euro\b/u', - '/\b(?:' . $intentConfig->getPricePattern() . ')\b/u', + $this->string('patterns.price_removal_between'), + $this->string('patterns.price_removal_minmax'), + $this->renderTemplate( + $this->string('patterns.price_removal_intent_template'), + ['price_pattern' => $intentConfig->getPricePattern()], + 'patterns.price_removal_intent_template' + ), ]; } public function getDirectProductDigitPattern(): string { - return '/\d/u'; + return $this->string('patterns.direct_product_digit'); } public function getDirectProductMaxTokens(): int { - return 4; + return $this->int('limits.direct_product_max_tokens'); } public function getModelLikePattern(): string { - return '/\b[a-zäöüß][a-zäöüß®\-]*(?:\s+[a-zäöüß][a-zäöüß®\-]*){0,2}\s+\d{2,5}[a-z0-9\-]*\b/u'; + return $this->string('patterns.model_like'); } public function getAccessoryLikePattern(): string { - return '/\b(?:indikator|indicator|reagenz|reagent|kit|set)\s+\d{1,5}[a-z0-9\-]*\b/u'; + return $this->string('patterns.accessory_like'); } public function getContainsDigitPattern(): string { - return '/\d/u'; + return $this->string('patterns.contains_digit'); } public function getModelNumberTokenPattern(): string { - return '/^(?:\d{2,5}[a-z0-9\-]*|[a-z]{1,6}\d{1,5}[a-z0-9\-]*)$/u'; + return $this->string('patterns.model_number_token'); } public function getModelContextTokenPattern(): string { - return '/^[\p{L}][\p{L}0-9®\-]{2,}$/u'; + return $this->string('patterns.model_context_token'); } public function getModelSuffixTokenPattern(): string { - return '/^[a-z]{1,4}\d{0,3}$/u'; + return $this->string('patterns.model_suffix_token'); } public function getModelContextTokenWindow(): int { - return 4; + return $this->int('limits.model_context_token_window'); } public function getMinMeaningfulAlphaTokenLength(): int { - return 2; + return $this->int('limits.min_meaningful_alpha_token_length'); } public function getMaxShopSearchTokens(): int { - return 6; + return $this->int('limits.max_shop_search_tokens'); } public function getInstructionOrPresentationTokenPattern(): string { - return '/^(?:zeig(?:e)?|such(?:e)?|find(?:e)?|gib|gebe|nenn(?:e)?|liefer(?:e)?|erstelle?|mach(?:e)?|brauch(?:e)?|will|möchte|moechte|hätte|haette|kannst|bitte|mal|alle|alles|komplett|vollständig|vollstaendig|gesamt|ganze|ganzen|liste|listung|auflistung|tabelle|tabellarisch|übersicht|uebersicht|anzeigen?|ausgeben?|darstellen?|antwort(?:e)?|erklär(?:e)?|erklaer(?:e)?|info|infos|informationen|dazu|hierzu|damit|davon|an|als|mit|ohne|inkl|inklusive|also|gut|gute|guten|guter|gutes|passend|passen)$/u'; - } - public function getMeasurementValueTokenPattern(): string - { - return '/^\d+[.,]\d+$/u'; + return $this->string('patterns.instruction_or_presentation_token'); } - /** - * Product/category tokens that are useful for Store API search even when they are not next to a model number. - * This is intentionally a semantic allowlist, not a spelling-error blocklist. - * - * @return string[] - */ + public function getMeasurementValueTokenPattern(): string + { + return $this->string('patterns.measurement_value_token'); + } + + /** @return string[] */ public function getSemanticShopSearchTokens(): array { - return $this->stringList( - 'semantic_shop_search_tokens', - $this->vocabularyView('commerce_query.semantic_shop_search_tokens', self::SEMANTIC_SHOP_SEARCH_TOKENS) + return $this->stringList('semantic_shop_search_tokens'); + } + + public function buildExactTokenRemovalPattern(string $token): string + { + return $this->renderTemplate( + $this->string('patterns.exact_token_removal_template'), + ['token' => preg_quote($token, '/')], + 'patterns.exact_token_removal_template' ); } - - /** @return string[] */ - private function vocabularyView(string $path, array $fallback): array + public function buildBrandPartOfModelPattern(string $brand): string { - return $this->vocabulary?->view($path, $fallback) ?? $fallback; - } - - /** @return array */ - private function vocabularyStringMap(string $path, array $fallback): array - { - return $this->vocabulary?->stringMap($path, $fallback) ?? $fallback; + return $this->renderTemplate( + $this->string('patterns.brand_part_of_model_template'), + ['brand' => preg_quote($brand, '/')], + 'patterns.brand_part_of_model_template' + ); } /** @return string[] */ - private function stringList(string $path, array $default): array + private function stringList(string $path, bool $preserveWhitespace = false): array { - $value = $this->value($path, $default); + $value = $this->value($path); if (!is_array($value)) { - return $default; + throw $this->invalid($path, 'must be a list of non-empty strings'); } $out = []; @@ -436,23 +262,31 @@ final class CommerceQueryParserConfig continue; } - $item = trim((string) $item); - if ($item === '' || in_array($item, $out, true)) { + $item = (string) $item; + if (!$preserveWhitespace) { + $item = trim($item); + } + + if (trim($item) === '' || in_array($item, $out, true)) { continue; } $out[] = $item; } - return $out !== [] ? $out : $default; + if ($out === []) { + throw $this->invalid($path, 'must contain at least one non-empty string'); + } + + return $out; } /** @return array */ - private function stringMap(string $path, array $default): array + private function stringMap(string $path): array { - $value = $this->value($path, $default); + $value = $this->value($path); if (!is_array($value)) { - return $default; + throw $this->invalid($path, 'must be a map of non-empty strings'); } $out = []; @@ -468,15 +302,44 @@ final class CommerceQueryParserConfig } } - return $out !== [] ? $out : $default; + if ($out === []) { + throw $this->invalid($path, 'must contain at least one non-empty mapping'); + } + + return $out; } - private function value(string $path, mixed $default): mixed + private function string(string $path): string + { + $value = $this->value($path); + if (!is_scalar($value)) { + throw $this->invalid($path, 'must be a non-empty string'); + } + + $value = (string) $value; + if ($value === '') { + throw $this->invalid($path, 'must be a non-empty string'); + } + + return $value; + } + + private function int(string $path): int + { + $value = $this->value($path); + if (!is_int($value)) { + throw $this->invalid($path, 'must be an integer'); + } + + return $value; + } + + private function value(string $path): mixed { $current = $this->config; foreach (explode('.', $path) as $segment) { if (!is_array($current) || !array_key_exists($segment, $current)) { - return $default; + throw $this->missing($path); } $current = $current[$segment]; @@ -485,13 +348,29 @@ final class CommerceQueryParserConfig return $current; } - public function buildExactTokenRemovalPattern(string $token): string + /** + * @param array $replacements + */ + private function renderTemplate(string $template, array $replacements, string $path): string { - return '/\b' . preg_quote($token, '/') . '\b/u'; + foreach ($replacements as $placeholder => $value) { + $template = str_replace('{' . $placeholder . '}', $value, $template); + } + + if (preg_match('/\{[A-Za-z_][A-Za-z0-9_]*\}/', $template) === 1) { + throw $this->invalid($path, 'contains unresolved placeholders'); + } + + return $template; } - public function buildBrandPartOfModelPattern(string $brand): string + private function missing(string $path): InvalidArgumentException { - return '/\b' . preg_quote($brand, '/') . '\s+\d{2,5}[a-z0-9\-]*\b/u'; + return new InvalidArgumentException(sprintf('RetrieX commerce query config "%s" is missing.', $path)); } -} \ No newline at end of file + + private function invalid(string $path, string $reason): InvalidArgumentException + { + return new InvalidArgumentException(sprintf('RetrieX commerce query config "%s" %s.', $path, $reason)); + } +} diff --git a/src/Config/QueryEnricherConfig.php b/src/Config/QueryEnricherConfig.php index 009ef46..c6a17f2 100644 --- a/src/Config/QueryEnricherConfig.php +++ b/src/Config/QueryEnricherConfig.php @@ -4,31 +4,18 @@ declare(strict_types=1); namespace App\Config; +/** + * YAML-backed query-enrichment configuration. + * + * This class intentionally has no PHP fallback values. Missing or invalid + * configuration must be fixed in config/retriex/query_enrichment.yaml. + */ final readonly class QueryEnricherConfig { - /** - * Backwards-compatible fallback vocabulary. - * Active values are loaded from retriex.query_enrichment.config when present. - * - * @var array - */ - private const DEFAULT_ENRICH_QUERY_LIST = [ - 'Wasserhärte' => 'Resthärte', - 'Gerät' => 'Modell', - 'Indikator' => 'Chemie', - 'Seminar' => 'Webinar', - 'Schulung' => 'Seminar', - 'Indikatoren' => 'Indikator', - 'Wasserhärte-Grenzwert' => 'Resthärte', - 'Resthärte-Grenzwert' => 'Wasserhärte', - 'Grenzwert' => 'Überwachungsbereich', - 'store' => 'shop', - ]; - /** * @param array $config */ - public function __construct(private array $config = []) + public function __construct(private array $config) { } @@ -52,11 +39,7 @@ final readonly class QueryEnricherConfig public function getEnrichQueryList(): array { $normalized = []; - $rules = $this->config['rules'] ?? self::DEFAULT_ENRICH_QUERY_LIST; - - if (!is_array($rules)) { - $rules = self::DEFAULT_ENRICH_QUERY_LIST; - } + $rules = $this->requiredArray('rules'); foreach ($rules as $key => $value) { if (is_array($value)) { @@ -76,18 +59,16 @@ final readonly class QueryEnricherConfig } } + if ($normalized === []) { + throw new \InvalidArgumentException('RetrieX query enrichment config key "rules" must contain at least one valid enrichment rule.'); + } + return $normalized; } public function getMaxExpansions(): int { - $value = $this->config['max_expansions'] ?? 4; - - if (!is_numeric($value)) { - return 4; - } - - return max(0, (int) $value); + return $this->requiredNonNegativeInt('max_expansions'); } /** @@ -160,6 +141,49 @@ final readonly class QueryEnricherConfig return true; } + /** @return array */ + private function requiredArray(string $key): array + { + if (!array_key_exists($key, $this->config)) { + throw new \InvalidArgumentException(sprintf('Missing required RetrieX query enrichment config key "%s".', $key)); + } + + $value = $this->config[$key]; + + if (!is_array($value)) { + throw new \InvalidArgumentException(sprintf('RetrieX query enrichment config key "%s" must be an array.', $key)); + } + + if ($value === []) { + throw new \InvalidArgumentException(sprintf('RetrieX query enrichment config key "%s" must not be empty.', $key)); + } + + return $value; + } + + private function requiredNonNegativeInt(string $key): int + { + if (!array_key_exists($key, $this->config)) { + throw new \InvalidArgumentException(sprintf('Missing required RetrieX query enrichment config key "%s".', $key)); + } + + $value = $this->config[$key]; + + if (is_int($value)) { + $intValue = $value; + } elseif (is_string($value) && preg_match('/^-?\d+$/', trim($value)) === 1) { + $intValue = (int) trim($value); + } else { + throw new \InvalidArgumentException(sprintf('RetrieX query enrichment config key "%s" must be an integer.', $key)); + } + + if ($intValue < 0) { + throw new \InvalidArgumentException(sprintf('RetrieX query enrichment config key "%s" must be greater than or equal to 0.', $key)); + } + + return $intValue; + } + private function normalizePhrase(string $value): string { $value = trim($value); diff --git a/src/Config/RetriexEffectiveConfigProvider.php b/src/Config/RetriexEffectiveConfigProvider.php index 99601eb..8160c1c 100644 --- a/src/Config/RetriexEffectiveConfigProvider.php +++ b/src/Config/RetriexEffectiveConfigProvider.php @@ -522,6 +522,13 @@ final readonly class RetriexEffectiveConfigProvider 'search_token_corrections' => $this->commerceQueryParserConfig->getSearchTokenCorrections(), 'search_token_canonical_map' => $this->commerceQueryParserConfig->getSearchTokenCanonicalMap(), 'semantic_shop_search_tokens' => $this->commerceQueryParserConfig->getSemanticShopSearchTokens(), + 'normalization' => [ + 'search' => $this->commerceQueryParserConfig->getNormalizationSearch(), + 'replace' => $this->commerceQueryParserConfig->getNormalizationReplace(), + ], + 'text' => [ + 'trim_characters_length' => strlen($this->commerceQueryParserConfig->getSearchTextTrimCharacters()), + ], 'limits' => [ 'min_search_token_length' => $this->commerceQueryParserConfig->getMinSearchTokenLength(), 'min_direct_product_token_length' => $this->commerceQueryParserConfig->getMinDirectProductTokenLength(), diff --git a/src/Config/SearchRepairConfig.php b/src/Config/SearchRepairConfig.php index 5057bdb..a245ed2 100644 --- a/src/Config/SearchRepairConfig.php +++ b/src/Config/SearchRepairConfig.php @@ -4,69 +4,24 @@ declare(strict_types=1); namespace App\Config; +/** + * YAML-backed shop search-repair configuration. + * + * This class intentionally has no PHP fallback values. Missing or invalid + * configuration must be fixed in config/retriex/search_repair.yaml and + * config/retriex/vocabulary.yaml. + */ final class SearchRepairConfig { - private const GENERIC_CANDIDATE_TOKENS = [ - 'wasser', - 'messgerät', - 'messgeraet', - 'produkt', - 'geräte', - 'geraete', - 'gerät', - 'geraet', - 'resthärte', - 'resthaerte', - 'preis', - 'infos', - 'wissen', - ]; - - private const ACCESSORY_CANDIDATE_TERMS = [ - 'indikator', - 'indicator', - 'reagenz', - 'reagent', - 'kit', - 'set', - ]; - - private const ACCESSORY_OR_BUNDLE_TERMS = [ - 'passend', - 'passende', - 'zubehor', - 'zubehör', - 'dazu', - 'zusatz', - 'erganzung', - 'ergänzung', - 'indikator', - 'reagenz', - 'kit', - 'set', - 'auch\s+das', - 'mit\s+preis\s+und\s+allen\s+infos', - ]; - - private const SPECIFICITY_BOOST_TERMS = [ - 'indikator', - 'indicator', - 'testomat', - 'tritromat', - 'titromat', - 'reagenz', - 'reagent', - ]; - /** * @param array $config */ public function __construct( - private readonly bool $enabled = true, - private readonly int $maxRepairQueries = 3, - private readonly int $minPrimaryResultsWithoutRepair = 2, - private readonly array $config = [], - private readonly ?DomainVocabularyConfig $vocabulary = null, + private readonly bool $enabled, + private readonly int $maxRepairQueries, + private readonly int $minPrimaryResultsWithoutRepair, + private readonly array $config, + private readonly DomainVocabularyConfig $vocabulary, ) { } @@ -87,257 +42,259 @@ final class SearchRepairConfig public function shouldRestrictRequestedAccessoryCodeRepair(): bool { - return $this->bool('strict_requested_accessory_code_repair', true); + return $this->requiredBool('strict_requested_accessory_code_repair'); } public function shouldPreferPromptAnchoredModelForRequestedAccessoryCode(): bool { - return $this->bool('prefer_prompt_anchored_model_for_requested_accessory_code', true); + return $this->requiredBool('prefer_prompt_anchored_model_for_requested_accessory_code'); } /** @return string[] */ public function getRequestedAccessoryCodeFallbackQueryTemplates(): array { - return $this->stringList( - 'requested_accessory_code_fallback_query_templates', - ['{term} {code}'] - ); + return $this->requiredStringList('requested_accessory_code_fallback_query_templates'); } /** @return string[] */ public function getRequestedAccessoryCodeFallbackTerms(): array { - return $this->stringList( - 'requested_accessory_code_fallback_terms', - $this->getAccessoryCandidateTerms() - ); + return $this->requiredStringList('requested_accessory_code_fallback_terms'); } /** @return string[] */ public function getRequestedAccessoryCodeContextPrefixTerms(): array { - return $this->stringList( - 'requested_accessory_code_context_prefix_terms', - $this->getAccessoryCandidateTerms() - ); + return $this->requiredStringList('requested_accessory_code_context_prefix_terms'); } public function getRequestedAccessoryCodeProximityWindow(): int { - return $this->int('requested_accessory_code_proximity_window', 1600); + return $this->requiredPositiveInt('requested_accessory_code_proximity_window'); } /** @return string[] */ public function getSpecificModelCandidatePatterns(): array { - return $this->stringList( - 'specific_model_candidate_patterns', - [$this->getModelLikePattern()] - ); + return $this->requiredStringList('specific_model_candidate_patterns'); } /** @return string[] */ public function getModelCandidateExcludeTerms(): array { - return $this->stringList( - 'model_candidate_exclude_terms', - array_merge($this->getAccessoryCandidateTerms(), ['verfuegbarkeit', 'verfügbarkeit', 'shop']) - ); + return $this->requiredStringList('model_candidate_exclude_terms'); } public function getTopProductLogLimit(): int { - return 3; + return $this->requiredNonNegativeInt('limits.top_product_log_limit'); } public function getModelCandidatePattern(): string { - return '/\b([A-Za-zÄÖÜäöüß][A-Za-zÄÖÜäöüß®\-]*(?:\s+[A-Za-zÄÖÜäöüß][A-Za-zÄÖÜäöüß®\-]*){0,2}\s+\d{2,5}[A-Za-z0-9\-]*)\b/u'; + return $this->requiredString('patterns.model_candidate'); } public function getAccessoryCandidatePattern(): string { - return '/\b((?:' . implode('|', $this->getAccessoryCandidateTerms()) . ')\s+\d{1,5}[A-Za-z0-9\-]*)\b/iu'; + return $this->renderPatternTemplate( + 'patterns.accessory_candidate_template', + ['terms' => $this->patternAlternation($this->getAccessoryCandidateTerms())] + ); } public function getRequestedAccessoryCodePattern(): string { - $fallbackTerms = array_map( - static fn(string $term): string => preg_quote($term, '/'), - $this->getRequestedAccessoryCodeContextPrefixTerms() - ); - $fallbackTerms = array_filter($fallbackTerms, static fn(string $term): bool => $term !== ''); - - $fallbackPattern = $fallbackTerms === [] - ? '/\b([A-Za-z]{0,3}\s*\d{1,5}[A-Za-z0-9\-]*)\b/iu' - : '/\b(?:' . implode('|', $fallbackTerms) . ')\s*([A-Za-z]{0,3}\s*\d{1,5}[A-Za-z0-9\-]*)\b/iu'; - - return $this->string('requested_accessory_code_pattern', $fallbackPattern); + return $this->requiredString('patterns.requested_accessory_code'); } public function getAccessoryOrBundlePattern(): string { - return '/\b(' . implode('|', $this->getAccessoryOrBundleTerms()) . ')\b/iu'; + return $this->renderPatternTemplate( + 'patterns.accessory_or_bundle_template', + ['terms' => $this->patternAlternation($this->getAccessoryOrBundleTerms())] + ); } public function getModelLikePattern(): string { - return '/\b[A-Za-zÄÖÜäöüß][A-Za-zÄÖÜäöüß®\-]*(?:\s+[A-Za-zÄÖÜäöüß][A-Za-zÄÖÜäöüß®\-]*){0,2}\s+\d{2,5}[A-Za-z0-9\-]*\b/u'; + return $this->requiredString('patterns.model_like'); } public function getSpecificityBoostPattern(): string { - return '/\b(?:' . implode('|', $this->getSpecificityBoostTerms()) . ')\b/iu'; + return $this->renderPatternTemplate( + 'patterns.specificity_boost_template', + ['terms' => $this->patternAlternation($this->getSpecificityBoostTerms())] + ); } /** @return string[] */ public function getGenericCandidateTokens(): array { - return $this->stringList( + return $this->configOrVocabularyStringList( 'generic_candidate_tokens', - $this->vocabularyView('search_repair.generic_candidate_tokens', self::GENERIC_CANDIDATE_TOKENS) + 'search_repair.generic_candidate_tokens' ); } public function getSanitizeTrimCharacters(): string { - return " \t\n\r\0\x0B\"'`.,;:-"; + return implode('', array_map( + static fn (int $code): string => chr($code), + $this->requiredCharacterCodes('sanitize_trim_character_codes') + )); } public function getContainsDigitPattern(): string { - return '/\d/u'; + return $this->requiredString('patterns.contains_digit'); } public function getWhitespaceCollapsePattern(): string { - return '/\s+/u'; + return $this->requiredString('patterns.whitespace_collapse'); } public function getTokenizeCleanupPattern(): string { - return '/[^\p{L}\p{N}\s\-]+/u'; + return $this->requiredString('patterns.tokenize_cleanup'); } public function getProductKeySeparator(): string { - return '|'; + return $this->requiredString('product_key_separator'); } public function getCandidateDigitScore(): int { - return 4; + return $this->requiredInt('scores.candidate_digit'); } public function getCandidateWordCountCap(): int { - return 4; + return $this->requiredPositiveInt('scores.candidate_word_count_cap'); } public function getSpecificityBoostScore(): int { - return 3; + return $this->requiredInt('scores.specificity_boost'); } public function getPrimaryQueryOverlapThreshold(): float { - return 0.9; + return $this->requiredFloat('scores.primary_query_overlap_threshold'); } public function getPromptMatchWeight(): int { - return 3; + return $this->requiredInt('scores.prompt_match_weight'); } public function getPrimaryQueryMatchWeight(): int { - return 2; + return $this->requiredInt('scores.primary_query_match_weight'); } public function getRepairSignalMatchWeight(): int { - return 4; + return $this->requiredInt('scores.repair_signal_match_weight'); } public function getPrimaryResultOrderBonus(): int { - return 1; + return $this->requiredInt('scores.primary_result_order_bonus'); } public function getTokenIntersectionScore(): int { - return 2; + return $this->requiredInt('scores.token_intersection_score'); } public function getNumericTokenMatchScore(): int { - return 4; + return $this->requiredInt('scores.numeric_token_match_score'); } /** @return string[] */ public function getAccessoryCandidateTerms(): array { - return $this->stringList( + return $this->configOrVocabularyStringList( 'accessory_candidate_terms', - $this->vocabularyView('search_repair.accessory_candidate_terms', self::ACCESSORY_CANDIDATE_TERMS) + 'search_repair.accessory_candidate_terms' ); } /** @return string[] */ public function getAccessoryOrBundleTerms(): array { - return $this->stringList( + return $this->configOrVocabularyStringList( 'accessory_or_bundle_terms', - $this->vocabularyView('search_repair.accessory_or_bundle_terms', self::ACCESSORY_OR_BUNDLE_TERMS) + 'search_repair.accessory_or_bundle_terms' ); } /** @return string[] */ public function getSpecificityBoostTerms(): array { - return $this->stringList( + return $this->configOrVocabularyStringList( 'specificity_boost_terms', - $this->vocabularyView('search_repair.specificity_boost_terms', self::SPECIFICITY_BOOST_TERMS) + 'search_repair.specificity_boost_terms' ); } /** @return string[] */ - private function vocabularyView(string $path, array $fallback): array + private function configOrVocabularyStringList(string $configKey, string $vocabularyPath): array { - return $this->vocabulary?->view($path, $fallback) ?? $fallback; - } - - private function string(string $key, string $default): string - { - $value = $this->config[$key] ?? $default; - - if (!is_scalar($value)) { - return $default; + if (array_key_exists($configKey, $this->config)) { + return $this->requiredStringList($configKey); } - $value = trim((string) $value); - - return $value !== '' ? $value : $default; - } - - private function int(string $key, int $default): int - { - $value = $this->config[$key] ?? $default; - - if (is_int($value)) { - return $value; + $items = $this->vocabulary->view($vocabularyPath, []); + if ($items === []) { + throw new \InvalidArgumentException(sprintf( + 'Missing required RetrieX search repair vocabulary view "%s".', + $vocabularyPath + )); } - if (is_numeric($value)) { - return (int) $value; - } - - return $default; + return $items; } - private function bool(string $key, bool $default): bool + /** @param array $variables */ + private function renderPatternTemplate(string $path, array $variables): string { - $value = $this->config[$key] ?? $default; + $template = $this->requiredString($path); + + foreach ($variables as $key => $value) { + $template = str_replace('{' . $key . '}', $value, $template); + } + + if (preg_match('/\{[A-Za-z_][A-Za-z0-9_]*\}/', $template) === 1) { + throw new \InvalidArgumentException(sprintf('RetrieX search repair pattern template "%s" contains unresolved placeholders.', $path)); + } + + return $template; + } + + /** @param string[] $terms */ + private function patternAlternation(array $terms): string + { + $terms = array_values(array_filter( + array_map(static fn (string $term): string => trim($term), $terms), + static fn (string $term): bool => $term !== '' + )); + + if ($terms === []) { + throw new \InvalidArgumentException('RetrieX search repair pattern alternation requires at least one term.'); + } + + return implode('|', $terms); + } + + private function requiredBool(string $path): bool + { + $value = $this->requiredValue($path); if (is_bool($value)) { return $value; @@ -357,15 +314,80 @@ final class SearchRepairConfig } } - return $default; + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" must be boolean.', $path)); + } + + private function requiredString(string $path): string + { + $value = $this->requiredValue($path); + + if (!is_scalar($value)) { + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" must be a string.', $path)); + } + + $value = trim((string) $value); + if ($value === '') { + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" must not be empty.', $path)); + } + + return $value; + } + + private function requiredInt(string $path): int + { + $value = $this->requiredValue($path); + + if (is_int($value)) { + return $value; + } + + if (is_string($value) && preg_match('/^-?\d+$/', trim($value)) === 1) { + return (int) trim($value); + } + + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" must be an integer.', $path)); + } + + private function requiredNonNegativeInt(string $path): int + { + $value = $this->requiredInt($path); + + if ($value < 0) { + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" must be greater than or equal to 0.', $path)); + } + + return $value; + } + + private function requiredPositiveInt(string $path): int + { + $value = $this->requiredInt($path); + + if ($value <= 0) { + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" must be greater than 0.', $path)); + } + + return $value; + } + + private function requiredFloat(string $path): float + { + $value = $this->requiredValue($path); + + if (is_int($value) || is_float($value) || (is_string($value) && is_numeric(trim($value)))) { + return (float) $value; + } + + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" must be numeric.', $path)); } /** @return string[] */ - private function stringList(string $key, array $default): array + private function requiredStringList(string $path): array { - $value = $this->config[$key] ?? $default; + $value = $this->requiredValue($path); + if (!is_array($value)) { - return $default; + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" must be a list.', $path)); } $out = []; @@ -382,6 +404,57 @@ final class SearchRepairConfig $out[] = $item; } - return $out !== [] ? $out : $default; + if ($out === []) { + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" must not be empty.', $path)); + } + + return $out; + } + + /** @return int[] */ + private function requiredCharacterCodes(string $path): array + { + $value = $this->requiredValue($path); + + if (!is_array($value)) { + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" must be a list of character codes.', $path)); + } + + $codes = []; + foreach ($value as $item) { + if (is_int($item)) { + $code = $item; + } elseif (is_string($item) && preg_match('/^\d+$/', trim($item)) === 1) { + $code = (int) trim($item); + } else { + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" contains a non-integer character code.', $path)); + } + + if ($code < 0 || $code > 255) { + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" contains an invalid character code.', $path)); + } + + $codes[] = $code; + } + + if ($codes === []) { + throw new \InvalidArgumentException(sprintf('RetrieX search repair config key "%s" must not be empty.', $path)); + } + + return $codes; + } + + private function requiredValue(string $path): mixed + { + $current = $this->config; + foreach (explode('.', $path) as $segment) { + if (!is_array($current) || !array_key_exists($segment, $current)) { + throw new \InvalidArgumentException(sprintf('Missing required RetrieX search repair config key "%s".', $path)); + } + + $current = $current[$segment]; + } + + return $current; } }