diff --git a/RETRIEX_VOCABULARY_FIX_README.md b/RETRIEX_VOCABULARY_FIX_README.md new file mode 100644 index 0000000..7250fdb --- /dev/null +++ b/RETRIEX_VOCABULARY_FIX_README.md @@ -0,0 +1,29 @@ +# RetrieX Vocabulary Centralization Fix + +This patch centralizes the growing recognition word lists without changing their tuned content. + +## Main changes + +- Added `config/retriex/vocabulary.yaml`. +- Added `App\Config\DomainVocabularyConfig`. +- Wired the vocabulary facade into: + - `ShopServiceConfig` + - `NdjsonHybridRetrieverConfig` + - `PromptBuilderConfig` + - `CommerceQueryParserConfig` +- Moved the active Shop and Retrieval vocabulary defaults out of `commerce.yaml` and `retrieval.yaml` into `vocabulary.yaml`. +- Kept all old per-service config keys as explicit overrides. +- Removed direct `NdjsonHybridRetrieverConfig::...` constant usage inside `NdjsonHybridRetriever` so effective config getters are used consistently. + +## Stability note + +The vocabulary views preserve the previous order and content of the tuned lists. +No new semantic terms were added to the critical retrieval and shop matching views. + +Required regression baseline: + +- `Was ist der niedrigste Grenzwert für die Wasserhärte, welcher mit einem Testomaten überwacht werden kann?` + - expected: `0,02 °dH (Testomat 808)` +- `mit welchem indikator wird der wert gemessen` + - expected: `Indikatortyp 300` +- Store query with `0,02` must preserve the decimal value and must not turn it into `02`. diff --git a/config/retriex/commerce.yaml b/config/retriex/commerce.yaml index 4b986d5..2b6713b 100644 --- a/config/retriex/commerce.yaml +++ b/config/retriex/commerce.yaml @@ -16,166 +16,8 @@ parameters: retriex.shop_matching.config: top_product_log_limit: 3 - device_query_keywords: - - analysegerät - - analysegeraet - - analysegeräte - - analysegeraete - - messgerät - - messgeraet - - messgeräte - - messgeraete - - analysator - - analysatoren - - analyzer - - gerät - - geraet - - geräte - - geraete - - monitor - - monitore - - controller - - gerät für - - geraet fuer - - geräte für - - geraete fuer - - system - - systeme - - anlage - - anlagen - - accessory_query_keywords: - - zubehör - - zubehor - - reagenz - - reagenzien - - reagent - - indikator - - indikatoren - - indicator - - kit - - set - - ersatz - - ersatzteil - - ersatzteile - - verbrauchsmaterial - - consumable - - dazu - - passend - - passende - - passendes - - nachfüll - - nachfuell - - refill - - filter - - pumpenkopf - - motorblock - - service set - - serviceset - - service-set - - accessory_product_keywords: - - reagenz - - reagenzien - - reagent - - indikator - - indikatoren - - indicator - - kit - - set - - verbrauchsmaterial - - consumable - - zubehör - - zubehor - - ersatz - - ersatzteil - - ersatzteile - - nachfüll - - nachfuell - - refill - - lösung - - loesung - - solution - - teststreifen - - test strip - - filter - - pumpenkopf - - motorblock - - service set - - serviceset - - service-set - - device_product_keywords: - - analysegerät - - analysegeraet - - analysegeräte - - analysegeraete - - messgerät - - messgeraet - - messgeräte - - messgeraete - - analysator - - analysatoren - - analyzer - - monitor - - monitore - - controller - - online-analysator - - online analysator - - online-analysegerät - - online analysegeraet - - online-analysegeräte - - online analysegeraete - - online analyzer - - online monitor - - system - - systeme - - anlage - - anlagen - - gerät - - geraet - - geräte - - geraete - - device_focus_keywords: - - geräte - - geraete - - gerät - - geraet - - analysegerät - - analysegeraet - - messgerät - - messgeraet - - analysator - - controller - - monitor - - accessory_focus_keywords: - - indikator - - indikatoren - - reagenz - - reagenzien - - zubehör - - zubehor - - ersatzteil - - ersatzteile - - verbrauchsmaterial - - service set - - serviceset - - filter - - pumpenkopf - - motorblock - - accessory_focus_variant_map: - indikator: [indikator, indikatoren] - indikatoren: [indikator, indikatoren] - reagenz: [reagenz, reagenzien] - reagenzien: [reagenz, reagenzien] - ersatzteil: [ersatzteil, ersatzteile] - ersatzteile: [ersatzteil, ersatzteile] - service set: [service set, serviceset, service-set] - serviceset: [service set, serviceset, service-set] - service-set: [service set, serviceset, service-set] + # Vocabulary-backed lists live in config/retriex/vocabulary.yaml. + # The old per-key entries may still be added here to override a specific view. scores: exact_product_number_phrase: 160 diff --git a/config/retriex/retrieval.yaml b/config/retriex/retrieval.yaml index 5fd88d6..35c56c6 100644 --- a/config/retriex/retrieval.yaml +++ b/config/retriex/retrieval.yaml @@ -27,141 +27,8 @@ parameters: focused_product_min_gap: 4.0 focused_product_max_chunks: 4 - generic_product_tokens: - - produkt - - produkte - - produktkarte - - titel - - geraet - - gerät - - messgeraet - - messgerät - - wasser - - haerte - - härte - - resthaerte - - resthärte - - analyse - - analysator - - automat - - online - - messung - - messen - - preis - - preise - - kosten - - info - - infos - - passend - - richtige - - richtiges - - geeignet - - geeignete - - welche - - welcher - - welches - - brauche - - suche - - important_short_model_tokens: [th, tc, tp, tm, ph, rx] - - family_descriptor_tokens: - - evo - - eco - - self - - clean - - mini - - pro - - plus - - basic - - lab - - inline - - compact - - panel - - sc - - looks_like_reagent_tokens: - - indikator - - reagenz - - reagens - - laborchemikalie - - chemikalie - - sicherheitsdatenblatt - - sdb - - msds - - ufi - - gebinde - - flasche - - ersatzteil - - zubehoer - - zubehör - - service set - - filtereinsatz - - kerzenfilter - - druckregler - - looks_like_safety_docs: - - sicherheitsdatenblatt - - sdb - - msds - - gefahrenbewertung - - gefahrenpiktogramm - - signalwort - - lagerung - - transport - - clp - - kennzeichnung - - h290 - - pbt - - vpvb - - looks_like_reagent_words: - - indikator - - reagenz - - reagens - - chemie - - chemikalie - - sdb - - sicherheitsdatenblatt - - msds - - flasche - - gebinde - - looks_like_document_words: - - datenblatt - - dokument - - pdf - - handbuch - - manual - - beschreibung - - sdb - - sicherheitsdatenblatt - - msds - - looks_like_safety_words: - - gefahr - - gefahrgut - - clp - - h290 - - sicherheit - - kennzeichnung - - transport - - lagerung - - piktogramm - - looks_like_device_words: - - geraet - - gerät - - messgeraet - - messgerät - - analysator - - automat - - messung - - messen - - ueberwachung - - überwachung - - online - - monitor + # Vocabulary-backed retrieval token lists live in config/retriex/vocabulary.yaml. + # The old per-key entries may still be added here to override a specific view. # Backwards-compatible name for existing config diagnostics. retriex.retrieval.inventory: '%retriex.retrieval.config%' diff --git a/config/retriex/vocabulary.yaml b/config/retriex/vocabulary.yaml new file mode 100644 index 0000000..f1b094b --- /dev/null +++ b/config/retriex/vocabulary.yaml @@ -0,0 +1,597 @@ +# Central domain vocabulary for RetrieX. +# Views preserve the previous 1.4.2-tuned ordering exactly; per-service configs may still override them. +parameters: + retriex.commerce_query.config: {} + retriex.vocabulary.config: + classes: + device: + - analysegerät + - analysegeraet + - analysegeräte + - analysegeraete + - messgerät + - messgeraet + - messgeräte + - messgeraete + - analysator + - analysatoren + - analyzer + - gerät + - geraet + - geräte + - geraete + - monitor + - monitore + - controller + - system + - systeme + - anlage + - anlagen + accessory: + - zubehör + - zubehor + - reagenz + - reagenzien + - reagent + - indikator + - indikatoren + - indicator + - kit + - set + - ersatz + - ersatzteil + - ersatzteile + - verbrauchsmaterial + - consumable + - filter + - pumpenkopf + - motorblock + - service set + - serviceset + - service-set + views: + shop: + device_query: + add: + - analysegerät + - analysegeraet + - analysegeräte + - analysegeraete + - messgerät + - messgeraet + - messgeräte + - messgeraete + - analysator + - analysatoren + - analyzer + - gerät + - geraet + - geräte + - geraete + - monitor + - monitore + - controller + - gerät für + - geraet fuer + - geräte für + - geraete fuer + - system + - systeme + - anlage + - anlagen + accessory_query: + add: + - zubehör + - zubehor + - reagenz + - reagenzien + - reagent + - indikator + - indikatoren + - indicator + - kit + - set + - ersatz + - ersatzteil + - ersatzteile + - verbrauchsmaterial + - consumable + - dazu + - passend + - passende + - passendes + - nachfüll + - nachfuell + - refill + - filter + - pumpenkopf + - motorblock + - service set + - serviceset + - service-set + accessory_product: + add: + - reagenz + - reagenzien + - reagent + - indikator + - indikatoren + - indicator + - kit + - set + - verbrauchsmaterial + - consumable + - zubehör + - zubehor + - ersatz + - ersatzteil + - ersatzteile + - nachfüll + - nachfuell + - refill + - lösung + - loesung + - solution + - teststreifen + - test strip + - filter + - pumpenkopf + - motorblock + - service set + - serviceset + - service-set + device_product: + add: + - analysegerät + - analysegeraet + - analysegeräte + - analysegeraete + - messgerät + - messgeraet + - messgeräte + - messgeraete + - analysator + - analysatoren + - analyzer + - monitor + - monitore + - controller + - online-analysator + - online analysator + - online-analysegerät + - online analysegeraet + - online-analysegeräte + - online analysegeraete + - online analyzer + - online monitor + - system + - systeme + - anlage + - anlagen + - gerät + - geraet + - geräte + - geraete + device_focus: + add: + - geräte + - geraete + - gerät + - geraet + - analysegerät + - analysegeraet + - messgerät + - messgeraet + - analysator + - controller + - monitor + accessory_focus: + add: + - indikator + - indikatoren + - reagenz + - reagenzien + - zubehör + - zubehor + - ersatzteil + - ersatzteile + - verbrauchsmaterial + - service set + - serviceset + - filter + - pumpenkopf + - motorblock + commerce_query: + known_brands: + add: + - heyl + - horiba + - neomeris + phrases_to_remove: + add: + - ich suche + - suche + - habt ihr + - gibt es + - gebe mir + - gib mir + - zeige mir + - welches gerät + - welche gerät + - welches modell + - welches ist besser + - welches ist am besten + - alternative + - alternativen + - unter anderem + - u a + - welche + - welcher + - welches + - welchen + - sind + - ist + - geeignet + - geeigent + - verfügbarkeit + - verfuegbarkeit + filter_search_tokens: + add: + - auch + - noch + - nochmal + - zusätzlich + - dazu + - davon + - stattdessen + - bitte + - gern + - gerne + - zeige + - zeig + - such + - suche + - finde + - find + - mir + - mal + - von + - im + - in + - für + - fuer + - welche + - welcher + - welches + - welchen + - sind + - ist + - geeignet + - geeigent + - verfügbarkeit + - verfuegbarkeit + - prüfe + - pruefe + - den + - die + - das + - der + - dem + - des + - und + - oder + - sowie + - seine + - seinen + - seiner + - seinem + - seines + - siene + - sienen + - siener + - sienem + - sienes + - gebe + - gib + - nenne + - nenn + - preis + - preise + - preisen + - kostet + - kosten + - ua + - also + - gut + - gute + - guten + - guter + - gutes + - passen + - passend + semantic_shop_search_tokens: + add: + - indikator + - indicator + - reagenz + - reagent + - zubehör + - zubehor + - ersatzteil + - verbrauchsmaterial + - chemie + - indikatorchemie + - reagenzchemie + - kit + - set + - filter + - pumpe + - pumpenkopf + - motorblock + - lösung + - loesung + - solution + - teststreifen + - gerät + - geraet + - messgerät + - messgeraet + - analysegerät + - analysegeraet + - analysator + - monitor + - controller + - system + retrieval: + generic_product_tokens: + add: + - produkt + - produkte + - produktkarte + - titel + - geraet + - gerät + - messgeraet + - messgerät + - wasser + - haerte + - härte + - resthaerte + - resthärte + - analyse + - analysator + - automat + - online + - messung + - messen + - preis + - preise + - kosten + - info + - infos + - passend + - richtige + - richtiges + - geeignet + - geeignete + - welche + - welcher + - welches + - brauche + - suche + important_short_model_tokens: + add: + - th + - tc + - tp + - tm + - ph + - rx + family_descriptor_tokens: + add: + - evo + - eco + - self + - clean + - mini + - pro + - plus + - basic + - lab + - inline + - compact + - panel + - sc + looks_like_reagent_tokens: + add: + - indikator + - reagenz + - reagens + - laborchemikalie + - chemikalie + - sicherheitsdatenblatt + - sdb + - msds + - ufi + - gebinde + - flasche + - ersatzteil + - zubehoer + - zubehör + - service set + - filtereinsatz + - kerzenfilter + - druckregler + looks_like_safety_docs: + add: + - sicherheitsdatenblatt + - sdb + - msds + - gefahrenbewertung + - gefahrenpiktogramm + - signalwort + - lagerung + - transport + - clp + - kennzeichnung + - h290 + - pbt + - vpvb + looks_like_reagent_words: + add: + - indikator + - reagenz + - reagens + - chemie + - chemikalie + - sdb + - sicherheitsdatenblatt + - msds + - flasche + - gebinde + looks_like_document_words: + add: + - datenblatt + - dokument + - pdf + - handbuch + - manual + - beschreibung + - sdb + - sicherheitsdatenblatt + - msds + looks_like_safety_words: + add: + - gefahr + - gefahrgut + - clp + - h290 + - sicherheit + - kennzeichnung + - transport + - lagerung + - piktogramm + looks_like_device_words: + add: + - geraet + - gerät + - messgeraet + - messgerät + - analysator + - automat + - messung + - messen + - ueberwachung + - überwachung + - online + - monitor + prompt: + technical_product_keywords: + add: + - technisch + - technical + - produkt + - product + - gerät + - device + - modell + - model + - messprinzip + - measurement principle + - schnittstelle + - interface + - relais + - relay + - indikator + - indicator + - grenzwert + - threshold + - messbereich + - measurement range + - minimaler + - minimum + - resthärte + - resthaerte + - °dh + - dh + - spannung + - voltage + - strom + - current + - druck + - pressure + - temperatur + - temperature + - schutzart + - ip + - fehlercode + - error code + - wasserhärte + - hardness + - testomat + - chlor + - chlormessung + accessory_request_keywords: + add: + - passend + - passende + - passendes + - zubehör + - zubehor + - dazu + - indikator + - reagenz + - kit + - set + - zusatz + - ergänzung + - ergaenzung + maps: + shop: + accessory_focus_variants: + indikator: + - indikator + - indikatoren + indikatoren: + - indikator + - indikatoren + reagenz: + - reagenz + - reagenzien + reagenzien: + - reagenz + - reagenzien + ersatzteil: + - ersatzteil + - ersatzteile + ersatzteile: + - ersatzteil + - ersatzteile + service set: + - service set + - serviceset + - service-set + serviceset: + - service set + - serviceset + - service-set + service-set: + - service set + - serviceset + - service-set + commerce_query: + search_token_corrections: + siene: seine + sienen: seinen + siener: seiner + sienem: seinem + sienes: seines + indicatoren: indikatoren + search_token_canonical: + indikatoren: indikator + indicators: indikator + indicator: indikator + reagenzien: reagenz + reagents: reagenz + reagent: reagenz + produkte: produkt diff --git a/config/services.yaml b/config/services.yaml index 76a659b..11616d0 100644 --- a/config/services.yaml +++ b/config/services.yaml @@ -9,6 +9,7 @@ imports: - { resource: 'retriex/retrieval.yaml' } - { resource: 'retriex/language.yaml' } - { resource: 'retriex/query_enrichment.yaml' } + - { resource: 'retriex/vocabulary.yaml' } # ------------------------------------------------------------ # Parameters @@ -112,9 +113,14 @@ services: $retrievalMaxChunks: '%retriex.model.default_retrieval_max_chunks%' $retrievalVectorTopK: '%retriex.model.default_retrieval_vector_top_k%' + App\Config\DomainVocabularyConfig: + arguments: + $config: '%retriex.vocabulary.config%' + App\Config\PromptBuilderConfig: arguments: $config: '%retriex.prompt.config%' + $vocabulary: '@App\Config\DomainVocabularyConfig' App\Config\AgentRunnerConfig: arguments: @@ -123,7 +129,7 @@ services: App\Config\NdjsonHybridRetrieverConfig: arguments: $config: '%retriex.retrieval.config%' - $vocabulary: '%retriex.retrieval.config%' + $vocabulary: '@App\Config\DomainVocabularyConfig' App\Config\StopWordsConfig: arguments: @@ -136,6 +142,7 @@ services: App\Config\ShopServiceConfig: arguments: $config: '%retriex.shop_matching.config%' + $vocabulary: '@App\Config\DomainVocabularyConfig' App\Infrastructure\OllamaClient: arguments: @@ -176,6 +183,11 @@ services: App\Intent\CommerceIntentLite: ~ + App\Config\CommerceQueryParserConfig: + arguments: + $config: '%retriex.commerce_query.config%' + $vocabulary: '@App\Config\DomainVocabularyConfig' + App\Commerce\CommerceQueryParser: ~ App\Config\SearchRepairConfig: diff --git a/src/Config/CommerceQueryParserConfig.php b/src/Config/CommerceQueryParserConfig.php index 3e611c9..9ab976c 100644 --- a/src/Config/CommerceQueryParserConfig.php +++ b/src/Config/CommerceQueryParserConfig.php @@ -6,24 +6,13 @@ namespace App\Config; final class CommerceQueryParserConfig { - /** - * @return string[] - */ - public function getKnownBrands(): array - { - return [ + private const KNOWN_BRANDS = [ 'heyl', 'horiba', 'neomeris', ]; - } - /** - * @return string[] - */ - public function getPhrasesToRemove(): array - { - return [ + private const PHRASES_TO_REMOVE = [ 'ich suche', 'suche', 'habt ihr', @@ -51,24 +40,8 @@ final class CommerceQueryParserConfig 'verfügbarkeit', 'verfuegbarkeit', ]; - } - public function getHistoryContextPattern(): string - { - return 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt'; - } - - public function getHistoryContextValuePattern(): string - { - return '/\b(' . $this->getHistoryContextPattern() . ')\b/u'; - } - - /** - * @return string[] - */ - public function getFilterSearchTokens(): array - { - return [ + private const FILTER_SEARCH_TOKENS = [ 'auch', 'noch', 'nochmal', @@ -142,14 +115,8 @@ final class CommerceQueryParserConfig 'passen', 'passend', ]; - } - /** - * @return array - */ - public function getSearchTokenCorrections(): array - { - return [ + private const SEARCH_TOKEN_CORRECTIONS = [ 'siene' => 'seine', 'sienen' => 'seinen', 'siener' => 'seiner', @@ -157,14 +124,8 @@ final class CommerceQueryParserConfig 'sienes' => 'seines', 'indicatoren' => 'indikatoren', ]; - } - /** - * @return array - */ - public function getSearchTokenCanonicalMap(): array - { - return [ + private const SEARCH_TOKEN_CANONICAL_MAP = [ 'indikatoren' => 'indikator', 'indicators' => 'indikator', 'indicator' => 'indikator', @@ -173,6 +134,113 @@ final class CommerceQueryParserConfig 'reagent' => 'reagenz', 'produkte' => 'produkt', ]; + + private const SEMANTIC_SHOP_SEARCH_TOKENS = [ + 'indikator', + 'indicator', + 'reagenz', + 'reagent', + 'zubehör', + 'zubehor', + 'ersatzteil', + 'verbrauchsmaterial', + 'chemie', + 'indikatorchemie', + 'reagenzchemie', + 'kit', + 'set', + 'filter', + 'pumpe', + 'pumpenkopf', + 'motorblock', + 'lösung', + 'loesung', + 'solution', + 'teststreifen', + 'gerät', + 'geraet', + 'messgerät', + 'messgeraet', + 'analysegerät', + 'analysegeraet', + 'analysator', + 'monitor', + 'controller', + 'system', + ]; + + /** + * @param array $config + */ + public function __construct( + private readonly array $config = [], + private readonly ?DomainVocabularyConfig $vocabulary = null, + ) { + } + + /** + * @return string[] + */ + public function getKnownBrands(): array + { + return $this->stringList( + 'known_brands', + $this->vocabularyView('commerce_query.known_brands', self::KNOWN_BRANDS) + ); + } + + /** + * @return string[] + */ + public function getPhrasesToRemove(): array + { + return $this->stringList( + 'phrases_to_remove', + $this->vocabularyView('commerce_query.phrases_to_remove', self::PHRASES_TO_REMOVE) + ); + } + + public function getHistoryContextPattern(): string + { + return 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt'; + } + + public function getHistoryContextValuePattern(): string + { + return '/\b(' . $this->getHistoryContextPattern() . ')\b/u'; + } + + /** + * @return string[] + */ + public function getFilterSearchTokens(): array + { + return $this->stringList( + 'filter_search_tokens', + $this->vocabularyView('commerce_query.filter_search_tokens', self::FILTER_SEARCH_TOKENS) + ); + } + + /** + * @return array + */ + public function getSearchTokenCorrections(): array + { + return $this->stringMap( + 'search_token_corrections', + $this->vocabularyStringMap('commerce_query.search_token_corrections', self::SEARCH_TOKEN_CORRECTIONS) + ); + } + + /** + * @return array + */ + public function getSearchTokenCanonicalMap(): array + { + return $this->stringMap( + 'search_token_canonical_map', + $this->vocabularyStringMap('commerce_query.search_token_canonical', self::SEARCH_TOKEN_CANONICAL_MAP) + ); } /** @@ -335,39 +403,86 @@ final class CommerceQueryParserConfig */ public function getSemanticShopSearchTokens(): array { - return [ - 'indikator', - 'indicator', - 'reagenz', - 'reagent', - 'zubehör', - 'zubehor', - 'ersatzteil', - 'verbrauchsmaterial', - 'chemie', - 'indikatorchemie', - 'reagenzchemie', - 'kit', - 'set', - 'filter', - 'pumpe', - 'pumpenkopf', - 'motorblock', - 'lösung', - 'loesung', - 'solution', - 'teststreifen', - 'gerät', - 'geraet', - 'messgerät', - 'messgeraet', - 'analysegerät', - 'analysegeraet', - 'analysator', - 'monitor', - 'controller', - 'system', - ]; + return $this->stringList( + 'semantic_shop_search_tokens', + $this->vocabularyView('commerce_query.semantic_shop_search_tokens', self::SEMANTIC_SHOP_SEARCH_TOKENS) + ); + } + + + /** @return string[] */ + private function vocabularyView(string $path, array $fallback): array + { + return $this->vocabulary?->view($path, $fallback) ?? $fallback; + } + + /** @return array */ + private function vocabularyStringMap(string $path, array $fallback): array + { + return $this->vocabulary?->stringMap($path, $fallback) ?? $fallback; + } + + /** @return string[] */ + private function stringList(string $path, array $default): array + { + $value = $this->value($path, $default); + if (!is_array($value)) { + return $default; + } + + $out = []; + foreach ($value as $item) { + if (!is_scalar($item)) { + continue; + } + + $item = trim((string) $item); + if ($item === '' || in_array($item, $out, true)) { + continue; + } + + $out[] = $item; + } + + return $out !== [] ? $out : $default; + } + + /** @return array */ + private function stringMap(string $path, array $default): array + { + $value = $this->value($path, $default); + if (!is_array($value)) { + return $default; + } + + $out = []; + foreach ($value as $key => $item) { + if (!is_scalar($key) || !is_scalar($item)) { + continue; + } + + $cleanKey = trim((string) $key); + $cleanValue = trim((string) $item); + if ($cleanKey !== '' && $cleanValue !== '') { + $out[$cleanKey] = $cleanValue; + } + } + + return $out !== [] ? $out : $default; + } + + private function value(string $path, mixed $default): mixed + { + $current = $this->config; + foreach (explode('.', $path) as $segment) { + if (!is_array($current) || !array_key_exists($segment, $current)) { + return $default; + } + + $current = $current[$segment]; + } + + return $current; } public function buildExactTokenRemovalPattern(string $token): string diff --git a/src/Config/DomainVocabularyConfig.php b/src/Config/DomainVocabularyConfig.php new file mode 100644 index 0000000..2b9b4f8 --- /dev/null +++ b/src/Config/DomainVocabularyConfig.php @@ -0,0 +1,196 @@ +value('views.' . $path, null); + if (!is_array($definition)) { + return $this->uniqueStringList($fallback); + } + + $terms = []; + foreach ($this->stringListFromValue($definition['include'] ?? []) as $className) { + foreach ($this->domainClass($className) as $term) { + $terms[] = $term; + } + } + foreach ($this->stringListFromValue($definition['add'] ?? []) as $term) { + $terms[] = $term; + } + + $terms = $this->uniqueStringList($terms); + return $terms !== [] ? $terms : $this->uniqueStringList($fallback); + } + + /** @return string[] */ + public function domainClass(string $name): array + { + return $this->stringList('classes.' . $name, []); + } + + /** @return array */ + public function map(string $path, array $fallback = []): array + { + $value = $this->value('maps.' . $path, null); + if (!is_array($value)) { + return $this->uniqueStringListMap($fallback); + } + + $out = []; + foreach ($value as $key => $items) { + if (!is_scalar($key)) { + continue; + } + $cleanKey = trim((string) $key); + $cleanItems = $this->stringListFromValue($items); + if ($cleanKey !== '' && $cleanItems !== []) { + $out[$cleanKey] = $cleanItems; + } + } + + return $out !== [] ? $out : $this->uniqueStringListMap($fallback); + } + + /** @return array */ + public function stringMap(string $path, array $fallback = []): array + { + $value = $this->value('maps.' . $path, null); + if (!is_array($value)) { + return $this->uniqueStringMap($fallback); + } + + $out = []; + foreach ($value as $key => $mappedValue) { + if (!is_scalar($key)) { + continue; + } + + $cleanKey = trim((string) $key); + if ($cleanKey === '') { + continue; + } + + if (is_array($mappedValue)) { + $items = $this->stringListFromValue($mappedValue); + $mappedValue = $items[0] ?? ''; + } + + if (!is_scalar($mappedValue)) { + continue; + } + + $cleanValue = trim((string) $mappedValue); + if ($cleanValue !== '') { + $out[$cleanKey] = $cleanValue; + } + } + + return $out !== [] ? $out : $this->uniqueStringMap($fallback); + } + + /** @return array */ + public function toArray(): array + { + return $this->config; + } + + /** @return string[] */ + private function stringList(string $path, array $fallback): array + { + $value = $this->value($path, null); + $items = $this->stringListFromValue($value); + return $items !== [] ? $items : $this->uniqueStringList($fallback); + } + + /** @return string[] */ + private function stringListFromValue(mixed $value): array + { + if (!is_array($value)) { + return []; + } + + return $this->uniqueStringList($value); + } + + /** @return string[] */ + private function uniqueStringList(array $items): array + { + $out = []; + foreach ($items as $item) { + if (!is_scalar($item)) { + continue; + } + + $item = trim((string) $item); + if ($item === '' || in_array($item, $out, true)) { + continue; + } + + $out[] = $item; + } + + return $out; + } + + /** @return array */ + private function uniqueStringListMap(array $map): array + { + $out = []; + foreach ($map as $key => $items) { + if (!is_scalar($key)) { + continue; + } + + $cleanKey = trim((string) $key); + $cleanItems = $this->uniqueStringList(is_array($items) ? $items : []); + if ($cleanKey !== '' && $cleanItems !== []) { + $out[$cleanKey] = $cleanItems; + } + } + + return $out; + } + + /** @return array */ + private function uniqueStringMap(array $map): array + { + $out = []; + foreach ($map as $key => $value) { + if (!is_scalar($key) || !is_scalar($value)) { + continue; + } + + $cleanKey = trim((string) $key); + $cleanValue = trim((string) $value); + if ($cleanKey !== '' && $cleanValue !== '') { + $out[$cleanKey] = $cleanValue; + } + } + + return $out; + } + + private function value(string $path, mixed $fallback): mixed + { + $current = $this->config; + foreach (explode('.', $path) as $segment) { + if (!is_array($current) || !array_key_exists($segment, $current)) { + return $fallback; + } + + $current = $current[$segment]; + } + + return $current; + } +} diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index 501731c..ce85e6d 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -176,11 +176,10 @@ final class NdjsonHybridRetrieverConfig /** * @param array $config - * @param array $vocabulary Kept for backwards-compatible service wiring. */ public function __construct( private array $config = [], - private array $vocabulary = [], + private readonly ?DomainVocabularyConfig $vocabulary = null, ) { } @@ -307,55 +306,55 @@ final class NdjsonHybridRetrieverConfig /** @return string[] */ public function genericProductTokens(): array { - return $this->stringList('generic_product_tokens', self::GENERIC_PRODUCT_TOKEN); + return $this->stringList('generic_product_tokens', $this->vocabularyView('retrieval.generic_product_tokens', self::GENERIC_PRODUCT_TOKEN)); } /** @return string[] */ public function importantShortModelTokens(): array { - return $this->stringList('important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN); + return $this->stringList('important_short_model_tokens', $this->vocabularyView('retrieval.important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN)); } /** @return string[] */ public function familyDescriptorTokens(): array { - return $this->stringList('family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN); + return $this->stringList('family_descriptor_tokens', $this->vocabularyView('retrieval.family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN)); } /** @return string[] */ public function looksLikeReagentTokens(): array { - return $this->stringList('looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS); + return $this->stringList('looks_like_reagent_tokens', $this->vocabularyView('retrieval.looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS)); } /** @return string[] */ public function looksLikeSafetyDocs(): array { - return $this->stringList('looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS); + return $this->stringList('looks_like_safety_docs', $this->vocabularyView('retrieval.looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS)); } /** @return string[] */ public function looksLikeReagentWords(): array { - return $this->stringList('looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS); + return $this->stringList('looks_like_reagent_words', $this->vocabularyView('retrieval.looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS)); } /** @return string[] */ public function looksLikeDocumentWords(): array { - return $this->stringList('looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS); + return $this->stringList('looks_like_document_words', $this->vocabularyView('retrieval.looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS)); } /** @return string[] */ public function looksLikeSafetyWords(): array { - return $this->stringList('looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS); + return $this->stringList('looks_like_safety_words', $this->vocabularyView('retrieval.looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS)); } /** @return string[] */ public function looksLikeDeviceWords(): array { - return $this->stringList('looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS); + return $this->stringList('looks_like_device_words', $this->vocabularyView('retrieval.looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS)); } /** * Effective retrieval vocabulary as a dedicated structure for diagnostics and config dumps. @@ -459,6 +458,12 @@ final class NdjsonHybridRetrieverConfig * @param string[] $default * @return string[] */ + /** @return string[] */ + private function vocabularyView(string $path, array $fallback): array + { + return $this->vocabulary?->view($path, $fallback) ?? $fallback; + } + private function stringList(string $key, array $default): array { $value = $this->raw($key, $default); @@ -492,10 +497,6 @@ final class NdjsonHybridRetrieverConfig return $this->config[$key]; } - if (array_key_exists($key, $this->vocabulary)) { - return $this->vocabulary[$key]; - } - return $default; } } diff --git a/src/Config/PromptBuilderConfig.php b/src/Config/PromptBuilderConfig.php index 1de65a0..58dbc93 100644 --- a/src/Config/PromptBuilderConfig.php +++ b/src/Config/PromptBuilderConfig.php @@ -6,11 +6,74 @@ namespace App\Config; final class PromptBuilderConfig { + private const TECHNICAL_PRODUCT_KEYWORDS = [ + 'technisch', + 'technical', + 'produkt', + 'product', + 'gerät', + 'device', + 'modell', + 'model', + 'messprinzip', + 'measurement principle', + 'schnittstelle', + 'interface', + 'relais', + 'relay', + 'indikator', + 'indicator', + 'grenzwert', + 'threshold', + 'messbereich', + 'measurement range', + 'minimaler', + 'minimum', + 'resthärte', + 'resthaerte', + '°dh', + 'dh', + 'spannung', + 'voltage', + 'strom', + 'current', + 'druck', + 'pressure', + 'temperatur', + 'temperature', + 'schutzart', + 'ip', + 'fehlercode', + 'error code', + 'wasserhärte', + 'hardness', + 'testomat', + 'chlor', + 'chlormessung', + ]; + + private const ACCESSORY_REQUEST_KEYWORDS = [ + 'passend', + 'passende', + 'passendes', + 'zubehör', + 'zubehor', + 'dazu', + 'indikator', + 'reagenz', + 'kit', + 'set', + 'zusatz', + 'ergänzung', + 'ergaenzung', + ]; + /** * @param array $config */ public function __construct( private readonly array $config = [], + private readonly ?DomainVocabularyConfig $vocabulary = null, ) { } @@ -88,6 +151,42 @@ final class PromptBuilderConfig return is_numeric($value) ? (float) $value : $default; } + /** + * @return string[] + */ + private function getStringList(string $path, array $default): array + { + $value = $this->getValue($path, $default); + + if (!is_array($value)) { + return $default; + } + + $out = []; + foreach ($value as $item) { + if (!is_scalar($item)) { + continue; + } + + $item = trim((string) $item); + if ($item === '' || in_array($item, $out, true)) { + continue; + } + + $out[] = $item; + } + + return $out !== [] ? $out : $default; + } + + /** + * @return string[] + */ + private function vocabularyView(string $path, array $fallback): array + { + return $this->vocabulary?->view($path, $fallback) ?? $fallback; + } + private function getValue(string $path, mixed $default): mixed { $current = $this->config; @@ -445,51 +544,10 @@ final class PromptBuilderConfig */ public function getTechnicalProductKeywords(): array { - return [ - 'technisch', - 'technical', - 'produkt', - 'product', - 'gerät', - 'device', - 'modell', - 'model', - 'messprinzip', - 'measurement principle', - 'schnittstelle', - 'interface', - 'relais', - 'relay', - 'indikator', - 'indicator', - 'grenzwert', - 'threshold', - 'messbereich', - 'measurement range', - 'minimaler', - 'minimum', - 'resthärte', - 'resthaerte', - '°dh', - 'dh', - 'spannung', - 'voltage', - 'strom', - 'current', - 'druck', - 'pressure', - 'temperatur', - 'temperature', - 'schutzart', - 'ip', - 'fehlercode', - 'error code', - 'wasserhärte', - 'hardness', - 'testomat', - 'chlor', - 'chlormessung', - ]; + return $this->getStringList( + 'technical_product_keywords', + $this->vocabularyView('prompt.technical_product_keywords', self::TECHNICAL_PRODUCT_KEYWORDS) + ); } /** @@ -497,21 +555,10 @@ final class PromptBuilderConfig */ public function getAccessoryRequestKeywords(): array { - return [ - 'passend', - 'passende', - 'passendes', - 'zubehör', - 'zubehor', - 'dazu', - 'indikator', - 'reagenz', - 'kit', - 'set', - 'zusatz', - 'ergänzung', - 'ergaenzung', - ]; + return $this->getStringList( + 'accessory_request_keywords', + $this->vocabularyView('prompt.accessory_request_keywords', self::ACCESSORY_REQUEST_KEYWORDS) + ); } public function getTechnicalProductModelPattern(): string diff --git a/src/Config/ShopServiceConfig.php b/src/Config/ShopServiceConfig.php index 6e2797f..1c08993 100644 --- a/src/Config/ShopServiceConfig.php +++ b/src/Config/ShopServiceConfig.php @@ -68,8 +68,10 @@ final class ShopServiceConfig /** * @param array $config */ - public function __construct(private array $config = []) - { + public function __construct( + private array $config = [], + private readonly ?DomainVocabularyConfig $vocabulary = null, + ) { } public function getTopProductLogLimit(): int @@ -80,43 +82,43 @@ final class ShopServiceConfig /** @return string[] */ public function getDeviceFocusKeywords(): array { - return $this->stringList('device_focus_keywords', self::DEVICE_FOCUS_KEYWORDS); + return $this->stringList('device_focus_keywords', $this->vocabularyView('shop.device_focus', self::DEVICE_FOCUS_KEYWORDS)); } /** @return string[] */ public function getAccessoryFocusKeywords(): array { - return $this->stringList('accessory_focus_keywords', self::ACCESSORY_FOCUS_KEYWORDS); + return $this->stringList('accessory_focus_keywords', $this->vocabularyView('shop.accessory_focus', self::ACCESSORY_FOCUS_KEYWORDS)); } /** @return array */ public function getAccessoryFocusVariantMap(): array { - return $this->stringListMap('accessory_focus_variant_map', self::ACCESSORY_FOCUS_VARIANT_MAP); + return $this->stringListMap('accessory_focus_variant_map', $this->vocabularyMap('shop.accessory_focus_variants', self::ACCESSORY_FOCUS_VARIANT_MAP)); } /** @return string[] */ public function getDeviceQueryKeywords(): array { - return $this->stringList('device_query_keywords', self::DEVICE_QUERY_KEYWORDS); + return $this->stringList('device_query_keywords', $this->vocabularyView('shop.device_query', self::DEVICE_QUERY_KEYWORDS)); } /** @return string[] */ public function getAccessoryQueryKeywords(): array { - return $this->stringList('accessory_query_keywords', self::ACCESSORY_QUERY_KEYWORDS); + return $this->stringList('accessory_query_keywords', $this->vocabularyView('shop.accessory_query', self::ACCESSORY_QUERY_KEYWORDS)); } /** @return string[] */ public function getAccessoryProductKeywords(): array { - return $this->stringList('accessory_product_keywords', self::ACCESSORY_PRODUCT_KEYWORDS); + return $this->stringList('accessory_product_keywords', $this->vocabularyView('shop.accessory_product', self::ACCESSORY_PRODUCT_KEYWORDS)); } /** @return string[] */ public function getDeviceProductKeywords(): array { - return $this->stringList('device_product_keywords', self::DEVICE_PRODUCT_KEYWORDS); + return $this->stringList('device_product_keywords', $this->vocabularyView('shop.device_product', self::DEVICE_PRODUCT_KEYWORDS)); } public function getExactProductNumberPhraseScore(): int @@ -368,6 +370,18 @@ final class ShopServiceConfig * @param string[]|null $emptySafeDefault * @return string[] */ + /** @return string[] */ + private function vocabularyView(string $path, array $fallback): array + { + return $this->vocabulary?->view($path, $fallback) ?? $fallback; + } + + /** @return array */ + private function vocabularyMap(string $path, array $fallback): array + { + return $this->vocabulary?->map($path, $fallback) ?? $fallback; + } + private function stringList(string $path, array $default, bool $allowEmptyStrings = false, ?array $emptySafeDefault = null): array { $value = $this->value($path, $default); diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 60f9a45..ea2e1ce 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -1125,7 +1125,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $candidates = []; $seenDocs = []; - foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) { + foreach (array_slice($chunkIds, 0, $this->retrieverConfig->focusedProductWindow()) as $rank => $chunkId) { $row = $rows[$chunkId] ?? null; if (!is_array($row)) { continue; @@ -1171,7 +1171,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $bestScore = (float)$best['score']; $gap = $bestScore - $runnerUpScore; - if ($bestScore < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_SCORE || $gap < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_GAP) { + if ($bestScore < $this->retrieverConfig->focusedProductMinScore() || $gap < $this->retrieverConfig->focusedProductMinGap()) { return null; } @@ -1199,10 +1199,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $normalized = $this->normalizeText($prompt); $tokens = $this->tokenizeText($normalized); - $reagentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_WORDS; - $documentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DOCUMENT_WORDS; - $safetyWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_WORDS; - $deviceWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DEVICE_WORDS; + $reagentWords = $this->retrieverConfig->looksLikeReagentWords(); + $documentWords = $this->retrieverConfig->looksLikeDocumentWords(); + $safetyWords = $this->retrieverConfig->looksLikeSafetyWords(); + $deviceWords = $this->retrieverConfig->looksLikeDeviceWords(); $asksReagent = $this->containsAnyToken($tokens, $reagentWords); $asksDocument = $this->containsAnyToken($tokens, $documentWords); @@ -1343,7 +1343,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $documentId, $chunkIds, $rows, - min($limit, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MAX_CHUNKS) + min($limit, $this->retrieverConfig->focusedProductMaxChunks()) ); } @@ -1358,7 +1358,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface { $docWindow = []; - foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::DOMINANT_DOC_WINDOW) as $chunkId) { + foreach (array_slice($chunkIds, 0, $this->retrieverConfig->dominantDocWindow()) as $chunkId) { if (!isset($rows[$chunkId]['text'])) { continue; } @@ -1388,7 +1388,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $dominantCount = (int)($counts[$dominantDocId] ?? 0); - if ($dominantCount >= NdjsonHybridRetrieverConfig::DOMINANT_DOC_MIN_HITS) { + if ($dominantCount >= $this->retrieverConfig->dominantDocMinHits()) { return $dominantDocId; } @@ -1450,7 +1450,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return []; } - $maxFromDoc = min($limit, NdjsonHybridRetrieverConfig::DOMINANT_DOC_MAX_CHUNKS); + $maxFromDoc = min($limit, $this->retrieverConfig->dominantDocMaxChunks()); if ($anchorChunkIndex !== null) { usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int { @@ -1550,13 +1550,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) { + if (($docCounter[$docId] ?? 0) >= $this->retrieverConfig->maxChunksPerDoc()) { continue; } if (is_int($chunkIndex)) { foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) { - if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) { + if (abs($prevIdx - $chunkIndex) < $this->retrieverConfig->minChunkDistance()) { continue 2; } } @@ -1609,13 +1609,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) { + if (($docCounter[$docId] ?? 0) >= $this->retrieverConfig->maxChunksPerDoc()) { continue; } if (is_int($chunkIndex)) { foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) { - if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) { + if (abs($prevIdx - $chunkIndex) < $this->retrieverConfig->minChunkDistance()) { continue 2; } } @@ -1715,7 +1715,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface */ private function isGenericProductToken(string $token): bool { - static $generic = NdjsonHybridRetrieverConfig::GENERIC_PRODUCT_TOKEN; + $generic = $this->retrieverConfig->genericProductTokens(); + return isset(array_fill_keys($generic, true)[$token]); } @@ -1724,7 +1725,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface */ private function isImportantShortModelToken(string $token): bool { - static $allowed = NdjsonHybridRetrieverConfig::IMPORTANT_SHORT_MODEL_TOKEN; + $allowed = $this->retrieverConfig->importantShortModelTokens(); return in_array($token, $allowed, true); } @@ -1734,7 +1735,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface */ private function isFamilyDescriptorToken(string $token): bool { - static $familyDescriptors = NdjsonHybridRetrieverConfig::FAMILY_DESCRIPTOR_TOKEN; + $familyDescriptors = $this->retrieverConfig->familyDescriptorTokens(); return in_array($token, $familyDescriptors, true) || $this->isImportantShortModelToken($token) @@ -1752,7 +1753,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return false; } - $needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_TOKENS; + $needles = $this->retrieverConfig->looksLikeReagentTokens(); foreach ($needles as $needle) { if (str_contains($haystack, $needle)) { @@ -1774,7 +1775,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return false; } - $needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_DOCS; + $needles = $this->retrieverConfig->looksLikeSafetyDocs(); foreach ($needles as $needle) { if (str_contains($haystack, $needle)) {