central config part 1
This commit is contained in:
29
RETRIEX_VOCABULARY_FIX_README.md
Normal file
29
RETRIEX_VOCABULARY_FIX_README.md
Normal file
@@ -0,0 +1,29 @@
|
||||
# RetrieX Vocabulary Centralization Fix
|
||||
|
||||
This patch centralizes the growing recognition word lists without changing their tuned content.
|
||||
|
||||
## Main changes
|
||||
|
||||
- Added `config/retriex/vocabulary.yaml`.
|
||||
- Added `App\Config\DomainVocabularyConfig`.
|
||||
- Wired the vocabulary facade into:
|
||||
- `ShopServiceConfig`
|
||||
- `NdjsonHybridRetrieverConfig`
|
||||
- `PromptBuilderConfig`
|
||||
- `CommerceQueryParserConfig`
|
||||
- Moved the active Shop and Retrieval vocabulary defaults out of `commerce.yaml` and `retrieval.yaml` into `vocabulary.yaml`.
|
||||
- Kept all old per-service config keys as explicit overrides.
|
||||
- Removed direct `NdjsonHybridRetrieverConfig::...` constant usage inside `NdjsonHybridRetriever` so effective config getters are used consistently.
|
||||
|
||||
## Stability note
|
||||
|
||||
The vocabulary views preserve the previous order and content of the tuned lists.
|
||||
No new semantic terms were added to the critical retrieval and shop matching views.
|
||||
|
||||
Required regression baseline:
|
||||
|
||||
- `Was ist der niedrigste Grenzwert für die Wasserhärte, welcher mit einem Testomaten überwacht werden kann?`
|
||||
- expected: `0,02 °dH (Testomat 808)`
|
||||
- `mit welchem indikator wird der wert gemessen`
|
||||
- expected: `Indikatortyp 300`
|
||||
- Store query with `0,02` must preserve the decimal value and must not turn it into `02`.
|
||||
@@ -16,166 +16,8 @@ parameters:
|
||||
retriex.shop_matching.config:
|
||||
top_product_log_limit: 3
|
||||
|
||||
device_query_keywords:
|
||||
- analysegerät
|
||||
- analysegeraet
|
||||
- analysegeräte
|
||||
- analysegeraete
|
||||
- messgerät
|
||||
- messgeraet
|
||||
- messgeräte
|
||||
- messgeraete
|
||||
- analysator
|
||||
- analysatoren
|
||||
- analyzer
|
||||
- gerät
|
||||
- geraet
|
||||
- geräte
|
||||
- geraete
|
||||
- monitor
|
||||
- monitore
|
||||
- controller
|
||||
- gerät für
|
||||
- geraet fuer
|
||||
- geräte für
|
||||
- geraete fuer
|
||||
- system
|
||||
- systeme
|
||||
- anlage
|
||||
- anlagen
|
||||
|
||||
accessory_query_keywords:
|
||||
- zubehör
|
||||
- zubehor
|
||||
- reagenz
|
||||
- reagenzien
|
||||
- reagent
|
||||
- indikator
|
||||
- indikatoren
|
||||
- indicator
|
||||
- kit
|
||||
- set
|
||||
- ersatz
|
||||
- ersatzteil
|
||||
- ersatzteile
|
||||
- verbrauchsmaterial
|
||||
- consumable
|
||||
- dazu
|
||||
- passend
|
||||
- passende
|
||||
- passendes
|
||||
- nachfüll
|
||||
- nachfuell
|
||||
- refill
|
||||
- filter
|
||||
- pumpenkopf
|
||||
- motorblock
|
||||
- service set
|
||||
- serviceset
|
||||
- service-set
|
||||
|
||||
accessory_product_keywords:
|
||||
- reagenz
|
||||
- reagenzien
|
||||
- reagent
|
||||
- indikator
|
||||
- indikatoren
|
||||
- indicator
|
||||
- kit
|
||||
- set
|
||||
- verbrauchsmaterial
|
||||
- consumable
|
||||
- zubehör
|
||||
- zubehor
|
||||
- ersatz
|
||||
- ersatzteil
|
||||
- ersatzteile
|
||||
- nachfüll
|
||||
- nachfuell
|
||||
- refill
|
||||
- lösung
|
||||
- loesung
|
||||
- solution
|
||||
- teststreifen
|
||||
- test strip
|
||||
- filter
|
||||
- pumpenkopf
|
||||
- motorblock
|
||||
- service set
|
||||
- serviceset
|
||||
- service-set
|
||||
|
||||
device_product_keywords:
|
||||
- analysegerät
|
||||
- analysegeraet
|
||||
- analysegeräte
|
||||
- analysegeraete
|
||||
- messgerät
|
||||
- messgeraet
|
||||
- messgeräte
|
||||
- messgeraete
|
||||
- analysator
|
||||
- analysatoren
|
||||
- analyzer
|
||||
- monitor
|
||||
- monitore
|
||||
- controller
|
||||
- online-analysator
|
||||
- online analysator
|
||||
- online-analysegerät
|
||||
- online analysegeraet
|
||||
- online-analysegeräte
|
||||
- online analysegeraete
|
||||
- online analyzer
|
||||
- online monitor
|
||||
- system
|
||||
- systeme
|
||||
- anlage
|
||||
- anlagen
|
||||
- gerät
|
||||
- geraet
|
||||
- geräte
|
||||
- geraete
|
||||
|
||||
device_focus_keywords:
|
||||
- geräte
|
||||
- geraete
|
||||
- gerät
|
||||
- geraet
|
||||
- analysegerät
|
||||
- analysegeraet
|
||||
- messgerät
|
||||
- messgeraet
|
||||
- analysator
|
||||
- controller
|
||||
- monitor
|
||||
|
||||
accessory_focus_keywords:
|
||||
- indikator
|
||||
- indikatoren
|
||||
- reagenz
|
||||
- reagenzien
|
||||
- zubehör
|
||||
- zubehor
|
||||
- ersatzteil
|
||||
- ersatzteile
|
||||
- verbrauchsmaterial
|
||||
- service set
|
||||
- serviceset
|
||||
- filter
|
||||
- pumpenkopf
|
||||
- motorblock
|
||||
|
||||
accessory_focus_variant_map:
|
||||
indikator: [indikator, indikatoren]
|
||||
indikatoren: [indikator, indikatoren]
|
||||
reagenz: [reagenz, reagenzien]
|
||||
reagenzien: [reagenz, reagenzien]
|
||||
ersatzteil: [ersatzteil, ersatzteile]
|
||||
ersatzteile: [ersatzteil, ersatzteile]
|
||||
service set: [service set, serviceset, service-set]
|
||||
serviceset: [service set, serviceset, service-set]
|
||||
service-set: [service set, serviceset, service-set]
|
||||
# Vocabulary-backed lists live in config/retriex/vocabulary.yaml.
|
||||
# The old per-key entries may still be added here to override a specific view.
|
||||
|
||||
scores:
|
||||
exact_product_number_phrase: 160
|
||||
|
||||
@@ -27,141 +27,8 @@ parameters:
|
||||
focused_product_min_gap: 4.0
|
||||
focused_product_max_chunks: 4
|
||||
|
||||
generic_product_tokens:
|
||||
- produkt
|
||||
- produkte
|
||||
- produktkarte
|
||||
- titel
|
||||
- geraet
|
||||
- gerät
|
||||
- messgeraet
|
||||
- messgerät
|
||||
- wasser
|
||||
- haerte
|
||||
- härte
|
||||
- resthaerte
|
||||
- resthärte
|
||||
- analyse
|
||||
- analysator
|
||||
- automat
|
||||
- online
|
||||
- messung
|
||||
- messen
|
||||
- preis
|
||||
- preise
|
||||
- kosten
|
||||
- info
|
||||
- infos
|
||||
- passend
|
||||
- richtige
|
||||
- richtiges
|
||||
- geeignet
|
||||
- geeignete
|
||||
- welche
|
||||
- welcher
|
||||
- welches
|
||||
- brauche
|
||||
- suche
|
||||
|
||||
important_short_model_tokens: [th, tc, tp, tm, ph, rx]
|
||||
|
||||
family_descriptor_tokens:
|
||||
- evo
|
||||
- eco
|
||||
- self
|
||||
- clean
|
||||
- mini
|
||||
- pro
|
||||
- plus
|
||||
- basic
|
||||
- lab
|
||||
- inline
|
||||
- compact
|
||||
- panel
|
||||
- sc
|
||||
|
||||
looks_like_reagent_tokens:
|
||||
- indikator
|
||||
- reagenz
|
||||
- reagens
|
||||
- laborchemikalie
|
||||
- chemikalie
|
||||
- sicherheitsdatenblatt
|
||||
- sdb
|
||||
- msds
|
||||
- ufi
|
||||
- gebinde
|
||||
- flasche
|
||||
- ersatzteil
|
||||
- zubehoer
|
||||
- zubehör
|
||||
- service set
|
||||
- filtereinsatz
|
||||
- kerzenfilter
|
||||
- druckregler
|
||||
|
||||
looks_like_safety_docs:
|
||||
- sicherheitsdatenblatt
|
||||
- sdb
|
||||
- msds
|
||||
- gefahrenbewertung
|
||||
- gefahrenpiktogramm
|
||||
- signalwort
|
||||
- lagerung
|
||||
- transport
|
||||
- clp
|
||||
- kennzeichnung
|
||||
- h290
|
||||
- pbt
|
||||
- vpvb
|
||||
|
||||
looks_like_reagent_words:
|
||||
- indikator
|
||||
- reagenz
|
||||
- reagens
|
||||
- chemie
|
||||
- chemikalie
|
||||
- sdb
|
||||
- sicherheitsdatenblatt
|
||||
- msds
|
||||
- flasche
|
||||
- gebinde
|
||||
|
||||
looks_like_document_words:
|
||||
- datenblatt
|
||||
- dokument
|
||||
- pdf
|
||||
- handbuch
|
||||
- manual
|
||||
- beschreibung
|
||||
- sdb
|
||||
- sicherheitsdatenblatt
|
||||
- msds
|
||||
|
||||
looks_like_safety_words:
|
||||
- gefahr
|
||||
- gefahrgut
|
||||
- clp
|
||||
- h290
|
||||
- sicherheit
|
||||
- kennzeichnung
|
||||
- transport
|
||||
- lagerung
|
||||
- piktogramm
|
||||
|
||||
looks_like_device_words:
|
||||
- geraet
|
||||
- gerät
|
||||
- messgeraet
|
||||
- messgerät
|
||||
- analysator
|
||||
- automat
|
||||
- messung
|
||||
- messen
|
||||
- ueberwachung
|
||||
- überwachung
|
||||
- online
|
||||
- monitor
|
||||
# Vocabulary-backed retrieval token lists live in config/retriex/vocabulary.yaml.
|
||||
# The old per-key entries may still be added here to override a specific view.
|
||||
|
||||
# Backwards-compatible name for existing config diagnostics.
|
||||
retriex.retrieval.inventory: '%retriex.retrieval.config%'
|
||||
|
||||
597
config/retriex/vocabulary.yaml
Normal file
597
config/retriex/vocabulary.yaml
Normal file
@@ -0,0 +1,597 @@
|
||||
# Central domain vocabulary for RetrieX.
|
||||
# Views preserve the previous 1.4.2-tuned ordering exactly; per-service configs may still override them.
|
||||
parameters:
|
||||
retriex.commerce_query.config: {}
|
||||
retriex.vocabulary.config:
|
||||
classes:
|
||||
device:
|
||||
- analysegerät
|
||||
- analysegeraet
|
||||
- analysegeräte
|
||||
- analysegeraete
|
||||
- messgerät
|
||||
- messgeraet
|
||||
- messgeräte
|
||||
- messgeraete
|
||||
- analysator
|
||||
- analysatoren
|
||||
- analyzer
|
||||
- gerät
|
||||
- geraet
|
||||
- geräte
|
||||
- geraete
|
||||
- monitor
|
||||
- monitore
|
||||
- controller
|
||||
- system
|
||||
- systeme
|
||||
- anlage
|
||||
- anlagen
|
||||
accessory:
|
||||
- zubehör
|
||||
- zubehor
|
||||
- reagenz
|
||||
- reagenzien
|
||||
- reagent
|
||||
- indikator
|
||||
- indikatoren
|
||||
- indicator
|
||||
- kit
|
||||
- set
|
||||
- ersatz
|
||||
- ersatzteil
|
||||
- ersatzteile
|
||||
- verbrauchsmaterial
|
||||
- consumable
|
||||
- filter
|
||||
- pumpenkopf
|
||||
- motorblock
|
||||
- service set
|
||||
- serviceset
|
||||
- service-set
|
||||
views:
|
||||
shop:
|
||||
device_query:
|
||||
add:
|
||||
- analysegerät
|
||||
- analysegeraet
|
||||
- analysegeräte
|
||||
- analysegeraete
|
||||
- messgerät
|
||||
- messgeraet
|
||||
- messgeräte
|
||||
- messgeraete
|
||||
- analysator
|
||||
- analysatoren
|
||||
- analyzer
|
||||
- gerät
|
||||
- geraet
|
||||
- geräte
|
||||
- geraete
|
||||
- monitor
|
||||
- monitore
|
||||
- controller
|
||||
- gerät für
|
||||
- geraet fuer
|
||||
- geräte für
|
||||
- geraete fuer
|
||||
- system
|
||||
- systeme
|
||||
- anlage
|
||||
- anlagen
|
||||
accessory_query:
|
||||
add:
|
||||
- zubehör
|
||||
- zubehor
|
||||
- reagenz
|
||||
- reagenzien
|
||||
- reagent
|
||||
- indikator
|
||||
- indikatoren
|
||||
- indicator
|
||||
- kit
|
||||
- set
|
||||
- ersatz
|
||||
- ersatzteil
|
||||
- ersatzteile
|
||||
- verbrauchsmaterial
|
||||
- consumable
|
||||
- dazu
|
||||
- passend
|
||||
- passende
|
||||
- passendes
|
||||
- nachfüll
|
||||
- nachfuell
|
||||
- refill
|
||||
- filter
|
||||
- pumpenkopf
|
||||
- motorblock
|
||||
- service set
|
||||
- serviceset
|
||||
- service-set
|
||||
accessory_product:
|
||||
add:
|
||||
- reagenz
|
||||
- reagenzien
|
||||
- reagent
|
||||
- indikator
|
||||
- indikatoren
|
||||
- indicator
|
||||
- kit
|
||||
- set
|
||||
- verbrauchsmaterial
|
||||
- consumable
|
||||
- zubehör
|
||||
- zubehor
|
||||
- ersatz
|
||||
- ersatzteil
|
||||
- ersatzteile
|
||||
- nachfüll
|
||||
- nachfuell
|
||||
- refill
|
||||
- lösung
|
||||
- loesung
|
||||
- solution
|
||||
- teststreifen
|
||||
- test strip
|
||||
- filter
|
||||
- pumpenkopf
|
||||
- motorblock
|
||||
- service set
|
||||
- serviceset
|
||||
- service-set
|
||||
device_product:
|
||||
add:
|
||||
- analysegerät
|
||||
- analysegeraet
|
||||
- analysegeräte
|
||||
- analysegeraete
|
||||
- messgerät
|
||||
- messgeraet
|
||||
- messgeräte
|
||||
- messgeraete
|
||||
- analysator
|
||||
- analysatoren
|
||||
- analyzer
|
||||
- monitor
|
||||
- monitore
|
||||
- controller
|
||||
- online-analysator
|
||||
- online analysator
|
||||
- online-analysegerät
|
||||
- online analysegeraet
|
||||
- online-analysegeräte
|
||||
- online analysegeraete
|
||||
- online analyzer
|
||||
- online monitor
|
||||
- system
|
||||
- systeme
|
||||
- anlage
|
||||
- anlagen
|
||||
- gerät
|
||||
- geraet
|
||||
- geräte
|
||||
- geraete
|
||||
device_focus:
|
||||
add:
|
||||
- geräte
|
||||
- geraete
|
||||
- gerät
|
||||
- geraet
|
||||
- analysegerät
|
||||
- analysegeraet
|
||||
- messgerät
|
||||
- messgeraet
|
||||
- analysator
|
||||
- controller
|
||||
- monitor
|
||||
accessory_focus:
|
||||
add:
|
||||
- indikator
|
||||
- indikatoren
|
||||
- reagenz
|
||||
- reagenzien
|
||||
- zubehör
|
||||
- zubehor
|
||||
- ersatzteil
|
||||
- ersatzteile
|
||||
- verbrauchsmaterial
|
||||
- service set
|
||||
- serviceset
|
||||
- filter
|
||||
- pumpenkopf
|
||||
- motorblock
|
||||
commerce_query:
|
||||
known_brands:
|
||||
add:
|
||||
- heyl
|
||||
- horiba
|
||||
- neomeris
|
||||
phrases_to_remove:
|
||||
add:
|
||||
- ich suche
|
||||
- suche
|
||||
- habt ihr
|
||||
- gibt es
|
||||
- gebe mir
|
||||
- gib mir
|
||||
- zeige mir
|
||||
- welches gerät
|
||||
- welche gerät
|
||||
- welches modell
|
||||
- welches ist besser
|
||||
- welches ist am besten
|
||||
- alternative
|
||||
- alternativen
|
||||
- unter anderem
|
||||
- u a
|
||||
- welche
|
||||
- welcher
|
||||
- welches
|
||||
- welchen
|
||||
- sind
|
||||
- ist
|
||||
- geeignet
|
||||
- geeigent
|
||||
- verfügbarkeit
|
||||
- verfuegbarkeit
|
||||
filter_search_tokens:
|
||||
add:
|
||||
- auch
|
||||
- noch
|
||||
- nochmal
|
||||
- zusätzlich
|
||||
- dazu
|
||||
- davon
|
||||
- stattdessen
|
||||
- bitte
|
||||
- gern
|
||||
- gerne
|
||||
- zeige
|
||||
- zeig
|
||||
- such
|
||||
- suche
|
||||
- finde
|
||||
- find
|
||||
- mir
|
||||
- mal
|
||||
- von
|
||||
- im
|
||||
- in
|
||||
- für
|
||||
- fuer
|
||||
- welche
|
||||
- welcher
|
||||
- welches
|
||||
- welchen
|
||||
- sind
|
||||
- ist
|
||||
- geeignet
|
||||
- geeigent
|
||||
- verfügbarkeit
|
||||
- verfuegbarkeit
|
||||
- prüfe
|
||||
- pruefe
|
||||
- den
|
||||
- die
|
||||
- das
|
||||
- der
|
||||
- dem
|
||||
- des
|
||||
- und
|
||||
- oder
|
||||
- sowie
|
||||
- seine
|
||||
- seinen
|
||||
- seiner
|
||||
- seinem
|
||||
- seines
|
||||
- siene
|
||||
- sienen
|
||||
- siener
|
||||
- sienem
|
||||
- sienes
|
||||
- gebe
|
||||
- gib
|
||||
- nenne
|
||||
- nenn
|
||||
- preis
|
||||
- preise
|
||||
- preisen
|
||||
- kostet
|
||||
- kosten
|
||||
- ua
|
||||
- also
|
||||
- gut
|
||||
- gute
|
||||
- guten
|
||||
- guter
|
||||
- gutes
|
||||
- passen
|
||||
- passend
|
||||
semantic_shop_search_tokens:
|
||||
add:
|
||||
- indikator
|
||||
- indicator
|
||||
- reagenz
|
||||
- reagent
|
||||
- zubehör
|
||||
- zubehor
|
||||
- ersatzteil
|
||||
- verbrauchsmaterial
|
||||
- chemie
|
||||
- indikatorchemie
|
||||
- reagenzchemie
|
||||
- kit
|
||||
- set
|
||||
- filter
|
||||
- pumpe
|
||||
- pumpenkopf
|
||||
- motorblock
|
||||
- lösung
|
||||
- loesung
|
||||
- solution
|
||||
- teststreifen
|
||||
- gerät
|
||||
- geraet
|
||||
- messgerät
|
||||
- messgeraet
|
||||
- analysegerät
|
||||
- analysegeraet
|
||||
- analysator
|
||||
- monitor
|
||||
- controller
|
||||
- system
|
||||
retrieval:
|
||||
generic_product_tokens:
|
||||
add:
|
||||
- produkt
|
||||
- produkte
|
||||
- produktkarte
|
||||
- titel
|
||||
- geraet
|
||||
- gerät
|
||||
- messgeraet
|
||||
- messgerät
|
||||
- wasser
|
||||
- haerte
|
||||
- härte
|
||||
- resthaerte
|
||||
- resthärte
|
||||
- analyse
|
||||
- analysator
|
||||
- automat
|
||||
- online
|
||||
- messung
|
||||
- messen
|
||||
- preis
|
||||
- preise
|
||||
- kosten
|
||||
- info
|
||||
- infos
|
||||
- passend
|
||||
- richtige
|
||||
- richtiges
|
||||
- geeignet
|
||||
- geeignete
|
||||
- welche
|
||||
- welcher
|
||||
- welches
|
||||
- brauche
|
||||
- suche
|
||||
important_short_model_tokens:
|
||||
add:
|
||||
- th
|
||||
- tc
|
||||
- tp
|
||||
- tm
|
||||
- ph
|
||||
- rx
|
||||
family_descriptor_tokens:
|
||||
add:
|
||||
- evo
|
||||
- eco
|
||||
- self
|
||||
- clean
|
||||
- mini
|
||||
- pro
|
||||
- plus
|
||||
- basic
|
||||
- lab
|
||||
- inline
|
||||
- compact
|
||||
- panel
|
||||
- sc
|
||||
looks_like_reagent_tokens:
|
||||
add:
|
||||
- indikator
|
||||
- reagenz
|
||||
- reagens
|
||||
- laborchemikalie
|
||||
- chemikalie
|
||||
- sicherheitsdatenblatt
|
||||
- sdb
|
||||
- msds
|
||||
- ufi
|
||||
- gebinde
|
||||
- flasche
|
||||
- ersatzteil
|
||||
- zubehoer
|
||||
- zubehör
|
||||
- service set
|
||||
- filtereinsatz
|
||||
- kerzenfilter
|
||||
- druckregler
|
||||
looks_like_safety_docs:
|
||||
add:
|
||||
- sicherheitsdatenblatt
|
||||
- sdb
|
||||
- msds
|
||||
- gefahrenbewertung
|
||||
- gefahrenpiktogramm
|
||||
- signalwort
|
||||
- lagerung
|
||||
- transport
|
||||
- clp
|
||||
- kennzeichnung
|
||||
- h290
|
||||
- pbt
|
||||
- vpvb
|
||||
looks_like_reagent_words:
|
||||
add:
|
||||
- indikator
|
||||
- reagenz
|
||||
- reagens
|
||||
- chemie
|
||||
- chemikalie
|
||||
- sdb
|
||||
- sicherheitsdatenblatt
|
||||
- msds
|
||||
- flasche
|
||||
- gebinde
|
||||
looks_like_document_words:
|
||||
add:
|
||||
- datenblatt
|
||||
- dokument
|
||||
- pdf
|
||||
- handbuch
|
||||
- manual
|
||||
- beschreibung
|
||||
- sdb
|
||||
- sicherheitsdatenblatt
|
||||
- msds
|
||||
looks_like_safety_words:
|
||||
add:
|
||||
- gefahr
|
||||
- gefahrgut
|
||||
- clp
|
||||
- h290
|
||||
- sicherheit
|
||||
- kennzeichnung
|
||||
- transport
|
||||
- lagerung
|
||||
- piktogramm
|
||||
looks_like_device_words:
|
||||
add:
|
||||
- geraet
|
||||
- gerät
|
||||
- messgeraet
|
||||
- messgerät
|
||||
- analysator
|
||||
- automat
|
||||
- messung
|
||||
- messen
|
||||
- ueberwachung
|
||||
- überwachung
|
||||
- online
|
||||
- monitor
|
||||
prompt:
|
||||
technical_product_keywords:
|
||||
add:
|
||||
- technisch
|
||||
- technical
|
||||
- produkt
|
||||
- product
|
||||
- gerät
|
||||
- device
|
||||
- modell
|
||||
- model
|
||||
- messprinzip
|
||||
- measurement principle
|
||||
- schnittstelle
|
||||
- interface
|
||||
- relais
|
||||
- relay
|
||||
- indikator
|
||||
- indicator
|
||||
- grenzwert
|
||||
- threshold
|
||||
- messbereich
|
||||
- measurement range
|
||||
- minimaler
|
||||
- minimum
|
||||
- resthärte
|
||||
- resthaerte
|
||||
- °dh
|
||||
- dh
|
||||
- spannung
|
||||
- voltage
|
||||
- strom
|
||||
- current
|
||||
- druck
|
||||
- pressure
|
||||
- temperatur
|
||||
- temperature
|
||||
- schutzart
|
||||
- ip
|
||||
- fehlercode
|
||||
- error code
|
||||
- wasserhärte
|
||||
- hardness
|
||||
- testomat
|
||||
- chlor
|
||||
- chlormessung
|
||||
accessory_request_keywords:
|
||||
add:
|
||||
- passend
|
||||
- passende
|
||||
- passendes
|
||||
- zubehör
|
||||
- zubehor
|
||||
- dazu
|
||||
- indikator
|
||||
- reagenz
|
||||
- kit
|
||||
- set
|
||||
- zusatz
|
||||
- ergänzung
|
||||
- ergaenzung
|
||||
maps:
|
||||
shop:
|
||||
accessory_focus_variants:
|
||||
indikator:
|
||||
- indikator
|
||||
- indikatoren
|
||||
indikatoren:
|
||||
- indikator
|
||||
- indikatoren
|
||||
reagenz:
|
||||
- reagenz
|
||||
- reagenzien
|
||||
reagenzien:
|
||||
- reagenz
|
||||
- reagenzien
|
||||
ersatzteil:
|
||||
- ersatzteil
|
||||
- ersatzteile
|
||||
ersatzteile:
|
||||
- ersatzteil
|
||||
- ersatzteile
|
||||
service set:
|
||||
- service set
|
||||
- serviceset
|
||||
- service-set
|
||||
serviceset:
|
||||
- service set
|
||||
- serviceset
|
||||
- service-set
|
||||
service-set:
|
||||
- service set
|
||||
- serviceset
|
||||
- service-set
|
||||
commerce_query:
|
||||
search_token_corrections:
|
||||
siene: seine
|
||||
sienen: seinen
|
||||
siener: seiner
|
||||
sienem: seinem
|
||||
sienes: seines
|
||||
indicatoren: indikatoren
|
||||
search_token_canonical:
|
||||
indikatoren: indikator
|
||||
indicators: indikator
|
||||
indicator: indikator
|
||||
reagenzien: reagenz
|
||||
reagents: reagenz
|
||||
reagent: reagenz
|
||||
produkte: produkt
|
||||
@@ -9,6 +9,7 @@ imports:
|
||||
- { resource: 'retriex/retrieval.yaml' }
|
||||
- { resource: 'retriex/language.yaml' }
|
||||
- { resource: 'retriex/query_enrichment.yaml' }
|
||||
- { resource: 'retriex/vocabulary.yaml' }
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Parameters
|
||||
@@ -112,9 +113,14 @@ services:
|
||||
$retrievalMaxChunks: '%retriex.model.default_retrieval_max_chunks%'
|
||||
$retrievalVectorTopK: '%retriex.model.default_retrieval_vector_top_k%'
|
||||
|
||||
App\Config\DomainVocabularyConfig:
|
||||
arguments:
|
||||
$config: '%retriex.vocabulary.config%'
|
||||
|
||||
App\Config\PromptBuilderConfig:
|
||||
arguments:
|
||||
$config: '%retriex.prompt.config%'
|
||||
$vocabulary: '@App\Config\DomainVocabularyConfig'
|
||||
|
||||
App\Config\AgentRunnerConfig:
|
||||
arguments:
|
||||
@@ -123,7 +129,7 @@ services:
|
||||
App\Config\NdjsonHybridRetrieverConfig:
|
||||
arguments:
|
||||
$config: '%retriex.retrieval.config%'
|
||||
$vocabulary: '%retriex.retrieval.config%'
|
||||
$vocabulary: '@App\Config\DomainVocabularyConfig'
|
||||
|
||||
App\Config\StopWordsConfig:
|
||||
arguments:
|
||||
@@ -136,6 +142,7 @@ services:
|
||||
App\Config\ShopServiceConfig:
|
||||
arguments:
|
||||
$config: '%retriex.shop_matching.config%'
|
||||
$vocabulary: '@App\Config\DomainVocabularyConfig'
|
||||
|
||||
App\Infrastructure\OllamaClient:
|
||||
arguments:
|
||||
@@ -176,6 +183,11 @@ services:
|
||||
|
||||
App\Intent\CommerceIntentLite: ~
|
||||
|
||||
App\Config\CommerceQueryParserConfig:
|
||||
arguments:
|
||||
$config: '%retriex.commerce_query.config%'
|
||||
$vocabulary: '@App\Config\DomainVocabularyConfig'
|
||||
|
||||
App\Commerce\CommerceQueryParser: ~
|
||||
|
||||
App\Config\SearchRepairConfig:
|
||||
|
||||
@@ -6,24 +6,13 @@ namespace App\Config;
|
||||
|
||||
final class CommerceQueryParserConfig
|
||||
{
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
public function getKnownBrands(): array
|
||||
{
|
||||
return [
|
||||
private const KNOWN_BRANDS = [
|
||||
'heyl',
|
||||
'horiba',
|
||||
'neomeris',
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
public function getPhrasesToRemove(): array
|
||||
{
|
||||
return [
|
||||
private const PHRASES_TO_REMOVE = [
|
||||
'ich suche',
|
||||
'suche',
|
||||
'habt ihr',
|
||||
@@ -51,24 +40,8 @@ final class CommerceQueryParserConfig
|
||||
'verfügbarkeit',
|
||||
'verfuegbarkeit',
|
||||
];
|
||||
}
|
||||
|
||||
public function getHistoryContextPattern(): string
|
||||
{
|
||||
return 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt';
|
||||
}
|
||||
|
||||
public function getHistoryContextValuePattern(): string
|
||||
{
|
||||
return '/\b(' . $this->getHistoryContextPattern() . ')\b/u';
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
public function getFilterSearchTokens(): array
|
||||
{
|
||||
return [
|
||||
private const FILTER_SEARCH_TOKENS = [
|
||||
'auch',
|
||||
'noch',
|
||||
'nochmal',
|
||||
@@ -142,14 +115,8 @@ final class CommerceQueryParserConfig
|
||||
'passen',
|
||||
'passend',
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, string>
|
||||
*/
|
||||
public function getSearchTokenCorrections(): array
|
||||
{
|
||||
return [
|
||||
private const SEARCH_TOKEN_CORRECTIONS = [
|
||||
'siene' => 'seine',
|
||||
'sienen' => 'seinen',
|
||||
'siener' => 'seiner',
|
||||
@@ -157,14 +124,8 @@ final class CommerceQueryParserConfig
|
||||
'sienes' => 'seines',
|
||||
'indicatoren' => 'indikatoren',
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, string>
|
||||
*/
|
||||
public function getSearchTokenCanonicalMap(): array
|
||||
{
|
||||
return [
|
||||
private const SEARCH_TOKEN_CANONICAL_MAP = [
|
||||
'indikatoren' => 'indikator',
|
||||
'indicators' => 'indikator',
|
||||
'indicator' => 'indikator',
|
||||
@@ -173,6 +134,113 @@ final class CommerceQueryParserConfig
|
||||
'reagent' => 'reagenz',
|
||||
'produkte' => 'produkt',
|
||||
];
|
||||
|
||||
private const SEMANTIC_SHOP_SEARCH_TOKENS = [
|
||||
'indikator',
|
||||
'indicator',
|
||||
'reagenz',
|
||||
'reagent',
|
||||
'zubehör',
|
||||
'zubehor',
|
||||
'ersatzteil',
|
||||
'verbrauchsmaterial',
|
||||
'chemie',
|
||||
'indikatorchemie',
|
||||
'reagenzchemie',
|
||||
'kit',
|
||||
'set',
|
||||
'filter',
|
||||
'pumpe',
|
||||
'pumpenkopf',
|
||||
'motorblock',
|
||||
'lösung',
|
||||
'loesung',
|
||||
'solution',
|
||||
'teststreifen',
|
||||
'gerät',
|
||||
'geraet',
|
||||
'messgerät',
|
||||
'messgeraet',
|
||||
'analysegerät',
|
||||
'analysegeraet',
|
||||
'analysator',
|
||||
'monitor',
|
||||
'controller',
|
||||
'system',
|
||||
];
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $config
|
||||
*/
|
||||
public function __construct(
|
||||
private readonly array $config = [],
|
||||
private readonly ?DomainVocabularyConfig $vocabulary = null,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
public function getKnownBrands(): array
|
||||
{
|
||||
return $this->stringList(
|
||||
'known_brands',
|
||||
$this->vocabularyView('commerce_query.known_brands', self::KNOWN_BRANDS)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
public function getPhrasesToRemove(): array
|
||||
{
|
||||
return $this->stringList(
|
||||
'phrases_to_remove',
|
||||
$this->vocabularyView('commerce_query.phrases_to_remove', self::PHRASES_TO_REMOVE)
|
||||
);
|
||||
}
|
||||
|
||||
public function getHistoryContextPattern(): string
|
||||
{
|
||||
return 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt';
|
||||
}
|
||||
|
||||
public function getHistoryContextValuePattern(): string
|
||||
{
|
||||
return '/\b(' . $this->getHistoryContextPattern() . ')\b/u';
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
public function getFilterSearchTokens(): array
|
||||
{
|
||||
return $this->stringList(
|
||||
'filter_search_tokens',
|
||||
$this->vocabularyView('commerce_query.filter_search_tokens', self::FILTER_SEARCH_TOKENS)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, string>
|
||||
*/
|
||||
public function getSearchTokenCorrections(): array
|
||||
{
|
||||
return $this->stringMap(
|
||||
'search_token_corrections',
|
||||
$this->vocabularyStringMap('commerce_query.search_token_corrections', self::SEARCH_TOKEN_CORRECTIONS)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, string>
|
||||
*/
|
||||
public function getSearchTokenCanonicalMap(): array
|
||||
{
|
||||
return $this->stringMap(
|
||||
'search_token_canonical_map',
|
||||
$this->vocabularyStringMap('commerce_query.search_token_canonical', self::SEARCH_TOKEN_CANONICAL_MAP)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -335,39 +403,86 @@ final class CommerceQueryParserConfig
|
||||
*/
|
||||
public function getSemanticShopSearchTokens(): array
|
||||
{
|
||||
return [
|
||||
'indikator',
|
||||
'indicator',
|
||||
'reagenz',
|
||||
'reagent',
|
||||
'zubehör',
|
||||
'zubehor',
|
||||
'ersatzteil',
|
||||
'verbrauchsmaterial',
|
||||
'chemie',
|
||||
'indikatorchemie',
|
||||
'reagenzchemie',
|
||||
'kit',
|
||||
'set',
|
||||
'filter',
|
||||
'pumpe',
|
||||
'pumpenkopf',
|
||||
'motorblock',
|
||||
'lösung',
|
||||
'loesung',
|
||||
'solution',
|
||||
'teststreifen',
|
||||
'gerät',
|
||||
'geraet',
|
||||
'messgerät',
|
||||
'messgeraet',
|
||||
'analysegerät',
|
||||
'analysegeraet',
|
||||
'analysator',
|
||||
'monitor',
|
||||
'controller',
|
||||
'system',
|
||||
];
|
||||
return $this->stringList(
|
||||
'semantic_shop_search_tokens',
|
||||
$this->vocabularyView('commerce_query.semantic_shop_search_tokens', self::SEMANTIC_SHOP_SEARCH_TOKENS)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/** @return string[] */
|
||||
private function vocabularyView(string $path, array $fallback): array
|
||||
{
|
||||
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
|
||||
}
|
||||
|
||||
/** @return array<string, string> */
|
||||
private function vocabularyStringMap(string $path, array $fallback): array
|
||||
{
|
||||
return $this->vocabulary?->stringMap($path, $fallback) ?? $fallback;
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
private function stringList(string $path, array $default): array
|
||||
{
|
||||
$value = $this->value($path, $default);
|
||||
if (!is_array($value)) {
|
||||
return $default;
|
||||
}
|
||||
|
||||
$out = [];
|
||||
foreach ($value as $item) {
|
||||
if (!is_scalar($item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = trim((string) $item);
|
||||
if ($item === '' || in_array($item, $out, true)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = $item;
|
||||
}
|
||||
|
||||
return $out !== [] ? $out : $default;
|
||||
}
|
||||
|
||||
/** @return array<string, string> */
|
||||
private function stringMap(string $path, array $default): array
|
||||
{
|
||||
$value = $this->value($path, $default);
|
||||
if (!is_array($value)) {
|
||||
return $default;
|
||||
}
|
||||
|
||||
$out = [];
|
||||
foreach ($value as $key => $item) {
|
||||
if (!is_scalar($key) || !is_scalar($item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$cleanKey = trim((string) $key);
|
||||
$cleanValue = trim((string) $item);
|
||||
if ($cleanKey !== '' && $cleanValue !== '') {
|
||||
$out[$cleanKey] = $cleanValue;
|
||||
}
|
||||
}
|
||||
|
||||
return $out !== [] ? $out : $default;
|
||||
}
|
||||
|
||||
private function value(string $path, mixed $default): mixed
|
||||
{
|
||||
$current = $this->config;
|
||||
foreach (explode('.', $path) as $segment) {
|
||||
if (!is_array($current) || !array_key_exists($segment, $current)) {
|
||||
return $default;
|
||||
}
|
||||
|
||||
$current = $current[$segment];
|
||||
}
|
||||
|
||||
return $current;
|
||||
}
|
||||
|
||||
public function buildExactTokenRemovalPattern(string $token): string
|
||||
|
||||
196
src/Config/DomainVocabularyConfig.php
Normal file
196
src/Config/DomainVocabularyConfig.php
Normal file
@@ -0,0 +1,196 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Config;
|
||||
|
||||
final class DomainVocabularyConfig
|
||||
{
|
||||
public function __construct(private readonly array $config = [])
|
||||
{
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function view(string $path, array $fallback = []): array
|
||||
{
|
||||
$definition = $this->value('views.' . $path, null);
|
||||
if (!is_array($definition)) {
|
||||
return $this->uniqueStringList($fallback);
|
||||
}
|
||||
|
||||
$terms = [];
|
||||
foreach ($this->stringListFromValue($definition['include'] ?? []) as $className) {
|
||||
foreach ($this->domainClass($className) as $term) {
|
||||
$terms[] = $term;
|
||||
}
|
||||
}
|
||||
foreach ($this->stringListFromValue($definition['add'] ?? []) as $term) {
|
||||
$terms[] = $term;
|
||||
}
|
||||
|
||||
$terms = $this->uniqueStringList($terms);
|
||||
return $terms !== [] ? $terms : $this->uniqueStringList($fallback);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function domainClass(string $name): array
|
||||
{
|
||||
return $this->stringList('classes.' . $name, []);
|
||||
}
|
||||
|
||||
/** @return array<string, string[]> */
|
||||
public function map(string $path, array $fallback = []): array
|
||||
{
|
||||
$value = $this->value('maps.' . $path, null);
|
||||
if (!is_array($value)) {
|
||||
return $this->uniqueStringListMap($fallback);
|
||||
}
|
||||
|
||||
$out = [];
|
||||
foreach ($value as $key => $items) {
|
||||
if (!is_scalar($key)) {
|
||||
continue;
|
||||
}
|
||||
$cleanKey = trim((string) $key);
|
||||
$cleanItems = $this->stringListFromValue($items);
|
||||
if ($cleanKey !== '' && $cleanItems !== []) {
|
||||
$out[$cleanKey] = $cleanItems;
|
||||
}
|
||||
}
|
||||
|
||||
return $out !== [] ? $out : $this->uniqueStringListMap($fallback);
|
||||
}
|
||||
|
||||
/** @return array<string, string> */
|
||||
public function stringMap(string $path, array $fallback = []): array
|
||||
{
|
||||
$value = $this->value('maps.' . $path, null);
|
||||
if (!is_array($value)) {
|
||||
return $this->uniqueStringMap($fallback);
|
||||
}
|
||||
|
||||
$out = [];
|
||||
foreach ($value as $key => $mappedValue) {
|
||||
if (!is_scalar($key)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$cleanKey = trim((string) $key);
|
||||
if ($cleanKey === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_array($mappedValue)) {
|
||||
$items = $this->stringListFromValue($mappedValue);
|
||||
$mappedValue = $items[0] ?? '';
|
||||
}
|
||||
|
||||
if (!is_scalar($mappedValue)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$cleanValue = trim((string) $mappedValue);
|
||||
if ($cleanValue !== '') {
|
||||
$out[$cleanKey] = $cleanValue;
|
||||
}
|
||||
}
|
||||
|
||||
return $out !== [] ? $out : $this->uniqueStringMap($fallback);
|
||||
}
|
||||
|
||||
/** @return array<string, mixed> */
|
||||
public function toArray(): array
|
||||
{
|
||||
return $this->config;
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
private function stringList(string $path, array $fallback): array
|
||||
{
|
||||
$value = $this->value($path, null);
|
||||
$items = $this->stringListFromValue($value);
|
||||
return $items !== [] ? $items : $this->uniqueStringList($fallback);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
private function stringListFromValue(mixed $value): array
|
||||
{
|
||||
if (!is_array($value)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return $this->uniqueStringList($value);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
private function uniqueStringList(array $items): array
|
||||
{
|
||||
$out = [];
|
||||
foreach ($items as $item) {
|
||||
if (!is_scalar($item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = trim((string) $item);
|
||||
if ($item === '' || in_array($item, $out, true)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = $item;
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/** @return array<string, string[]> */
|
||||
private function uniqueStringListMap(array $map): array
|
||||
{
|
||||
$out = [];
|
||||
foreach ($map as $key => $items) {
|
||||
if (!is_scalar($key)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$cleanKey = trim((string) $key);
|
||||
$cleanItems = $this->uniqueStringList(is_array($items) ? $items : []);
|
||||
if ($cleanKey !== '' && $cleanItems !== []) {
|
||||
$out[$cleanKey] = $cleanItems;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/** @return array<string, string> */
|
||||
private function uniqueStringMap(array $map): array
|
||||
{
|
||||
$out = [];
|
||||
foreach ($map as $key => $value) {
|
||||
if (!is_scalar($key) || !is_scalar($value)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$cleanKey = trim((string) $key);
|
||||
$cleanValue = trim((string) $value);
|
||||
if ($cleanKey !== '' && $cleanValue !== '') {
|
||||
$out[$cleanKey] = $cleanValue;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
private function value(string $path, mixed $fallback): mixed
|
||||
{
|
||||
$current = $this->config;
|
||||
foreach (explode('.', $path) as $segment) {
|
||||
if (!is_array($current) || !array_key_exists($segment, $current)) {
|
||||
return $fallback;
|
||||
}
|
||||
|
||||
$current = $current[$segment];
|
||||
}
|
||||
|
||||
return $current;
|
||||
}
|
||||
}
|
||||
@@ -176,11 +176,10 @@ final class NdjsonHybridRetrieverConfig
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $config
|
||||
* @param array<string, mixed> $vocabulary Kept for backwards-compatible service wiring.
|
||||
*/
|
||||
public function __construct(
|
||||
private array $config = [],
|
||||
private array $vocabulary = [],
|
||||
private readonly ?DomainVocabularyConfig $vocabulary = null,
|
||||
) {
|
||||
}
|
||||
|
||||
@@ -307,55 +306,55 @@ final class NdjsonHybridRetrieverConfig
|
||||
/** @return string[] */
|
||||
public function genericProductTokens(): array
|
||||
{
|
||||
return $this->stringList('generic_product_tokens', self::GENERIC_PRODUCT_TOKEN);
|
||||
return $this->stringList('generic_product_tokens', $this->vocabularyView('retrieval.generic_product_tokens', self::GENERIC_PRODUCT_TOKEN));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function importantShortModelTokens(): array
|
||||
{
|
||||
return $this->stringList('important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN);
|
||||
return $this->stringList('important_short_model_tokens', $this->vocabularyView('retrieval.important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function familyDescriptorTokens(): array
|
||||
{
|
||||
return $this->stringList('family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN);
|
||||
return $this->stringList('family_descriptor_tokens', $this->vocabularyView('retrieval.family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function looksLikeReagentTokens(): array
|
||||
{
|
||||
return $this->stringList('looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS);
|
||||
return $this->stringList('looks_like_reagent_tokens', $this->vocabularyView('retrieval.looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function looksLikeSafetyDocs(): array
|
||||
{
|
||||
return $this->stringList('looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS);
|
||||
return $this->stringList('looks_like_safety_docs', $this->vocabularyView('retrieval.looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function looksLikeReagentWords(): array
|
||||
{
|
||||
return $this->stringList('looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS);
|
||||
return $this->stringList('looks_like_reagent_words', $this->vocabularyView('retrieval.looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function looksLikeDocumentWords(): array
|
||||
{
|
||||
return $this->stringList('looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS);
|
||||
return $this->stringList('looks_like_document_words', $this->vocabularyView('retrieval.looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function looksLikeSafetyWords(): array
|
||||
{
|
||||
return $this->stringList('looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS);
|
||||
return $this->stringList('looks_like_safety_words', $this->vocabularyView('retrieval.looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function looksLikeDeviceWords(): array
|
||||
{
|
||||
return $this->stringList('looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS);
|
||||
return $this->stringList('looks_like_device_words', $this->vocabularyView('retrieval.looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS));
|
||||
}
|
||||
/**
|
||||
* Effective retrieval vocabulary as a dedicated structure for diagnostics and config dumps.
|
||||
@@ -459,6 +458,12 @@ final class NdjsonHybridRetrieverConfig
|
||||
* @param string[] $default
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
private function vocabularyView(string $path, array $fallback): array
|
||||
{
|
||||
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
|
||||
}
|
||||
|
||||
private function stringList(string $key, array $default): array
|
||||
{
|
||||
$value = $this->raw($key, $default);
|
||||
@@ -492,10 +497,6 @@ final class NdjsonHybridRetrieverConfig
|
||||
return $this->config[$key];
|
||||
}
|
||||
|
||||
if (array_key_exists($key, $this->vocabulary)) {
|
||||
return $this->vocabulary[$key];
|
||||
}
|
||||
|
||||
return $default;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,11 +6,74 @@ namespace App\Config;
|
||||
|
||||
final class PromptBuilderConfig
|
||||
{
|
||||
private const TECHNICAL_PRODUCT_KEYWORDS = [
|
||||
'technisch',
|
||||
'technical',
|
||||
'produkt',
|
||||
'product',
|
||||
'gerät',
|
||||
'device',
|
||||
'modell',
|
||||
'model',
|
||||
'messprinzip',
|
||||
'measurement principle',
|
||||
'schnittstelle',
|
||||
'interface',
|
||||
'relais',
|
||||
'relay',
|
||||
'indikator',
|
||||
'indicator',
|
||||
'grenzwert',
|
||||
'threshold',
|
||||
'messbereich',
|
||||
'measurement range',
|
||||
'minimaler',
|
||||
'minimum',
|
||||
'resthärte',
|
||||
'resthaerte',
|
||||
'°dh',
|
||||
'dh',
|
||||
'spannung',
|
||||
'voltage',
|
||||
'strom',
|
||||
'current',
|
||||
'druck',
|
||||
'pressure',
|
||||
'temperatur',
|
||||
'temperature',
|
||||
'schutzart',
|
||||
'ip',
|
||||
'fehlercode',
|
||||
'error code',
|
||||
'wasserhärte',
|
||||
'hardness',
|
||||
'testomat',
|
||||
'chlor',
|
||||
'chlormessung',
|
||||
];
|
||||
|
||||
private const ACCESSORY_REQUEST_KEYWORDS = [
|
||||
'passend',
|
||||
'passende',
|
||||
'passendes',
|
||||
'zubehör',
|
||||
'zubehor',
|
||||
'dazu',
|
||||
'indikator',
|
||||
'reagenz',
|
||||
'kit',
|
||||
'set',
|
||||
'zusatz',
|
||||
'ergänzung',
|
||||
'ergaenzung',
|
||||
];
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $config
|
||||
*/
|
||||
public function __construct(
|
||||
private readonly array $config = [],
|
||||
private readonly ?DomainVocabularyConfig $vocabulary = null,
|
||||
) {
|
||||
}
|
||||
|
||||
@@ -88,6 +151,42 @@ final class PromptBuilderConfig
|
||||
return is_numeric($value) ? (float) $value : $default;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
private function getStringList(string $path, array $default): array
|
||||
{
|
||||
$value = $this->getValue($path, $default);
|
||||
|
||||
if (!is_array($value)) {
|
||||
return $default;
|
||||
}
|
||||
|
||||
$out = [];
|
||||
foreach ($value as $item) {
|
||||
if (!is_scalar($item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = trim((string) $item);
|
||||
if ($item === '' || in_array($item, $out, true)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = $item;
|
||||
}
|
||||
|
||||
return $out !== [] ? $out : $default;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
private function vocabularyView(string $path, array $fallback): array
|
||||
{
|
||||
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
|
||||
}
|
||||
|
||||
private function getValue(string $path, mixed $default): mixed
|
||||
{
|
||||
$current = $this->config;
|
||||
@@ -445,51 +544,10 @@ final class PromptBuilderConfig
|
||||
*/
|
||||
public function getTechnicalProductKeywords(): array
|
||||
{
|
||||
return [
|
||||
'technisch',
|
||||
'technical',
|
||||
'produkt',
|
||||
'product',
|
||||
'gerät',
|
||||
'device',
|
||||
'modell',
|
||||
'model',
|
||||
'messprinzip',
|
||||
'measurement principle',
|
||||
'schnittstelle',
|
||||
'interface',
|
||||
'relais',
|
||||
'relay',
|
||||
'indikator',
|
||||
'indicator',
|
||||
'grenzwert',
|
||||
'threshold',
|
||||
'messbereich',
|
||||
'measurement range',
|
||||
'minimaler',
|
||||
'minimum',
|
||||
'resthärte',
|
||||
'resthaerte',
|
||||
'°dh',
|
||||
'dh',
|
||||
'spannung',
|
||||
'voltage',
|
||||
'strom',
|
||||
'current',
|
||||
'druck',
|
||||
'pressure',
|
||||
'temperatur',
|
||||
'temperature',
|
||||
'schutzart',
|
||||
'ip',
|
||||
'fehlercode',
|
||||
'error code',
|
||||
'wasserhärte',
|
||||
'hardness',
|
||||
'testomat',
|
||||
'chlor',
|
||||
'chlormessung',
|
||||
];
|
||||
return $this->getStringList(
|
||||
'technical_product_keywords',
|
||||
$this->vocabularyView('prompt.technical_product_keywords', self::TECHNICAL_PRODUCT_KEYWORDS)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -497,21 +555,10 @@ final class PromptBuilderConfig
|
||||
*/
|
||||
public function getAccessoryRequestKeywords(): array
|
||||
{
|
||||
return [
|
||||
'passend',
|
||||
'passende',
|
||||
'passendes',
|
||||
'zubehör',
|
||||
'zubehor',
|
||||
'dazu',
|
||||
'indikator',
|
||||
'reagenz',
|
||||
'kit',
|
||||
'set',
|
||||
'zusatz',
|
||||
'ergänzung',
|
||||
'ergaenzung',
|
||||
];
|
||||
return $this->getStringList(
|
||||
'accessory_request_keywords',
|
||||
$this->vocabularyView('prompt.accessory_request_keywords', self::ACCESSORY_REQUEST_KEYWORDS)
|
||||
);
|
||||
}
|
||||
|
||||
public function getTechnicalProductModelPattern(): string
|
||||
|
||||
@@ -68,8 +68,10 @@ final class ShopServiceConfig
|
||||
/**
|
||||
* @param array<string, mixed> $config
|
||||
*/
|
||||
public function __construct(private array $config = [])
|
||||
{
|
||||
public function __construct(
|
||||
private array $config = [],
|
||||
private readonly ?DomainVocabularyConfig $vocabulary = null,
|
||||
) {
|
||||
}
|
||||
|
||||
public function getTopProductLogLimit(): int
|
||||
@@ -80,43 +82,43 @@ final class ShopServiceConfig
|
||||
/** @return string[] */
|
||||
public function getDeviceFocusKeywords(): array
|
||||
{
|
||||
return $this->stringList('device_focus_keywords', self::DEVICE_FOCUS_KEYWORDS);
|
||||
return $this->stringList('device_focus_keywords', $this->vocabularyView('shop.device_focus', self::DEVICE_FOCUS_KEYWORDS));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function getAccessoryFocusKeywords(): array
|
||||
{
|
||||
return $this->stringList('accessory_focus_keywords', self::ACCESSORY_FOCUS_KEYWORDS);
|
||||
return $this->stringList('accessory_focus_keywords', $this->vocabularyView('shop.accessory_focus', self::ACCESSORY_FOCUS_KEYWORDS));
|
||||
}
|
||||
|
||||
/** @return array<string, string[]> */
|
||||
public function getAccessoryFocusVariantMap(): array
|
||||
{
|
||||
return $this->stringListMap('accessory_focus_variant_map', self::ACCESSORY_FOCUS_VARIANT_MAP);
|
||||
return $this->stringListMap('accessory_focus_variant_map', $this->vocabularyMap('shop.accessory_focus_variants', self::ACCESSORY_FOCUS_VARIANT_MAP));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function getDeviceQueryKeywords(): array
|
||||
{
|
||||
return $this->stringList('device_query_keywords', self::DEVICE_QUERY_KEYWORDS);
|
||||
return $this->stringList('device_query_keywords', $this->vocabularyView('shop.device_query', self::DEVICE_QUERY_KEYWORDS));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function getAccessoryQueryKeywords(): array
|
||||
{
|
||||
return $this->stringList('accessory_query_keywords', self::ACCESSORY_QUERY_KEYWORDS);
|
||||
return $this->stringList('accessory_query_keywords', $this->vocabularyView('shop.accessory_query', self::ACCESSORY_QUERY_KEYWORDS));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function getAccessoryProductKeywords(): array
|
||||
{
|
||||
return $this->stringList('accessory_product_keywords', self::ACCESSORY_PRODUCT_KEYWORDS);
|
||||
return $this->stringList('accessory_product_keywords', $this->vocabularyView('shop.accessory_product', self::ACCESSORY_PRODUCT_KEYWORDS));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function getDeviceProductKeywords(): array
|
||||
{
|
||||
return $this->stringList('device_product_keywords', self::DEVICE_PRODUCT_KEYWORDS);
|
||||
return $this->stringList('device_product_keywords', $this->vocabularyView('shop.device_product', self::DEVICE_PRODUCT_KEYWORDS));
|
||||
}
|
||||
|
||||
public function getExactProductNumberPhraseScore(): int
|
||||
@@ -368,6 +370,18 @@ final class ShopServiceConfig
|
||||
* @param string[]|null $emptySafeDefault
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
private function vocabularyView(string $path, array $fallback): array
|
||||
{
|
||||
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
|
||||
}
|
||||
|
||||
/** @return array<string, string[]> */
|
||||
private function vocabularyMap(string $path, array $fallback): array
|
||||
{
|
||||
return $this->vocabulary?->map($path, $fallback) ?? $fallback;
|
||||
}
|
||||
|
||||
private function stringList(string $path, array $default, bool $allowEmptyStrings = false, ?array $emptySafeDefault = null): array
|
||||
{
|
||||
$value = $this->value($path, $default);
|
||||
|
||||
@@ -1125,7 +1125,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$candidates = [];
|
||||
$seenDocs = [];
|
||||
|
||||
foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) {
|
||||
foreach (array_slice($chunkIds, 0, $this->retrieverConfig->focusedProductWindow()) as $rank => $chunkId) {
|
||||
$row = $rows[$chunkId] ?? null;
|
||||
if (!is_array($row)) {
|
||||
continue;
|
||||
@@ -1171,7 +1171,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$bestScore = (float)$best['score'];
|
||||
$gap = $bestScore - $runnerUpScore;
|
||||
|
||||
if ($bestScore < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_SCORE || $gap < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_GAP) {
|
||||
if ($bestScore < $this->retrieverConfig->focusedProductMinScore() || $gap < $this->retrieverConfig->focusedProductMinGap()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -1199,10 +1199,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$normalized = $this->normalizeText($prompt);
|
||||
$tokens = $this->tokenizeText($normalized);
|
||||
|
||||
$reagentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_WORDS;
|
||||
$documentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DOCUMENT_WORDS;
|
||||
$safetyWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_WORDS;
|
||||
$deviceWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DEVICE_WORDS;
|
||||
$reagentWords = $this->retrieverConfig->looksLikeReagentWords();
|
||||
$documentWords = $this->retrieverConfig->looksLikeDocumentWords();
|
||||
$safetyWords = $this->retrieverConfig->looksLikeSafetyWords();
|
||||
$deviceWords = $this->retrieverConfig->looksLikeDeviceWords();
|
||||
|
||||
$asksReagent = $this->containsAnyToken($tokens, $reagentWords);
|
||||
$asksDocument = $this->containsAnyToken($tokens, $documentWords);
|
||||
@@ -1343,7 +1343,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$documentId,
|
||||
$chunkIds,
|
||||
$rows,
|
||||
min($limit, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MAX_CHUNKS)
|
||||
min($limit, $this->retrieverConfig->focusedProductMaxChunks())
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1358,7 +1358,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
$docWindow = [];
|
||||
|
||||
foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::DOMINANT_DOC_WINDOW) as $chunkId) {
|
||||
foreach (array_slice($chunkIds, 0, $this->retrieverConfig->dominantDocWindow()) as $chunkId) {
|
||||
if (!isset($rows[$chunkId]['text'])) {
|
||||
continue;
|
||||
}
|
||||
@@ -1388,7 +1388,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
$dominantCount = (int)($counts[$dominantDocId] ?? 0);
|
||||
|
||||
if ($dominantCount >= NdjsonHybridRetrieverConfig::DOMINANT_DOC_MIN_HITS) {
|
||||
if ($dominantCount >= $this->retrieverConfig->dominantDocMinHits()) {
|
||||
return $dominantDocId;
|
||||
}
|
||||
|
||||
@@ -1450,7 +1450,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return [];
|
||||
}
|
||||
|
||||
$maxFromDoc = min($limit, NdjsonHybridRetrieverConfig::DOMINANT_DOC_MAX_CHUNKS);
|
||||
$maxFromDoc = min($limit, $this->retrieverConfig->dominantDocMaxChunks());
|
||||
|
||||
if ($anchorChunkIndex !== null) {
|
||||
usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int {
|
||||
@@ -1550,13 +1550,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
continue;
|
||||
}
|
||||
|
||||
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
|
||||
if (($docCounter[$docId] ?? 0) >= $this->retrieverConfig->maxChunksPerDoc()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_int($chunkIndex)) {
|
||||
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
|
||||
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
|
||||
if (abs($prevIdx - $chunkIndex) < $this->retrieverConfig->minChunkDistance()) {
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
@@ -1609,13 +1609,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
continue;
|
||||
}
|
||||
|
||||
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
|
||||
if (($docCounter[$docId] ?? 0) >= $this->retrieverConfig->maxChunksPerDoc()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_int($chunkIndex)) {
|
||||
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
|
||||
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
|
||||
if (abs($prevIdx - $chunkIndex) < $this->retrieverConfig->minChunkDistance()) {
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
@@ -1715,7 +1715,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
*/
|
||||
private function isGenericProductToken(string $token): bool
|
||||
{
|
||||
static $generic = NdjsonHybridRetrieverConfig::GENERIC_PRODUCT_TOKEN;
|
||||
$generic = $this->retrieverConfig->genericProductTokens();
|
||||
|
||||
return isset(array_fill_keys($generic, true)[$token]);
|
||||
}
|
||||
|
||||
@@ -1724,7 +1725,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
*/
|
||||
private function isImportantShortModelToken(string $token): bool
|
||||
{
|
||||
static $allowed = NdjsonHybridRetrieverConfig::IMPORTANT_SHORT_MODEL_TOKEN;
|
||||
$allowed = $this->retrieverConfig->importantShortModelTokens();
|
||||
|
||||
return in_array($token, $allowed, true);
|
||||
}
|
||||
@@ -1734,7 +1735,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
*/
|
||||
private function isFamilyDescriptorToken(string $token): bool
|
||||
{
|
||||
static $familyDescriptors = NdjsonHybridRetrieverConfig::FAMILY_DESCRIPTOR_TOKEN;
|
||||
$familyDescriptors = $this->retrieverConfig->familyDescriptorTokens();
|
||||
|
||||
return in_array($token, $familyDescriptors, true)
|
||||
|| $this->isImportantShortModelToken($token)
|
||||
@@ -1752,7 +1753,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return false;
|
||||
}
|
||||
|
||||
$needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_TOKENS;
|
||||
$needles = $this->retrieverConfig->looksLikeReagentTokens();
|
||||
|
||||
foreach ($needles as $needle) {
|
||||
if (str_contains($haystack, $needle)) {
|
||||
@@ -1774,7 +1775,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return false;
|
||||
}
|
||||
|
||||
$needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_DOCS;
|
||||
$needles = $this->retrieverConfig->looksLikeSafetyDocs();
|
||||
|
||||
foreach ($needles as $needle) {
|
||||
if (str_contains($haystack, $needle)) {
|
||||
|
||||
Reference in New Issue
Block a user