central config part 1

This commit is contained in:
team2
2026-04-25 23:39:41 +02:00
parent 2797834a5f
commit f42022e5f7
11 changed files with 1197 additions and 476 deletions

View File

@@ -0,0 +1,29 @@
# RetrieX Vocabulary Centralization Fix
This patch centralizes the growing recognition word lists without changing their tuned content.
## Main changes
- Added `config/retriex/vocabulary.yaml`.
- Added `App\Config\DomainVocabularyConfig`.
- Wired the vocabulary facade into:
- `ShopServiceConfig`
- `NdjsonHybridRetrieverConfig`
- `PromptBuilderConfig`
- `CommerceQueryParserConfig`
- Moved the active Shop and Retrieval vocabulary defaults out of `commerce.yaml` and `retrieval.yaml` into `vocabulary.yaml`.
- Kept all old per-service config keys as explicit overrides.
- Removed direct `NdjsonHybridRetrieverConfig::...` constant usage inside `NdjsonHybridRetriever` so effective config getters are used consistently.
## Stability note
The vocabulary views preserve the previous order and content of the tuned lists.
No new semantic terms were added to the critical retrieval and shop matching views.
Required regression baseline:
- `Was ist der niedrigste Grenzwert für die Wasserhärte, welcher mit einem Testomaten überwacht werden kann?`
- expected: `0,02 °dH (Testomat 808)`
- `mit welchem indikator wird der wert gemessen`
- expected: `Indikatortyp 300`
- Store query with `0,02` must preserve the decimal value and must not turn it into `02`.

View File

@@ -16,166 +16,8 @@ parameters:
retriex.shop_matching.config: retriex.shop_matching.config:
top_product_log_limit: 3 top_product_log_limit: 3
device_query_keywords: # Vocabulary-backed lists live in config/retriex/vocabulary.yaml.
- analysegerät # The old per-key entries may still be added here to override a specific view.
- analysegeraet
- analysegeräte
- analysegeraete
- messgerät
- messgeraet
- messgeräte
- messgeraete
- analysator
- analysatoren
- analyzer
- gerät
- geraet
- geräte
- geraete
- monitor
- monitore
- controller
- gerät für
- geraet fuer
- geräte für
- geraete fuer
- system
- systeme
- anlage
- anlagen
accessory_query_keywords:
- zubehör
- zubehor
- reagenz
- reagenzien
- reagent
- indikator
- indikatoren
- indicator
- kit
- set
- ersatz
- ersatzteil
- ersatzteile
- verbrauchsmaterial
- consumable
- dazu
- passend
- passende
- passendes
- nachfüll
- nachfuell
- refill
- filter
- pumpenkopf
- motorblock
- service set
- serviceset
- service-set
accessory_product_keywords:
- reagenz
- reagenzien
- reagent
- indikator
- indikatoren
- indicator
- kit
- set
- verbrauchsmaterial
- consumable
- zubehör
- zubehor
- ersatz
- ersatzteil
- ersatzteile
- nachfüll
- nachfuell
- refill
- lösung
- loesung
- solution
- teststreifen
- test strip
- filter
- pumpenkopf
- motorblock
- service set
- serviceset
- service-set
device_product_keywords:
- analysegerät
- analysegeraet
- analysegeräte
- analysegeraete
- messgerät
- messgeraet
- messgeräte
- messgeraete
- analysator
- analysatoren
- analyzer
- monitor
- monitore
- controller
- online-analysator
- online analysator
- online-analysegerät
- online analysegeraet
- online-analysegeräte
- online analysegeraete
- online analyzer
- online monitor
- system
- systeme
- anlage
- anlagen
- gerät
- geraet
- geräte
- geraete
device_focus_keywords:
- geräte
- geraete
- gerät
- geraet
- analysegerät
- analysegeraet
- messgerät
- messgeraet
- analysator
- controller
- monitor
accessory_focus_keywords:
- indikator
- indikatoren
- reagenz
- reagenzien
- zubehör
- zubehor
- ersatzteil
- ersatzteile
- verbrauchsmaterial
- service set
- serviceset
- filter
- pumpenkopf
- motorblock
accessory_focus_variant_map:
indikator: [indikator, indikatoren]
indikatoren: [indikator, indikatoren]
reagenz: [reagenz, reagenzien]
reagenzien: [reagenz, reagenzien]
ersatzteil: [ersatzteil, ersatzteile]
ersatzteile: [ersatzteil, ersatzteile]
service set: [service set, serviceset, service-set]
serviceset: [service set, serviceset, service-set]
service-set: [service set, serviceset, service-set]
scores: scores:
exact_product_number_phrase: 160 exact_product_number_phrase: 160

View File

@@ -27,141 +27,8 @@ parameters:
focused_product_min_gap: 4.0 focused_product_min_gap: 4.0
focused_product_max_chunks: 4 focused_product_max_chunks: 4
generic_product_tokens: # Vocabulary-backed retrieval token lists live in config/retriex/vocabulary.yaml.
- produkt # The old per-key entries may still be added here to override a specific view.
- produkte
- produktkarte
- titel
- geraet
- gerät
- messgeraet
- messgerät
- wasser
- haerte
- härte
- resthaerte
- resthärte
- analyse
- analysator
- automat
- online
- messung
- messen
- preis
- preise
- kosten
- info
- infos
- passend
- richtige
- richtiges
- geeignet
- geeignete
- welche
- welcher
- welches
- brauche
- suche
important_short_model_tokens: [th, tc, tp, tm, ph, rx]
family_descriptor_tokens:
- evo
- eco
- self
- clean
- mini
- pro
- plus
- basic
- lab
- inline
- compact
- panel
- sc
looks_like_reagent_tokens:
- indikator
- reagenz
- reagens
- laborchemikalie
- chemikalie
- sicherheitsdatenblatt
- sdb
- msds
- ufi
- gebinde
- flasche
- ersatzteil
- zubehoer
- zubehör
- service set
- filtereinsatz
- kerzenfilter
- druckregler
looks_like_safety_docs:
- sicherheitsdatenblatt
- sdb
- msds
- gefahrenbewertung
- gefahrenpiktogramm
- signalwort
- lagerung
- transport
- clp
- kennzeichnung
- h290
- pbt
- vpvb
looks_like_reagent_words:
- indikator
- reagenz
- reagens
- chemie
- chemikalie
- sdb
- sicherheitsdatenblatt
- msds
- flasche
- gebinde
looks_like_document_words:
- datenblatt
- dokument
- pdf
- handbuch
- manual
- beschreibung
- sdb
- sicherheitsdatenblatt
- msds
looks_like_safety_words:
- gefahr
- gefahrgut
- clp
- h290
- sicherheit
- kennzeichnung
- transport
- lagerung
- piktogramm
looks_like_device_words:
- geraet
- gerät
- messgeraet
- messgerät
- analysator
- automat
- messung
- messen
- ueberwachung
- überwachung
- online
- monitor
# Backwards-compatible name for existing config diagnostics. # Backwards-compatible name for existing config diagnostics.
retriex.retrieval.inventory: '%retriex.retrieval.config%' retriex.retrieval.inventory: '%retriex.retrieval.config%'

View File

@@ -0,0 +1,597 @@
# Central domain vocabulary for RetrieX.
# Views preserve the previous 1.4.2-tuned ordering exactly; per-service configs may still override them.
parameters:
retriex.commerce_query.config: {}
retriex.vocabulary.config:
classes:
device:
- analysegerät
- analysegeraet
- analysegeräte
- analysegeraete
- messgerät
- messgeraet
- messgeräte
- messgeraete
- analysator
- analysatoren
- analyzer
- gerät
- geraet
- geräte
- geraete
- monitor
- monitore
- controller
- system
- systeme
- anlage
- anlagen
accessory:
- zubehör
- zubehor
- reagenz
- reagenzien
- reagent
- indikator
- indikatoren
- indicator
- kit
- set
- ersatz
- ersatzteil
- ersatzteile
- verbrauchsmaterial
- consumable
- filter
- pumpenkopf
- motorblock
- service set
- serviceset
- service-set
views:
shop:
device_query:
add:
- analysegerät
- analysegeraet
- analysegeräte
- analysegeraete
- messgerät
- messgeraet
- messgeräte
- messgeraete
- analysator
- analysatoren
- analyzer
- gerät
- geraet
- geräte
- geraete
- monitor
- monitore
- controller
- gerät für
- geraet fuer
- geräte für
- geraete fuer
- system
- systeme
- anlage
- anlagen
accessory_query:
add:
- zubehör
- zubehor
- reagenz
- reagenzien
- reagent
- indikator
- indikatoren
- indicator
- kit
- set
- ersatz
- ersatzteil
- ersatzteile
- verbrauchsmaterial
- consumable
- dazu
- passend
- passende
- passendes
- nachfüll
- nachfuell
- refill
- filter
- pumpenkopf
- motorblock
- service set
- serviceset
- service-set
accessory_product:
add:
- reagenz
- reagenzien
- reagent
- indikator
- indikatoren
- indicator
- kit
- set
- verbrauchsmaterial
- consumable
- zubehör
- zubehor
- ersatz
- ersatzteil
- ersatzteile
- nachfüll
- nachfuell
- refill
- lösung
- loesung
- solution
- teststreifen
- test strip
- filter
- pumpenkopf
- motorblock
- service set
- serviceset
- service-set
device_product:
add:
- analysegerät
- analysegeraet
- analysegeräte
- analysegeraete
- messgerät
- messgeraet
- messgeräte
- messgeraete
- analysator
- analysatoren
- analyzer
- monitor
- monitore
- controller
- online-analysator
- online analysator
- online-analysegerät
- online analysegeraet
- online-analysegeräte
- online analysegeraete
- online analyzer
- online monitor
- system
- systeme
- anlage
- anlagen
- gerät
- geraet
- geräte
- geraete
device_focus:
add:
- geräte
- geraete
- gerät
- geraet
- analysegerät
- analysegeraet
- messgerät
- messgeraet
- analysator
- controller
- monitor
accessory_focus:
add:
- indikator
- indikatoren
- reagenz
- reagenzien
- zubehör
- zubehor
- ersatzteil
- ersatzteile
- verbrauchsmaterial
- service set
- serviceset
- filter
- pumpenkopf
- motorblock
commerce_query:
known_brands:
add:
- heyl
- horiba
- neomeris
phrases_to_remove:
add:
- ich suche
- suche
- habt ihr
- gibt es
- gebe mir
- gib mir
- zeige mir
- welches gerät
- welche gerät
- welches modell
- welches ist besser
- welches ist am besten
- alternative
- alternativen
- unter anderem
- u a
- welche
- welcher
- welches
- welchen
- sind
- ist
- geeignet
- geeigent
- verfügbarkeit
- verfuegbarkeit
filter_search_tokens:
add:
- auch
- noch
- nochmal
- zusätzlich
- dazu
- davon
- stattdessen
- bitte
- gern
- gerne
- zeige
- zeig
- such
- suche
- finde
- find
- mir
- mal
- von
- im
- in
- für
- fuer
- welche
- welcher
- welches
- welchen
- sind
- ist
- geeignet
- geeigent
- verfügbarkeit
- verfuegbarkeit
- prüfe
- pruefe
- den
- die
- das
- der
- dem
- des
- und
- oder
- sowie
- seine
- seinen
- seiner
- seinem
- seines
- siene
- sienen
- siener
- sienem
- sienes
- gebe
- gib
- nenne
- nenn
- preis
- preise
- preisen
- kostet
- kosten
- ua
- also
- gut
- gute
- guten
- guter
- gutes
- passen
- passend
semantic_shop_search_tokens:
add:
- indikator
- indicator
- reagenz
- reagent
- zubehör
- zubehor
- ersatzteil
- verbrauchsmaterial
- chemie
- indikatorchemie
- reagenzchemie
- kit
- set
- filter
- pumpe
- pumpenkopf
- motorblock
- lösung
- loesung
- solution
- teststreifen
- gerät
- geraet
- messgerät
- messgeraet
- analysegerät
- analysegeraet
- analysator
- monitor
- controller
- system
retrieval:
generic_product_tokens:
add:
- produkt
- produkte
- produktkarte
- titel
- geraet
- gerät
- messgeraet
- messgerät
- wasser
- haerte
- härte
- resthaerte
- resthärte
- analyse
- analysator
- automat
- online
- messung
- messen
- preis
- preise
- kosten
- info
- infos
- passend
- richtige
- richtiges
- geeignet
- geeignete
- welche
- welcher
- welches
- brauche
- suche
important_short_model_tokens:
add:
- th
- tc
- tp
- tm
- ph
- rx
family_descriptor_tokens:
add:
- evo
- eco
- self
- clean
- mini
- pro
- plus
- basic
- lab
- inline
- compact
- panel
- sc
looks_like_reagent_tokens:
add:
- indikator
- reagenz
- reagens
- laborchemikalie
- chemikalie
- sicherheitsdatenblatt
- sdb
- msds
- ufi
- gebinde
- flasche
- ersatzteil
- zubehoer
- zubehör
- service set
- filtereinsatz
- kerzenfilter
- druckregler
looks_like_safety_docs:
add:
- sicherheitsdatenblatt
- sdb
- msds
- gefahrenbewertung
- gefahrenpiktogramm
- signalwort
- lagerung
- transport
- clp
- kennzeichnung
- h290
- pbt
- vpvb
looks_like_reagent_words:
add:
- indikator
- reagenz
- reagens
- chemie
- chemikalie
- sdb
- sicherheitsdatenblatt
- msds
- flasche
- gebinde
looks_like_document_words:
add:
- datenblatt
- dokument
- pdf
- handbuch
- manual
- beschreibung
- sdb
- sicherheitsdatenblatt
- msds
looks_like_safety_words:
add:
- gefahr
- gefahrgut
- clp
- h290
- sicherheit
- kennzeichnung
- transport
- lagerung
- piktogramm
looks_like_device_words:
add:
- geraet
- gerät
- messgeraet
- messgerät
- analysator
- automat
- messung
- messen
- ueberwachung
- überwachung
- online
- monitor
prompt:
technical_product_keywords:
add:
- technisch
- technical
- produkt
- product
- gerät
- device
- modell
- model
- messprinzip
- measurement principle
- schnittstelle
- interface
- relais
- relay
- indikator
- indicator
- grenzwert
- threshold
- messbereich
- measurement range
- minimaler
- minimum
- resthärte
- resthaerte
- °dh
- dh
- spannung
- voltage
- strom
- current
- druck
- pressure
- temperatur
- temperature
- schutzart
- ip
- fehlercode
- error code
- wasserhärte
- hardness
- testomat
- chlor
- chlormessung
accessory_request_keywords:
add:
- passend
- passende
- passendes
- zubehör
- zubehor
- dazu
- indikator
- reagenz
- kit
- set
- zusatz
- ergänzung
- ergaenzung
maps:
shop:
accessory_focus_variants:
indikator:
- indikator
- indikatoren
indikatoren:
- indikator
- indikatoren
reagenz:
- reagenz
- reagenzien
reagenzien:
- reagenz
- reagenzien
ersatzteil:
- ersatzteil
- ersatzteile
ersatzteile:
- ersatzteil
- ersatzteile
service set:
- service set
- serviceset
- service-set
serviceset:
- service set
- serviceset
- service-set
service-set:
- service set
- serviceset
- service-set
commerce_query:
search_token_corrections:
siene: seine
sienen: seinen
siener: seiner
sienem: seinem
sienes: seines
indicatoren: indikatoren
search_token_canonical:
indikatoren: indikator
indicators: indikator
indicator: indikator
reagenzien: reagenz
reagents: reagenz
reagent: reagenz
produkte: produkt

View File

@@ -9,6 +9,7 @@ imports:
- { resource: 'retriex/retrieval.yaml' } - { resource: 'retriex/retrieval.yaml' }
- { resource: 'retriex/language.yaml' } - { resource: 'retriex/language.yaml' }
- { resource: 'retriex/query_enrichment.yaml' } - { resource: 'retriex/query_enrichment.yaml' }
- { resource: 'retriex/vocabulary.yaml' }
# ------------------------------------------------------------ # ------------------------------------------------------------
# Parameters # Parameters
@@ -112,9 +113,14 @@ services:
$retrievalMaxChunks: '%retriex.model.default_retrieval_max_chunks%' $retrievalMaxChunks: '%retriex.model.default_retrieval_max_chunks%'
$retrievalVectorTopK: '%retriex.model.default_retrieval_vector_top_k%' $retrievalVectorTopK: '%retriex.model.default_retrieval_vector_top_k%'
App\Config\DomainVocabularyConfig:
arguments:
$config: '%retriex.vocabulary.config%'
App\Config\PromptBuilderConfig: App\Config\PromptBuilderConfig:
arguments: arguments:
$config: '%retriex.prompt.config%' $config: '%retriex.prompt.config%'
$vocabulary: '@App\Config\DomainVocabularyConfig'
App\Config\AgentRunnerConfig: App\Config\AgentRunnerConfig:
arguments: arguments:
@@ -123,7 +129,7 @@ services:
App\Config\NdjsonHybridRetrieverConfig: App\Config\NdjsonHybridRetrieverConfig:
arguments: arguments:
$config: '%retriex.retrieval.config%' $config: '%retriex.retrieval.config%'
$vocabulary: '%retriex.retrieval.config%' $vocabulary: '@App\Config\DomainVocabularyConfig'
App\Config\StopWordsConfig: App\Config\StopWordsConfig:
arguments: arguments:
@@ -136,6 +142,7 @@ services:
App\Config\ShopServiceConfig: App\Config\ShopServiceConfig:
arguments: arguments:
$config: '%retriex.shop_matching.config%' $config: '%retriex.shop_matching.config%'
$vocabulary: '@App\Config\DomainVocabularyConfig'
App\Infrastructure\OllamaClient: App\Infrastructure\OllamaClient:
arguments: arguments:
@@ -176,6 +183,11 @@ services:
App\Intent\CommerceIntentLite: ~ App\Intent\CommerceIntentLite: ~
App\Config\CommerceQueryParserConfig:
arguments:
$config: '%retriex.commerce_query.config%'
$vocabulary: '@App\Config\DomainVocabularyConfig'
App\Commerce\CommerceQueryParser: ~ App\Commerce\CommerceQueryParser: ~
App\Config\SearchRepairConfig: App\Config\SearchRepairConfig:

View File

@@ -6,24 +6,13 @@ namespace App\Config;
final class CommerceQueryParserConfig final class CommerceQueryParserConfig
{ {
/** private const KNOWN_BRANDS = [
* @return string[]
*/
public function getKnownBrands(): array
{
return [
'heyl', 'heyl',
'horiba', 'horiba',
'neomeris', 'neomeris',
]; ];
}
/** private const PHRASES_TO_REMOVE = [
* @return string[]
*/
public function getPhrasesToRemove(): array
{
return [
'ich suche', 'ich suche',
'suche', 'suche',
'habt ihr', 'habt ihr',
@@ -51,24 +40,8 @@ final class CommerceQueryParserConfig
'verfügbarkeit', 'verfügbarkeit',
'verfuegbarkeit', 'verfuegbarkeit',
]; ];
}
public function getHistoryContextPattern(): string private const FILTER_SEARCH_TOKENS = [
{
return 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt';
}
public function getHistoryContextValuePattern(): string
{
return '/\b(' . $this->getHistoryContextPattern() . ')\b/u';
}
/**
* @return string[]
*/
public function getFilterSearchTokens(): array
{
return [
'auch', 'auch',
'noch', 'noch',
'nochmal', 'nochmal',
@@ -142,14 +115,8 @@ final class CommerceQueryParserConfig
'passen', 'passen',
'passend', 'passend',
]; ];
}
/** private const SEARCH_TOKEN_CORRECTIONS = [
* @return array<string, string>
*/
public function getSearchTokenCorrections(): array
{
return [
'siene' => 'seine', 'siene' => 'seine',
'sienen' => 'seinen', 'sienen' => 'seinen',
'siener' => 'seiner', 'siener' => 'seiner',
@@ -157,14 +124,8 @@ final class CommerceQueryParserConfig
'sienes' => 'seines', 'sienes' => 'seines',
'indicatoren' => 'indikatoren', 'indicatoren' => 'indikatoren',
]; ];
}
/** private const SEARCH_TOKEN_CANONICAL_MAP = [
* @return array<string, string>
*/
public function getSearchTokenCanonicalMap(): array
{
return [
'indikatoren' => 'indikator', 'indikatoren' => 'indikator',
'indicators' => 'indikator', 'indicators' => 'indikator',
'indicator' => 'indikator', 'indicator' => 'indikator',
@@ -173,6 +134,113 @@ final class CommerceQueryParserConfig
'reagent' => 'reagenz', 'reagent' => 'reagenz',
'produkte' => 'produkt', 'produkte' => 'produkt',
]; ];
private const SEMANTIC_SHOP_SEARCH_TOKENS = [
'indikator',
'indicator',
'reagenz',
'reagent',
'zubehör',
'zubehor',
'ersatzteil',
'verbrauchsmaterial',
'chemie',
'indikatorchemie',
'reagenzchemie',
'kit',
'set',
'filter',
'pumpe',
'pumpenkopf',
'motorblock',
'lösung',
'loesung',
'solution',
'teststreifen',
'gerät',
'geraet',
'messgerät',
'messgeraet',
'analysegerät',
'analysegeraet',
'analysator',
'monitor',
'controller',
'system',
];
/**
* @param array<string, mixed> $config
*/
public function __construct(
private readonly array $config = [],
private readonly ?DomainVocabularyConfig $vocabulary = null,
) {
}
/**
* @return string[]
*/
public function getKnownBrands(): array
{
return $this->stringList(
'known_brands',
$this->vocabularyView('commerce_query.known_brands', self::KNOWN_BRANDS)
);
}
/**
* @return string[]
*/
public function getPhrasesToRemove(): array
{
return $this->stringList(
'phrases_to_remove',
$this->vocabularyView('commerce_query.phrases_to_remove', self::PHRASES_TO_REMOVE)
);
}
public function getHistoryContextPattern(): string
{
return 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt';
}
public function getHistoryContextValuePattern(): string
{
return '/\b(' . $this->getHistoryContextPattern() . ')\b/u';
}
/**
* @return string[]
*/
public function getFilterSearchTokens(): array
{
return $this->stringList(
'filter_search_tokens',
$this->vocabularyView('commerce_query.filter_search_tokens', self::FILTER_SEARCH_TOKENS)
);
}
/**
* @return array<string, string>
*/
public function getSearchTokenCorrections(): array
{
return $this->stringMap(
'search_token_corrections',
$this->vocabularyStringMap('commerce_query.search_token_corrections', self::SEARCH_TOKEN_CORRECTIONS)
);
}
/**
* @return array<string, string>
*/
public function getSearchTokenCanonicalMap(): array
{
return $this->stringMap(
'search_token_canonical_map',
$this->vocabularyStringMap('commerce_query.search_token_canonical', self::SEARCH_TOKEN_CANONICAL_MAP)
);
} }
/** /**
@@ -335,39 +403,86 @@ final class CommerceQueryParserConfig
*/ */
public function getSemanticShopSearchTokens(): array public function getSemanticShopSearchTokens(): array
{ {
return [ return $this->stringList(
'indikator', 'semantic_shop_search_tokens',
'indicator', $this->vocabularyView('commerce_query.semantic_shop_search_tokens', self::SEMANTIC_SHOP_SEARCH_TOKENS)
'reagenz', );
'reagent', }
'zubehör',
'zubehor',
'ersatzteil', /** @return string[] */
'verbrauchsmaterial', private function vocabularyView(string $path, array $fallback): array
'chemie', {
'indikatorchemie', return $this->vocabulary?->view($path, $fallback) ?? $fallback;
'reagenzchemie', }
'kit',
'set', /** @return array<string, string> */
'filter', private function vocabularyStringMap(string $path, array $fallback): array
'pumpe', {
'pumpenkopf', return $this->vocabulary?->stringMap($path, $fallback) ?? $fallback;
'motorblock', }
'lösung',
'loesung', /** @return string[] */
'solution', private function stringList(string $path, array $default): array
'teststreifen', {
'gerät', $value = $this->value($path, $default);
'geraet', if (!is_array($value)) {
'messgerät', return $default;
'messgeraet', }
'analysegerät',
'analysegeraet', $out = [];
'analysator', foreach ($value as $item) {
'monitor', if (!is_scalar($item)) {
'controller', continue;
'system', }
];
$item = trim((string) $item);
if ($item === '' || in_array($item, $out, true)) {
continue;
}
$out[] = $item;
}
return $out !== [] ? $out : $default;
}
/** @return array<string, string> */
private function stringMap(string $path, array $default): array
{
$value = $this->value($path, $default);
if (!is_array($value)) {
return $default;
}
$out = [];
foreach ($value as $key => $item) {
if (!is_scalar($key) || !is_scalar($item)) {
continue;
}
$cleanKey = trim((string) $key);
$cleanValue = trim((string) $item);
if ($cleanKey !== '' && $cleanValue !== '') {
$out[$cleanKey] = $cleanValue;
}
}
return $out !== [] ? $out : $default;
}
private function value(string $path, mixed $default): mixed
{
$current = $this->config;
foreach (explode('.', $path) as $segment) {
if (!is_array($current) || !array_key_exists($segment, $current)) {
return $default;
}
$current = $current[$segment];
}
return $current;
} }
public function buildExactTokenRemovalPattern(string $token): string public function buildExactTokenRemovalPattern(string $token): string

View File

@@ -0,0 +1,196 @@
<?php
declare(strict_types=1);
namespace App\Config;
final class DomainVocabularyConfig
{
public function __construct(private readonly array $config = [])
{
}
/** @return string[] */
public function view(string $path, array $fallback = []): array
{
$definition = $this->value('views.' . $path, null);
if (!is_array($definition)) {
return $this->uniqueStringList($fallback);
}
$terms = [];
foreach ($this->stringListFromValue($definition['include'] ?? []) as $className) {
foreach ($this->domainClass($className) as $term) {
$terms[] = $term;
}
}
foreach ($this->stringListFromValue($definition['add'] ?? []) as $term) {
$terms[] = $term;
}
$terms = $this->uniqueStringList($terms);
return $terms !== [] ? $terms : $this->uniqueStringList($fallback);
}
/** @return string[] */
public function domainClass(string $name): array
{
return $this->stringList('classes.' . $name, []);
}
/** @return array<string, string[]> */
public function map(string $path, array $fallback = []): array
{
$value = $this->value('maps.' . $path, null);
if (!is_array($value)) {
return $this->uniqueStringListMap($fallback);
}
$out = [];
foreach ($value as $key => $items) {
if (!is_scalar($key)) {
continue;
}
$cleanKey = trim((string) $key);
$cleanItems = $this->stringListFromValue($items);
if ($cleanKey !== '' && $cleanItems !== []) {
$out[$cleanKey] = $cleanItems;
}
}
return $out !== [] ? $out : $this->uniqueStringListMap($fallback);
}
/** @return array<string, string> */
public function stringMap(string $path, array $fallback = []): array
{
$value = $this->value('maps.' . $path, null);
if (!is_array($value)) {
return $this->uniqueStringMap($fallback);
}
$out = [];
foreach ($value as $key => $mappedValue) {
if (!is_scalar($key)) {
continue;
}
$cleanKey = trim((string) $key);
if ($cleanKey === '') {
continue;
}
if (is_array($mappedValue)) {
$items = $this->stringListFromValue($mappedValue);
$mappedValue = $items[0] ?? '';
}
if (!is_scalar($mappedValue)) {
continue;
}
$cleanValue = trim((string) $mappedValue);
if ($cleanValue !== '') {
$out[$cleanKey] = $cleanValue;
}
}
return $out !== [] ? $out : $this->uniqueStringMap($fallback);
}
/** @return array<string, mixed> */
public function toArray(): array
{
return $this->config;
}
/** @return string[] */
private function stringList(string $path, array $fallback): array
{
$value = $this->value($path, null);
$items = $this->stringListFromValue($value);
return $items !== [] ? $items : $this->uniqueStringList($fallback);
}
/** @return string[] */
private function stringListFromValue(mixed $value): array
{
if (!is_array($value)) {
return [];
}
return $this->uniqueStringList($value);
}
/** @return string[] */
private function uniqueStringList(array $items): array
{
$out = [];
foreach ($items as $item) {
if (!is_scalar($item)) {
continue;
}
$item = trim((string) $item);
if ($item === '' || in_array($item, $out, true)) {
continue;
}
$out[] = $item;
}
return $out;
}
/** @return array<string, string[]> */
private function uniqueStringListMap(array $map): array
{
$out = [];
foreach ($map as $key => $items) {
if (!is_scalar($key)) {
continue;
}
$cleanKey = trim((string) $key);
$cleanItems = $this->uniqueStringList(is_array($items) ? $items : []);
if ($cleanKey !== '' && $cleanItems !== []) {
$out[$cleanKey] = $cleanItems;
}
}
return $out;
}
/** @return array<string, string> */
private function uniqueStringMap(array $map): array
{
$out = [];
foreach ($map as $key => $value) {
if (!is_scalar($key) || !is_scalar($value)) {
continue;
}
$cleanKey = trim((string) $key);
$cleanValue = trim((string) $value);
if ($cleanKey !== '' && $cleanValue !== '') {
$out[$cleanKey] = $cleanValue;
}
}
return $out;
}
private function value(string $path, mixed $fallback): mixed
{
$current = $this->config;
foreach (explode('.', $path) as $segment) {
if (!is_array($current) || !array_key_exists($segment, $current)) {
return $fallback;
}
$current = $current[$segment];
}
return $current;
}
}

View File

@@ -176,11 +176,10 @@ final class NdjsonHybridRetrieverConfig
/** /**
* @param array<string, mixed> $config * @param array<string, mixed> $config
* @param array<string, mixed> $vocabulary Kept for backwards-compatible service wiring.
*/ */
public function __construct( public function __construct(
private array $config = [], private array $config = [],
private array $vocabulary = [], private readonly ?DomainVocabularyConfig $vocabulary = null,
) { ) {
} }
@@ -307,55 +306,55 @@ final class NdjsonHybridRetrieverConfig
/** @return string[] */ /** @return string[] */
public function genericProductTokens(): array public function genericProductTokens(): array
{ {
return $this->stringList('generic_product_tokens', self::GENERIC_PRODUCT_TOKEN); return $this->stringList('generic_product_tokens', $this->vocabularyView('retrieval.generic_product_tokens', self::GENERIC_PRODUCT_TOKEN));
} }
/** @return string[] */ /** @return string[] */
public function importantShortModelTokens(): array public function importantShortModelTokens(): array
{ {
return $this->stringList('important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN); return $this->stringList('important_short_model_tokens', $this->vocabularyView('retrieval.important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN));
} }
/** @return string[] */ /** @return string[] */
public function familyDescriptorTokens(): array public function familyDescriptorTokens(): array
{ {
return $this->stringList('family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN); return $this->stringList('family_descriptor_tokens', $this->vocabularyView('retrieval.family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN));
} }
/** @return string[] */ /** @return string[] */
public function looksLikeReagentTokens(): array public function looksLikeReagentTokens(): array
{ {
return $this->stringList('looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS); return $this->stringList('looks_like_reagent_tokens', $this->vocabularyView('retrieval.looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS));
} }
/** @return string[] */ /** @return string[] */
public function looksLikeSafetyDocs(): array public function looksLikeSafetyDocs(): array
{ {
return $this->stringList('looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS); return $this->stringList('looks_like_safety_docs', $this->vocabularyView('retrieval.looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS));
} }
/** @return string[] */ /** @return string[] */
public function looksLikeReagentWords(): array public function looksLikeReagentWords(): array
{ {
return $this->stringList('looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS); return $this->stringList('looks_like_reagent_words', $this->vocabularyView('retrieval.looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS));
} }
/** @return string[] */ /** @return string[] */
public function looksLikeDocumentWords(): array public function looksLikeDocumentWords(): array
{ {
return $this->stringList('looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS); return $this->stringList('looks_like_document_words', $this->vocabularyView('retrieval.looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS));
} }
/** @return string[] */ /** @return string[] */
public function looksLikeSafetyWords(): array public function looksLikeSafetyWords(): array
{ {
return $this->stringList('looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS); return $this->stringList('looks_like_safety_words', $this->vocabularyView('retrieval.looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS));
} }
/** @return string[] */ /** @return string[] */
public function looksLikeDeviceWords(): array public function looksLikeDeviceWords(): array
{ {
return $this->stringList('looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS); return $this->stringList('looks_like_device_words', $this->vocabularyView('retrieval.looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS));
} }
/** /**
* Effective retrieval vocabulary as a dedicated structure for diagnostics and config dumps. * Effective retrieval vocabulary as a dedicated structure for diagnostics and config dumps.
@@ -459,6 +458,12 @@ final class NdjsonHybridRetrieverConfig
* @param string[] $default * @param string[] $default
* @return string[] * @return string[]
*/ */
/** @return string[] */
private function vocabularyView(string $path, array $fallback): array
{
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
}
private function stringList(string $key, array $default): array private function stringList(string $key, array $default): array
{ {
$value = $this->raw($key, $default); $value = $this->raw($key, $default);
@@ -492,10 +497,6 @@ final class NdjsonHybridRetrieverConfig
return $this->config[$key]; return $this->config[$key];
} }
if (array_key_exists($key, $this->vocabulary)) {
return $this->vocabulary[$key];
}
return $default; return $default;
} }
} }

View File

@@ -6,11 +6,74 @@ namespace App\Config;
final class PromptBuilderConfig final class PromptBuilderConfig
{ {
private const TECHNICAL_PRODUCT_KEYWORDS = [
'technisch',
'technical',
'produkt',
'product',
'gerät',
'device',
'modell',
'model',
'messprinzip',
'measurement principle',
'schnittstelle',
'interface',
'relais',
'relay',
'indikator',
'indicator',
'grenzwert',
'threshold',
'messbereich',
'measurement range',
'minimaler',
'minimum',
'resthärte',
'resthaerte',
'°dh',
'dh',
'spannung',
'voltage',
'strom',
'current',
'druck',
'pressure',
'temperatur',
'temperature',
'schutzart',
'ip',
'fehlercode',
'error code',
'wasserhärte',
'hardness',
'testomat',
'chlor',
'chlormessung',
];
private const ACCESSORY_REQUEST_KEYWORDS = [
'passend',
'passende',
'passendes',
'zubehör',
'zubehor',
'dazu',
'indikator',
'reagenz',
'kit',
'set',
'zusatz',
'ergänzung',
'ergaenzung',
];
/** /**
* @param array<string, mixed> $config * @param array<string, mixed> $config
*/ */
public function __construct( public function __construct(
private readonly array $config = [], private readonly array $config = [],
private readonly ?DomainVocabularyConfig $vocabulary = null,
) { ) {
} }
@@ -88,6 +151,42 @@ final class PromptBuilderConfig
return is_numeric($value) ? (float) $value : $default; return is_numeric($value) ? (float) $value : $default;
} }
/**
* @return string[]
*/
private function getStringList(string $path, array $default): array
{
$value = $this->getValue($path, $default);
if (!is_array($value)) {
return $default;
}
$out = [];
foreach ($value as $item) {
if (!is_scalar($item)) {
continue;
}
$item = trim((string) $item);
if ($item === '' || in_array($item, $out, true)) {
continue;
}
$out[] = $item;
}
return $out !== [] ? $out : $default;
}
/**
* @return string[]
*/
private function vocabularyView(string $path, array $fallback): array
{
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
}
private function getValue(string $path, mixed $default): mixed private function getValue(string $path, mixed $default): mixed
{ {
$current = $this->config; $current = $this->config;
@@ -445,51 +544,10 @@ final class PromptBuilderConfig
*/ */
public function getTechnicalProductKeywords(): array public function getTechnicalProductKeywords(): array
{ {
return [ return $this->getStringList(
'technisch', 'technical_product_keywords',
'technical', $this->vocabularyView('prompt.technical_product_keywords', self::TECHNICAL_PRODUCT_KEYWORDS)
'produkt', );
'product',
'gerät',
'device',
'modell',
'model',
'messprinzip',
'measurement principle',
'schnittstelle',
'interface',
'relais',
'relay',
'indikator',
'indicator',
'grenzwert',
'threshold',
'messbereich',
'measurement range',
'minimaler',
'minimum',
'resthärte',
'resthaerte',
'°dh',
'dh',
'spannung',
'voltage',
'strom',
'current',
'druck',
'pressure',
'temperatur',
'temperature',
'schutzart',
'ip',
'fehlercode',
'error code',
'wasserhärte',
'hardness',
'testomat',
'chlor',
'chlormessung',
];
} }
/** /**
@@ -497,21 +555,10 @@ final class PromptBuilderConfig
*/ */
public function getAccessoryRequestKeywords(): array public function getAccessoryRequestKeywords(): array
{ {
return [ return $this->getStringList(
'passend', 'accessory_request_keywords',
'passende', $this->vocabularyView('prompt.accessory_request_keywords', self::ACCESSORY_REQUEST_KEYWORDS)
'passendes', );
'zubehör',
'zubehor',
'dazu',
'indikator',
'reagenz',
'kit',
'set',
'zusatz',
'ergänzung',
'ergaenzung',
];
} }
public function getTechnicalProductModelPattern(): string public function getTechnicalProductModelPattern(): string

View File

@@ -68,8 +68,10 @@ final class ShopServiceConfig
/** /**
* @param array<string, mixed> $config * @param array<string, mixed> $config
*/ */
public function __construct(private array $config = []) public function __construct(
{ private array $config = [],
private readonly ?DomainVocabularyConfig $vocabulary = null,
) {
} }
public function getTopProductLogLimit(): int public function getTopProductLogLimit(): int
@@ -80,43 +82,43 @@ final class ShopServiceConfig
/** @return string[] */ /** @return string[] */
public function getDeviceFocusKeywords(): array public function getDeviceFocusKeywords(): array
{ {
return $this->stringList('device_focus_keywords', self::DEVICE_FOCUS_KEYWORDS); return $this->stringList('device_focus_keywords', $this->vocabularyView('shop.device_focus', self::DEVICE_FOCUS_KEYWORDS));
} }
/** @return string[] */ /** @return string[] */
public function getAccessoryFocusKeywords(): array public function getAccessoryFocusKeywords(): array
{ {
return $this->stringList('accessory_focus_keywords', self::ACCESSORY_FOCUS_KEYWORDS); return $this->stringList('accessory_focus_keywords', $this->vocabularyView('shop.accessory_focus', self::ACCESSORY_FOCUS_KEYWORDS));
} }
/** @return array<string, string[]> */ /** @return array<string, string[]> */
public function getAccessoryFocusVariantMap(): array public function getAccessoryFocusVariantMap(): array
{ {
return $this->stringListMap('accessory_focus_variant_map', self::ACCESSORY_FOCUS_VARIANT_MAP); return $this->stringListMap('accessory_focus_variant_map', $this->vocabularyMap('shop.accessory_focus_variants', self::ACCESSORY_FOCUS_VARIANT_MAP));
} }
/** @return string[] */ /** @return string[] */
public function getDeviceQueryKeywords(): array public function getDeviceQueryKeywords(): array
{ {
return $this->stringList('device_query_keywords', self::DEVICE_QUERY_KEYWORDS); return $this->stringList('device_query_keywords', $this->vocabularyView('shop.device_query', self::DEVICE_QUERY_KEYWORDS));
} }
/** @return string[] */ /** @return string[] */
public function getAccessoryQueryKeywords(): array public function getAccessoryQueryKeywords(): array
{ {
return $this->stringList('accessory_query_keywords', self::ACCESSORY_QUERY_KEYWORDS); return $this->stringList('accessory_query_keywords', $this->vocabularyView('shop.accessory_query', self::ACCESSORY_QUERY_KEYWORDS));
} }
/** @return string[] */ /** @return string[] */
public function getAccessoryProductKeywords(): array public function getAccessoryProductKeywords(): array
{ {
return $this->stringList('accessory_product_keywords', self::ACCESSORY_PRODUCT_KEYWORDS); return $this->stringList('accessory_product_keywords', $this->vocabularyView('shop.accessory_product', self::ACCESSORY_PRODUCT_KEYWORDS));
} }
/** @return string[] */ /** @return string[] */
public function getDeviceProductKeywords(): array public function getDeviceProductKeywords(): array
{ {
return $this->stringList('device_product_keywords', self::DEVICE_PRODUCT_KEYWORDS); return $this->stringList('device_product_keywords', $this->vocabularyView('shop.device_product', self::DEVICE_PRODUCT_KEYWORDS));
} }
public function getExactProductNumberPhraseScore(): int public function getExactProductNumberPhraseScore(): int
@@ -368,6 +370,18 @@ final class ShopServiceConfig
* @param string[]|null $emptySafeDefault * @param string[]|null $emptySafeDefault
* @return string[] * @return string[]
*/ */
/** @return string[] */
private function vocabularyView(string $path, array $fallback): array
{
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
}
/** @return array<string, string[]> */
private function vocabularyMap(string $path, array $fallback): array
{
return $this->vocabulary?->map($path, $fallback) ?? $fallback;
}
private function stringList(string $path, array $default, bool $allowEmptyStrings = false, ?array $emptySafeDefault = null): array private function stringList(string $path, array $default, bool $allowEmptyStrings = false, ?array $emptySafeDefault = null): array
{ {
$value = $this->value($path, $default); $value = $this->value($path, $default);

View File

@@ -1125,7 +1125,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$candidates = []; $candidates = [];
$seenDocs = []; $seenDocs = [];
foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) { foreach (array_slice($chunkIds, 0, $this->retrieverConfig->focusedProductWindow()) as $rank => $chunkId) {
$row = $rows[$chunkId] ?? null; $row = $rows[$chunkId] ?? null;
if (!is_array($row)) { if (!is_array($row)) {
continue; continue;
@@ -1171,7 +1171,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$bestScore = (float)$best['score']; $bestScore = (float)$best['score'];
$gap = $bestScore - $runnerUpScore; $gap = $bestScore - $runnerUpScore;
if ($bestScore < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_SCORE || $gap < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_GAP) { if ($bestScore < $this->retrieverConfig->focusedProductMinScore() || $gap < $this->retrieverConfig->focusedProductMinGap()) {
return null; return null;
} }
@@ -1199,10 +1199,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$normalized = $this->normalizeText($prompt); $normalized = $this->normalizeText($prompt);
$tokens = $this->tokenizeText($normalized); $tokens = $this->tokenizeText($normalized);
$reagentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_WORDS; $reagentWords = $this->retrieverConfig->looksLikeReagentWords();
$documentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DOCUMENT_WORDS; $documentWords = $this->retrieverConfig->looksLikeDocumentWords();
$safetyWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_WORDS; $safetyWords = $this->retrieverConfig->looksLikeSafetyWords();
$deviceWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DEVICE_WORDS; $deviceWords = $this->retrieverConfig->looksLikeDeviceWords();
$asksReagent = $this->containsAnyToken($tokens, $reagentWords); $asksReagent = $this->containsAnyToken($tokens, $reagentWords);
$asksDocument = $this->containsAnyToken($tokens, $documentWords); $asksDocument = $this->containsAnyToken($tokens, $documentWords);
@@ -1343,7 +1343,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$documentId, $documentId,
$chunkIds, $chunkIds,
$rows, $rows,
min($limit, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MAX_CHUNKS) min($limit, $this->retrieverConfig->focusedProductMaxChunks())
); );
} }
@@ -1358,7 +1358,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
{ {
$docWindow = []; $docWindow = [];
foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::DOMINANT_DOC_WINDOW) as $chunkId) { foreach (array_slice($chunkIds, 0, $this->retrieverConfig->dominantDocWindow()) as $chunkId) {
if (!isset($rows[$chunkId]['text'])) { if (!isset($rows[$chunkId]['text'])) {
continue; continue;
} }
@@ -1388,7 +1388,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$dominantCount = (int)($counts[$dominantDocId] ?? 0); $dominantCount = (int)($counts[$dominantDocId] ?? 0);
if ($dominantCount >= NdjsonHybridRetrieverConfig::DOMINANT_DOC_MIN_HITS) { if ($dominantCount >= $this->retrieverConfig->dominantDocMinHits()) {
return $dominantDocId; return $dominantDocId;
} }
@@ -1450,7 +1450,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return []; return [];
} }
$maxFromDoc = min($limit, NdjsonHybridRetrieverConfig::DOMINANT_DOC_MAX_CHUNKS); $maxFromDoc = min($limit, $this->retrieverConfig->dominantDocMaxChunks());
if ($anchorChunkIndex !== null) { if ($anchorChunkIndex !== null) {
usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int { usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int {
@@ -1550,13 +1550,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
continue; continue;
} }
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) { if (($docCounter[$docId] ?? 0) >= $this->retrieverConfig->maxChunksPerDoc()) {
continue; continue;
} }
if (is_int($chunkIndex)) { if (is_int($chunkIndex)) {
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) { foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) { if (abs($prevIdx - $chunkIndex) < $this->retrieverConfig->minChunkDistance()) {
continue 2; continue 2;
} }
} }
@@ -1609,13 +1609,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
continue; continue;
} }
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) { if (($docCounter[$docId] ?? 0) >= $this->retrieverConfig->maxChunksPerDoc()) {
continue; continue;
} }
if (is_int($chunkIndex)) { if (is_int($chunkIndex)) {
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) { foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) { if (abs($prevIdx - $chunkIndex) < $this->retrieverConfig->minChunkDistance()) {
continue 2; continue 2;
} }
} }
@@ -1715,7 +1715,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
*/ */
private function isGenericProductToken(string $token): bool private function isGenericProductToken(string $token): bool
{ {
static $generic = NdjsonHybridRetrieverConfig::GENERIC_PRODUCT_TOKEN; $generic = $this->retrieverConfig->genericProductTokens();
return isset(array_fill_keys($generic, true)[$token]); return isset(array_fill_keys($generic, true)[$token]);
} }
@@ -1724,7 +1725,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
*/ */
private function isImportantShortModelToken(string $token): bool private function isImportantShortModelToken(string $token): bool
{ {
static $allowed = NdjsonHybridRetrieverConfig::IMPORTANT_SHORT_MODEL_TOKEN; $allowed = $this->retrieverConfig->importantShortModelTokens();
return in_array($token, $allowed, true); return in_array($token, $allowed, true);
} }
@@ -1734,7 +1735,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
*/ */
private function isFamilyDescriptorToken(string $token): bool private function isFamilyDescriptorToken(string $token): bool
{ {
static $familyDescriptors = NdjsonHybridRetrieverConfig::FAMILY_DESCRIPTOR_TOKEN; $familyDescriptors = $this->retrieverConfig->familyDescriptorTokens();
return in_array($token, $familyDescriptors, true) return in_array($token, $familyDescriptors, true)
|| $this->isImportantShortModelToken($token) || $this->isImportantShortModelToken($token)
@@ -1752,7 +1753,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return false; return false;
} }
$needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_TOKENS; $needles = $this->retrieverConfig->looksLikeReagentTokens();
foreach ($needles as $needle) { foreach ($needles as $needle) {
if (str_contains($haystack, $needle)) { if (str_contains($haystack, $needle)) {
@@ -1774,7 +1775,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return false; return false;
} }
$needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_DOCS; $needles = $this->retrieverConfig->looksLikeSafetyDocs();
foreach ($needles as $needle) { foreach ($needles as $needle) {
if (str_contains($haystack, $needle)) { if (str_contains($haystack, $needle)) {