central config part 1

This commit is contained in:
team2
2026-04-25 23:39:41 +02:00
parent 2797834a5f
commit f42022e5f7
11 changed files with 1197 additions and 476 deletions

View File

@@ -0,0 +1,29 @@
# RetrieX Vocabulary Centralization Fix
This patch centralizes the growing recognition word lists without changing their tuned content.
## Main changes
- Added `config/retriex/vocabulary.yaml`.
- Added `App\Config\DomainVocabularyConfig`.
- Wired the vocabulary facade into:
- `ShopServiceConfig`
- `NdjsonHybridRetrieverConfig`
- `PromptBuilderConfig`
- `CommerceQueryParserConfig`
- Moved the active Shop and Retrieval vocabulary defaults out of `commerce.yaml` and `retrieval.yaml` into `vocabulary.yaml`.
- Kept all old per-service config keys as explicit overrides.
- Removed direct `NdjsonHybridRetrieverConfig::...` constant usage inside `NdjsonHybridRetriever` so effective config getters are used consistently.
## Stability note
The vocabulary views preserve the previous order and content of the tuned lists.
No new semantic terms were added to the critical retrieval and shop matching views.
Required regression baseline:
- `Was ist der niedrigste Grenzwert für die Wasserhärte, welcher mit einem Testomaten überwacht werden kann?`
- expected: `0,02 °dH (Testomat 808)`
- `mit welchem indikator wird der wert gemessen`
- expected: `Indikatortyp 300`
- Store query with `0,02` must preserve the decimal value and must not turn it into `02`.

View File

@@ -16,166 +16,8 @@ parameters:
retriex.shop_matching.config:
top_product_log_limit: 3
device_query_keywords:
- analysegerät
- analysegeraet
- analysegeräte
- analysegeraete
- messgerät
- messgeraet
- messgeräte
- messgeraete
- analysator
- analysatoren
- analyzer
- gerät
- geraet
- geräte
- geraete
- monitor
- monitore
- controller
- gerät für
- geraet fuer
- geräte für
- geraete fuer
- system
- systeme
- anlage
- anlagen
accessory_query_keywords:
- zubehör
- zubehor
- reagenz
- reagenzien
- reagent
- indikator
- indikatoren
- indicator
- kit
- set
- ersatz
- ersatzteil
- ersatzteile
- verbrauchsmaterial
- consumable
- dazu
- passend
- passende
- passendes
- nachfüll
- nachfuell
- refill
- filter
- pumpenkopf
- motorblock
- service set
- serviceset
- service-set
accessory_product_keywords:
- reagenz
- reagenzien
- reagent
- indikator
- indikatoren
- indicator
- kit
- set
- verbrauchsmaterial
- consumable
- zubehör
- zubehor
- ersatz
- ersatzteil
- ersatzteile
- nachfüll
- nachfuell
- refill
- lösung
- loesung
- solution
- teststreifen
- test strip
- filter
- pumpenkopf
- motorblock
- service set
- serviceset
- service-set
device_product_keywords:
- analysegerät
- analysegeraet
- analysegeräte
- analysegeraete
- messgerät
- messgeraet
- messgeräte
- messgeraete
- analysator
- analysatoren
- analyzer
- monitor
- monitore
- controller
- online-analysator
- online analysator
- online-analysegerät
- online analysegeraet
- online-analysegeräte
- online analysegeraete
- online analyzer
- online monitor
- system
- systeme
- anlage
- anlagen
- gerät
- geraet
- geräte
- geraete
device_focus_keywords:
- geräte
- geraete
- gerät
- geraet
- analysegerät
- analysegeraet
- messgerät
- messgeraet
- analysator
- controller
- monitor
accessory_focus_keywords:
- indikator
- indikatoren
- reagenz
- reagenzien
- zubehör
- zubehor
- ersatzteil
- ersatzteile
- verbrauchsmaterial
- service set
- serviceset
- filter
- pumpenkopf
- motorblock
accessory_focus_variant_map:
indikator: [indikator, indikatoren]
indikatoren: [indikator, indikatoren]
reagenz: [reagenz, reagenzien]
reagenzien: [reagenz, reagenzien]
ersatzteil: [ersatzteil, ersatzteile]
ersatzteile: [ersatzteil, ersatzteile]
service set: [service set, serviceset, service-set]
serviceset: [service set, serviceset, service-set]
service-set: [service set, serviceset, service-set]
# Vocabulary-backed lists live in config/retriex/vocabulary.yaml.
# The old per-key entries may still be added here to override a specific view.
scores:
exact_product_number_phrase: 160

View File

@@ -27,141 +27,8 @@ parameters:
focused_product_min_gap: 4.0
focused_product_max_chunks: 4
generic_product_tokens:
- produkt
- produkte
- produktkarte
- titel
- geraet
- gerät
- messgeraet
- messgerät
- wasser
- haerte
- härte
- resthaerte
- resthärte
- analyse
- analysator
- automat
- online
- messung
- messen
- preis
- preise
- kosten
- info
- infos
- passend
- richtige
- richtiges
- geeignet
- geeignete
- welche
- welcher
- welches
- brauche
- suche
important_short_model_tokens: [th, tc, tp, tm, ph, rx]
family_descriptor_tokens:
- evo
- eco
- self
- clean
- mini
- pro
- plus
- basic
- lab
- inline
- compact
- panel
- sc
looks_like_reagent_tokens:
- indikator
- reagenz
- reagens
- laborchemikalie
- chemikalie
- sicherheitsdatenblatt
- sdb
- msds
- ufi
- gebinde
- flasche
- ersatzteil
- zubehoer
- zubehör
- service set
- filtereinsatz
- kerzenfilter
- druckregler
looks_like_safety_docs:
- sicherheitsdatenblatt
- sdb
- msds
- gefahrenbewertung
- gefahrenpiktogramm
- signalwort
- lagerung
- transport
- clp
- kennzeichnung
- h290
- pbt
- vpvb
looks_like_reagent_words:
- indikator
- reagenz
- reagens
- chemie
- chemikalie
- sdb
- sicherheitsdatenblatt
- msds
- flasche
- gebinde
looks_like_document_words:
- datenblatt
- dokument
- pdf
- handbuch
- manual
- beschreibung
- sdb
- sicherheitsdatenblatt
- msds
looks_like_safety_words:
- gefahr
- gefahrgut
- clp
- h290
- sicherheit
- kennzeichnung
- transport
- lagerung
- piktogramm
looks_like_device_words:
- geraet
- gerät
- messgeraet
- messgerät
- analysator
- automat
- messung
- messen
- ueberwachung
- überwachung
- online
- monitor
# Vocabulary-backed retrieval token lists live in config/retriex/vocabulary.yaml.
# The old per-key entries may still be added here to override a specific view.
# Backwards-compatible name for existing config diagnostics.
retriex.retrieval.inventory: '%retriex.retrieval.config%'

View File

@@ -0,0 +1,597 @@
# Central domain vocabulary for RetrieX.
# Views preserve the previous 1.4.2-tuned ordering exactly; per-service configs may still override them.
parameters:
retriex.commerce_query.config: {}
retriex.vocabulary.config:
classes:
device:
- analysegerät
- analysegeraet
- analysegeräte
- analysegeraete
- messgerät
- messgeraet
- messgeräte
- messgeraete
- analysator
- analysatoren
- analyzer
- gerät
- geraet
- geräte
- geraete
- monitor
- monitore
- controller
- system
- systeme
- anlage
- anlagen
accessory:
- zubehör
- zubehor
- reagenz
- reagenzien
- reagent
- indikator
- indikatoren
- indicator
- kit
- set
- ersatz
- ersatzteil
- ersatzteile
- verbrauchsmaterial
- consumable
- filter
- pumpenkopf
- motorblock
- service set
- serviceset
- service-set
views:
shop:
device_query:
add:
- analysegerät
- analysegeraet
- analysegeräte
- analysegeraete
- messgerät
- messgeraet
- messgeräte
- messgeraete
- analysator
- analysatoren
- analyzer
- gerät
- geraet
- geräte
- geraete
- monitor
- monitore
- controller
- gerät für
- geraet fuer
- geräte für
- geraete fuer
- system
- systeme
- anlage
- anlagen
accessory_query:
add:
- zubehör
- zubehor
- reagenz
- reagenzien
- reagent
- indikator
- indikatoren
- indicator
- kit
- set
- ersatz
- ersatzteil
- ersatzteile
- verbrauchsmaterial
- consumable
- dazu
- passend
- passende
- passendes
- nachfüll
- nachfuell
- refill
- filter
- pumpenkopf
- motorblock
- service set
- serviceset
- service-set
accessory_product:
add:
- reagenz
- reagenzien
- reagent
- indikator
- indikatoren
- indicator
- kit
- set
- verbrauchsmaterial
- consumable
- zubehör
- zubehor
- ersatz
- ersatzteil
- ersatzteile
- nachfüll
- nachfuell
- refill
- lösung
- loesung
- solution
- teststreifen
- test strip
- filter
- pumpenkopf
- motorblock
- service set
- serviceset
- service-set
device_product:
add:
- analysegerät
- analysegeraet
- analysegeräte
- analysegeraete
- messgerät
- messgeraet
- messgeräte
- messgeraete
- analysator
- analysatoren
- analyzer
- monitor
- monitore
- controller
- online-analysator
- online analysator
- online-analysegerät
- online analysegeraet
- online-analysegeräte
- online analysegeraete
- online analyzer
- online monitor
- system
- systeme
- anlage
- anlagen
- gerät
- geraet
- geräte
- geraete
device_focus:
add:
- geräte
- geraete
- gerät
- geraet
- analysegerät
- analysegeraet
- messgerät
- messgeraet
- analysator
- controller
- monitor
accessory_focus:
add:
- indikator
- indikatoren
- reagenz
- reagenzien
- zubehör
- zubehor
- ersatzteil
- ersatzteile
- verbrauchsmaterial
- service set
- serviceset
- filter
- pumpenkopf
- motorblock
commerce_query:
known_brands:
add:
- heyl
- horiba
- neomeris
phrases_to_remove:
add:
- ich suche
- suche
- habt ihr
- gibt es
- gebe mir
- gib mir
- zeige mir
- welches gerät
- welche gerät
- welches modell
- welches ist besser
- welches ist am besten
- alternative
- alternativen
- unter anderem
- u a
- welche
- welcher
- welches
- welchen
- sind
- ist
- geeignet
- geeigent
- verfügbarkeit
- verfuegbarkeit
filter_search_tokens:
add:
- auch
- noch
- nochmal
- zusätzlich
- dazu
- davon
- stattdessen
- bitte
- gern
- gerne
- zeige
- zeig
- such
- suche
- finde
- find
- mir
- mal
- von
- im
- in
- für
- fuer
- welche
- welcher
- welches
- welchen
- sind
- ist
- geeignet
- geeigent
- verfügbarkeit
- verfuegbarkeit
- prüfe
- pruefe
- den
- die
- das
- der
- dem
- des
- und
- oder
- sowie
- seine
- seinen
- seiner
- seinem
- seines
- siene
- sienen
- siener
- sienem
- sienes
- gebe
- gib
- nenne
- nenn
- preis
- preise
- preisen
- kostet
- kosten
- ua
- also
- gut
- gute
- guten
- guter
- gutes
- passen
- passend
semantic_shop_search_tokens:
add:
- indikator
- indicator
- reagenz
- reagent
- zubehör
- zubehor
- ersatzteil
- verbrauchsmaterial
- chemie
- indikatorchemie
- reagenzchemie
- kit
- set
- filter
- pumpe
- pumpenkopf
- motorblock
- lösung
- loesung
- solution
- teststreifen
- gerät
- geraet
- messgerät
- messgeraet
- analysegerät
- analysegeraet
- analysator
- monitor
- controller
- system
retrieval:
generic_product_tokens:
add:
- produkt
- produkte
- produktkarte
- titel
- geraet
- gerät
- messgeraet
- messgerät
- wasser
- haerte
- härte
- resthaerte
- resthärte
- analyse
- analysator
- automat
- online
- messung
- messen
- preis
- preise
- kosten
- info
- infos
- passend
- richtige
- richtiges
- geeignet
- geeignete
- welche
- welcher
- welches
- brauche
- suche
important_short_model_tokens:
add:
- th
- tc
- tp
- tm
- ph
- rx
family_descriptor_tokens:
add:
- evo
- eco
- self
- clean
- mini
- pro
- plus
- basic
- lab
- inline
- compact
- panel
- sc
looks_like_reagent_tokens:
add:
- indikator
- reagenz
- reagens
- laborchemikalie
- chemikalie
- sicherheitsdatenblatt
- sdb
- msds
- ufi
- gebinde
- flasche
- ersatzteil
- zubehoer
- zubehör
- service set
- filtereinsatz
- kerzenfilter
- druckregler
looks_like_safety_docs:
add:
- sicherheitsdatenblatt
- sdb
- msds
- gefahrenbewertung
- gefahrenpiktogramm
- signalwort
- lagerung
- transport
- clp
- kennzeichnung
- h290
- pbt
- vpvb
looks_like_reagent_words:
add:
- indikator
- reagenz
- reagens
- chemie
- chemikalie
- sdb
- sicherheitsdatenblatt
- msds
- flasche
- gebinde
looks_like_document_words:
add:
- datenblatt
- dokument
- pdf
- handbuch
- manual
- beschreibung
- sdb
- sicherheitsdatenblatt
- msds
looks_like_safety_words:
add:
- gefahr
- gefahrgut
- clp
- h290
- sicherheit
- kennzeichnung
- transport
- lagerung
- piktogramm
looks_like_device_words:
add:
- geraet
- gerät
- messgeraet
- messgerät
- analysator
- automat
- messung
- messen
- ueberwachung
- überwachung
- online
- monitor
prompt:
technical_product_keywords:
add:
- technisch
- technical
- produkt
- product
- gerät
- device
- modell
- model
- messprinzip
- measurement principle
- schnittstelle
- interface
- relais
- relay
- indikator
- indicator
- grenzwert
- threshold
- messbereich
- measurement range
- minimaler
- minimum
- resthärte
- resthaerte
- °dh
- dh
- spannung
- voltage
- strom
- current
- druck
- pressure
- temperatur
- temperature
- schutzart
- ip
- fehlercode
- error code
- wasserhärte
- hardness
- testomat
- chlor
- chlormessung
accessory_request_keywords:
add:
- passend
- passende
- passendes
- zubehör
- zubehor
- dazu
- indikator
- reagenz
- kit
- set
- zusatz
- ergänzung
- ergaenzung
maps:
shop:
accessory_focus_variants:
indikator:
- indikator
- indikatoren
indikatoren:
- indikator
- indikatoren
reagenz:
- reagenz
- reagenzien
reagenzien:
- reagenz
- reagenzien
ersatzteil:
- ersatzteil
- ersatzteile
ersatzteile:
- ersatzteil
- ersatzteile
service set:
- service set
- serviceset
- service-set
serviceset:
- service set
- serviceset
- service-set
service-set:
- service set
- serviceset
- service-set
commerce_query:
search_token_corrections:
siene: seine
sienen: seinen
siener: seiner
sienem: seinem
sienes: seines
indicatoren: indikatoren
search_token_canonical:
indikatoren: indikator
indicators: indikator
indicator: indikator
reagenzien: reagenz
reagents: reagenz
reagent: reagenz
produkte: produkt

View File

@@ -9,6 +9,7 @@ imports:
- { resource: 'retriex/retrieval.yaml' }
- { resource: 'retriex/language.yaml' }
- { resource: 'retriex/query_enrichment.yaml' }
- { resource: 'retriex/vocabulary.yaml' }
# ------------------------------------------------------------
# Parameters
@@ -112,9 +113,14 @@ services:
$retrievalMaxChunks: '%retriex.model.default_retrieval_max_chunks%'
$retrievalVectorTopK: '%retriex.model.default_retrieval_vector_top_k%'
App\Config\DomainVocabularyConfig:
arguments:
$config: '%retriex.vocabulary.config%'
App\Config\PromptBuilderConfig:
arguments:
$config: '%retriex.prompt.config%'
$vocabulary: '@App\Config\DomainVocabularyConfig'
App\Config\AgentRunnerConfig:
arguments:
@@ -123,7 +129,7 @@ services:
App\Config\NdjsonHybridRetrieverConfig:
arguments:
$config: '%retriex.retrieval.config%'
$vocabulary: '%retriex.retrieval.config%'
$vocabulary: '@App\Config\DomainVocabularyConfig'
App\Config\StopWordsConfig:
arguments:
@@ -136,6 +142,7 @@ services:
App\Config\ShopServiceConfig:
arguments:
$config: '%retriex.shop_matching.config%'
$vocabulary: '@App\Config\DomainVocabularyConfig'
App\Infrastructure\OllamaClient:
arguments:
@@ -176,6 +183,11 @@ services:
App\Intent\CommerceIntentLite: ~
App\Config\CommerceQueryParserConfig:
arguments:
$config: '%retriex.commerce_query.config%'
$vocabulary: '@App\Config\DomainVocabularyConfig'
App\Commerce\CommerceQueryParser: ~
App\Config\SearchRepairConfig:

View File

@@ -6,24 +6,13 @@ namespace App\Config;
final class CommerceQueryParserConfig
{
/**
* @return string[]
*/
public function getKnownBrands(): array
{
return [
private const KNOWN_BRANDS = [
'heyl',
'horiba',
'neomeris',
];
}
/**
* @return string[]
*/
public function getPhrasesToRemove(): array
{
return [
private const PHRASES_TO_REMOVE = [
'ich suche',
'suche',
'habt ihr',
@@ -51,24 +40,8 @@ final class CommerceQueryParserConfig
'verfügbarkeit',
'verfuegbarkeit',
];
}
public function getHistoryContextPattern(): string
{
return 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt';
}
public function getHistoryContextValuePattern(): string
{
return '/\b(' . $this->getHistoryContextPattern() . ')\b/u';
}
/**
* @return string[]
*/
public function getFilterSearchTokens(): array
{
return [
private const FILTER_SEARCH_TOKENS = [
'auch',
'noch',
'nochmal',
@@ -142,14 +115,8 @@ final class CommerceQueryParserConfig
'passen',
'passend',
];
}
/**
* @return array<string, string>
*/
public function getSearchTokenCorrections(): array
{
return [
private const SEARCH_TOKEN_CORRECTIONS = [
'siene' => 'seine',
'sienen' => 'seinen',
'siener' => 'seiner',
@@ -157,14 +124,8 @@ final class CommerceQueryParserConfig
'sienes' => 'seines',
'indicatoren' => 'indikatoren',
];
}
/**
* @return array<string, string>
*/
public function getSearchTokenCanonicalMap(): array
{
return [
private const SEARCH_TOKEN_CANONICAL_MAP = [
'indikatoren' => 'indikator',
'indicators' => 'indikator',
'indicator' => 'indikator',
@@ -173,6 +134,113 @@ final class CommerceQueryParserConfig
'reagent' => 'reagenz',
'produkte' => 'produkt',
];
private const SEMANTIC_SHOP_SEARCH_TOKENS = [
'indikator',
'indicator',
'reagenz',
'reagent',
'zubehör',
'zubehor',
'ersatzteil',
'verbrauchsmaterial',
'chemie',
'indikatorchemie',
'reagenzchemie',
'kit',
'set',
'filter',
'pumpe',
'pumpenkopf',
'motorblock',
'lösung',
'loesung',
'solution',
'teststreifen',
'gerät',
'geraet',
'messgerät',
'messgeraet',
'analysegerät',
'analysegeraet',
'analysator',
'monitor',
'controller',
'system',
];
/**
* @param array<string, mixed> $config
*/
public function __construct(
private readonly array $config = [],
private readonly ?DomainVocabularyConfig $vocabulary = null,
) {
}
/**
* @return string[]
*/
public function getKnownBrands(): array
{
return $this->stringList(
'known_brands',
$this->vocabularyView('commerce_query.known_brands', self::KNOWN_BRANDS)
);
}
/**
* @return string[]
*/
public function getPhrasesToRemove(): array
{
return $this->stringList(
'phrases_to_remove',
$this->vocabularyView('commerce_query.phrases_to_remove', self::PHRASES_TO_REMOVE)
);
}
public function getHistoryContextPattern(): string
{
return 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt';
}
public function getHistoryContextValuePattern(): string
{
return '/\b(' . $this->getHistoryContextPattern() . ')\b/u';
}
/**
* @return string[]
*/
public function getFilterSearchTokens(): array
{
return $this->stringList(
'filter_search_tokens',
$this->vocabularyView('commerce_query.filter_search_tokens', self::FILTER_SEARCH_TOKENS)
);
}
/**
* @return array<string, string>
*/
public function getSearchTokenCorrections(): array
{
return $this->stringMap(
'search_token_corrections',
$this->vocabularyStringMap('commerce_query.search_token_corrections', self::SEARCH_TOKEN_CORRECTIONS)
);
}
/**
* @return array<string, string>
*/
public function getSearchTokenCanonicalMap(): array
{
return $this->stringMap(
'search_token_canonical_map',
$this->vocabularyStringMap('commerce_query.search_token_canonical', self::SEARCH_TOKEN_CANONICAL_MAP)
);
}
/**
@@ -335,39 +403,86 @@ final class CommerceQueryParserConfig
*/
public function getSemanticShopSearchTokens(): array
{
return [
'indikator',
'indicator',
'reagenz',
'reagent',
'zubehör',
'zubehor',
'ersatzteil',
'verbrauchsmaterial',
'chemie',
'indikatorchemie',
'reagenzchemie',
'kit',
'set',
'filter',
'pumpe',
'pumpenkopf',
'motorblock',
'lösung',
'loesung',
'solution',
'teststreifen',
'gerät',
'geraet',
'messgerät',
'messgeraet',
'analysegerät',
'analysegeraet',
'analysator',
'monitor',
'controller',
'system',
];
return $this->stringList(
'semantic_shop_search_tokens',
$this->vocabularyView('commerce_query.semantic_shop_search_tokens', self::SEMANTIC_SHOP_SEARCH_TOKENS)
);
}
/** @return string[] */
private function vocabularyView(string $path, array $fallback): array
{
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
}
/** @return array<string, string> */
private function vocabularyStringMap(string $path, array $fallback): array
{
return $this->vocabulary?->stringMap($path, $fallback) ?? $fallback;
}
/** @return string[] */
private function stringList(string $path, array $default): array
{
$value = $this->value($path, $default);
if (!is_array($value)) {
return $default;
}
$out = [];
foreach ($value as $item) {
if (!is_scalar($item)) {
continue;
}
$item = trim((string) $item);
if ($item === '' || in_array($item, $out, true)) {
continue;
}
$out[] = $item;
}
return $out !== [] ? $out : $default;
}
/** @return array<string, string> */
private function stringMap(string $path, array $default): array
{
$value = $this->value($path, $default);
if (!is_array($value)) {
return $default;
}
$out = [];
foreach ($value as $key => $item) {
if (!is_scalar($key) || !is_scalar($item)) {
continue;
}
$cleanKey = trim((string) $key);
$cleanValue = trim((string) $item);
if ($cleanKey !== '' && $cleanValue !== '') {
$out[$cleanKey] = $cleanValue;
}
}
return $out !== [] ? $out : $default;
}
private function value(string $path, mixed $default): mixed
{
$current = $this->config;
foreach (explode('.', $path) as $segment) {
if (!is_array($current) || !array_key_exists($segment, $current)) {
return $default;
}
$current = $current[$segment];
}
return $current;
}
public function buildExactTokenRemovalPattern(string $token): string

View File

@@ -0,0 +1,196 @@
<?php
declare(strict_types=1);
namespace App\Config;
final class DomainVocabularyConfig
{
public function __construct(private readonly array $config = [])
{
}
/** @return string[] */
public function view(string $path, array $fallback = []): array
{
$definition = $this->value('views.' . $path, null);
if (!is_array($definition)) {
return $this->uniqueStringList($fallback);
}
$terms = [];
foreach ($this->stringListFromValue($definition['include'] ?? []) as $className) {
foreach ($this->domainClass($className) as $term) {
$terms[] = $term;
}
}
foreach ($this->stringListFromValue($definition['add'] ?? []) as $term) {
$terms[] = $term;
}
$terms = $this->uniqueStringList($terms);
return $terms !== [] ? $terms : $this->uniqueStringList($fallback);
}
/** @return string[] */
public function domainClass(string $name): array
{
return $this->stringList('classes.' . $name, []);
}
/** @return array<string, string[]> */
public function map(string $path, array $fallback = []): array
{
$value = $this->value('maps.' . $path, null);
if (!is_array($value)) {
return $this->uniqueStringListMap($fallback);
}
$out = [];
foreach ($value as $key => $items) {
if (!is_scalar($key)) {
continue;
}
$cleanKey = trim((string) $key);
$cleanItems = $this->stringListFromValue($items);
if ($cleanKey !== '' && $cleanItems !== []) {
$out[$cleanKey] = $cleanItems;
}
}
return $out !== [] ? $out : $this->uniqueStringListMap($fallback);
}
/** @return array<string, string> */
public function stringMap(string $path, array $fallback = []): array
{
$value = $this->value('maps.' . $path, null);
if (!is_array($value)) {
return $this->uniqueStringMap($fallback);
}
$out = [];
foreach ($value as $key => $mappedValue) {
if (!is_scalar($key)) {
continue;
}
$cleanKey = trim((string) $key);
if ($cleanKey === '') {
continue;
}
if (is_array($mappedValue)) {
$items = $this->stringListFromValue($mappedValue);
$mappedValue = $items[0] ?? '';
}
if (!is_scalar($mappedValue)) {
continue;
}
$cleanValue = trim((string) $mappedValue);
if ($cleanValue !== '') {
$out[$cleanKey] = $cleanValue;
}
}
return $out !== [] ? $out : $this->uniqueStringMap($fallback);
}
/** @return array<string, mixed> */
public function toArray(): array
{
return $this->config;
}
/** @return string[] */
private function stringList(string $path, array $fallback): array
{
$value = $this->value($path, null);
$items = $this->stringListFromValue($value);
return $items !== [] ? $items : $this->uniqueStringList($fallback);
}
/** @return string[] */
private function stringListFromValue(mixed $value): array
{
if (!is_array($value)) {
return [];
}
return $this->uniqueStringList($value);
}
/** @return string[] */
private function uniqueStringList(array $items): array
{
$out = [];
foreach ($items as $item) {
if (!is_scalar($item)) {
continue;
}
$item = trim((string) $item);
if ($item === '' || in_array($item, $out, true)) {
continue;
}
$out[] = $item;
}
return $out;
}
/** @return array<string, string[]> */
private function uniqueStringListMap(array $map): array
{
$out = [];
foreach ($map as $key => $items) {
if (!is_scalar($key)) {
continue;
}
$cleanKey = trim((string) $key);
$cleanItems = $this->uniqueStringList(is_array($items) ? $items : []);
if ($cleanKey !== '' && $cleanItems !== []) {
$out[$cleanKey] = $cleanItems;
}
}
return $out;
}
/** @return array<string, string> */
private function uniqueStringMap(array $map): array
{
$out = [];
foreach ($map as $key => $value) {
if (!is_scalar($key) || !is_scalar($value)) {
continue;
}
$cleanKey = trim((string) $key);
$cleanValue = trim((string) $value);
if ($cleanKey !== '' && $cleanValue !== '') {
$out[$cleanKey] = $cleanValue;
}
}
return $out;
}
private function value(string $path, mixed $fallback): mixed
{
$current = $this->config;
foreach (explode('.', $path) as $segment) {
if (!is_array($current) || !array_key_exists($segment, $current)) {
return $fallback;
}
$current = $current[$segment];
}
return $current;
}
}

View File

@@ -176,11 +176,10 @@ final class NdjsonHybridRetrieverConfig
/**
* @param array<string, mixed> $config
* @param array<string, mixed> $vocabulary Kept for backwards-compatible service wiring.
*/
public function __construct(
private array $config = [],
private array $vocabulary = [],
private readonly ?DomainVocabularyConfig $vocabulary = null,
) {
}
@@ -307,55 +306,55 @@ final class NdjsonHybridRetrieverConfig
/** @return string[] */
public function genericProductTokens(): array
{
return $this->stringList('generic_product_tokens', self::GENERIC_PRODUCT_TOKEN);
return $this->stringList('generic_product_tokens', $this->vocabularyView('retrieval.generic_product_tokens', self::GENERIC_PRODUCT_TOKEN));
}
/** @return string[] */
public function importantShortModelTokens(): array
{
return $this->stringList('important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN);
return $this->stringList('important_short_model_tokens', $this->vocabularyView('retrieval.important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN));
}
/** @return string[] */
public function familyDescriptorTokens(): array
{
return $this->stringList('family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN);
return $this->stringList('family_descriptor_tokens', $this->vocabularyView('retrieval.family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN));
}
/** @return string[] */
public function looksLikeReagentTokens(): array
{
return $this->stringList('looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS);
return $this->stringList('looks_like_reagent_tokens', $this->vocabularyView('retrieval.looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS));
}
/** @return string[] */
public function looksLikeSafetyDocs(): array
{
return $this->stringList('looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS);
return $this->stringList('looks_like_safety_docs', $this->vocabularyView('retrieval.looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS));
}
/** @return string[] */
public function looksLikeReagentWords(): array
{
return $this->stringList('looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS);
return $this->stringList('looks_like_reagent_words', $this->vocabularyView('retrieval.looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS));
}
/** @return string[] */
public function looksLikeDocumentWords(): array
{
return $this->stringList('looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS);
return $this->stringList('looks_like_document_words', $this->vocabularyView('retrieval.looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS));
}
/** @return string[] */
public function looksLikeSafetyWords(): array
{
return $this->stringList('looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS);
return $this->stringList('looks_like_safety_words', $this->vocabularyView('retrieval.looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS));
}
/** @return string[] */
public function looksLikeDeviceWords(): array
{
return $this->stringList('looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS);
return $this->stringList('looks_like_device_words', $this->vocabularyView('retrieval.looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS));
}
/**
* Effective retrieval vocabulary as a dedicated structure for diagnostics and config dumps.
@@ -459,6 +458,12 @@ final class NdjsonHybridRetrieverConfig
* @param string[] $default
* @return string[]
*/
/** @return string[] */
private function vocabularyView(string $path, array $fallback): array
{
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
}
private function stringList(string $key, array $default): array
{
$value = $this->raw($key, $default);
@@ -492,10 +497,6 @@ final class NdjsonHybridRetrieverConfig
return $this->config[$key];
}
if (array_key_exists($key, $this->vocabulary)) {
return $this->vocabulary[$key];
}
return $default;
}
}

View File

@@ -6,11 +6,74 @@ namespace App\Config;
final class PromptBuilderConfig
{
private const TECHNICAL_PRODUCT_KEYWORDS = [
'technisch',
'technical',
'produkt',
'product',
'gerät',
'device',
'modell',
'model',
'messprinzip',
'measurement principle',
'schnittstelle',
'interface',
'relais',
'relay',
'indikator',
'indicator',
'grenzwert',
'threshold',
'messbereich',
'measurement range',
'minimaler',
'minimum',
'resthärte',
'resthaerte',
'°dh',
'dh',
'spannung',
'voltage',
'strom',
'current',
'druck',
'pressure',
'temperatur',
'temperature',
'schutzart',
'ip',
'fehlercode',
'error code',
'wasserhärte',
'hardness',
'testomat',
'chlor',
'chlormessung',
];
private const ACCESSORY_REQUEST_KEYWORDS = [
'passend',
'passende',
'passendes',
'zubehör',
'zubehor',
'dazu',
'indikator',
'reagenz',
'kit',
'set',
'zusatz',
'ergänzung',
'ergaenzung',
];
/**
* @param array<string, mixed> $config
*/
public function __construct(
private readonly array $config = [],
private readonly ?DomainVocabularyConfig $vocabulary = null,
) {
}
@@ -88,6 +151,42 @@ final class PromptBuilderConfig
return is_numeric($value) ? (float) $value : $default;
}
/**
* @return string[]
*/
private function getStringList(string $path, array $default): array
{
$value = $this->getValue($path, $default);
if (!is_array($value)) {
return $default;
}
$out = [];
foreach ($value as $item) {
if (!is_scalar($item)) {
continue;
}
$item = trim((string) $item);
if ($item === '' || in_array($item, $out, true)) {
continue;
}
$out[] = $item;
}
return $out !== [] ? $out : $default;
}
/**
* @return string[]
*/
private function vocabularyView(string $path, array $fallback): array
{
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
}
private function getValue(string $path, mixed $default): mixed
{
$current = $this->config;
@@ -445,51 +544,10 @@ final class PromptBuilderConfig
*/
public function getTechnicalProductKeywords(): array
{
return [
'technisch',
'technical',
'produkt',
'product',
'gerät',
'device',
'modell',
'model',
'messprinzip',
'measurement principle',
'schnittstelle',
'interface',
'relais',
'relay',
'indikator',
'indicator',
'grenzwert',
'threshold',
'messbereich',
'measurement range',
'minimaler',
'minimum',
'resthärte',
'resthaerte',
'°dh',
'dh',
'spannung',
'voltage',
'strom',
'current',
'druck',
'pressure',
'temperatur',
'temperature',
'schutzart',
'ip',
'fehlercode',
'error code',
'wasserhärte',
'hardness',
'testomat',
'chlor',
'chlormessung',
];
return $this->getStringList(
'technical_product_keywords',
$this->vocabularyView('prompt.technical_product_keywords', self::TECHNICAL_PRODUCT_KEYWORDS)
);
}
/**
@@ -497,21 +555,10 @@ final class PromptBuilderConfig
*/
public function getAccessoryRequestKeywords(): array
{
return [
'passend',
'passende',
'passendes',
'zubehör',
'zubehor',
'dazu',
'indikator',
'reagenz',
'kit',
'set',
'zusatz',
'ergänzung',
'ergaenzung',
];
return $this->getStringList(
'accessory_request_keywords',
$this->vocabularyView('prompt.accessory_request_keywords', self::ACCESSORY_REQUEST_KEYWORDS)
);
}
public function getTechnicalProductModelPattern(): string

View File

@@ -68,8 +68,10 @@ final class ShopServiceConfig
/**
* @param array<string, mixed> $config
*/
public function __construct(private array $config = [])
{
public function __construct(
private array $config = [],
private readonly ?DomainVocabularyConfig $vocabulary = null,
) {
}
public function getTopProductLogLimit(): int
@@ -80,43 +82,43 @@ final class ShopServiceConfig
/** @return string[] */
public function getDeviceFocusKeywords(): array
{
return $this->stringList('device_focus_keywords', self::DEVICE_FOCUS_KEYWORDS);
return $this->stringList('device_focus_keywords', $this->vocabularyView('shop.device_focus', self::DEVICE_FOCUS_KEYWORDS));
}
/** @return string[] */
public function getAccessoryFocusKeywords(): array
{
return $this->stringList('accessory_focus_keywords', self::ACCESSORY_FOCUS_KEYWORDS);
return $this->stringList('accessory_focus_keywords', $this->vocabularyView('shop.accessory_focus', self::ACCESSORY_FOCUS_KEYWORDS));
}
/** @return array<string, string[]> */
public function getAccessoryFocusVariantMap(): array
{
return $this->stringListMap('accessory_focus_variant_map', self::ACCESSORY_FOCUS_VARIANT_MAP);
return $this->stringListMap('accessory_focus_variant_map', $this->vocabularyMap('shop.accessory_focus_variants', self::ACCESSORY_FOCUS_VARIANT_MAP));
}
/** @return string[] */
public function getDeviceQueryKeywords(): array
{
return $this->stringList('device_query_keywords', self::DEVICE_QUERY_KEYWORDS);
return $this->stringList('device_query_keywords', $this->vocabularyView('shop.device_query', self::DEVICE_QUERY_KEYWORDS));
}
/** @return string[] */
public function getAccessoryQueryKeywords(): array
{
return $this->stringList('accessory_query_keywords', self::ACCESSORY_QUERY_KEYWORDS);
return $this->stringList('accessory_query_keywords', $this->vocabularyView('shop.accessory_query', self::ACCESSORY_QUERY_KEYWORDS));
}
/** @return string[] */
public function getAccessoryProductKeywords(): array
{
return $this->stringList('accessory_product_keywords', self::ACCESSORY_PRODUCT_KEYWORDS);
return $this->stringList('accessory_product_keywords', $this->vocabularyView('shop.accessory_product', self::ACCESSORY_PRODUCT_KEYWORDS));
}
/** @return string[] */
public function getDeviceProductKeywords(): array
{
return $this->stringList('device_product_keywords', self::DEVICE_PRODUCT_KEYWORDS);
return $this->stringList('device_product_keywords', $this->vocabularyView('shop.device_product', self::DEVICE_PRODUCT_KEYWORDS));
}
public function getExactProductNumberPhraseScore(): int
@@ -368,6 +370,18 @@ final class ShopServiceConfig
* @param string[]|null $emptySafeDefault
* @return string[]
*/
/** @return string[] */
private function vocabularyView(string $path, array $fallback): array
{
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
}
/** @return array<string, string[]> */
private function vocabularyMap(string $path, array $fallback): array
{
return $this->vocabulary?->map($path, $fallback) ?? $fallback;
}
private function stringList(string $path, array $default, bool $allowEmptyStrings = false, ?array $emptySafeDefault = null): array
{
$value = $this->value($path, $default);

View File

@@ -1125,7 +1125,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$candidates = [];
$seenDocs = [];
foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) {
foreach (array_slice($chunkIds, 0, $this->retrieverConfig->focusedProductWindow()) as $rank => $chunkId) {
$row = $rows[$chunkId] ?? null;
if (!is_array($row)) {
continue;
@@ -1171,7 +1171,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$bestScore = (float)$best['score'];
$gap = $bestScore - $runnerUpScore;
if ($bestScore < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_SCORE || $gap < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_GAP) {
if ($bestScore < $this->retrieverConfig->focusedProductMinScore() || $gap < $this->retrieverConfig->focusedProductMinGap()) {
return null;
}
@@ -1199,10 +1199,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$normalized = $this->normalizeText($prompt);
$tokens = $this->tokenizeText($normalized);
$reagentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_WORDS;
$documentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DOCUMENT_WORDS;
$safetyWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_WORDS;
$deviceWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DEVICE_WORDS;
$reagentWords = $this->retrieverConfig->looksLikeReagentWords();
$documentWords = $this->retrieverConfig->looksLikeDocumentWords();
$safetyWords = $this->retrieverConfig->looksLikeSafetyWords();
$deviceWords = $this->retrieverConfig->looksLikeDeviceWords();
$asksReagent = $this->containsAnyToken($tokens, $reagentWords);
$asksDocument = $this->containsAnyToken($tokens, $documentWords);
@@ -1343,7 +1343,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$documentId,
$chunkIds,
$rows,
min($limit, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MAX_CHUNKS)
min($limit, $this->retrieverConfig->focusedProductMaxChunks())
);
}
@@ -1358,7 +1358,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
{
$docWindow = [];
foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::DOMINANT_DOC_WINDOW) as $chunkId) {
foreach (array_slice($chunkIds, 0, $this->retrieverConfig->dominantDocWindow()) as $chunkId) {
if (!isset($rows[$chunkId]['text'])) {
continue;
}
@@ -1388,7 +1388,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$dominantCount = (int)($counts[$dominantDocId] ?? 0);
if ($dominantCount >= NdjsonHybridRetrieverConfig::DOMINANT_DOC_MIN_HITS) {
if ($dominantCount >= $this->retrieverConfig->dominantDocMinHits()) {
return $dominantDocId;
}
@@ -1450,7 +1450,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return [];
}
$maxFromDoc = min($limit, NdjsonHybridRetrieverConfig::DOMINANT_DOC_MAX_CHUNKS);
$maxFromDoc = min($limit, $this->retrieverConfig->dominantDocMaxChunks());
if ($anchorChunkIndex !== null) {
usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int {
@@ -1550,13 +1550,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
continue;
}
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
if (($docCounter[$docId] ?? 0) >= $this->retrieverConfig->maxChunksPerDoc()) {
continue;
}
if (is_int($chunkIndex)) {
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
if (abs($prevIdx - $chunkIndex) < $this->retrieverConfig->minChunkDistance()) {
continue 2;
}
}
@@ -1609,13 +1609,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
continue;
}
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
if (($docCounter[$docId] ?? 0) >= $this->retrieverConfig->maxChunksPerDoc()) {
continue;
}
if (is_int($chunkIndex)) {
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
if (abs($prevIdx - $chunkIndex) < $this->retrieverConfig->minChunkDistance()) {
continue 2;
}
}
@@ -1715,7 +1715,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
*/
private function isGenericProductToken(string $token): bool
{
static $generic = NdjsonHybridRetrieverConfig::GENERIC_PRODUCT_TOKEN;
$generic = $this->retrieverConfig->genericProductTokens();
return isset(array_fill_keys($generic, true)[$token]);
}
@@ -1724,7 +1725,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
*/
private function isImportantShortModelToken(string $token): bool
{
static $allowed = NdjsonHybridRetrieverConfig::IMPORTANT_SHORT_MODEL_TOKEN;
$allowed = $this->retrieverConfig->importantShortModelTokens();
return in_array($token, $allowed, true);
}
@@ -1734,7 +1735,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
*/
private function isFamilyDescriptorToken(string $token): bool
{
static $familyDescriptors = NdjsonHybridRetrieverConfig::FAMILY_DESCRIPTOR_TOKEN;
$familyDescriptors = $this->retrieverConfig->familyDescriptorTokens();
return in_array($token, $familyDescriptors, true)
|| $this->isImportantShortModelToken($token)
@@ -1752,7 +1753,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return false;
}
$needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_TOKENS;
$needles = $this->retrieverConfig->looksLikeReagentTokens();
foreach ($needles as $needle) {
if (str_contains($haystack, $needle)) {
@@ -1774,7 +1775,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return false;
}
$needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_DOCS;
$needles = $this->retrieverConfig->looksLikeSafetyDocs();
foreach ($needles as $needle) {
if (str_contains($haystack, $needle)) {