From c439fb99d6f93c92a0f403f03a0996ae12fa11a5 Mon Sep 17 00:00:00 2001 From: team 1 Date: Fri, 24 Apr 2026 18:54:25 +0200 Subject: [PATCH] fix 3 --- config/retriex/commerce.yaml | 233 +++++++++ config/retriex/language.yaml | 52 ++ config/retriex/query_enrichment.yaml | 16 + config/retriex/retrieval.yaml | 145 +++++- config/services.yaml | 19 + src/Config/NdjsonHybridRetrieverConfig.php | 317 +++++++++++- src/Config/QueryEnricherConfig.php | 61 ++- src/Config/RetriexEffectiveConfigProvider.php | 27 +- src/Config/ShopServiceConfig.php | 470 +++++++++--------- src/Config/StopWordsConfig.php | 79 ++- .../Retrieval/NdjsonHybridRetriever.php | 37 +- src/Knowledge/Retrieval/QueryEnricher.php | 6 +- 12 files changed, 1126 insertions(+), 336 deletions(-) create mode 100644 config/retriex/language.yaml create mode 100644 config/retriex/query_enrichment.yaml diff --git a/config/retriex/commerce.yaml b/config/retriex/commerce.yaml index 02ad973..faaf415 100644 --- a/config/retriex/commerce.yaml +++ b/config/retriex/commerce.yaml @@ -10,3 +10,236 @@ parameters: retriex.commerce.search_repair.enabled: true retriex.commerce.search_repair.max_queries: 3 retriex.commerce.search_repair.min_primary_results_without_repair: 2 + + # Shop matching and presentation configuration. + # Defaults are intentionally identical to the previous PHP values. + retriex.shop_matching.config: + top_product_log_limit: 3 + + device_query_keywords: + - analysegerät + - analysegeraet + - analysegeräte + - analysegeraete + - messgerät + - messgeraet + - messgeräte + - messgeraete + - analysator + - analysatoren + - analyzer + - gerät + - geraet + - geräte + - geraete + - monitor + - monitore + - controller + - gerät für + - geraet fuer + - geräte für + - geraete fuer + - system + - systeme + - anlage + - anlagen + + accessory_query_keywords: + - zubehör + - zubehor + - reagenz + - reagenzien + - reagent + - indikator + - indikatoren + - indicator + - kit + - set + - ersatz + - ersatzteil + - ersatzteile + - verbrauchsmaterial + - consumable + - dazu + - passend + - passende + - passendes + - nachfüll + - nachfuell + - refill + - filter + - pumpenkopf + - motorblock + - service set + - serviceset + - service-set + + accessory_product_keywords: + - reagenz + - reagenzien + - reagent + - indikator + - indikatoren + - indicator + - kit + - set + - verbrauchsmaterial + - consumable + - zubehör + - zubehor + - ersatz + - ersatzteil + - ersatzteile + - nachfüll + - nachfuell + - refill + - lösung + - loesung + - solution + - teststreifen + - test strip + - filter + - pumpenkopf + - motorblock + - service set + - serviceset + - service-set + + device_product_keywords: + - analysegerät + - analysegeraet + - analysegeräte + - analysegeraete + - messgerät + - messgeraet + - messgeräte + - messgeraete + - analysator + - analysatoren + - analyzer + - monitor + - monitore + - controller + - online-analysator + - online analysator + - online-analysegerät + - online analysegeraet + - online-analysegeräte + - online analysegeraete + - online analyzer + - online monitor + - system + - systeme + - anlage + - anlagen + - gerät + - geraet + - geräte + - geraete + + device_focus_keywords: + - geräte + - geraete + - gerät + - geraet + - analysegerät + - analysegeraet + - messgerät + - messgeraet + - analysator + - controller + - monitor + + accessory_focus_keywords: + - indikator + - indikatoren + - reagenz + - reagenzien + - zubehör + - zubehor + - ersatzteil + - ersatzteile + - verbrauchsmaterial + - service set + - serviceset + - filter + - pumpenkopf + - motorblock + + accessory_focus_variant_map: + indikator: [indikator, indikatoren] + indikatoren: [indikator, indikatoren] + reagenz: [reagenz, reagenzien] + reagenzien: [reagenz, reagenzien] + ersatzteil: [ersatzteil, ersatzteile] + ersatzteile: [ersatzteil, ersatzteile] + service set: [service set, serviceset, service-set] + serviceset: [service set, serviceset, service-set] + service-set: [service set, serviceset, service-set] + + scores: + exact_product_number_phrase: 160 + exact_product_name_phrase: 90 + exact_manufacturer_match: 40 + brand_contained_in_name: 20 + name_token_overlap_weight: 6 + product_number_token_overlap_weight: 10 + corpus_token_overlap_weight: 2 + name_number_overlap_weight: 18 + product_number_number_overlap_weight: 28 + corpus_number_overlap_weight: 8 + size_match: 12 + availability_bonus: 1 + device_query_device_product_bonus: 60 + device_query_accessory_penalty: 120 + accessory_query_accessory_product_bonus: 30 + accessory_query_device_product_bonus: 10 + + patterns: + contains_digit: '/\d/u' + matching_cleanup: '/[^\p{L}\p{N}]+/u' + whitespace_collapse: '/\s+/u' + token_split: '/[^\p{L}\p{N}]+/u' + + padding: + prefix: ' ' + suffix: ' ' + + price: + normalization_search: ['€', ' ', '.'] + normalization_replace: ['', '', ''] + decimals: 2 + decimal_separator: ',' + thousands_separator: '.' + suffix: ' €' + + custom_fields: + primary: migration_Backup_product_attr1 + secondary: migration_Backup_product_attr2 + use_cases: migration_Backup_product_attr4 + languages: migration_Backup_product_attr5 + + text: + primary_secondary_separator: ': ' + use_cases_label: 'Einsatzgebiete: ' + languages_label: 'Sprachen: ' + custom_field_join_separator: ' | ' + + description: + empty_line_pattern: '/^[ \t]*\R/m' + whitespace_cleanup_pattern: '/[ \t]{2,}/' + max_length: 1500 + + seo: + relative_prefix: '/' + + highlight: + available_label: Verfügbar + unavailable_label: Nicht verfügbar + product_number_prefix: 'Produktnummer: ' + + image: + missing_placeholder: no-image + + deduplication: + separator: '|' diff --git a/config/retriex/language.yaml b/config/retriex/language.yaml new file mode 100644 index 0000000..42081c3 --- /dev/null +++ b/config/retriex/language.yaml @@ -0,0 +1,52 @@ +# Language-level retrieval configuration. +# Defaults are intentionally identical to the previous PHP list. +parameters: + retriex.stopwords.config: + words: + - mit + - der + - die + - das + - ein + - eine + - einer + - eines + - den + - dem + - des + - und + - oder + - aber + - sowie + - ich + - du + - er + - sie + - es + - wir + - ihr + - halt + - eben + - auch + - schon + - noch + - mal + - bitte + - danke + - also + - nun + - tja + - dann + - danach + - davor + - hier + - dort + - heute + - gestern + - morgen + - könnte + - kannst + - kann + - würde + - würdest + - würden diff --git a/config/retriex/query_enrichment.yaml b/config/retriex/query_enrichment.yaml new file mode 100644 index 0000000..46646ff --- /dev/null +++ b/config/retriex/query_enrichment.yaml @@ -0,0 +1,16 @@ +# Query enrichment vocabulary. +# Defaults are intentionally identical to the previous PHP mapping. +parameters: + retriex.query_enrichment.config: + max_expansions: 4 + rules: + Wasserhärte: Resthärte + Gerät: Modell + Indikator: Chemie + Seminar: Webinar + Schulung: Seminar + Indikatoren: Indikator + Wasserhärte-Grenzwert: Resthärte + Resthärte-Grenzwert: Wasserhärte + Grenzwert: Überwachungsbereich + store: shop diff --git a/config/retriex/retrieval.yaml b/config/retriex/retrieval.yaml index d096a53..5fd88d6 100644 --- a/config/retriex/retrieval.yaml +++ b/config/retriex/retrieval.yaml @@ -1,7 +1,7 @@ -# Current 1.4.2 retrieval constants documented as configuration inventory. -# In this round these values are exposed by config dump/validation; the retriever logic remains unchanged. +# Active retrieval configuration. +# Defaults are intentionally identical to the frozen 1.4.2 constants. parameters: - retriex.retrieval.inventory: + retriex.retrieval.config: hard_max_chunks: 6 hard_max_vectork: 18 hard_max_keywordk: 36 @@ -26,3 +26,142 @@ parameters: focused_product_min_score: 10.0 focused_product_min_gap: 4.0 focused_product_max_chunks: 4 + + generic_product_tokens: + - produkt + - produkte + - produktkarte + - titel + - geraet + - gerät + - messgeraet + - messgerät + - wasser + - haerte + - härte + - resthaerte + - resthärte + - analyse + - analysator + - automat + - online + - messung + - messen + - preis + - preise + - kosten + - info + - infos + - passend + - richtige + - richtiges + - geeignet + - geeignete + - welche + - welcher + - welches + - brauche + - suche + + important_short_model_tokens: [th, tc, tp, tm, ph, rx] + + family_descriptor_tokens: + - evo + - eco + - self + - clean + - mini + - pro + - plus + - basic + - lab + - inline + - compact + - panel + - sc + + looks_like_reagent_tokens: + - indikator + - reagenz + - reagens + - laborchemikalie + - chemikalie + - sicherheitsdatenblatt + - sdb + - msds + - ufi + - gebinde + - flasche + - ersatzteil + - zubehoer + - zubehör + - service set + - filtereinsatz + - kerzenfilter + - druckregler + + looks_like_safety_docs: + - sicherheitsdatenblatt + - sdb + - msds + - gefahrenbewertung + - gefahrenpiktogramm + - signalwort + - lagerung + - transport + - clp + - kennzeichnung + - h290 + - pbt + - vpvb + + looks_like_reagent_words: + - indikator + - reagenz + - reagens + - chemie + - chemikalie + - sdb + - sicherheitsdatenblatt + - msds + - flasche + - gebinde + + looks_like_document_words: + - datenblatt + - dokument + - pdf + - handbuch + - manual + - beschreibung + - sdb + - sicherheitsdatenblatt + - msds + + looks_like_safety_words: + - gefahr + - gefahrgut + - clp + - h290 + - sicherheit + - kennzeichnung + - transport + - lagerung + - piktogramm + + looks_like_device_words: + - geraet + - gerät + - messgeraet + - messgerät + - analysator + - automat + - messung + - messen + - ueberwachung + - überwachung + - online + - monitor + + # Backwards-compatible name for existing config diagnostics. + retriex.retrieval.inventory: '%retriex.retrieval.config%' diff --git a/config/services.yaml b/config/services.yaml index 39b350d..76a659b 100644 --- a/config/services.yaml +++ b/config/services.yaml @@ -7,6 +7,8 @@ imports: - { resource: 'retriex/prompt.yaml' } - { resource: 'retriex/agent.yaml' } - { resource: 'retriex/retrieval.yaml' } + - { resource: 'retriex/language.yaml' } + - { resource: 'retriex/query_enrichment.yaml' } # ------------------------------------------------------------ # Parameters @@ -118,6 +120,23 @@ services: arguments: $config: '%retriex.agent.config%' + App\Config\NdjsonHybridRetrieverConfig: + arguments: + $config: '%retriex.retrieval.config%' + $vocabulary: '%retriex.retrieval.config%' + + App\Config\StopWordsConfig: + arguments: + $config: '%retriex.stopwords.config%' + + App\Config\QueryEnricherConfig: + arguments: + $config: '%retriex.query_enrichment.config%' + + App\Config\ShopServiceConfig: + arguments: + $config: '%retriex.shop_matching.config%' + App\Infrastructure\OllamaClient: arguments: $apiUrl: '%env(AI_LLM_API_URL)%' diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index e1f092a..411d66b 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -131,24 +131,24 @@ final class NdjsonHybridRetrieverConfig 'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung', 'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend', 'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher', - 'welches', 'brauche', 'suche' + 'welches', 'brauche', 'suche', ]; - const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx']; + public const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx']; - const FAMILY_DESCRIPTOR_TOKEN = [ + public const FAMILY_DESCRIPTOR_TOKEN = [ 'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab', 'inline', 'compact', 'panel', 'sc', ]; - const LOOKS_LIKE_REAGENT_TOKENS = [ + public const LOOKS_LIKE_REAGENT_TOKENS = [ 'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie', 'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche', 'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz', 'kerzenfilter', 'druckregler', ]; - const LOOKS_LIKE_SAFETY_DOCS = [ + public const LOOKS_LIKE_SAFETY_DOCS = [ 'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung', 'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp', 'kennzeichnung', 'h290', 'pbt', 'vpvb', @@ -174,4 +174,309 @@ final class NdjsonHybridRetrieverConfig 'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor', ]; -} \ No newline at end of file + /** + * @param array $config + * @param array $vocabulary Kept for backwards-compatible service wiring. + */ + public function __construct( + private array $config = [], + private array $vocabulary = [], + ) { + } + + public function hardMaxChunks(): int + { + return $this->intValue('hard_max_chunks', self::HARD_MAX_CHUNKS, 1); + } + + public function hardMaxVectorK(): int + { + return $this->intValue('hard_max_vectork', self::HARD_MAX_VECTORK, 1); + } + + public function hardMaxKeywordK(): int + { + return $this->intValue('hard_max_keywordk', self::HARD_MAX_KEYWORDK, 1); + } + + public function vectorScoreThreshold(): float + { + return $this->floatValue('vector_score_threshold', self::VECTOR_SCORE_THRESHOLD, 0.0, 1.0); + } + + public function thresholdFloor(): float + { + return $this->floatValue('threshold_floor', self::THRESHOLD_FLOOR, 0.0, 1.0); + } + + public function thresholdCeil(): float + { + return $this->floatValue('threshold_ceil', self::THRESHOLD_CEIL, 0.0, 1.0); + } + + public function listBonus(): float + { + return $this->floatValue('list_bonus', self::LIST_BONUS, 1.0); + } + + public function rrfK(): int + { + return $this->intValue('rrf_k', self::RRF_K, 1); + } + + public function keywordTopKMultiplier(): float + { + return $this->floatValue('keyword_topk_multiplier', self::KEYWORD_TOPK_MULTIPLIER, 0.1); + } + + public function keywordScoreThreshold(): float + { + return $this->floatValue('keyword_score_threshold', self::KEYWORD_SCORE_THRESHOLD, 0.0, 1.0); + } + + public function keywordRrfWeight(): float + { + return $this->floatValue('keyword_rrf_weight', self::KEYWORD_RRF_WEIGHT, 0.0); + } + + public function scopedVectorRrfWeight(): float + { + return $this->floatValue('scoped_vector_rrf_weight', self::SCOPED_VECTOR_RRF_WEIGHT, 0.0); + } + + public function scopedKeywordRrfWeight(): float + { + return $this->floatValue('scoped_keyword_rrf_weight', self::SCOPED_KEYWORD_RRF_WEIGHT, 0.0); + } + + public function emptyRrfFallbackTopN(): int + { + return $this->intValue('empty_rrf_fallback_topn', self::EMPTY_RRF_FALLBACK_TOPN, 1); + } + + public function maxChunksPerDoc(): int + { + return $this->intValue('max_chunks_per_doc', self::MAX_CHUNKS_PER_DOC, 1); + } + + public function minChunkDistance(): int + { + return $this->intValue('min_chunk_distance', self::MIN_CHUNK_DISTANCE, 0); + } + + public function dominantDocWindow(): int + { + return $this->intValue('dominant_doc_window', self::DOMINANT_DOC_WINDOW, 1); + } + + public function dominantDocMinHits(): int + { + return $this->intValue('dominant_doc_min_hits', self::DOMINANT_DOC_MIN_HITS, 1); + } + + public function dominantDocMaxChunks(): int + { + return $this->intValue('dominant_doc_max_chunks', self::DOMINANT_DOC_MAX_CHUNKS, 1); + } + + public function exactDocumentMaxChunks(): int + { + return $this->intValue('exact_document_max_chunks', self::EXACT_DOCUMENT_MAX_CHUNKS, 1); + } + + public function focusedProductWindow(): int + { + return $this->intValue('focused_product_window', self::FOCUSED_PRODUCT_WINDOW, 1); + } + + public function focusedProductMinScore(): float + { + return $this->floatValue('focused_product_min_score', self::FOCUSED_PRODUCT_MIN_SCORE, 0.0); + } + + public function focusedProductMinGap(): float + { + return $this->floatValue('focused_product_min_gap', self::FOCUSED_PRODUCT_MIN_GAP, 0.0); + } + + public function focusedProductMaxChunks(): int + { + return $this->intValue('focused_product_max_chunks', self::FOCUSED_PRODUCT_MAX_CHUNKS, 1); + } + + /** @return string[] */ + public function genericProductTokens(): array + { + return $this->stringList('generic_product_tokens', self::GENERIC_PRODUCT_TOKEN); + } + + /** @return string[] */ + public function importantShortModelTokens(): array + { + return $this->stringList('important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN); + } + + /** @return string[] */ + public function familyDescriptorTokens(): array + { + return $this->stringList('family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN); + } + + /** @return string[] */ + public function looksLikeReagentTokens(): array + { + return $this->stringList('looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS); + } + + /** @return string[] */ + public function looksLikeSafetyDocs(): array + { + return $this->stringList('looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS); + } + + /** @return string[] */ + public function looksLikeReagentWords(): array + { + return $this->stringList('looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS); + } + + /** @return string[] */ + public function looksLikeDocumentWords(): array + { + return $this->stringList('looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS); + } + + /** @return string[] */ + public function looksLikeSafetyWords(): array + { + return $this->stringList('looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS); + } + + /** @return string[] */ + public function looksLikeDeviceWords(): array + { + return $this->stringList('looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS); + } + + /** + * @return array + */ + public function toArray(): array + { + return [ + 'hard_max_chunks' => $this->hardMaxChunks(), + 'hard_max_vectork' => $this->hardMaxVectorK(), + 'hard_max_keywordk' => $this->hardMaxKeywordK(), + 'vector_score_threshold' => $this->vectorScoreThreshold(), + 'threshold_floor' => $this->thresholdFloor(), + 'threshold_ceil' => $this->thresholdCeil(), + 'list_bonus' => $this->listBonus(), + 'rrf_k' => $this->rrfK(), + 'keyword_topk_multiplier' => $this->keywordTopKMultiplier(), + 'keyword_score_threshold' => $this->keywordScoreThreshold(), + 'keyword_rrf_weight' => $this->keywordRrfWeight(), + 'scoped_vector_rrf_weight' => $this->scopedVectorRrfWeight(), + 'scoped_keyword_rrf_weight' => $this->scopedKeywordRrfWeight(), + 'empty_rrf_fallback_topn' => $this->emptyRrfFallbackTopN(), + 'max_chunks_per_doc' => $this->maxChunksPerDoc(), + 'min_chunk_distance' => $this->minChunkDistance(), + 'dominant_doc_window' => $this->dominantDocWindow(), + 'dominant_doc_min_hits' => $this->dominantDocMinHits(), + 'dominant_doc_max_chunks' => $this->dominantDocMaxChunks(), + 'exact_document_max_chunks' => $this->exactDocumentMaxChunks(), + 'focused_product_window' => $this->focusedProductWindow(), + 'focused_product_min_score' => $this->focusedProductMinScore(), + 'focused_product_min_gap' => $this->focusedProductMinGap(), + 'focused_product_max_chunks' => $this->focusedProductMaxChunks(), + 'generic_product_tokens' => $this->genericProductTokens(), + 'important_short_model_tokens' => $this->importantShortModelTokens(), + 'family_descriptor_tokens' => $this->familyDescriptorTokens(), + 'looks_like_reagent_tokens' => $this->looksLikeReagentTokens(), + 'looks_like_safety_docs' => $this->looksLikeSafetyDocs(), + 'looks_like_reagent_words' => $this->looksLikeReagentWords(), + 'looks_like_document_words' => $this->looksLikeDocumentWords(), + 'looks_like_safety_words' => $this->looksLikeSafetyWords(), + 'looks_like_device_words' => $this->looksLikeDeviceWords(), + ]; + } + + private function intValue(string $key, int $default, int $min = PHP_INT_MIN, ?int $max = null): int + { + $value = $this->raw($key, $default); + + if (!is_numeric($value)) { + return $default; + } + + $value = (int) $value; + $value = max($min, $value); + + if ($max !== null) { + $value = min($max, $value); + } + + return $value; + } + + private function floatValue(string $key, float $default, float $min = -INF, ?float $max = null): float + { + $value = $this->raw($key, $default); + + if (!is_numeric($value)) { + return $default; + } + + $value = (float) $value; + $value = max($min, $value); + + if ($max !== null) { + $value = min($max, $value); + } + + return $value; + } + + /** + * @param string[] $default + * @return string[] + */ + private function stringList(string $key, array $default): array + { + $value = $this->raw($key, $default); + + if (!is_array($value)) { + return $default; + } + + $out = []; + foreach ($value as $item) { + if (!is_scalar($item)) { + continue; + } + + $item = trim((string) $item); + if ($item === '') { + continue; + } + + if (!in_array($item, $out, true)) { + $out[] = $item; + } + } + + return $out !== [] ? $out : $default; + } + + private function raw(string $key, mixed $default): mixed + { + if (array_key_exists($key, $this->config)) { + return $this->config[$key]; + } + + if (array_key_exists($key, $this->vocabulary)) { + return $this->vocabulary[$key]; + } + + return $default; + } +} diff --git a/src/Config/QueryEnricherConfig.php b/src/Config/QueryEnricherConfig.php index f772eff..009ef46 100644 --- a/src/Config/QueryEnricherConfig.php +++ b/src/Config/QueryEnricherConfig.php @@ -7,38 +7,12 @@ namespace App\Config; final readonly class QueryEnricherConfig { /** - * Keep the enrichment vocabulary in the class for now. - * - * Important: - * - This is intentionally NOT externalized yet. - * - Add or maintain the current project-specific mappings here. - * - The later move to external config/files can happen separately. - * - * Supported shapes: - * - * 1) Simple mapping: - * [ - * 'water hardness' => 'residual hardness', - * 'device' => 'instrument', - * ] - * - * 2) Small synonym groups: - * [ - * ['water hardness', 'residual hardness', 'hardness'], - * ['device', 'instrument', 'meter'], - * ] - * - * The public API stays intentionally simple: - * - getEnrichQueryList(): array - * - * This keeps QueryEnricher generic while the domain vocabulary - * deliberately remains inside this class for now. - * - * Replace the example entries below with your real project mappings. + * Backwards-compatible fallback vocabulary. + * Active values are loaded from retriex.query_enrichment.config when present. * * @var array */ - private const ENRICH_QUERY_LIST = [ + private const DEFAULT_ENRICH_QUERY_LIST = [ 'Wasserhärte' => 'Resthärte', 'Gerät' => 'Modell', 'Indikator' => 'Chemie', @@ -48,9 +22,16 @@ final readonly class QueryEnricherConfig 'Wasserhärte-Grenzwert' => 'Resthärte', 'Resthärte-Grenzwert' => 'Wasserhärte', 'Grenzwert' => 'Überwachungsbereich', - 'store'=>'shop' + 'store' => 'shop', ]; + /** + * @param array $config + */ + public function __construct(private array $config = []) + { + } + /** * Returns a normalized, deduplicated mapping for the QueryEnricher. * @@ -71,8 +52,13 @@ final readonly class QueryEnricherConfig public function getEnrichQueryList(): array { $normalized = []; + $rules = $this->config['rules'] ?? self::DEFAULT_ENRICH_QUERY_LIST; - foreach (self::ENRICH_QUERY_LIST as $key => $value) { + if (!is_array($rules)) { + $rules = self::DEFAULT_ENRICH_QUERY_LIST; + } + + foreach ($rules as $key => $value) { if (is_array($value)) { $this->ingestGroup($normalized, $value); continue; @@ -93,6 +79,17 @@ final readonly class QueryEnricherConfig return $normalized; } + public function getMaxExpansions(): int + { + $value = $this->config['max_expansions'] ?? 4; + + if (!is_numeric($value)) { + return 4; + } + + return max(0, (int) $value); + } + /** * Returns true when at least one valid enrichment rule exists. */ @@ -176,4 +173,4 @@ final readonly class QueryEnricherConfig return trim($value); } -} \ No newline at end of file +} diff --git a/src/Config/RetriexEffectiveConfigProvider.php b/src/Config/RetriexEffectiveConfigProvider.php index aea7595..37ac996 100644 --- a/src/Config/RetriexEffectiveConfigProvider.php +++ b/src/Config/RetriexEffectiveConfigProvider.php @@ -16,6 +16,7 @@ final readonly class RetriexEffectiveConfigProvider private ModelGenerationConfigProvider $modelProvider, private IndexConfigurationProvider $indexProvider, private PromptBuilderConfig $promptConfig, + private NdjsonHybridRetrieverConfig $retrieverConfig, ) { } @@ -144,30 +145,8 @@ final readonly class RetriexEffectiveConfigProvider private function retrievalConfig(): array { return [ - 'hard_max_chunks' => NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS, - 'hard_max_vectork' => NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK, - 'hard_max_keywordk' => NdjsonHybridRetrieverConfig::HARD_MAX_KEYWORDK, - 'vector_score_threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD, - 'threshold_floor' => NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR, - 'threshold_ceil' => NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, - 'list_bonus' => NdjsonHybridRetrieverConfig::LIST_BONUS, - 'rrf_k' => NdjsonHybridRetrieverConfig::RRF_K, - 'keyword_topk_multiplier' => NdjsonHybridRetrieverConfig::KEYWORD_TOPK_MULTIPLIER, - 'keyword_score_threshold' => NdjsonHybridRetrieverConfig::KEYWORD_SCORE_THRESHOLD, - 'keyword_rrf_weight' => NdjsonHybridRetrieverConfig::KEYWORD_RRF_WEIGHT, - 'scoped_vector_rrf_weight' => NdjsonHybridRetrieverConfig::SCOPED_VECTOR_RRF_WEIGHT, - 'scoped_keyword_rrf_weight' => NdjsonHybridRetrieverConfig::SCOPED_KEYWORD_RRF_WEIGHT, - 'empty_rrf_fallback_topn' => NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN, - 'max_chunks_per_doc' => NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC, - 'min_chunk_distance' => NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE, - 'dominant_doc_window' => NdjsonHybridRetrieverConfig::DOMINANT_DOC_WINDOW, - 'dominant_doc_min_hits' => NdjsonHybridRetrieverConfig::DOMINANT_DOC_MIN_HITS, - 'dominant_doc_max_chunks' => NdjsonHybridRetrieverConfig::DOMINANT_DOC_MAX_CHUNKS, - 'exact_document_max_chunks' => NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS, - 'focused_product_window' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_WINDOW, - 'focused_product_min_score' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_SCORE, - 'focused_product_min_gap' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_GAP, - 'focused_product_max_chunks' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MAX_CHUNKS, + ...$this->retrieverConfig->toArray(), + 'vocabulary' => $this->retrieverConfig->vocabularyToArray(), 'inventory_parameter' => $this->param('retriex.retrieval.inventory', []), ]; } diff --git a/src/Config/ShopServiceConfig.php b/src/Config/ShopServiceConfig.php index 0210a3c..6e2797f 100644 --- a/src/Config/ShopServiceConfig.php +++ b/src/Config/ShopServiceConfig.php @@ -7,451 +7,461 @@ namespace App\Config; final class ShopServiceConfig { public const DEVICE_QUERY_KEYWORDS = [ - 'analysegerät', - 'analysegeraet', - 'analysegeräte', - 'analysegeraete', - 'messgerät', - 'messgeraet', - 'messgeräte', - 'messgeraete', - 'analysator', - 'analysatoren', - 'analyzer', - 'gerät', - 'geraet', - 'geräte', - 'geraete', - 'monitor', - 'monitore', - 'controller', - 'gerät für', - 'geraet fuer', - 'geräte für', - 'geraete fuer', - 'system', - 'systeme', - 'anlage', - 'anlagen', + 'analysegerät', 'analysegeraet', 'analysegeräte', 'analysegeraete', + 'messgerät', 'messgeraet', 'messgeräte', 'messgeraete', + 'analysator', 'analysatoren', 'analyzer', 'gerät', 'geraet', 'geräte', + 'geraete', 'monitor', 'monitore', 'controller', 'gerät für', + 'geraet fuer', 'geräte für', 'geraete fuer', 'system', 'systeme', + 'anlage', 'anlagen', ]; public const ACCESSORY_QUERY_KEYWORDS = [ - 'zubehör', - 'zubehor', - 'reagenz', - 'reagenzien', - 'reagent', - 'indikator', - 'indikatoren', - 'indicator', - 'kit', - 'set', - 'ersatz', - 'ersatzteil', - 'ersatzteile', - 'verbrauchsmaterial', - 'consumable', - 'dazu', - 'passend', - 'passende', - 'passendes', - 'nachfüll', - 'nachfuell', - 'refill', - 'filter', - 'pumpenkopf', - 'motorblock', - 'service set', - 'serviceset', - 'service-set', + 'zubehör', 'zubehor', 'reagenz', 'reagenzien', 'reagent', 'indikator', + 'indikatoren', 'indicator', 'kit', 'set', 'ersatz', 'ersatzteil', + 'ersatzteile', 'verbrauchsmaterial', 'consumable', 'dazu', 'passend', + 'passende', 'passendes', 'nachfüll', 'nachfuell', 'refill', 'filter', + 'pumpenkopf', 'motorblock', 'service set', 'serviceset', 'service-set', ]; public const ACCESSORY_PRODUCT_KEYWORDS = [ - 'reagenz', - 'reagenzien', - 'reagent', - 'indikator', - 'indikatoren', - 'indicator', - 'kit', - 'set', - 'verbrauchsmaterial', - 'consumable', - 'zubehör', - 'zubehor', - 'ersatz', - 'ersatzteil', - 'ersatzteile', - 'nachfüll', - 'nachfuell', - 'refill', - 'lösung', - 'loesung', - 'solution', - 'teststreifen', - 'test strip', - 'filter', - 'pumpenkopf', - 'motorblock', - 'service set', - 'serviceset', - 'service-set', + 'reagenz', 'reagenzien', 'reagent', 'indikator', 'indikatoren', + 'indicator', 'kit', 'set', 'verbrauchsmaterial', 'consumable', + 'zubehör', 'zubehor', 'ersatz', 'ersatzteil', 'ersatzteile', + 'nachfüll', 'nachfuell', 'refill', 'lösung', 'loesung', 'solution', + 'teststreifen', 'test strip', 'filter', 'pumpenkopf', 'motorblock', + 'service set', 'serviceset', 'service-set', ]; public const DEVICE_PRODUCT_KEYWORDS = [ - 'analysegerät', - 'analysegeraet', - 'analysegeräte', - 'analysegeraete', - 'messgerät', - 'messgeraet', - 'messgeräte', - 'messgeraete', - 'analysator', - 'analysatoren', - 'analyzer', - 'monitor', - 'monitore', - 'controller', - 'online-analysator', - 'online analysator', - 'online-analysegerät', - 'online analysegeraet', - 'online-analysegeräte', - 'online analysegeraete', - 'online analyzer', - 'online monitor', - 'system', - 'systeme', - 'anlage', - 'anlagen', - 'gerät', - 'geraet', - 'geräte', - 'geraete', + 'analysegerät', 'analysegeraet', 'analysegeräte', 'analysegeraete', + 'messgerät', 'messgeraet', 'messgeräte', 'messgeraete', + 'analysator', 'analysatoren', 'analyzer', 'monitor', 'monitore', + 'controller', 'online-analysator', 'online analysator', + 'online-analysegerät', 'online analysegeraet', 'online-analysegeräte', + 'online analysegeraete', 'online analyzer', 'online monitor', 'system', + 'systeme', 'anlage', 'anlagen', 'gerät', 'geraet', 'geräte', 'geraete', ]; + private const DEVICE_FOCUS_KEYWORDS = [ + 'geräte', 'geraete', 'gerät', 'geraet', 'analysegerät', 'analysegeraet', + 'messgerät', 'messgeraet', 'analysator', 'controller', 'monitor', + ]; + + private const ACCESSORY_FOCUS_KEYWORDS = [ + 'indikator', 'indikatoren', 'reagenz', 'reagenzien', 'zubehör', + 'zubehor', 'ersatzteil', 'ersatzteile', 'verbrauchsmaterial', + 'service set', 'serviceset', 'filter', 'pumpenkopf', 'motorblock', + ]; + + private const ACCESSORY_FOCUS_VARIANT_MAP = [ + 'indikator' => ['indikator', 'indikatoren'], + 'indikatoren' => ['indikator', 'indikatoren'], + 'reagenz' => ['reagenz', 'reagenzien'], + 'reagenzien' => ['reagenz', 'reagenzien'], + 'ersatzteil' => ['ersatzteil', 'ersatzteile'], + 'ersatzteile' => ['ersatzteil', 'ersatzteile'], + 'service set' => ['service set', 'serviceset', 'service-set'], + 'serviceset' => ['service set', 'serviceset', 'service-set'], + 'service-set' => ['service set', 'serviceset', 'service-set'], + ]; + + /** + * @param array $config + */ + public function __construct(private array $config = []) + { + } + public function getTopProductLogLimit(): int { - return 3; + return $this->int('top_product_log_limit', 3, 0); } - /** - * @return string[] - */ + /** @return string[] */ public function getDeviceFocusKeywords(): array { - return [ - 'geräte', - 'geraete', - 'gerät', - 'geraet', - 'analysegerät', - 'analysegeraet', - 'messgerät', - 'messgeraet', - 'analysator', - 'controller', - 'monitor', - ]; + return $this->stringList('device_focus_keywords', self::DEVICE_FOCUS_KEYWORDS); } - /** - * @return string[] - */ + /** @return string[] */ public function getAccessoryFocusKeywords(): array { - return [ - 'indikator', - 'indikatoren', - 'reagenz', - 'reagenzien', - 'zubehör', - 'zubehor', - 'ersatzteil', - 'ersatzteile', - 'verbrauchsmaterial', - 'service set', - 'serviceset', - 'filter', - 'pumpenkopf', - 'motorblock', - ]; + return $this->stringList('accessory_focus_keywords', self::ACCESSORY_FOCUS_KEYWORDS); } - /** - * @return array - */ + /** @return array */ public function getAccessoryFocusVariantMap(): array { - return [ - 'indikator' => ['indikator', 'indikatoren'], - 'indikatoren' => ['indikator', 'indikatoren'], - 'reagenz' => ['reagenz', 'reagenzien'], - 'reagenzien' => ['reagenz', 'reagenzien'], - 'ersatzteil' => ['ersatzteil', 'ersatzteile'], - 'ersatzteile' => ['ersatzteil', 'ersatzteile'], - 'service set' => ['service set', 'serviceset', 'service-set'], - 'serviceset' => ['service set', 'serviceset', 'service-set'], - 'service-set' => ['service set', 'serviceset', 'service-set'], - ]; + return $this->stringListMap('accessory_focus_variant_map', self::ACCESSORY_FOCUS_VARIANT_MAP); } - /** - * @return string[] - */ + /** @return string[] */ public function getDeviceQueryKeywords(): array { - return self::DEVICE_QUERY_KEYWORDS; + return $this->stringList('device_query_keywords', self::DEVICE_QUERY_KEYWORDS); } - /** - * @return string[] - */ + /** @return string[] */ public function getAccessoryQueryKeywords(): array { - return self::ACCESSORY_QUERY_KEYWORDS; + return $this->stringList('accessory_query_keywords', self::ACCESSORY_QUERY_KEYWORDS); } - /** - * @return string[] - */ + /** @return string[] */ public function getAccessoryProductKeywords(): array { - return self::ACCESSORY_PRODUCT_KEYWORDS; + return $this->stringList('accessory_product_keywords', self::ACCESSORY_PRODUCT_KEYWORDS); } - /** - * @return string[] - */ + /** @return string[] */ public function getDeviceProductKeywords(): array { - return self::DEVICE_PRODUCT_KEYWORDS; + return $this->stringList('device_product_keywords', self::DEVICE_PRODUCT_KEYWORDS); } public function getExactProductNumberPhraseScore(): int { - return 160; + return $this->int('scores.exact_product_number_phrase', 160); } public function getExactProductNamePhraseScore(): int { - return 90; + return $this->int('scores.exact_product_name_phrase', 90); } public function getExactManufacturerMatchScore(): int { - return 40; + return $this->int('scores.exact_manufacturer_match', 40); } public function getBrandContainedInNameScore(): int { - return 20; + return $this->int('scores.brand_contained_in_name', 20); } public function getNameTokenOverlapWeight(): int { - return 6; + return $this->int('scores.name_token_overlap_weight', 6); } public function getProductNumberTokenOverlapWeight(): int { - return 10; + return $this->int('scores.product_number_token_overlap_weight', 10); } public function getCorpusTokenOverlapWeight(): int { - return 2; + return $this->int('scores.corpus_token_overlap_weight', 2); } public function getNameNumberOverlapWeight(): int { - return 18; + return $this->int('scores.name_number_overlap_weight', 18); } public function getProductNumberNumberOverlapWeight(): int { - return 28; + return $this->int('scores.product_number_number_overlap_weight', 28); } public function getCorpusNumberOverlapWeight(): int { - return 8; + return $this->int('scores.corpus_number_overlap_weight', 8); } public function getSizeMatchScore(): int { - return 12; + return $this->int('scores.size_match', 12); } public function getAvailabilityBonusScore(): int { - return 1; + return $this->int('scores.availability_bonus', 1); } public function getDeviceQueryDeviceProductBonus(): int { - return 60; + return $this->int('scores.device_query_device_product_bonus', 60); } public function getDeviceQueryAccessoryPenalty(): int { - return 120; + return $this->int('scores.device_query_accessory_penalty', 120); } public function getAccessoryQueryAccessoryProductBonus(): int { - return 30; + return $this->int('scores.accessory_query_accessory_product_bonus', 30); } public function getAccessoryQueryDeviceProductBonus(): int { - return 10; + return $this->int('scores.accessory_query_device_product_bonus', 10); } public function getContainsDigitPattern(): string { - return '/\d/u'; + return $this->string('patterns.contains_digit', '/\d/u'); } public function getMatchingCleanupPattern(): string { - return '/[^\p{L}\p{N}]+/u'; + return $this->string('patterns.matching_cleanup', '/[^\p{L}\p{N}]+/u'); } public function getWhitespaceCollapsePattern(): string { - return '/\s+/u'; + return $this->string('patterns.whitespace_collapse', '/\s+/u'); } public function getTokenSplitPattern(): string { - return '/[^\p{L}\p{N}]+/u'; + return $this->string('patterns.token_split', '/[^\p{L}\p{N}]+/u'); } public function wrapWithPaddingSpaces(string $value): string { - return ' ' . trim($value) . ' '; + return $this->string('padding.prefix', ' ') . trim($value) . $this->string('padding.suffix', ' '); } - /** - * @return string[] - */ + /** @return string[] */ public function getPriceNormalizationSearch(): array { - return ['€', ' ', '.']; + return $this->stringList('price.normalization_search', ['€', ' ', '.']); } - /** - * @return string[] - */ + /** @return string[] */ public function getPriceNormalizationReplace(): array { - return ['', '', '']; + return $this->stringList('price.normalization_replace', ['', '', ''], true, ['', '', '']); } public function getPrimaryCustomFieldKey(): string { - return 'migration_Backup_product_attr1'; + return $this->string('custom_fields.primary', 'migration_Backup_product_attr1'); } public function getSecondaryCustomFieldKey(): string { - return 'migration_Backup_product_attr2'; + return $this->string('custom_fields.secondary', 'migration_Backup_product_attr2'); } public function getUseCasesCustomFieldKey(): string { - return 'migration_Backup_product_attr4'; + return $this->string('custom_fields.use_cases', 'migration_Backup_product_attr4'); } public function getLanguagesCustomFieldKey(): string { - return 'migration_Backup_product_attr5'; + return $this->string('custom_fields.languages', 'migration_Backup_product_attr5'); } public function getPrimarySecondarySeparator(): string { - return ': '; + return $this->string('text.primary_secondary_separator', ': '); } public function getUseCasesLabel(): string { - return 'Einsatzgebiete: '; + return $this->string('text.use_cases_label', 'Einsatzgebiete: '); } public function getLanguagesLabel(): string { - return 'Sprachen: '; + return $this->string('text.languages_label', 'Sprachen: '); } public function getCustomFieldJoinSeparator(): string { - return ' | '; + return $this->string('text.custom_field_join_separator', ' | '); } public function getDescriptionEmptyLinePattern(): string { - return '/^[ \t]*\R/m'; + return $this->string('description.empty_line_pattern', '/^[ \t]*\R/m'); } public function getDescriptionWhitespaceCleanupPattern(): string { - return '/[ \t]{2,}/'; + return $this->string('description.whitespace_cleanup_pattern', '/[ \t]{2,}/'); } public function getDescriptionMaxLength(): int { - return 1500; + return $this->int('description.max_length', 1500, 0); } public function getPriceDecimals(): int { - return 2; + return $this->int('price.decimals', 2, 0); } public function getPriceDecimalSeparator(): string { - return ','; + return $this->string('price.decimal_separator', ','); } public function getPriceThousandsSeparator(): string { - return '.'; + return $this->string('price.thousands_separator', '.'); } public function getPriceSuffix(): string { - return ' €'; + return $this->string('price.suffix', ' €'); } public function buildRelativeSeoUrl(string $path): string { - return '/' . ltrim($path, '/'); + return $this->string('seo.relative_prefix', '/') . ltrim($path, '/'); } public function getAvailableHighlightLabel(): string { - return 'Verfügbar'; + return $this->string('highlight.available_label', 'Verfügbar'); } public function getUnavailableHighlightLabel(): string { - return 'Nicht verfügbar'; + return $this->string('highlight.unavailable_label', 'Nicht verfügbar'); } public function getProductNumberHighlightPrefix(): string { - return 'Produktnummer: '; + return $this->string('highlight.product_number_prefix', 'Produktnummer: '); } public function getMissingProductImagePlaceholder(): string { - return 'no-image'; + return $this->string('image.missing_placeholder', 'no-image'); } public function getDeduplicationSeparator(): string { - return '|'; + return $this->string('deduplication.separator', '|'); } -} \ No newline at end of file + + private function int(string $path, int $default, int $min = PHP_INT_MIN): int + { + $value = $this->value($path, $default); + + if (!is_numeric($value)) { + return $default; + } + + return max($min, (int) $value); + } + + private function string(string $path, string $default): string + { + $value = $this->value($path, $default); + + if (!is_scalar($value)) { + return $default; + } + + return (string) $value; + } + + /** + * @param string[] $default + * @param string[]|null $emptySafeDefault + * @return string[] + */ + private function stringList(string $path, array $default, bool $allowEmptyStrings = false, ?array $emptySafeDefault = null): array + { + $value = $this->value($path, $default); + + if (!is_array($value)) { + return $emptySafeDefault ?? $default; + } + + $out = []; + foreach ($value as $item) { + if (!is_scalar($item)) { + continue; + } + + $item = (string) $item; + if (!$allowEmptyStrings) { + $item = trim($item); + } + + if (!$allowEmptyStrings && $item === '') { + continue; + } + + if ($allowEmptyStrings || !in_array($item, $out, true)) { + $out[] = $item; + } + } + + if ($out === [] && !$allowEmptyStrings) { + return $emptySafeDefault ?? $default; + } + + return $out; + } + + /** + * @param array $default + * @return array + */ + private function stringListMap(string $path, array $default): array + { + $value = $this->value($path, $default); + + if (!is_array($value)) { + return $default; + } + + $out = []; + foreach ($value as $key => $items) { + if (!is_string($key) || !is_array($items)) { + continue; + } + + $cleanKey = trim($key); + if ($cleanKey === '') { + continue; + } + + $cleanItems = []; + foreach ($items as $item) { + if (!is_scalar($item)) { + continue; + } + + $item = trim((string) $item); + if ($item === '') { + continue; + } + + if (!in_array($item, $cleanItems, true)) { + $cleanItems[] = $item; + } + } + + if ($cleanItems !== []) { + $out[$cleanKey] = $cleanItems; + } + } + + return $out !== [] ? $out : $default; + } + + private function value(string $path, mixed $default): mixed + { + $current = $this->config; + + foreach (explode('.', $path) as $segment) { + if (!is_array($current) || !array_key_exists($segment, $current)) { + return $default; + } + + $current = $current[$segment]; + } + + return $current; + } +} diff --git a/src/Config/StopWordsConfig.php b/src/Config/StopWordsConfig.php index c0797c4..af3b034 100644 --- a/src/Config/StopWordsConfig.php +++ b/src/Config/StopWordsConfig.php @@ -14,27 +14,68 @@ final class StopWordsConfig * - keep question words * - keep domain terms * - remove only structural filler words - * + */ + private const DEFAULT_STOP_WORDS = [ + 'mit', + 'der', 'die', 'das', + 'ein', 'eine', 'einer', 'eines', + 'den', 'dem', 'des', + 'und', 'oder', 'aber', 'sowie', + 'ich', 'du', 'er', 'sie', 'es', + 'wir', 'ihr', + 'halt', 'eben', 'auch', 'schon', + 'noch', 'mal', 'bitte', 'danke', + 'also', 'nun', 'tja', + 'dann', 'danach', 'davor', + 'hier', 'dort', + 'heute', 'gestern', 'morgen', + 'könnte', 'kannst', 'kann', + 'würde', 'würdest', 'würden', + ]; + + /** + * @param array $config + */ + public function __construct(private array $config = []) + { + } + + /** * @return string[] */ public function getStopWords(): array { - return [ - 'mit', - 'der', 'die', 'das', - 'ein', 'eine', 'einer', 'eines', - 'den', 'dem', 'des', - 'und', 'oder', 'aber', 'sowie', - 'ich', 'du', 'er', 'sie', 'es', - 'wir', 'ihr', - 'halt', 'eben', 'auch', 'schon', - 'noch', 'mal', 'bitte', 'danke', - 'also', 'nun', 'tja', - 'dann', 'danach', 'davor', - 'hier', 'dort', - 'heute', 'gestern', 'morgen', - 'könnte', 'kannst', 'kann', - 'würde', 'würdest', 'würden', - ]; + return $this->stringList('words', self::DEFAULT_STOP_WORDS); } -} \ No newline at end of file + + /** + * @param string[] $default + * @return string[] + */ + private function stringList(string $key, array $default): array + { + $value = $this->config[$key] ?? $default; + + if (!is_array($value)) { + return $default; + } + + $out = []; + foreach ($value as $item) { + if (!is_scalar($item)) { + continue; + } + + $item = trim((string) $item); + if ($item === '') { + continue; + } + + if (!in_array($item, $out, true)) { + $out[] = $item; + } + } + + return $out !== [] ? $out : $default; + } +} diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index d2e6526..60f9a45 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -45,6 +45,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface private IntentRouteResolver $routeResolver, private EntityCatalogService $entityCatalogService, private QueryEnricher $queryEnricher, + private NdjsonHybridRetrieverConfig $retrieverConfig, ) { } @@ -211,7 +212,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface if ($exactDocumentMatch !== null) { $selectedChunkIds = $this->selectExactDocumentChunkIds( $exactDocumentMatch['rows'], - max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)), + max(1, min($config->getRetrievalMaxChunks(), $this->retrieverConfig->hardMaxChunks())), $prompt ); @@ -310,8 +311,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface string $salesIntent ): array { - $limit = max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)); - $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK)); + $limit = max(1, min($config->getRetrievalMaxChunks(), $this->retrieverConfig->hardMaxChunks())); + $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), $this->retrieverConfig->hardMaxVectorK())); $isListQuery = $this->intentLite->isListQuery($prompt); @@ -322,7 +323,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return [ 'limit' => $limit, 'is_list_query' => $isListQuery, - 'threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD, + 'threshold' => $this->retrieverConfig->vectorScoreThreshold(), 'ranked_chunk_ids' => [], 'rows' => [], 'rrf_scores' => [], @@ -501,9 +502,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface */ private function computeKeywordTopK(int $vectorTopK): int { - $topK = (int) ceil($vectorTopK * NdjsonHybridRetrieverConfig::KEYWORD_TOPK_MULTIPLIER); + $topK = (int) ceil($vectorTopK * $this->retrieverConfig->keywordTopKMultiplier()); - return max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_KEYWORDK)); + return max(1, min($topK, $this->retrieverConfig->hardMaxKeywordK())); } /** @@ -520,7 +521,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface int $vectorTopKBase ): array { - $threshold = NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD; + $threshold = $this->retrieverConfig->vectorScoreThreshold(); $topK = $vectorTopKBase; if ( @@ -531,13 +532,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } if ($isListQuery) { - $topK = (int)round($topK * NdjsonHybridRetrieverConfig::LIST_BONUS); + $topK = (int)round($topK * $this->retrieverConfig->listBonus()); } - $topK = max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK)); + $topK = max(1, min($topK, $this->retrieverConfig->hardMaxVectorK())); $threshold = max( - NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR, - min(NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, $threshold) + $this->retrieverConfig->thresholdFloor(), + min($this->retrieverConfig->thresholdCeil(), $threshold) ); return [$threshold, $topK]; @@ -587,16 +588,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } $rank++; - $rrf = (1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank)) * $weight; + $rrf = (1.0 / ($this->retrieverConfig->rrfK() + $rank)) * $weight; $rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf; } }; $apply($globalHits, $vectorThreshold, 1.0); - $apply($scopedHits, $vectorThreshold, $boostScopedVector ? NdjsonHybridRetrieverConfig::SCOPED_VECTOR_RRF_WEIGHT : 1.0); - $apply($keywordHits, NdjsonHybridRetrieverConfig::KEYWORD_SCORE_THRESHOLD, NdjsonHybridRetrieverConfig::KEYWORD_RRF_WEIGHT); - $apply($scopedKeywordHits, NdjsonHybridRetrieverConfig::KEYWORD_SCORE_THRESHOLD, $boostScopedKeyword ? NdjsonHybridRetrieverConfig::SCOPED_KEYWORD_RRF_WEIGHT : NdjsonHybridRetrieverConfig::KEYWORD_RRF_WEIGHT); + $apply($scopedHits, $vectorThreshold, $boostScopedVector ? $this->retrieverConfig->scopedVectorRrfWeight() : 1.0); + $apply($keywordHits, $this->retrieverConfig->keywordScoreThreshold(), $this->retrieverConfig->keywordRrfWeight()); + $apply($scopedKeywordHits, $this->retrieverConfig->keywordScoreThreshold(), $boostScopedKeyword ? $this->retrieverConfig->scopedKeywordRrfWeight() : $this->retrieverConfig->keywordRrfWeight()); return [ 'rrf_scores' => $rrfScores, @@ -621,9 +622,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } $rank++; - $rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); + $rrf[(string)$hit['chunk_id']] = 1.0 / ($this->retrieverConfig->rrfK() + $rank); - if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) { + if ($rank >= $this->retrieverConfig->emptyRrfFallbackTopN()) { break; } } @@ -649,7 +650,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface private function selectExactDocumentChunkIds(array $rows, int $limit, string $prompt): array { $orderedRows = $this->sortRowsByChunkIndex($rows); - $max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS); + $max = min($limit, $this->retrieverConfig->exactDocumentMaxChunks()); if ($orderedRows === [] || $max <= 0) { return []; diff --git a/src/Knowledge/Retrieval/QueryEnricher.php b/src/Knowledge/Retrieval/QueryEnricher.php index ffe66dc..3bf1105 100644 --- a/src/Knowledge/Retrieval/QueryEnricher.php +++ b/src/Knowledge/Retrieval/QueryEnricher.php @@ -14,9 +14,7 @@ final readonly class QueryEnricher * The enriched semantic query should help vector retrieval, * but must not become bloated enough to dilute the original user intent. */ - private const MAX_EXPANSIONS = 4; - - public function __construct( + public function __construct( private QueryEnricherConfig $config ) { } @@ -95,7 +93,7 @@ final readonly class QueryEnricher $matches[] = $mappedValue; $seenNormalizedExpansions[$normalizedMappedValue] = true; - if (count($matches) >= self::MAX_EXPANSIONS) { + if (count($matches) >= $this->config->getMaxExpansions()) { break; } }