This commit is contained in:
team 1
2026-04-24 18:54:25 +02:00
parent 372a6797fa
commit c439fb99d6
12 changed files with 1126 additions and 336 deletions

View File

@@ -10,3 +10,236 @@ parameters:
retriex.commerce.search_repair.enabled: true
retriex.commerce.search_repair.max_queries: 3
retriex.commerce.search_repair.min_primary_results_without_repair: 2
# Shop matching and presentation configuration.
# Defaults are intentionally identical to the previous PHP values.
retriex.shop_matching.config:
top_product_log_limit: 3
device_query_keywords:
- analysegerät
- analysegeraet
- analysegeräte
- analysegeraete
- messgerät
- messgeraet
- messgeräte
- messgeraete
- analysator
- analysatoren
- analyzer
- gerät
- geraet
- geräte
- geraete
- monitor
- monitore
- controller
- gerät für
- geraet fuer
- geräte für
- geraete fuer
- system
- systeme
- anlage
- anlagen
accessory_query_keywords:
- zubehör
- zubehor
- reagenz
- reagenzien
- reagent
- indikator
- indikatoren
- indicator
- kit
- set
- ersatz
- ersatzteil
- ersatzteile
- verbrauchsmaterial
- consumable
- dazu
- passend
- passende
- passendes
- nachfüll
- nachfuell
- refill
- filter
- pumpenkopf
- motorblock
- service set
- serviceset
- service-set
accessory_product_keywords:
- reagenz
- reagenzien
- reagent
- indikator
- indikatoren
- indicator
- kit
- set
- verbrauchsmaterial
- consumable
- zubehör
- zubehor
- ersatz
- ersatzteil
- ersatzteile
- nachfüll
- nachfuell
- refill
- lösung
- loesung
- solution
- teststreifen
- test strip
- filter
- pumpenkopf
- motorblock
- service set
- serviceset
- service-set
device_product_keywords:
- analysegerät
- analysegeraet
- analysegeräte
- analysegeraete
- messgerät
- messgeraet
- messgeräte
- messgeraete
- analysator
- analysatoren
- analyzer
- monitor
- monitore
- controller
- online-analysator
- online analysator
- online-analysegerät
- online analysegeraet
- online-analysegeräte
- online analysegeraete
- online analyzer
- online monitor
- system
- systeme
- anlage
- anlagen
- gerät
- geraet
- geräte
- geraete
device_focus_keywords:
- geräte
- geraete
- gerät
- geraet
- analysegerät
- analysegeraet
- messgerät
- messgeraet
- analysator
- controller
- monitor
accessory_focus_keywords:
- indikator
- indikatoren
- reagenz
- reagenzien
- zubehör
- zubehor
- ersatzteil
- ersatzteile
- verbrauchsmaterial
- service set
- serviceset
- filter
- pumpenkopf
- motorblock
accessory_focus_variant_map:
indikator: [indikator, indikatoren]
indikatoren: [indikator, indikatoren]
reagenz: [reagenz, reagenzien]
reagenzien: [reagenz, reagenzien]
ersatzteil: [ersatzteil, ersatzteile]
ersatzteile: [ersatzteil, ersatzteile]
service set: [service set, serviceset, service-set]
serviceset: [service set, serviceset, service-set]
service-set: [service set, serviceset, service-set]
scores:
exact_product_number_phrase: 160
exact_product_name_phrase: 90
exact_manufacturer_match: 40
brand_contained_in_name: 20
name_token_overlap_weight: 6
product_number_token_overlap_weight: 10
corpus_token_overlap_weight: 2
name_number_overlap_weight: 18
product_number_number_overlap_weight: 28
corpus_number_overlap_weight: 8
size_match: 12
availability_bonus: 1
device_query_device_product_bonus: 60
device_query_accessory_penalty: 120
accessory_query_accessory_product_bonus: 30
accessory_query_device_product_bonus: 10
patterns:
contains_digit: '/\d/u'
matching_cleanup: '/[^\p{L}\p{N}]+/u'
whitespace_collapse: '/\s+/u'
token_split: '/[^\p{L}\p{N}]+/u'
padding:
prefix: ' '
suffix: ' '
price:
normalization_search: ['€', ' ', '.']
normalization_replace: ['', '', '']
decimals: 2
decimal_separator: ','
thousands_separator: '.'
suffix: ' €'
custom_fields:
primary: migration_Backup_product_attr1
secondary: migration_Backup_product_attr2
use_cases: migration_Backup_product_attr4
languages: migration_Backup_product_attr5
text:
primary_secondary_separator: ': '
use_cases_label: 'Einsatzgebiete: '
languages_label: 'Sprachen: '
custom_field_join_separator: ' | '
description:
empty_line_pattern: '/^[ \t]*\R/m'
whitespace_cleanup_pattern: '/[ \t]{2,}/'
max_length: 1500
seo:
relative_prefix: '/'
highlight:
available_label: Verfügbar
unavailable_label: Nicht verfügbar
product_number_prefix: 'Produktnummer: '
image:
missing_placeholder: no-image
deduplication:
separator: '|'

View File

@@ -0,0 +1,52 @@
# Language-level retrieval configuration.
# Defaults are intentionally identical to the previous PHP list.
parameters:
retriex.stopwords.config:
words:
- mit
- der
- die
- das
- ein
- eine
- einer
- eines
- den
- dem
- des
- und
- oder
- aber
- sowie
- ich
- du
- er
- sie
- es
- wir
- ihr
- halt
- eben
- auch
- schon
- noch
- mal
- bitte
- danke
- also
- nun
- tja
- dann
- danach
- davor
- hier
- dort
- heute
- gestern
- morgen
- könnte
- kannst
- kann
- würde
- würdest
- würden

View File

@@ -0,0 +1,16 @@
# Query enrichment vocabulary.
# Defaults are intentionally identical to the previous PHP mapping.
parameters:
retriex.query_enrichment.config:
max_expansions: 4
rules:
Wasserhärte: Resthärte
Gerät: Modell
Indikator: Chemie
Seminar: Webinar
Schulung: Seminar
Indikatoren: Indikator
Wasserhärte-Grenzwert: Resthärte
Resthärte-Grenzwert: Wasserhärte
Grenzwert: Überwachungsbereich
store: shop

View File

@@ -1,7 +1,7 @@
# Current 1.4.2 retrieval constants documented as configuration inventory.
# In this round these values are exposed by config dump/validation; the retriever logic remains unchanged.
# Active retrieval configuration.
# Defaults are intentionally identical to the frozen 1.4.2 constants.
parameters:
retriex.retrieval.inventory:
retriex.retrieval.config:
hard_max_chunks: 6
hard_max_vectork: 18
hard_max_keywordk: 36
@@ -26,3 +26,142 @@ parameters:
focused_product_min_score: 10.0
focused_product_min_gap: 4.0
focused_product_max_chunks: 4
generic_product_tokens:
- produkt
- produkte
- produktkarte
- titel
- geraet
- gerät
- messgeraet
- messgerät
- wasser
- haerte
- härte
- resthaerte
- resthärte
- analyse
- analysator
- automat
- online
- messung
- messen
- preis
- preise
- kosten
- info
- infos
- passend
- richtige
- richtiges
- geeignet
- geeignete
- welche
- welcher
- welches
- brauche
- suche
important_short_model_tokens: [th, tc, tp, tm, ph, rx]
family_descriptor_tokens:
- evo
- eco
- self
- clean
- mini
- pro
- plus
- basic
- lab
- inline
- compact
- panel
- sc
looks_like_reagent_tokens:
- indikator
- reagenz
- reagens
- laborchemikalie
- chemikalie
- sicherheitsdatenblatt
- sdb
- msds
- ufi
- gebinde
- flasche
- ersatzteil
- zubehoer
- zubehör
- service set
- filtereinsatz
- kerzenfilter
- druckregler
looks_like_safety_docs:
- sicherheitsdatenblatt
- sdb
- msds
- gefahrenbewertung
- gefahrenpiktogramm
- signalwort
- lagerung
- transport
- clp
- kennzeichnung
- h290
- pbt
- vpvb
looks_like_reagent_words:
- indikator
- reagenz
- reagens
- chemie
- chemikalie
- sdb
- sicherheitsdatenblatt
- msds
- flasche
- gebinde
looks_like_document_words:
- datenblatt
- dokument
- pdf
- handbuch
- manual
- beschreibung
- sdb
- sicherheitsdatenblatt
- msds
looks_like_safety_words:
- gefahr
- gefahrgut
- clp
- h290
- sicherheit
- kennzeichnung
- transport
- lagerung
- piktogramm
looks_like_device_words:
- geraet
- gerät
- messgeraet
- messgerät
- analysator
- automat
- messung
- messen
- ueberwachung
- überwachung
- online
- monitor
# Backwards-compatible name for existing config diagnostics.
retriex.retrieval.inventory: '%retriex.retrieval.config%'

View File

@@ -7,6 +7,8 @@ imports:
- { resource: 'retriex/prompt.yaml' }
- { resource: 'retriex/agent.yaml' }
- { resource: 'retriex/retrieval.yaml' }
- { resource: 'retriex/language.yaml' }
- { resource: 'retriex/query_enrichment.yaml' }
# ------------------------------------------------------------
# Parameters
@@ -118,6 +120,23 @@ services:
arguments:
$config: '%retriex.agent.config%'
App\Config\NdjsonHybridRetrieverConfig:
arguments:
$config: '%retriex.retrieval.config%'
$vocabulary: '%retriex.retrieval.config%'
App\Config\StopWordsConfig:
arguments:
$config: '%retriex.stopwords.config%'
App\Config\QueryEnricherConfig:
arguments:
$config: '%retriex.query_enrichment.config%'
App\Config\ShopServiceConfig:
arguments:
$config: '%retriex.shop_matching.config%'
App\Infrastructure\OllamaClient:
arguments:
$apiUrl: '%env(AI_LLM_API_URL)%'

View File

@@ -131,24 +131,24 @@ final class NdjsonHybridRetrieverConfig
'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung',
'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend',
'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher',
'welches', 'brauche', 'suche'
'welches', 'brauche', 'suche',
];
const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
public const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
const FAMILY_DESCRIPTOR_TOKEN = [
public const FAMILY_DESCRIPTOR_TOKEN = [
'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab',
'inline', 'compact', 'panel', 'sc',
];
const LOOKS_LIKE_REAGENT_TOKENS = [
public const LOOKS_LIKE_REAGENT_TOKENS = [
'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie',
'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche',
'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz',
'kerzenfilter', 'druckregler',
];
const LOOKS_LIKE_SAFETY_DOCS = [
public const LOOKS_LIKE_SAFETY_DOCS = [
'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung',
'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp',
'kennzeichnung', 'h290', 'pbt', 'vpvb',
@@ -174,4 +174,309 @@ final class NdjsonHybridRetrieverConfig
'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor',
];
/**
* @param array<string, mixed> $config
* @param array<string, mixed> $vocabulary Kept for backwards-compatible service wiring.
*/
public function __construct(
private array $config = [],
private array $vocabulary = [],
) {
}
public function hardMaxChunks(): int
{
return $this->intValue('hard_max_chunks', self::HARD_MAX_CHUNKS, 1);
}
public function hardMaxVectorK(): int
{
return $this->intValue('hard_max_vectork', self::HARD_MAX_VECTORK, 1);
}
public function hardMaxKeywordK(): int
{
return $this->intValue('hard_max_keywordk', self::HARD_MAX_KEYWORDK, 1);
}
public function vectorScoreThreshold(): float
{
return $this->floatValue('vector_score_threshold', self::VECTOR_SCORE_THRESHOLD, 0.0, 1.0);
}
public function thresholdFloor(): float
{
return $this->floatValue('threshold_floor', self::THRESHOLD_FLOOR, 0.0, 1.0);
}
public function thresholdCeil(): float
{
return $this->floatValue('threshold_ceil', self::THRESHOLD_CEIL, 0.0, 1.0);
}
public function listBonus(): float
{
return $this->floatValue('list_bonus', self::LIST_BONUS, 1.0);
}
public function rrfK(): int
{
return $this->intValue('rrf_k', self::RRF_K, 1);
}
public function keywordTopKMultiplier(): float
{
return $this->floatValue('keyword_topk_multiplier', self::KEYWORD_TOPK_MULTIPLIER, 0.1);
}
public function keywordScoreThreshold(): float
{
return $this->floatValue('keyword_score_threshold', self::KEYWORD_SCORE_THRESHOLD, 0.0, 1.0);
}
public function keywordRrfWeight(): float
{
return $this->floatValue('keyword_rrf_weight', self::KEYWORD_RRF_WEIGHT, 0.0);
}
public function scopedVectorRrfWeight(): float
{
return $this->floatValue('scoped_vector_rrf_weight', self::SCOPED_VECTOR_RRF_WEIGHT, 0.0);
}
public function scopedKeywordRrfWeight(): float
{
return $this->floatValue('scoped_keyword_rrf_weight', self::SCOPED_KEYWORD_RRF_WEIGHT, 0.0);
}
public function emptyRrfFallbackTopN(): int
{
return $this->intValue('empty_rrf_fallback_topn', self::EMPTY_RRF_FALLBACK_TOPN, 1);
}
public function maxChunksPerDoc(): int
{
return $this->intValue('max_chunks_per_doc', self::MAX_CHUNKS_PER_DOC, 1);
}
public function minChunkDistance(): int
{
return $this->intValue('min_chunk_distance', self::MIN_CHUNK_DISTANCE, 0);
}
public function dominantDocWindow(): int
{
return $this->intValue('dominant_doc_window', self::DOMINANT_DOC_WINDOW, 1);
}
public function dominantDocMinHits(): int
{
return $this->intValue('dominant_doc_min_hits', self::DOMINANT_DOC_MIN_HITS, 1);
}
public function dominantDocMaxChunks(): int
{
return $this->intValue('dominant_doc_max_chunks', self::DOMINANT_DOC_MAX_CHUNKS, 1);
}
public function exactDocumentMaxChunks(): int
{
return $this->intValue('exact_document_max_chunks', self::EXACT_DOCUMENT_MAX_CHUNKS, 1);
}
public function focusedProductWindow(): int
{
return $this->intValue('focused_product_window', self::FOCUSED_PRODUCT_WINDOW, 1);
}
public function focusedProductMinScore(): float
{
return $this->floatValue('focused_product_min_score', self::FOCUSED_PRODUCT_MIN_SCORE, 0.0);
}
public function focusedProductMinGap(): float
{
return $this->floatValue('focused_product_min_gap', self::FOCUSED_PRODUCT_MIN_GAP, 0.0);
}
public function focusedProductMaxChunks(): int
{
return $this->intValue('focused_product_max_chunks', self::FOCUSED_PRODUCT_MAX_CHUNKS, 1);
}
/** @return string[] */
public function genericProductTokens(): array
{
return $this->stringList('generic_product_tokens', self::GENERIC_PRODUCT_TOKEN);
}
/** @return string[] */
public function importantShortModelTokens(): array
{
return $this->stringList('important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN);
}
/** @return string[] */
public function familyDescriptorTokens(): array
{
return $this->stringList('family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN);
}
/** @return string[] */
public function looksLikeReagentTokens(): array
{
return $this->stringList('looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS);
}
/** @return string[] */
public function looksLikeSafetyDocs(): array
{
return $this->stringList('looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS);
}
/** @return string[] */
public function looksLikeReagentWords(): array
{
return $this->stringList('looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS);
}
/** @return string[] */
public function looksLikeDocumentWords(): array
{
return $this->stringList('looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS);
}
/** @return string[] */
public function looksLikeSafetyWords(): array
{
return $this->stringList('looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS);
}
/** @return string[] */
public function looksLikeDeviceWords(): array
{
return $this->stringList('looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS);
}
/**
* @return array<string, mixed>
*/
public function toArray(): array
{
return [
'hard_max_chunks' => $this->hardMaxChunks(),
'hard_max_vectork' => $this->hardMaxVectorK(),
'hard_max_keywordk' => $this->hardMaxKeywordK(),
'vector_score_threshold' => $this->vectorScoreThreshold(),
'threshold_floor' => $this->thresholdFloor(),
'threshold_ceil' => $this->thresholdCeil(),
'list_bonus' => $this->listBonus(),
'rrf_k' => $this->rrfK(),
'keyword_topk_multiplier' => $this->keywordTopKMultiplier(),
'keyword_score_threshold' => $this->keywordScoreThreshold(),
'keyword_rrf_weight' => $this->keywordRrfWeight(),
'scoped_vector_rrf_weight' => $this->scopedVectorRrfWeight(),
'scoped_keyword_rrf_weight' => $this->scopedKeywordRrfWeight(),
'empty_rrf_fallback_topn' => $this->emptyRrfFallbackTopN(),
'max_chunks_per_doc' => $this->maxChunksPerDoc(),
'min_chunk_distance' => $this->minChunkDistance(),
'dominant_doc_window' => $this->dominantDocWindow(),
'dominant_doc_min_hits' => $this->dominantDocMinHits(),
'dominant_doc_max_chunks' => $this->dominantDocMaxChunks(),
'exact_document_max_chunks' => $this->exactDocumentMaxChunks(),
'focused_product_window' => $this->focusedProductWindow(),
'focused_product_min_score' => $this->focusedProductMinScore(),
'focused_product_min_gap' => $this->focusedProductMinGap(),
'focused_product_max_chunks' => $this->focusedProductMaxChunks(),
'generic_product_tokens' => $this->genericProductTokens(),
'important_short_model_tokens' => $this->importantShortModelTokens(),
'family_descriptor_tokens' => $this->familyDescriptorTokens(),
'looks_like_reagent_tokens' => $this->looksLikeReagentTokens(),
'looks_like_safety_docs' => $this->looksLikeSafetyDocs(),
'looks_like_reagent_words' => $this->looksLikeReagentWords(),
'looks_like_document_words' => $this->looksLikeDocumentWords(),
'looks_like_safety_words' => $this->looksLikeSafetyWords(),
'looks_like_device_words' => $this->looksLikeDeviceWords(),
];
}
private function intValue(string $key, int $default, int $min = PHP_INT_MIN, ?int $max = null): int
{
$value = $this->raw($key, $default);
if (!is_numeric($value)) {
return $default;
}
$value = (int) $value;
$value = max($min, $value);
if ($max !== null) {
$value = min($max, $value);
}
return $value;
}
private function floatValue(string $key, float $default, float $min = -INF, ?float $max = null): float
{
$value = $this->raw($key, $default);
if (!is_numeric($value)) {
return $default;
}
$value = (float) $value;
$value = max($min, $value);
if ($max !== null) {
$value = min($max, $value);
}
return $value;
}
/**
* @param string[] $default
* @return string[]
*/
private function stringList(string $key, array $default): array
{
$value = $this->raw($key, $default);
if (!is_array($value)) {
return $default;
}
$out = [];
foreach ($value as $item) {
if (!is_scalar($item)) {
continue;
}
$item = trim((string) $item);
if ($item === '') {
continue;
}
if (!in_array($item, $out, true)) {
$out[] = $item;
}
}
return $out !== [] ? $out : $default;
}
private function raw(string $key, mixed $default): mixed
{
if (array_key_exists($key, $this->config)) {
return $this->config[$key];
}
if (array_key_exists($key, $this->vocabulary)) {
return $this->vocabulary[$key];
}
return $default;
}
}

View File

@@ -7,38 +7,12 @@ namespace App\Config;
final readonly class QueryEnricherConfig
{
/**
* Keep the enrichment vocabulary in the class for now.
*
* Important:
* - This is intentionally NOT externalized yet.
* - Add or maintain the current project-specific mappings here.
* - The later move to external config/files can happen separately.
*
* Supported shapes:
*
* 1) Simple mapping:
* [
* 'water hardness' => 'residual hardness',
* 'device' => 'instrument',
* ]
*
* 2) Small synonym groups:
* [
* ['water hardness', 'residual hardness', 'hardness'],
* ['device', 'instrument', 'meter'],
* ]
*
* The public API stays intentionally simple:
* - getEnrichQueryList(): array<string,string>
*
* This keeps QueryEnricher generic while the domain vocabulary
* deliberately remains inside this class for now.
*
* Replace the example entries below with your real project mappings.
* Backwards-compatible fallback vocabulary.
* Active values are loaded from retriex.query_enrichment.config when present.
*
* @var array<int|string, mixed>
*/
private const ENRICH_QUERY_LIST = [
private const DEFAULT_ENRICH_QUERY_LIST = [
'Wasserhärte' => 'Resthärte',
'Gerät' => 'Modell',
'Indikator' => 'Chemie',
@@ -48,9 +22,16 @@ final readonly class QueryEnricherConfig
'Wasserhärte-Grenzwert' => 'Resthärte',
'Resthärte-Grenzwert' => 'Wasserhärte',
'Grenzwert' => 'Überwachungsbereich',
'store'=>'shop'
'store' => 'shop',
];
/**
* @param array<string, mixed> $config
*/
public function __construct(private array $config = [])
{
}
/**
* Returns a normalized, deduplicated mapping for the QueryEnricher.
*
@@ -71,8 +52,13 @@ final readonly class QueryEnricherConfig
public function getEnrichQueryList(): array
{
$normalized = [];
$rules = $this->config['rules'] ?? self::DEFAULT_ENRICH_QUERY_LIST;
foreach (self::ENRICH_QUERY_LIST as $key => $value) {
if (!is_array($rules)) {
$rules = self::DEFAULT_ENRICH_QUERY_LIST;
}
foreach ($rules as $key => $value) {
if (is_array($value)) {
$this->ingestGroup($normalized, $value);
continue;
@@ -93,6 +79,17 @@ final readonly class QueryEnricherConfig
return $normalized;
}
public function getMaxExpansions(): int
{
$value = $this->config['max_expansions'] ?? 4;
if (!is_numeric($value)) {
return 4;
}
return max(0, (int) $value);
}
/**
* Returns true when at least one valid enrichment rule exists.
*/

View File

@@ -16,6 +16,7 @@ final readonly class RetriexEffectiveConfigProvider
private ModelGenerationConfigProvider $modelProvider,
private IndexConfigurationProvider $indexProvider,
private PromptBuilderConfig $promptConfig,
private NdjsonHybridRetrieverConfig $retrieverConfig,
) {
}
@@ -144,30 +145,8 @@ final readonly class RetriexEffectiveConfigProvider
private function retrievalConfig(): array
{
return [
'hard_max_chunks' => NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS,
'hard_max_vectork' => NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK,
'hard_max_keywordk' => NdjsonHybridRetrieverConfig::HARD_MAX_KEYWORDK,
'vector_score_threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD,
'threshold_floor' => NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR,
'threshold_ceil' => NdjsonHybridRetrieverConfig::THRESHOLD_CEIL,
'list_bonus' => NdjsonHybridRetrieverConfig::LIST_BONUS,
'rrf_k' => NdjsonHybridRetrieverConfig::RRF_K,
'keyword_topk_multiplier' => NdjsonHybridRetrieverConfig::KEYWORD_TOPK_MULTIPLIER,
'keyword_score_threshold' => NdjsonHybridRetrieverConfig::KEYWORD_SCORE_THRESHOLD,
'keyword_rrf_weight' => NdjsonHybridRetrieverConfig::KEYWORD_RRF_WEIGHT,
'scoped_vector_rrf_weight' => NdjsonHybridRetrieverConfig::SCOPED_VECTOR_RRF_WEIGHT,
'scoped_keyword_rrf_weight' => NdjsonHybridRetrieverConfig::SCOPED_KEYWORD_RRF_WEIGHT,
'empty_rrf_fallback_topn' => NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN,
'max_chunks_per_doc' => NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC,
'min_chunk_distance' => NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE,
'dominant_doc_window' => NdjsonHybridRetrieverConfig::DOMINANT_DOC_WINDOW,
'dominant_doc_min_hits' => NdjsonHybridRetrieverConfig::DOMINANT_DOC_MIN_HITS,
'dominant_doc_max_chunks' => NdjsonHybridRetrieverConfig::DOMINANT_DOC_MAX_CHUNKS,
'exact_document_max_chunks' => NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS,
'focused_product_window' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_WINDOW,
'focused_product_min_score' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_SCORE,
'focused_product_min_gap' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_GAP,
'focused_product_max_chunks' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MAX_CHUNKS,
...$this->retrieverConfig->toArray(),
'vocabulary' => $this->retrieverConfig->vocabularyToArray(),
'inventory_parameter' => $this->param('retriex.retrieval.inventory', []),
];
}

View File

@@ -7,451 +7,461 @@ namespace App\Config;
final class ShopServiceConfig
{
public const DEVICE_QUERY_KEYWORDS = [
'analysegerät',
'analysegeraet',
'analysegeräte',
'analysegeraete',
'messgerät',
'messgeraet',
'messgeräte',
'messgeraete',
'analysator',
'analysatoren',
'analyzer',
'gerät',
'geraet',
'geräte',
'geraete',
'monitor',
'monitore',
'controller',
'gerät für',
'geraet fuer',
'geräte für',
'geraete fuer',
'system',
'systeme',
'anlage',
'anlagen',
'analysegerät', 'analysegeraet', 'analysegeräte', 'analysegeraete',
'messgerät', 'messgeraet', 'messgeräte', 'messgeraete',
'analysator', 'analysatoren', 'analyzer', 'gerät', 'geraet', 'geräte',
'geraete', 'monitor', 'monitore', 'controller', 'gerät für',
'geraet fuer', 'geräte für', 'geraete fuer', 'system', 'systeme',
'anlage', 'anlagen',
];
public const ACCESSORY_QUERY_KEYWORDS = [
'zubehör',
'zubehor',
'reagenz',
'reagenzien',
'reagent',
'indikator',
'indikatoren',
'indicator',
'kit',
'set',
'ersatz',
'ersatzteil',
'ersatzteile',
'verbrauchsmaterial',
'consumable',
'dazu',
'passend',
'passende',
'passendes',
'nachfüll',
'nachfuell',
'refill',
'filter',
'pumpenkopf',
'motorblock',
'service set',
'serviceset',
'service-set',
'zubehör', 'zubehor', 'reagenz', 'reagenzien', 'reagent', 'indikator',
'indikatoren', 'indicator', 'kit', 'set', 'ersatz', 'ersatzteil',
'ersatzteile', 'verbrauchsmaterial', 'consumable', 'dazu', 'passend',
'passende', 'passendes', 'nachfüll', 'nachfuell', 'refill', 'filter',
'pumpenkopf', 'motorblock', 'service set', 'serviceset', 'service-set',
];
public const ACCESSORY_PRODUCT_KEYWORDS = [
'reagenz',
'reagenzien',
'reagent',
'indikator',
'indikatoren',
'indicator',
'kit',
'set',
'verbrauchsmaterial',
'consumable',
'zubehör',
'zubehor',
'ersatz',
'ersatzteil',
'ersatzteile',
'nachfüll',
'nachfuell',
'refill',
'lösung',
'loesung',
'solution',
'teststreifen',
'test strip',
'filter',
'pumpenkopf',
'motorblock',
'service set',
'serviceset',
'service-set',
'reagenz', 'reagenzien', 'reagent', 'indikator', 'indikatoren',
'indicator', 'kit', 'set', 'verbrauchsmaterial', 'consumable',
'zubehör', 'zubehor', 'ersatz', 'ersatzteil', 'ersatzteile',
'nachfüll', 'nachfuell', 'refill', 'lösung', 'loesung', 'solution',
'teststreifen', 'test strip', 'filter', 'pumpenkopf', 'motorblock',
'service set', 'serviceset', 'service-set',
];
public const DEVICE_PRODUCT_KEYWORDS = [
'analysegerät',
'analysegeraet',
'analysegeräte',
'analysegeraete',
'messgerät',
'messgeraet',
'messgeräte',
'messgeraete',
'analysator',
'analysatoren',
'analyzer',
'monitor',
'monitore',
'controller',
'online-analysator',
'online analysator',
'online-analysegerät',
'online analysegeraet',
'online-analysegeräte',
'online analysegeraete',
'online analyzer',
'online monitor',
'system',
'systeme',
'anlage',
'anlagen',
'gerät',
'geraet',
'geräte',
'geraete',
'analysegerät', 'analysegeraet', 'analysegeräte', 'analysegeraete',
'messgerät', 'messgeraet', 'messgeräte', 'messgeraete',
'analysator', 'analysatoren', 'analyzer', 'monitor', 'monitore',
'controller', 'online-analysator', 'online analysator',
'online-analysegerät', 'online analysegeraet', 'online-analysegeräte',
'online analysegeraete', 'online analyzer', 'online monitor', 'system',
'systeme', 'anlage', 'anlagen', 'gerät', 'geraet', 'geräte', 'geraete',
];
private const DEVICE_FOCUS_KEYWORDS = [
'geräte', 'geraete', 'gerät', 'geraet', 'analysegerät', 'analysegeraet',
'messgerät', 'messgeraet', 'analysator', 'controller', 'monitor',
];
private const ACCESSORY_FOCUS_KEYWORDS = [
'indikator', 'indikatoren', 'reagenz', 'reagenzien', 'zubehör',
'zubehor', 'ersatzteil', 'ersatzteile', 'verbrauchsmaterial',
'service set', 'serviceset', 'filter', 'pumpenkopf', 'motorblock',
];
private const ACCESSORY_FOCUS_VARIANT_MAP = [
'indikator' => ['indikator', 'indikatoren'],
'indikatoren' => ['indikator', 'indikatoren'],
'reagenz' => ['reagenz', 'reagenzien'],
'reagenzien' => ['reagenz', 'reagenzien'],
'ersatzteil' => ['ersatzteil', 'ersatzteile'],
'ersatzteile' => ['ersatzteil', 'ersatzteile'],
'service set' => ['service set', 'serviceset', 'service-set'],
'serviceset' => ['service set', 'serviceset', 'service-set'],
'service-set' => ['service set', 'serviceset', 'service-set'],
];
/**
* @param array<string, mixed> $config
*/
public function __construct(private array $config = [])
{
}
public function getTopProductLogLimit(): int
{
return 3;
return $this->int('top_product_log_limit', 3, 0);
}
/**
* @return string[]
*/
/** @return string[] */
public function getDeviceFocusKeywords(): array
{
return [
'geräte',
'geraete',
'gerät',
'geraet',
'analysegerät',
'analysegeraet',
'messgerät',
'messgeraet',
'analysator',
'controller',
'monitor',
];
return $this->stringList('device_focus_keywords', self::DEVICE_FOCUS_KEYWORDS);
}
/**
* @return string[]
*/
/** @return string[] */
public function getAccessoryFocusKeywords(): array
{
return [
'indikator',
'indikatoren',
'reagenz',
'reagenzien',
'zubehör',
'zubehor',
'ersatzteil',
'ersatzteile',
'verbrauchsmaterial',
'service set',
'serviceset',
'filter',
'pumpenkopf',
'motorblock',
];
return $this->stringList('accessory_focus_keywords', self::ACCESSORY_FOCUS_KEYWORDS);
}
/**
* @return array<string, string[]>
*/
/** @return array<string, string[]> */
public function getAccessoryFocusVariantMap(): array
{
return [
'indikator' => ['indikator', 'indikatoren'],
'indikatoren' => ['indikator', 'indikatoren'],
'reagenz' => ['reagenz', 'reagenzien'],
'reagenzien' => ['reagenz', 'reagenzien'],
'ersatzteil' => ['ersatzteil', 'ersatzteile'],
'ersatzteile' => ['ersatzteil', 'ersatzteile'],
'service set' => ['service set', 'serviceset', 'service-set'],
'serviceset' => ['service set', 'serviceset', 'service-set'],
'service-set' => ['service set', 'serviceset', 'service-set'],
];
return $this->stringListMap('accessory_focus_variant_map', self::ACCESSORY_FOCUS_VARIANT_MAP);
}
/**
* @return string[]
*/
/** @return string[] */
public function getDeviceQueryKeywords(): array
{
return self::DEVICE_QUERY_KEYWORDS;
return $this->stringList('device_query_keywords', self::DEVICE_QUERY_KEYWORDS);
}
/**
* @return string[]
*/
/** @return string[] */
public function getAccessoryQueryKeywords(): array
{
return self::ACCESSORY_QUERY_KEYWORDS;
return $this->stringList('accessory_query_keywords', self::ACCESSORY_QUERY_KEYWORDS);
}
/**
* @return string[]
*/
/** @return string[] */
public function getAccessoryProductKeywords(): array
{
return self::ACCESSORY_PRODUCT_KEYWORDS;
return $this->stringList('accessory_product_keywords', self::ACCESSORY_PRODUCT_KEYWORDS);
}
/**
* @return string[]
*/
/** @return string[] */
public function getDeviceProductKeywords(): array
{
return self::DEVICE_PRODUCT_KEYWORDS;
return $this->stringList('device_product_keywords', self::DEVICE_PRODUCT_KEYWORDS);
}
public function getExactProductNumberPhraseScore(): int
{
return 160;
return $this->int('scores.exact_product_number_phrase', 160);
}
public function getExactProductNamePhraseScore(): int
{
return 90;
return $this->int('scores.exact_product_name_phrase', 90);
}
public function getExactManufacturerMatchScore(): int
{
return 40;
return $this->int('scores.exact_manufacturer_match', 40);
}
public function getBrandContainedInNameScore(): int
{
return 20;
return $this->int('scores.brand_contained_in_name', 20);
}
public function getNameTokenOverlapWeight(): int
{
return 6;
return $this->int('scores.name_token_overlap_weight', 6);
}
public function getProductNumberTokenOverlapWeight(): int
{
return 10;
return $this->int('scores.product_number_token_overlap_weight', 10);
}
public function getCorpusTokenOverlapWeight(): int
{
return 2;
return $this->int('scores.corpus_token_overlap_weight', 2);
}
public function getNameNumberOverlapWeight(): int
{
return 18;
return $this->int('scores.name_number_overlap_weight', 18);
}
public function getProductNumberNumberOverlapWeight(): int
{
return 28;
return $this->int('scores.product_number_number_overlap_weight', 28);
}
public function getCorpusNumberOverlapWeight(): int
{
return 8;
return $this->int('scores.corpus_number_overlap_weight', 8);
}
public function getSizeMatchScore(): int
{
return 12;
return $this->int('scores.size_match', 12);
}
public function getAvailabilityBonusScore(): int
{
return 1;
return $this->int('scores.availability_bonus', 1);
}
public function getDeviceQueryDeviceProductBonus(): int
{
return 60;
return $this->int('scores.device_query_device_product_bonus', 60);
}
public function getDeviceQueryAccessoryPenalty(): int
{
return 120;
return $this->int('scores.device_query_accessory_penalty', 120);
}
public function getAccessoryQueryAccessoryProductBonus(): int
{
return 30;
return $this->int('scores.accessory_query_accessory_product_bonus', 30);
}
public function getAccessoryQueryDeviceProductBonus(): int
{
return 10;
return $this->int('scores.accessory_query_device_product_bonus', 10);
}
public function getContainsDigitPattern(): string
{
return '/\d/u';
return $this->string('patterns.contains_digit', '/\d/u');
}
public function getMatchingCleanupPattern(): string
{
return '/[^\p{L}\p{N}]+/u';
return $this->string('patterns.matching_cleanup', '/[^\p{L}\p{N}]+/u');
}
public function getWhitespaceCollapsePattern(): string
{
return '/\s+/u';
return $this->string('patterns.whitespace_collapse', '/\s+/u');
}
public function getTokenSplitPattern(): string
{
return '/[^\p{L}\p{N}]+/u';
return $this->string('patterns.token_split', '/[^\p{L}\p{N}]+/u');
}
public function wrapWithPaddingSpaces(string $value): string
{
return ' ' . trim($value) . ' ';
return $this->string('padding.prefix', ' ') . trim($value) . $this->string('padding.suffix', ' ');
}
/**
* @return string[]
*/
/** @return string[] */
public function getPriceNormalizationSearch(): array
{
return ['€', ' ', '.'];
return $this->stringList('price.normalization_search', ['€', ' ', '.']);
}
/**
* @return string[]
*/
/** @return string[] */
public function getPriceNormalizationReplace(): array
{
return ['', '', ''];
return $this->stringList('price.normalization_replace', ['', '', ''], true, ['', '', '']);
}
public function getPrimaryCustomFieldKey(): string
{
return 'migration_Backup_product_attr1';
return $this->string('custom_fields.primary', 'migration_Backup_product_attr1');
}
public function getSecondaryCustomFieldKey(): string
{
return 'migration_Backup_product_attr2';
return $this->string('custom_fields.secondary', 'migration_Backup_product_attr2');
}
public function getUseCasesCustomFieldKey(): string
{
return 'migration_Backup_product_attr4';
return $this->string('custom_fields.use_cases', 'migration_Backup_product_attr4');
}
public function getLanguagesCustomFieldKey(): string
{
return 'migration_Backup_product_attr5';
return $this->string('custom_fields.languages', 'migration_Backup_product_attr5');
}
public function getPrimarySecondarySeparator(): string
{
return ': ';
return $this->string('text.primary_secondary_separator', ': ');
}
public function getUseCasesLabel(): string
{
return 'Einsatzgebiete: ';
return $this->string('text.use_cases_label', 'Einsatzgebiete: ');
}
public function getLanguagesLabel(): string
{
return 'Sprachen: ';
return $this->string('text.languages_label', 'Sprachen: ');
}
public function getCustomFieldJoinSeparator(): string
{
return ' | ';
return $this->string('text.custom_field_join_separator', ' | ');
}
public function getDescriptionEmptyLinePattern(): string
{
return '/^[ \t]*\R/m';
return $this->string('description.empty_line_pattern', '/^[ \t]*\R/m');
}
public function getDescriptionWhitespaceCleanupPattern(): string
{
return '/[ \t]{2,}/';
return $this->string('description.whitespace_cleanup_pattern', '/[ \t]{2,}/');
}
public function getDescriptionMaxLength(): int
{
return 1500;
return $this->int('description.max_length', 1500, 0);
}
public function getPriceDecimals(): int
{
return 2;
return $this->int('price.decimals', 2, 0);
}
public function getPriceDecimalSeparator(): string
{
return ',';
return $this->string('price.decimal_separator', ',');
}
public function getPriceThousandsSeparator(): string
{
return '.';
return $this->string('price.thousands_separator', '.');
}
public function getPriceSuffix(): string
{
return ' €';
return $this->string('price.suffix', ' €');
}
public function buildRelativeSeoUrl(string $path): string
{
return '/' . ltrim($path, '/');
return $this->string('seo.relative_prefix', '/') . ltrim($path, '/');
}
public function getAvailableHighlightLabel(): string
{
return 'Verfügbar';
return $this->string('highlight.available_label', 'Verfügbar');
}
public function getUnavailableHighlightLabel(): string
{
return 'Nicht verfügbar';
return $this->string('highlight.unavailable_label', 'Nicht verfügbar');
}
public function getProductNumberHighlightPrefix(): string
{
return 'Produktnummer: ';
return $this->string('highlight.product_number_prefix', 'Produktnummer: ');
}
public function getMissingProductImagePlaceholder(): string
{
return 'no-image';
return $this->string('image.missing_placeholder', 'no-image');
}
public function getDeduplicationSeparator(): string
{
return '|';
return $this->string('deduplication.separator', '|');
}
private function int(string $path, int $default, int $min = PHP_INT_MIN): int
{
$value = $this->value($path, $default);
if (!is_numeric($value)) {
return $default;
}
return max($min, (int) $value);
}
private function string(string $path, string $default): string
{
$value = $this->value($path, $default);
if (!is_scalar($value)) {
return $default;
}
return (string) $value;
}
/**
* @param string[] $default
* @param string[]|null $emptySafeDefault
* @return string[]
*/
private function stringList(string $path, array $default, bool $allowEmptyStrings = false, ?array $emptySafeDefault = null): array
{
$value = $this->value($path, $default);
if (!is_array($value)) {
return $emptySafeDefault ?? $default;
}
$out = [];
foreach ($value as $item) {
if (!is_scalar($item)) {
continue;
}
$item = (string) $item;
if (!$allowEmptyStrings) {
$item = trim($item);
}
if (!$allowEmptyStrings && $item === '') {
continue;
}
if ($allowEmptyStrings || !in_array($item, $out, true)) {
$out[] = $item;
}
}
if ($out === [] && !$allowEmptyStrings) {
return $emptySafeDefault ?? $default;
}
return $out;
}
/**
* @param array<string, string[]> $default
* @return array<string, string[]>
*/
private function stringListMap(string $path, array $default): array
{
$value = $this->value($path, $default);
if (!is_array($value)) {
return $default;
}
$out = [];
foreach ($value as $key => $items) {
if (!is_string($key) || !is_array($items)) {
continue;
}
$cleanKey = trim($key);
if ($cleanKey === '') {
continue;
}
$cleanItems = [];
foreach ($items as $item) {
if (!is_scalar($item)) {
continue;
}
$item = trim((string) $item);
if ($item === '') {
continue;
}
if (!in_array($item, $cleanItems, true)) {
$cleanItems[] = $item;
}
}
if ($cleanItems !== []) {
$out[$cleanKey] = $cleanItems;
}
}
return $out !== [] ? $out : $default;
}
private function value(string $path, mixed $default): mixed
{
$current = $this->config;
foreach (explode('.', $path) as $segment) {
if (!is_array($current) || !array_key_exists($segment, $current)) {
return $default;
}
$current = $current[$segment];
}
return $current;
}
}

View File

@@ -14,27 +14,68 @@ final class StopWordsConfig
* - keep question words
* - keep domain terms
* - remove only structural filler words
*
*/
private const DEFAULT_STOP_WORDS = [
'mit',
'der', 'die', 'das',
'ein', 'eine', 'einer', 'eines',
'den', 'dem', 'des',
'und', 'oder', 'aber', 'sowie',
'ich', 'du', 'er', 'sie', 'es',
'wir', 'ihr',
'halt', 'eben', 'auch', 'schon',
'noch', 'mal', 'bitte', 'danke',
'also', 'nun', 'tja',
'dann', 'danach', 'davor',
'hier', 'dort',
'heute', 'gestern', 'morgen',
'könnte', 'kannst', 'kann',
'würde', 'würdest', 'würden',
];
/**
* @param array<string, mixed> $config
*/
public function __construct(private array $config = [])
{
}
/**
* @return string[]
*/
public function getStopWords(): array
{
return [
'mit',
'der', 'die', 'das',
'ein', 'eine', 'einer', 'eines',
'den', 'dem', 'des',
'und', 'oder', 'aber', 'sowie',
'ich', 'du', 'er', 'sie', 'es',
'wir', 'ihr',
'halt', 'eben', 'auch', 'schon',
'noch', 'mal', 'bitte', 'danke',
'also', 'nun', 'tja',
'dann', 'danach', 'davor',
'hier', 'dort',
'heute', 'gestern', 'morgen',
'könnte', 'kannst', 'kann',
'würde', 'würdest', 'würden',
];
return $this->stringList('words', self::DEFAULT_STOP_WORDS);
}
/**
* @param string[] $default
* @return string[]
*/
private function stringList(string $key, array $default): array
{
$value = $this->config[$key] ?? $default;
if (!is_array($value)) {
return $default;
}
$out = [];
foreach ($value as $item) {
if (!is_scalar($item)) {
continue;
}
$item = trim((string) $item);
if ($item === '') {
continue;
}
if (!in_array($item, $out, true)) {
$out[] = $item;
}
}
return $out !== [] ? $out : $default;
}
}

View File

@@ -45,6 +45,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
private IntentRouteResolver $routeResolver,
private EntityCatalogService $entityCatalogService,
private QueryEnricher $queryEnricher,
private NdjsonHybridRetrieverConfig $retrieverConfig,
)
{
}
@@ -211,7 +212,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
if ($exactDocumentMatch !== null) {
$selectedChunkIds = $this->selectExactDocumentChunkIds(
$exactDocumentMatch['rows'],
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)),
max(1, min($config->getRetrievalMaxChunks(), $this->retrieverConfig->hardMaxChunks())),
$prompt
);
@@ -310,8 +311,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
string $salesIntent
): array
{
$limit = max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
$limit = max(1, min($config->getRetrievalMaxChunks(), $this->retrieverConfig->hardMaxChunks()));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), $this->retrieverConfig->hardMaxVectorK()));
$isListQuery = $this->intentLite->isListQuery($prompt);
@@ -322,7 +323,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD,
'threshold' => $this->retrieverConfig->vectorScoreThreshold(),
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
@@ -501,9 +502,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
*/
private function computeKeywordTopK(int $vectorTopK): int
{
$topK = (int) ceil($vectorTopK * NdjsonHybridRetrieverConfig::KEYWORD_TOPK_MULTIPLIER);
$topK = (int) ceil($vectorTopK * $this->retrieverConfig->keywordTopKMultiplier());
return max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_KEYWORDK));
return max(1, min($topK, $this->retrieverConfig->hardMaxKeywordK()));
}
/**
@@ -520,7 +521,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
int $vectorTopKBase
): array
{
$threshold = NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD;
$threshold = $this->retrieverConfig->vectorScoreThreshold();
$topK = $vectorTopKBase;
if (
@@ -531,13 +532,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
}
if ($isListQuery) {
$topK = (int)round($topK * NdjsonHybridRetrieverConfig::LIST_BONUS);
$topK = (int)round($topK * $this->retrieverConfig->listBonus());
}
$topK = max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
$topK = max(1, min($topK, $this->retrieverConfig->hardMaxVectorK()));
$threshold = max(
NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR,
min(NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, $threshold)
$this->retrieverConfig->thresholdFloor(),
min($this->retrieverConfig->thresholdCeil(), $threshold)
);
return [$threshold, $topK];
@@ -587,16 +588,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
}
$rank++;
$rrf = (1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank)) * $weight;
$rrf = (1.0 / ($this->retrieverConfig->rrfK() + $rank)) * $weight;
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
}
};
$apply($globalHits, $vectorThreshold, 1.0);
$apply($scopedHits, $vectorThreshold, $boostScopedVector ? NdjsonHybridRetrieverConfig::SCOPED_VECTOR_RRF_WEIGHT : 1.0);
$apply($keywordHits, NdjsonHybridRetrieverConfig::KEYWORD_SCORE_THRESHOLD, NdjsonHybridRetrieverConfig::KEYWORD_RRF_WEIGHT);
$apply($scopedKeywordHits, NdjsonHybridRetrieverConfig::KEYWORD_SCORE_THRESHOLD, $boostScopedKeyword ? NdjsonHybridRetrieverConfig::SCOPED_KEYWORD_RRF_WEIGHT : NdjsonHybridRetrieverConfig::KEYWORD_RRF_WEIGHT);
$apply($scopedHits, $vectorThreshold, $boostScopedVector ? $this->retrieverConfig->scopedVectorRrfWeight() : 1.0);
$apply($keywordHits, $this->retrieverConfig->keywordScoreThreshold(), $this->retrieverConfig->keywordRrfWeight());
$apply($scopedKeywordHits, $this->retrieverConfig->keywordScoreThreshold(), $boostScopedKeyword ? $this->retrieverConfig->scopedKeywordRrfWeight() : $this->retrieverConfig->keywordRrfWeight());
return [
'rrf_scores' => $rrfScores,
@@ -621,9 +622,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
}
$rank++;
$rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
$rrf[(string)$hit['chunk_id']] = 1.0 / ($this->retrieverConfig->rrfK() + $rank);
if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) {
if ($rank >= $this->retrieverConfig->emptyRrfFallbackTopN()) {
break;
}
}
@@ -649,7 +650,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
private function selectExactDocumentChunkIds(array $rows, int $limit, string $prompt): array
{
$orderedRows = $this->sortRowsByChunkIndex($rows);
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
$max = min($limit, $this->retrieverConfig->exactDocumentMaxChunks());
if ($orderedRows === [] || $max <= 0) {
return [];

View File

@@ -14,9 +14,7 @@ final readonly class QueryEnricher
* The enriched semantic query should help vector retrieval,
* but must not become bloated enough to dilute the original user intent.
*/
private const MAX_EXPANSIONS = 4;
public function __construct(
public function __construct(
private QueryEnricherConfig $config
) {
}
@@ -95,7 +93,7 @@ final readonly class QueryEnricher
$matches[] = $mappedValue;
$seenNormalizedExpansions[$normalizedMappedValue] = true;
if (count($matches) >= self::MAX_EXPANSIONS) {
if (count($matches) >= $this->config->getMaxExpansions()) {
break;
}
}