fix 3
This commit is contained in:
@@ -131,24 +131,24 @@ final class NdjsonHybridRetrieverConfig
|
||||
'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung',
|
||||
'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend',
|
||||
'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher',
|
||||
'welches', 'brauche', 'suche'
|
||||
'welches', 'brauche', 'suche',
|
||||
];
|
||||
|
||||
const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
|
||||
public const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
|
||||
|
||||
const FAMILY_DESCRIPTOR_TOKEN = [
|
||||
public const FAMILY_DESCRIPTOR_TOKEN = [
|
||||
'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab',
|
||||
'inline', 'compact', 'panel', 'sc',
|
||||
];
|
||||
|
||||
const LOOKS_LIKE_REAGENT_TOKENS = [
|
||||
public const LOOKS_LIKE_REAGENT_TOKENS = [
|
||||
'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie',
|
||||
'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche',
|
||||
'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz',
|
||||
'kerzenfilter', 'druckregler',
|
||||
];
|
||||
|
||||
const LOOKS_LIKE_SAFETY_DOCS = [
|
||||
public const LOOKS_LIKE_SAFETY_DOCS = [
|
||||
'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung',
|
||||
'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp',
|
||||
'kennzeichnung', 'h290', 'pbt', 'vpvb',
|
||||
@@ -174,4 +174,309 @@ final class NdjsonHybridRetrieverConfig
|
||||
'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor',
|
||||
];
|
||||
|
||||
}
|
||||
/**
|
||||
* @param array<string, mixed> $config
|
||||
* @param array<string, mixed> $vocabulary Kept for backwards-compatible service wiring.
|
||||
*/
|
||||
public function __construct(
|
||||
private array $config = [],
|
||||
private array $vocabulary = [],
|
||||
) {
|
||||
}
|
||||
|
||||
public function hardMaxChunks(): int
|
||||
{
|
||||
return $this->intValue('hard_max_chunks', self::HARD_MAX_CHUNKS, 1);
|
||||
}
|
||||
|
||||
public function hardMaxVectorK(): int
|
||||
{
|
||||
return $this->intValue('hard_max_vectork', self::HARD_MAX_VECTORK, 1);
|
||||
}
|
||||
|
||||
public function hardMaxKeywordK(): int
|
||||
{
|
||||
return $this->intValue('hard_max_keywordk', self::HARD_MAX_KEYWORDK, 1);
|
||||
}
|
||||
|
||||
public function vectorScoreThreshold(): float
|
||||
{
|
||||
return $this->floatValue('vector_score_threshold', self::VECTOR_SCORE_THRESHOLD, 0.0, 1.0);
|
||||
}
|
||||
|
||||
public function thresholdFloor(): float
|
||||
{
|
||||
return $this->floatValue('threshold_floor', self::THRESHOLD_FLOOR, 0.0, 1.0);
|
||||
}
|
||||
|
||||
public function thresholdCeil(): float
|
||||
{
|
||||
return $this->floatValue('threshold_ceil', self::THRESHOLD_CEIL, 0.0, 1.0);
|
||||
}
|
||||
|
||||
public function listBonus(): float
|
||||
{
|
||||
return $this->floatValue('list_bonus', self::LIST_BONUS, 1.0);
|
||||
}
|
||||
|
||||
public function rrfK(): int
|
||||
{
|
||||
return $this->intValue('rrf_k', self::RRF_K, 1);
|
||||
}
|
||||
|
||||
public function keywordTopKMultiplier(): float
|
||||
{
|
||||
return $this->floatValue('keyword_topk_multiplier', self::KEYWORD_TOPK_MULTIPLIER, 0.1);
|
||||
}
|
||||
|
||||
public function keywordScoreThreshold(): float
|
||||
{
|
||||
return $this->floatValue('keyword_score_threshold', self::KEYWORD_SCORE_THRESHOLD, 0.0, 1.0);
|
||||
}
|
||||
|
||||
public function keywordRrfWeight(): float
|
||||
{
|
||||
return $this->floatValue('keyword_rrf_weight', self::KEYWORD_RRF_WEIGHT, 0.0);
|
||||
}
|
||||
|
||||
public function scopedVectorRrfWeight(): float
|
||||
{
|
||||
return $this->floatValue('scoped_vector_rrf_weight', self::SCOPED_VECTOR_RRF_WEIGHT, 0.0);
|
||||
}
|
||||
|
||||
public function scopedKeywordRrfWeight(): float
|
||||
{
|
||||
return $this->floatValue('scoped_keyword_rrf_weight', self::SCOPED_KEYWORD_RRF_WEIGHT, 0.0);
|
||||
}
|
||||
|
||||
public function emptyRrfFallbackTopN(): int
|
||||
{
|
||||
return $this->intValue('empty_rrf_fallback_topn', self::EMPTY_RRF_FALLBACK_TOPN, 1);
|
||||
}
|
||||
|
||||
public function maxChunksPerDoc(): int
|
||||
{
|
||||
return $this->intValue('max_chunks_per_doc', self::MAX_CHUNKS_PER_DOC, 1);
|
||||
}
|
||||
|
||||
public function minChunkDistance(): int
|
||||
{
|
||||
return $this->intValue('min_chunk_distance', self::MIN_CHUNK_DISTANCE, 0);
|
||||
}
|
||||
|
||||
public function dominantDocWindow(): int
|
||||
{
|
||||
return $this->intValue('dominant_doc_window', self::DOMINANT_DOC_WINDOW, 1);
|
||||
}
|
||||
|
||||
public function dominantDocMinHits(): int
|
||||
{
|
||||
return $this->intValue('dominant_doc_min_hits', self::DOMINANT_DOC_MIN_HITS, 1);
|
||||
}
|
||||
|
||||
public function dominantDocMaxChunks(): int
|
||||
{
|
||||
return $this->intValue('dominant_doc_max_chunks', self::DOMINANT_DOC_MAX_CHUNKS, 1);
|
||||
}
|
||||
|
||||
public function exactDocumentMaxChunks(): int
|
||||
{
|
||||
return $this->intValue('exact_document_max_chunks', self::EXACT_DOCUMENT_MAX_CHUNKS, 1);
|
||||
}
|
||||
|
||||
public function focusedProductWindow(): int
|
||||
{
|
||||
return $this->intValue('focused_product_window', self::FOCUSED_PRODUCT_WINDOW, 1);
|
||||
}
|
||||
|
||||
public function focusedProductMinScore(): float
|
||||
{
|
||||
return $this->floatValue('focused_product_min_score', self::FOCUSED_PRODUCT_MIN_SCORE, 0.0);
|
||||
}
|
||||
|
||||
public function focusedProductMinGap(): float
|
||||
{
|
||||
return $this->floatValue('focused_product_min_gap', self::FOCUSED_PRODUCT_MIN_GAP, 0.0);
|
||||
}
|
||||
|
||||
public function focusedProductMaxChunks(): int
|
||||
{
|
||||
return $this->intValue('focused_product_max_chunks', self::FOCUSED_PRODUCT_MAX_CHUNKS, 1);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function genericProductTokens(): array
|
||||
{
|
||||
return $this->stringList('generic_product_tokens', self::GENERIC_PRODUCT_TOKEN);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function importantShortModelTokens(): array
|
||||
{
|
||||
return $this->stringList('important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function familyDescriptorTokens(): array
|
||||
{
|
||||
return $this->stringList('family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function looksLikeReagentTokens(): array
|
||||
{
|
||||
return $this->stringList('looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function looksLikeSafetyDocs(): array
|
||||
{
|
||||
return $this->stringList('looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function looksLikeReagentWords(): array
|
||||
{
|
||||
return $this->stringList('looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function looksLikeDocumentWords(): array
|
||||
{
|
||||
return $this->stringList('looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function looksLikeSafetyWords(): array
|
||||
{
|
||||
return $this->stringList('looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function looksLikeDeviceWords(): array
|
||||
{
|
||||
return $this->stringList('looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, mixed>
|
||||
*/
|
||||
public function toArray(): array
|
||||
{
|
||||
return [
|
||||
'hard_max_chunks' => $this->hardMaxChunks(),
|
||||
'hard_max_vectork' => $this->hardMaxVectorK(),
|
||||
'hard_max_keywordk' => $this->hardMaxKeywordK(),
|
||||
'vector_score_threshold' => $this->vectorScoreThreshold(),
|
||||
'threshold_floor' => $this->thresholdFloor(),
|
||||
'threshold_ceil' => $this->thresholdCeil(),
|
||||
'list_bonus' => $this->listBonus(),
|
||||
'rrf_k' => $this->rrfK(),
|
||||
'keyword_topk_multiplier' => $this->keywordTopKMultiplier(),
|
||||
'keyword_score_threshold' => $this->keywordScoreThreshold(),
|
||||
'keyword_rrf_weight' => $this->keywordRrfWeight(),
|
||||
'scoped_vector_rrf_weight' => $this->scopedVectorRrfWeight(),
|
||||
'scoped_keyword_rrf_weight' => $this->scopedKeywordRrfWeight(),
|
||||
'empty_rrf_fallback_topn' => $this->emptyRrfFallbackTopN(),
|
||||
'max_chunks_per_doc' => $this->maxChunksPerDoc(),
|
||||
'min_chunk_distance' => $this->minChunkDistance(),
|
||||
'dominant_doc_window' => $this->dominantDocWindow(),
|
||||
'dominant_doc_min_hits' => $this->dominantDocMinHits(),
|
||||
'dominant_doc_max_chunks' => $this->dominantDocMaxChunks(),
|
||||
'exact_document_max_chunks' => $this->exactDocumentMaxChunks(),
|
||||
'focused_product_window' => $this->focusedProductWindow(),
|
||||
'focused_product_min_score' => $this->focusedProductMinScore(),
|
||||
'focused_product_min_gap' => $this->focusedProductMinGap(),
|
||||
'focused_product_max_chunks' => $this->focusedProductMaxChunks(),
|
||||
'generic_product_tokens' => $this->genericProductTokens(),
|
||||
'important_short_model_tokens' => $this->importantShortModelTokens(),
|
||||
'family_descriptor_tokens' => $this->familyDescriptorTokens(),
|
||||
'looks_like_reagent_tokens' => $this->looksLikeReagentTokens(),
|
||||
'looks_like_safety_docs' => $this->looksLikeSafetyDocs(),
|
||||
'looks_like_reagent_words' => $this->looksLikeReagentWords(),
|
||||
'looks_like_document_words' => $this->looksLikeDocumentWords(),
|
||||
'looks_like_safety_words' => $this->looksLikeSafetyWords(),
|
||||
'looks_like_device_words' => $this->looksLikeDeviceWords(),
|
||||
];
|
||||
}
|
||||
|
||||
private function intValue(string $key, int $default, int $min = PHP_INT_MIN, ?int $max = null): int
|
||||
{
|
||||
$value = $this->raw($key, $default);
|
||||
|
||||
if (!is_numeric($value)) {
|
||||
return $default;
|
||||
}
|
||||
|
||||
$value = (int) $value;
|
||||
$value = max($min, $value);
|
||||
|
||||
if ($max !== null) {
|
||||
$value = min($max, $value);
|
||||
}
|
||||
|
||||
return $value;
|
||||
}
|
||||
|
||||
private function floatValue(string $key, float $default, float $min = -INF, ?float $max = null): float
|
||||
{
|
||||
$value = $this->raw($key, $default);
|
||||
|
||||
if (!is_numeric($value)) {
|
||||
return $default;
|
||||
}
|
||||
|
||||
$value = (float) $value;
|
||||
$value = max($min, $value);
|
||||
|
||||
if ($max !== null) {
|
||||
$value = min($max, $value);
|
||||
}
|
||||
|
||||
return $value;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[] $default
|
||||
* @return string[]
|
||||
*/
|
||||
private function stringList(string $key, array $default): array
|
||||
{
|
||||
$value = $this->raw($key, $default);
|
||||
|
||||
if (!is_array($value)) {
|
||||
return $default;
|
||||
}
|
||||
|
||||
$out = [];
|
||||
foreach ($value as $item) {
|
||||
if (!is_scalar($item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = trim((string) $item);
|
||||
if ($item === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!in_array($item, $out, true)) {
|
||||
$out[] = $item;
|
||||
}
|
||||
}
|
||||
|
||||
return $out !== [] ? $out : $default;
|
||||
}
|
||||
|
||||
private function raw(string $key, mixed $default): mixed
|
||||
{
|
||||
if (array_key_exists($key, $this->config)) {
|
||||
return $this->config[$key];
|
||||
}
|
||||
|
||||
if (array_key_exists($key, $this->vocabulary)) {
|
||||
return $this->vocabulary[$key];
|
||||
}
|
||||
|
||||
return $default;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,38 +7,12 @@ namespace App\Config;
|
||||
final readonly class QueryEnricherConfig
|
||||
{
|
||||
/**
|
||||
* Keep the enrichment vocabulary in the class for now.
|
||||
*
|
||||
* Important:
|
||||
* - This is intentionally NOT externalized yet.
|
||||
* - Add or maintain the current project-specific mappings here.
|
||||
* - The later move to external config/files can happen separately.
|
||||
*
|
||||
* Supported shapes:
|
||||
*
|
||||
* 1) Simple mapping:
|
||||
* [
|
||||
* 'water hardness' => 'residual hardness',
|
||||
* 'device' => 'instrument',
|
||||
* ]
|
||||
*
|
||||
* 2) Small synonym groups:
|
||||
* [
|
||||
* ['water hardness', 'residual hardness', 'hardness'],
|
||||
* ['device', 'instrument', 'meter'],
|
||||
* ]
|
||||
*
|
||||
* The public API stays intentionally simple:
|
||||
* - getEnrichQueryList(): array<string,string>
|
||||
*
|
||||
* This keeps QueryEnricher generic while the domain vocabulary
|
||||
* deliberately remains inside this class for now.
|
||||
*
|
||||
* Replace the example entries below with your real project mappings.
|
||||
* Backwards-compatible fallback vocabulary.
|
||||
* Active values are loaded from retriex.query_enrichment.config when present.
|
||||
*
|
||||
* @var array<int|string, mixed>
|
||||
*/
|
||||
private const ENRICH_QUERY_LIST = [
|
||||
private const DEFAULT_ENRICH_QUERY_LIST = [
|
||||
'Wasserhärte' => 'Resthärte',
|
||||
'Gerät' => 'Modell',
|
||||
'Indikator' => 'Chemie',
|
||||
@@ -48,9 +22,16 @@ final readonly class QueryEnricherConfig
|
||||
'Wasserhärte-Grenzwert' => 'Resthärte',
|
||||
'Resthärte-Grenzwert' => 'Wasserhärte',
|
||||
'Grenzwert' => 'Überwachungsbereich',
|
||||
'store'=>'shop'
|
||||
'store' => 'shop',
|
||||
];
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $config
|
||||
*/
|
||||
public function __construct(private array $config = [])
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a normalized, deduplicated mapping for the QueryEnricher.
|
||||
*
|
||||
@@ -71,8 +52,13 @@ final readonly class QueryEnricherConfig
|
||||
public function getEnrichQueryList(): array
|
||||
{
|
||||
$normalized = [];
|
||||
$rules = $this->config['rules'] ?? self::DEFAULT_ENRICH_QUERY_LIST;
|
||||
|
||||
foreach (self::ENRICH_QUERY_LIST as $key => $value) {
|
||||
if (!is_array($rules)) {
|
||||
$rules = self::DEFAULT_ENRICH_QUERY_LIST;
|
||||
}
|
||||
|
||||
foreach ($rules as $key => $value) {
|
||||
if (is_array($value)) {
|
||||
$this->ingestGroup($normalized, $value);
|
||||
continue;
|
||||
@@ -93,6 +79,17 @@ final readonly class QueryEnricherConfig
|
||||
return $normalized;
|
||||
}
|
||||
|
||||
public function getMaxExpansions(): int
|
||||
{
|
||||
$value = $this->config['max_expansions'] ?? 4;
|
||||
|
||||
if (!is_numeric($value)) {
|
||||
return 4;
|
||||
}
|
||||
|
||||
return max(0, (int) $value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true when at least one valid enrichment rule exists.
|
||||
*/
|
||||
@@ -176,4 +173,4 @@ final readonly class QueryEnricherConfig
|
||||
|
||||
return trim($value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ final readonly class RetriexEffectiveConfigProvider
|
||||
private ModelGenerationConfigProvider $modelProvider,
|
||||
private IndexConfigurationProvider $indexProvider,
|
||||
private PromptBuilderConfig $promptConfig,
|
||||
private NdjsonHybridRetrieverConfig $retrieverConfig,
|
||||
) {
|
||||
}
|
||||
|
||||
@@ -144,30 +145,8 @@ final readonly class RetriexEffectiveConfigProvider
|
||||
private function retrievalConfig(): array
|
||||
{
|
||||
return [
|
||||
'hard_max_chunks' => NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS,
|
||||
'hard_max_vectork' => NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK,
|
||||
'hard_max_keywordk' => NdjsonHybridRetrieverConfig::HARD_MAX_KEYWORDK,
|
||||
'vector_score_threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD,
|
||||
'threshold_floor' => NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR,
|
||||
'threshold_ceil' => NdjsonHybridRetrieverConfig::THRESHOLD_CEIL,
|
||||
'list_bonus' => NdjsonHybridRetrieverConfig::LIST_BONUS,
|
||||
'rrf_k' => NdjsonHybridRetrieverConfig::RRF_K,
|
||||
'keyword_topk_multiplier' => NdjsonHybridRetrieverConfig::KEYWORD_TOPK_MULTIPLIER,
|
||||
'keyword_score_threshold' => NdjsonHybridRetrieverConfig::KEYWORD_SCORE_THRESHOLD,
|
||||
'keyword_rrf_weight' => NdjsonHybridRetrieverConfig::KEYWORD_RRF_WEIGHT,
|
||||
'scoped_vector_rrf_weight' => NdjsonHybridRetrieverConfig::SCOPED_VECTOR_RRF_WEIGHT,
|
||||
'scoped_keyword_rrf_weight' => NdjsonHybridRetrieverConfig::SCOPED_KEYWORD_RRF_WEIGHT,
|
||||
'empty_rrf_fallback_topn' => NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN,
|
||||
'max_chunks_per_doc' => NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC,
|
||||
'min_chunk_distance' => NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE,
|
||||
'dominant_doc_window' => NdjsonHybridRetrieverConfig::DOMINANT_DOC_WINDOW,
|
||||
'dominant_doc_min_hits' => NdjsonHybridRetrieverConfig::DOMINANT_DOC_MIN_HITS,
|
||||
'dominant_doc_max_chunks' => NdjsonHybridRetrieverConfig::DOMINANT_DOC_MAX_CHUNKS,
|
||||
'exact_document_max_chunks' => NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS,
|
||||
'focused_product_window' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_WINDOW,
|
||||
'focused_product_min_score' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_SCORE,
|
||||
'focused_product_min_gap' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_GAP,
|
||||
'focused_product_max_chunks' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MAX_CHUNKS,
|
||||
...$this->retrieverConfig->toArray(),
|
||||
'vocabulary' => $this->retrieverConfig->vocabularyToArray(),
|
||||
'inventory_parameter' => $this->param('retriex.retrieval.inventory', []),
|
||||
];
|
||||
}
|
||||
|
||||
@@ -7,451 +7,461 @@ namespace App\Config;
|
||||
final class ShopServiceConfig
|
||||
{
|
||||
public const DEVICE_QUERY_KEYWORDS = [
|
||||
'analysegerät',
|
||||
'analysegeraet',
|
||||
'analysegeräte',
|
||||
'analysegeraete',
|
||||
'messgerät',
|
||||
'messgeraet',
|
||||
'messgeräte',
|
||||
'messgeraete',
|
||||
'analysator',
|
||||
'analysatoren',
|
||||
'analyzer',
|
||||
'gerät',
|
||||
'geraet',
|
||||
'geräte',
|
||||
'geraete',
|
||||
'monitor',
|
||||
'monitore',
|
||||
'controller',
|
||||
'gerät für',
|
||||
'geraet fuer',
|
||||
'geräte für',
|
||||
'geraete fuer',
|
||||
'system',
|
||||
'systeme',
|
||||
'anlage',
|
||||
'anlagen',
|
||||
'analysegerät', 'analysegeraet', 'analysegeräte', 'analysegeraete',
|
||||
'messgerät', 'messgeraet', 'messgeräte', 'messgeraete',
|
||||
'analysator', 'analysatoren', 'analyzer', 'gerät', 'geraet', 'geräte',
|
||||
'geraete', 'monitor', 'monitore', 'controller', 'gerät für',
|
||||
'geraet fuer', 'geräte für', 'geraete fuer', 'system', 'systeme',
|
||||
'anlage', 'anlagen',
|
||||
];
|
||||
|
||||
public const ACCESSORY_QUERY_KEYWORDS = [
|
||||
'zubehör',
|
||||
'zubehor',
|
||||
'reagenz',
|
||||
'reagenzien',
|
||||
'reagent',
|
||||
'indikator',
|
||||
'indikatoren',
|
||||
'indicator',
|
||||
'kit',
|
||||
'set',
|
||||
'ersatz',
|
||||
'ersatzteil',
|
||||
'ersatzteile',
|
||||
'verbrauchsmaterial',
|
||||
'consumable',
|
||||
'dazu',
|
||||
'passend',
|
||||
'passende',
|
||||
'passendes',
|
||||
'nachfüll',
|
||||
'nachfuell',
|
||||
'refill',
|
||||
'filter',
|
||||
'pumpenkopf',
|
||||
'motorblock',
|
||||
'service set',
|
||||
'serviceset',
|
||||
'service-set',
|
||||
'zubehör', 'zubehor', 'reagenz', 'reagenzien', 'reagent', 'indikator',
|
||||
'indikatoren', 'indicator', 'kit', 'set', 'ersatz', 'ersatzteil',
|
||||
'ersatzteile', 'verbrauchsmaterial', 'consumable', 'dazu', 'passend',
|
||||
'passende', 'passendes', 'nachfüll', 'nachfuell', 'refill', 'filter',
|
||||
'pumpenkopf', 'motorblock', 'service set', 'serviceset', 'service-set',
|
||||
];
|
||||
|
||||
public const ACCESSORY_PRODUCT_KEYWORDS = [
|
||||
'reagenz',
|
||||
'reagenzien',
|
||||
'reagent',
|
||||
'indikator',
|
||||
'indikatoren',
|
||||
'indicator',
|
||||
'kit',
|
||||
'set',
|
||||
'verbrauchsmaterial',
|
||||
'consumable',
|
||||
'zubehör',
|
||||
'zubehor',
|
||||
'ersatz',
|
||||
'ersatzteil',
|
||||
'ersatzteile',
|
||||
'nachfüll',
|
||||
'nachfuell',
|
||||
'refill',
|
||||
'lösung',
|
||||
'loesung',
|
||||
'solution',
|
||||
'teststreifen',
|
||||
'test strip',
|
||||
'filter',
|
||||
'pumpenkopf',
|
||||
'motorblock',
|
||||
'service set',
|
||||
'serviceset',
|
||||
'service-set',
|
||||
'reagenz', 'reagenzien', 'reagent', 'indikator', 'indikatoren',
|
||||
'indicator', 'kit', 'set', 'verbrauchsmaterial', 'consumable',
|
||||
'zubehör', 'zubehor', 'ersatz', 'ersatzteil', 'ersatzteile',
|
||||
'nachfüll', 'nachfuell', 'refill', 'lösung', 'loesung', 'solution',
|
||||
'teststreifen', 'test strip', 'filter', 'pumpenkopf', 'motorblock',
|
||||
'service set', 'serviceset', 'service-set',
|
||||
];
|
||||
|
||||
public const DEVICE_PRODUCT_KEYWORDS = [
|
||||
'analysegerät',
|
||||
'analysegeraet',
|
||||
'analysegeräte',
|
||||
'analysegeraete',
|
||||
'messgerät',
|
||||
'messgeraet',
|
||||
'messgeräte',
|
||||
'messgeraete',
|
||||
'analysator',
|
||||
'analysatoren',
|
||||
'analyzer',
|
||||
'monitor',
|
||||
'monitore',
|
||||
'controller',
|
||||
'online-analysator',
|
||||
'online analysator',
|
||||
'online-analysegerät',
|
||||
'online analysegeraet',
|
||||
'online-analysegeräte',
|
||||
'online analysegeraete',
|
||||
'online analyzer',
|
||||
'online monitor',
|
||||
'system',
|
||||
'systeme',
|
||||
'anlage',
|
||||
'anlagen',
|
||||
'gerät',
|
||||
'geraet',
|
||||
'geräte',
|
||||
'geraete',
|
||||
'analysegerät', 'analysegeraet', 'analysegeräte', 'analysegeraete',
|
||||
'messgerät', 'messgeraet', 'messgeräte', 'messgeraete',
|
||||
'analysator', 'analysatoren', 'analyzer', 'monitor', 'monitore',
|
||||
'controller', 'online-analysator', 'online analysator',
|
||||
'online-analysegerät', 'online analysegeraet', 'online-analysegeräte',
|
||||
'online analysegeraete', 'online analyzer', 'online monitor', 'system',
|
||||
'systeme', 'anlage', 'anlagen', 'gerät', 'geraet', 'geräte', 'geraete',
|
||||
];
|
||||
|
||||
private const DEVICE_FOCUS_KEYWORDS = [
|
||||
'geräte', 'geraete', 'gerät', 'geraet', 'analysegerät', 'analysegeraet',
|
||||
'messgerät', 'messgeraet', 'analysator', 'controller', 'monitor',
|
||||
];
|
||||
|
||||
private const ACCESSORY_FOCUS_KEYWORDS = [
|
||||
'indikator', 'indikatoren', 'reagenz', 'reagenzien', 'zubehör',
|
||||
'zubehor', 'ersatzteil', 'ersatzteile', 'verbrauchsmaterial',
|
||||
'service set', 'serviceset', 'filter', 'pumpenkopf', 'motorblock',
|
||||
];
|
||||
|
||||
private const ACCESSORY_FOCUS_VARIANT_MAP = [
|
||||
'indikator' => ['indikator', 'indikatoren'],
|
||||
'indikatoren' => ['indikator', 'indikatoren'],
|
||||
'reagenz' => ['reagenz', 'reagenzien'],
|
||||
'reagenzien' => ['reagenz', 'reagenzien'],
|
||||
'ersatzteil' => ['ersatzteil', 'ersatzteile'],
|
||||
'ersatzteile' => ['ersatzteil', 'ersatzteile'],
|
||||
'service set' => ['service set', 'serviceset', 'service-set'],
|
||||
'serviceset' => ['service set', 'serviceset', 'service-set'],
|
||||
'service-set' => ['service set', 'serviceset', 'service-set'],
|
||||
];
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $config
|
||||
*/
|
||||
public function __construct(private array $config = [])
|
||||
{
|
||||
}
|
||||
|
||||
public function getTopProductLogLimit(): int
|
||||
{
|
||||
return 3;
|
||||
return $this->int('top_product_log_limit', 3, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getDeviceFocusKeywords(): array
|
||||
{
|
||||
return [
|
||||
'geräte',
|
||||
'geraete',
|
||||
'gerät',
|
||||
'geraet',
|
||||
'analysegerät',
|
||||
'analysegeraet',
|
||||
'messgerät',
|
||||
'messgeraet',
|
||||
'analysator',
|
||||
'controller',
|
||||
'monitor',
|
||||
];
|
||||
return $this->stringList('device_focus_keywords', self::DEVICE_FOCUS_KEYWORDS);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getAccessoryFocusKeywords(): array
|
||||
{
|
||||
return [
|
||||
'indikator',
|
||||
'indikatoren',
|
||||
'reagenz',
|
||||
'reagenzien',
|
||||
'zubehör',
|
||||
'zubehor',
|
||||
'ersatzteil',
|
||||
'ersatzteile',
|
||||
'verbrauchsmaterial',
|
||||
'service set',
|
||||
'serviceset',
|
||||
'filter',
|
||||
'pumpenkopf',
|
||||
'motorblock',
|
||||
];
|
||||
return $this->stringList('accessory_focus_keywords', self::ACCESSORY_FOCUS_KEYWORDS);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, string[]>
|
||||
*/
|
||||
/** @return array<string, string[]> */
|
||||
public function getAccessoryFocusVariantMap(): array
|
||||
{
|
||||
return [
|
||||
'indikator' => ['indikator', 'indikatoren'],
|
||||
'indikatoren' => ['indikator', 'indikatoren'],
|
||||
'reagenz' => ['reagenz', 'reagenzien'],
|
||||
'reagenzien' => ['reagenz', 'reagenzien'],
|
||||
'ersatzteil' => ['ersatzteil', 'ersatzteile'],
|
||||
'ersatzteile' => ['ersatzteil', 'ersatzteile'],
|
||||
'service set' => ['service set', 'serviceset', 'service-set'],
|
||||
'serviceset' => ['service set', 'serviceset', 'service-set'],
|
||||
'service-set' => ['service set', 'serviceset', 'service-set'],
|
||||
];
|
||||
return $this->stringListMap('accessory_focus_variant_map', self::ACCESSORY_FOCUS_VARIANT_MAP);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getDeviceQueryKeywords(): array
|
||||
{
|
||||
return self::DEVICE_QUERY_KEYWORDS;
|
||||
return $this->stringList('device_query_keywords', self::DEVICE_QUERY_KEYWORDS);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getAccessoryQueryKeywords(): array
|
||||
{
|
||||
return self::ACCESSORY_QUERY_KEYWORDS;
|
||||
return $this->stringList('accessory_query_keywords', self::ACCESSORY_QUERY_KEYWORDS);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getAccessoryProductKeywords(): array
|
||||
{
|
||||
return self::ACCESSORY_PRODUCT_KEYWORDS;
|
||||
return $this->stringList('accessory_product_keywords', self::ACCESSORY_PRODUCT_KEYWORDS);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getDeviceProductKeywords(): array
|
||||
{
|
||||
return self::DEVICE_PRODUCT_KEYWORDS;
|
||||
return $this->stringList('device_product_keywords', self::DEVICE_PRODUCT_KEYWORDS);
|
||||
}
|
||||
|
||||
public function getExactProductNumberPhraseScore(): int
|
||||
{
|
||||
return 160;
|
||||
return $this->int('scores.exact_product_number_phrase', 160);
|
||||
}
|
||||
|
||||
public function getExactProductNamePhraseScore(): int
|
||||
{
|
||||
return 90;
|
||||
return $this->int('scores.exact_product_name_phrase', 90);
|
||||
}
|
||||
|
||||
public function getExactManufacturerMatchScore(): int
|
||||
{
|
||||
return 40;
|
||||
return $this->int('scores.exact_manufacturer_match', 40);
|
||||
}
|
||||
|
||||
public function getBrandContainedInNameScore(): int
|
||||
{
|
||||
return 20;
|
||||
return $this->int('scores.brand_contained_in_name', 20);
|
||||
}
|
||||
|
||||
public function getNameTokenOverlapWeight(): int
|
||||
{
|
||||
return 6;
|
||||
return $this->int('scores.name_token_overlap_weight', 6);
|
||||
}
|
||||
|
||||
public function getProductNumberTokenOverlapWeight(): int
|
||||
{
|
||||
return 10;
|
||||
return $this->int('scores.product_number_token_overlap_weight', 10);
|
||||
}
|
||||
|
||||
public function getCorpusTokenOverlapWeight(): int
|
||||
{
|
||||
return 2;
|
||||
return $this->int('scores.corpus_token_overlap_weight', 2);
|
||||
}
|
||||
|
||||
public function getNameNumberOverlapWeight(): int
|
||||
{
|
||||
return 18;
|
||||
return $this->int('scores.name_number_overlap_weight', 18);
|
||||
}
|
||||
|
||||
public function getProductNumberNumberOverlapWeight(): int
|
||||
{
|
||||
return 28;
|
||||
return $this->int('scores.product_number_number_overlap_weight', 28);
|
||||
}
|
||||
|
||||
public function getCorpusNumberOverlapWeight(): int
|
||||
{
|
||||
return 8;
|
||||
return $this->int('scores.corpus_number_overlap_weight', 8);
|
||||
}
|
||||
|
||||
public function getSizeMatchScore(): int
|
||||
{
|
||||
return 12;
|
||||
return $this->int('scores.size_match', 12);
|
||||
}
|
||||
|
||||
public function getAvailabilityBonusScore(): int
|
||||
{
|
||||
return 1;
|
||||
return $this->int('scores.availability_bonus', 1);
|
||||
}
|
||||
|
||||
public function getDeviceQueryDeviceProductBonus(): int
|
||||
{
|
||||
return 60;
|
||||
return $this->int('scores.device_query_device_product_bonus', 60);
|
||||
}
|
||||
|
||||
public function getDeviceQueryAccessoryPenalty(): int
|
||||
{
|
||||
return 120;
|
||||
return $this->int('scores.device_query_accessory_penalty', 120);
|
||||
}
|
||||
|
||||
public function getAccessoryQueryAccessoryProductBonus(): int
|
||||
{
|
||||
return 30;
|
||||
return $this->int('scores.accessory_query_accessory_product_bonus', 30);
|
||||
}
|
||||
|
||||
public function getAccessoryQueryDeviceProductBonus(): int
|
||||
{
|
||||
return 10;
|
||||
return $this->int('scores.accessory_query_device_product_bonus', 10);
|
||||
}
|
||||
|
||||
public function getContainsDigitPattern(): string
|
||||
{
|
||||
return '/\d/u';
|
||||
return $this->string('patterns.contains_digit', '/\d/u');
|
||||
}
|
||||
|
||||
public function getMatchingCleanupPattern(): string
|
||||
{
|
||||
return '/[^\p{L}\p{N}]+/u';
|
||||
return $this->string('patterns.matching_cleanup', '/[^\p{L}\p{N}]+/u');
|
||||
}
|
||||
|
||||
public function getWhitespaceCollapsePattern(): string
|
||||
{
|
||||
return '/\s+/u';
|
||||
return $this->string('patterns.whitespace_collapse', '/\s+/u');
|
||||
}
|
||||
|
||||
public function getTokenSplitPattern(): string
|
||||
{
|
||||
return '/[^\p{L}\p{N}]+/u';
|
||||
return $this->string('patterns.token_split', '/[^\p{L}\p{N}]+/u');
|
||||
}
|
||||
|
||||
public function wrapWithPaddingSpaces(string $value): string
|
||||
{
|
||||
return ' ' . trim($value) . ' ';
|
||||
return $this->string('padding.prefix', ' ') . trim($value) . $this->string('padding.suffix', ' ');
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getPriceNormalizationSearch(): array
|
||||
{
|
||||
return ['€', ' ', '.'];
|
||||
return $this->stringList('price.normalization_search', ['€', ' ', '.']);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getPriceNormalizationReplace(): array
|
||||
{
|
||||
return ['', '', ''];
|
||||
return $this->stringList('price.normalization_replace', ['', '', ''], true, ['', '', '']);
|
||||
}
|
||||
|
||||
public function getPrimaryCustomFieldKey(): string
|
||||
{
|
||||
return 'migration_Backup_product_attr1';
|
||||
return $this->string('custom_fields.primary', 'migration_Backup_product_attr1');
|
||||
}
|
||||
|
||||
public function getSecondaryCustomFieldKey(): string
|
||||
{
|
||||
return 'migration_Backup_product_attr2';
|
||||
return $this->string('custom_fields.secondary', 'migration_Backup_product_attr2');
|
||||
}
|
||||
|
||||
public function getUseCasesCustomFieldKey(): string
|
||||
{
|
||||
return 'migration_Backup_product_attr4';
|
||||
return $this->string('custom_fields.use_cases', 'migration_Backup_product_attr4');
|
||||
}
|
||||
|
||||
public function getLanguagesCustomFieldKey(): string
|
||||
{
|
||||
return 'migration_Backup_product_attr5';
|
||||
return $this->string('custom_fields.languages', 'migration_Backup_product_attr5');
|
||||
}
|
||||
|
||||
public function getPrimarySecondarySeparator(): string
|
||||
{
|
||||
return ': ';
|
||||
return $this->string('text.primary_secondary_separator', ': ');
|
||||
}
|
||||
|
||||
public function getUseCasesLabel(): string
|
||||
{
|
||||
return 'Einsatzgebiete: ';
|
||||
return $this->string('text.use_cases_label', 'Einsatzgebiete: ');
|
||||
}
|
||||
|
||||
public function getLanguagesLabel(): string
|
||||
{
|
||||
return 'Sprachen: ';
|
||||
return $this->string('text.languages_label', 'Sprachen: ');
|
||||
}
|
||||
|
||||
public function getCustomFieldJoinSeparator(): string
|
||||
{
|
||||
return ' | ';
|
||||
return $this->string('text.custom_field_join_separator', ' | ');
|
||||
}
|
||||
|
||||
public function getDescriptionEmptyLinePattern(): string
|
||||
{
|
||||
return '/^[ \t]*\R/m';
|
||||
return $this->string('description.empty_line_pattern', '/^[ \t]*\R/m');
|
||||
}
|
||||
|
||||
public function getDescriptionWhitespaceCleanupPattern(): string
|
||||
{
|
||||
return '/[ \t]{2,}/';
|
||||
return $this->string('description.whitespace_cleanup_pattern', '/[ \t]{2,}/');
|
||||
}
|
||||
|
||||
public function getDescriptionMaxLength(): int
|
||||
{
|
||||
return 1500;
|
||||
return $this->int('description.max_length', 1500, 0);
|
||||
}
|
||||
|
||||
public function getPriceDecimals(): int
|
||||
{
|
||||
return 2;
|
||||
return $this->int('price.decimals', 2, 0);
|
||||
}
|
||||
|
||||
public function getPriceDecimalSeparator(): string
|
||||
{
|
||||
return ',';
|
||||
return $this->string('price.decimal_separator', ',');
|
||||
}
|
||||
|
||||
public function getPriceThousandsSeparator(): string
|
||||
{
|
||||
return '.';
|
||||
return $this->string('price.thousands_separator', '.');
|
||||
}
|
||||
|
||||
public function getPriceSuffix(): string
|
||||
{
|
||||
return ' €';
|
||||
return $this->string('price.suffix', ' €');
|
||||
}
|
||||
|
||||
public function buildRelativeSeoUrl(string $path): string
|
||||
{
|
||||
return '/' . ltrim($path, '/');
|
||||
return $this->string('seo.relative_prefix', '/') . ltrim($path, '/');
|
||||
}
|
||||
|
||||
public function getAvailableHighlightLabel(): string
|
||||
{
|
||||
return 'Verfügbar';
|
||||
return $this->string('highlight.available_label', 'Verfügbar');
|
||||
}
|
||||
|
||||
public function getUnavailableHighlightLabel(): string
|
||||
{
|
||||
return 'Nicht verfügbar';
|
||||
return $this->string('highlight.unavailable_label', 'Nicht verfügbar');
|
||||
}
|
||||
|
||||
public function getProductNumberHighlightPrefix(): string
|
||||
{
|
||||
return 'Produktnummer: ';
|
||||
return $this->string('highlight.product_number_prefix', 'Produktnummer: ');
|
||||
}
|
||||
|
||||
public function getMissingProductImagePlaceholder(): string
|
||||
{
|
||||
return 'no-image';
|
||||
return $this->string('image.missing_placeholder', 'no-image');
|
||||
}
|
||||
|
||||
public function getDeduplicationSeparator(): string
|
||||
{
|
||||
return '|';
|
||||
return $this->string('deduplication.separator', '|');
|
||||
}
|
||||
}
|
||||
|
||||
private function int(string $path, int $default, int $min = PHP_INT_MIN): int
|
||||
{
|
||||
$value = $this->value($path, $default);
|
||||
|
||||
if (!is_numeric($value)) {
|
||||
return $default;
|
||||
}
|
||||
|
||||
return max($min, (int) $value);
|
||||
}
|
||||
|
||||
private function string(string $path, string $default): string
|
||||
{
|
||||
$value = $this->value($path, $default);
|
||||
|
||||
if (!is_scalar($value)) {
|
||||
return $default;
|
||||
}
|
||||
|
||||
return (string) $value;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[] $default
|
||||
* @param string[]|null $emptySafeDefault
|
||||
* @return string[]
|
||||
*/
|
||||
private function stringList(string $path, array $default, bool $allowEmptyStrings = false, ?array $emptySafeDefault = null): array
|
||||
{
|
||||
$value = $this->value($path, $default);
|
||||
|
||||
if (!is_array($value)) {
|
||||
return $emptySafeDefault ?? $default;
|
||||
}
|
||||
|
||||
$out = [];
|
||||
foreach ($value as $item) {
|
||||
if (!is_scalar($item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = (string) $item;
|
||||
if (!$allowEmptyStrings) {
|
||||
$item = trim($item);
|
||||
}
|
||||
|
||||
if (!$allowEmptyStrings && $item === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($allowEmptyStrings || !in_array($item, $out, true)) {
|
||||
$out[] = $item;
|
||||
}
|
||||
}
|
||||
|
||||
if ($out === [] && !$allowEmptyStrings) {
|
||||
return $emptySafeDefault ?? $default;
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, string[]> $default
|
||||
* @return array<string, string[]>
|
||||
*/
|
||||
private function stringListMap(string $path, array $default): array
|
||||
{
|
||||
$value = $this->value($path, $default);
|
||||
|
||||
if (!is_array($value)) {
|
||||
return $default;
|
||||
}
|
||||
|
||||
$out = [];
|
||||
foreach ($value as $key => $items) {
|
||||
if (!is_string($key) || !is_array($items)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$cleanKey = trim($key);
|
||||
if ($cleanKey === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$cleanItems = [];
|
||||
foreach ($items as $item) {
|
||||
if (!is_scalar($item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = trim((string) $item);
|
||||
if ($item === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!in_array($item, $cleanItems, true)) {
|
||||
$cleanItems[] = $item;
|
||||
}
|
||||
}
|
||||
|
||||
if ($cleanItems !== []) {
|
||||
$out[$cleanKey] = $cleanItems;
|
||||
}
|
||||
}
|
||||
|
||||
return $out !== [] ? $out : $default;
|
||||
}
|
||||
|
||||
private function value(string $path, mixed $default): mixed
|
||||
{
|
||||
$current = $this->config;
|
||||
|
||||
foreach (explode('.', $path) as $segment) {
|
||||
if (!is_array($current) || !array_key_exists($segment, $current)) {
|
||||
return $default;
|
||||
}
|
||||
|
||||
$current = $current[$segment];
|
||||
}
|
||||
|
||||
return $current;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,27 +14,68 @@ final class StopWordsConfig
|
||||
* - keep question words
|
||||
* - keep domain terms
|
||||
* - remove only structural filler words
|
||||
*
|
||||
*/
|
||||
private const DEFAULT_STOP_WORDS = [
|
||||
'mit',
|
||||
'der', 'die', 'das',
|
||||
'ein', 'eine', 'einer', 'eines',
|
||||
'den', 'dem', 'des',
|
||||
'und', 'oder', 'aber', 'sowie',
|
||||
'ich', 'du', 'er', 'sie', 'es',
|
||||
'wir', 'ihr',
|
||||
'halt', 'eben', 'auch', 'schon',
|
||||
'noch', 'mal', 'bitte', 'danke',
|
||||
'also', 'nun', 'tja',
|
||||
'dann', 'danach', 'davor',
|
||||
'hier', 'dort',
|
||||
'heute', 'gestern', 'morgen',
|
||||
'könnte', 'kannst', 'kann',
|
||||
'würde', 'würdest', 'würden',
|
||||
];
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $config
|
||||
*/
|
||||
public function __construct(private array $config = [])
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
public function getStopWords(): array
|
||||
{
|
||||
return [
|
||||
'mit',
|
||||
'der', 'die', 'das',
|
||||
'ein', 'eine', 'einer', 'eines',
|
||||
'den', 'dem', 'des',
|
||||
'und', 'oder', 'aber', 'sowie',
|
||||
'ich', 'du', 'er', 'sie', 'es',
|
||||
'wir', 'ihr',
|
||||
'halt', 'eben', 'auch', 'schon',
|
||||
'noch', 'mal', 'bitte', 'danke',
|
||||
'also', 'nun', 'tja',
|
||||
'dann', 'danach', 'davor',
|
||||
'hier', 'dort',
|
||||
'heute', 'gestern', 'morgen',
|
||||
'könnte', 'kannst', 'kann',
|
||||
'würde', 'würdest', 'würden',
|
||||
];
|
||||
return $this->stringList('words', self::DEFAULT_STOP_WORDS);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[] $default
|
||||
* @return string[]
|
||||
*/
|
||||
private function stringList(string $key, array $default): array
|
||||
{
|
||||
$value = $this->config[$key] ?? $default;
|
||||
|
||||
if (!is_array($value)) {
|
||||
return $default;
|
||||
}
|
||||
|
||||
$out = [];
|
||||
foreach ($value as $item) {
|
||||
if (!is_scalar($item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = trim((string) $item);
|
||||
if ($item === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!in_array($item, $out, true)) {
|
||||
$out[] = $item;
|
||||
}
|
||||
}
|
||||
|
||||
return $out !== [] ? $out : $default;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
private IntentRouteResolver $routeResolver,
|
||||
private EntityCatalogService $entityCatalogService,
|
||||
private QueryEnricher $queryEnricher,
|
||||
private NdjsonHybridRetrieverConfig $retrieverConfig,
|
||||
)
|
||||
{
|
||||
}
|
||||
@@ -211,7 +212,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
if ($exactDocumentMatch !== null) {
|
||||
$selectedChunkIds = $this->selectExactDocumentChunkIds(
|
||||
$exactDocumentMatch['rows'],
|
||||
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)),
|
||||
max(1, min($config->getRetrievalMaxChunks(), $this->retrieverConfig->hardMaxChunks())),
|
||||
$prompt
|
||||
);
|
||||
|
||||
@@ -310,8 +311,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
string $salesIntent
|
||||
): array
|
||||
{
|
||||
$limit = max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS));
|
||||
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
|
||||
$limit = max(1, min($config->getRetrievalMaxChunks(), $this->retrieverConfig->hardMaxChunks()));
|
||||
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), $this->retrieverConfig->hardMaxVectorK()));
|
||||
|
||||
$isListQuery = $this->intentLite->isListQuery($prompt);
|
||||
|
||||
@@ -322,7 +323,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return [
|
||||
'limit' => $limit,
|
||||
'is_list_query' => $isListQuery,
|
||||
'threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD,
|
||||
'threshold' => $this->retrieverConfig->vectorScoreThreshold(),
|
||||
'ranked_chunk_ids' => [],
|
||||
'rows' => [],
|
||||
'rrf_scores' => [],
|
||||
@@ -501,9 +502,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
*/
|
||||
private function computeKeywordTopK(int $vectorTopK): int
|
||||
{
|
||||
$topK = (int) ceil($vectorTopK * NdjsonHybridRetrieverConfig::KEYWORD_TOPK_MULTIPLIER);
|
||||
$topK = (int) ceil($vectorTopK * $this->retrieverConfig->keywordTopKMultiplier());
|
||||
|
||||
return max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_KEYWORDK));
|
||||
return max(1, min($topK, $this->retrieverConfig->hardMaxKeywordK()));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -520,7 +521,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
int $vectorTopKBase
|
||||
): array
|
||||
{
|
||||
$threshold = NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD;
|
||||
$threshold = $this->retrieverConfig->vectorScoreThreshold();
|
||||
$topK = $vectorTopKBase;
|
||||
|
||||
if (
|
||||
@@ -531,13 +532,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
if ($isListQuery) {
|
||||
$topK = (int)round($topK * NdjsonHybridRetrieverConfig::LIST_BONUS);
|
||||
$topK = (int)round($topK * $this->retrieverConfig->listBonus());
|
||||
}
|
||||
|
||||
$topK = max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
|
||||
$topK = max(1, min($topK, $this->retrieverConfig->hardMaxVectorK()));
|
||||
$threshold = max(
|
||||
NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR,
|
||||
min(NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, $threshold)
|
||||
$this->retrieverConfig->thresholdFloor(),
|
||||
min($this->retrieverConfig->thresholdCeil(), $threshold)
|
||||
);
|
||||
|
||||
return [$threshold, $topK];
|
||||
@@ -587,16 +588,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
$rank++;
|
||||
$rrf = (1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank)) * $weight;
|
||||
$rrf = (1.0 / ($this->retrieverConfig->rrfK() + $rank)) * $weight;
|
||||
|
||||
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
|
||||
}
|
||||
};
|
||||
|
||||
$apply($globalHits, $vectorThreshold, 1.0);
|
||||
$apply($scopedHits, $vectorThreshold, $boostScopedVector ? NdjsonHybridRetrieverConfig::SCOPED_VECTOR_RRF_WEIGHT : 1.0);
|
||||
$apply($keywordHits, NdjsonHybridRetrieverConfig::KEYWORD_SCORE_THRESHOLD, NdjsonHybridRetrieverConfig::KEYWORD_RRF_WEIGHT);
|
||||
$apply($scopedKeywordHits, NdjsonHybridRetrieverConfig::KEYWORD_SCORE_THRESHOLD, $boostScopedKeyword ? NdjsonHybridRetrieverConfig::SCOPED_KEYWORD_RRF_WEIGHT : NdjsonHybridRetrieverConfig::KEYWORD_RRF_WEIGHT);
|
||||
$apply($scopedHits, $vectorThreshold, $boostScopedVector ? $this->retrieverConfig->scopedVectorRrfWeight() : 1.0);
|
||||
$apply($keywordHits, $this->retrieverConfig->keywordScoreThreshold(), $this->retrieverConfig->keywordRrfWeight());
|
||||
$apply($scopedKeywordHits, $this->retrieverConfig->keywordScoreThreshold(), $boostScopedKeyword ? $this->retrieverConfig->scopedKeywordRrfWeight() : $this->retrieverConfig->keywordRrfWeight());
|
||||
|
||||
return [
|
||||
'rrf_scores' => $rrfScores,
|
||||
@@ -621,9 +622,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
$rank++;
|
||||
$rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
|
||||
$rrf[(string)$hit['chunk_id']] = 1.0 / ($this->retrieverConfig->rrfK() + $rank);
|
||||
|
||||
if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) {
|
||||
if ($rank >= $this->retrieverConfig->emptyRrfFallbackTopN()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -649,7 +650,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
private function selectExactDocumentChunkIds(array $rows, int $limit, string $prompt): array
|
||||
{
|
||||
$orderedRows = $this->sortRowsByChunkIndex($rows);
|
||||
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
|
||||
$max = min($limit, $this->retrieverConfig->exactDocumentMaxChunks());
|
||||
|
||||
if ($orderedRows === [] || $max <= 0) {
|
||||
return [];
|
||||
|
||||
@@ -14,9 +14,7 @@ final readonly class QueryEnricher
|
||||
* The enriched semantic query should help vector retrieval,
|
||||
* but must not become bloated enough to dilute the original user intent.
|
||||
*/
|
||||
private const MAX_EXPANSIONS = 4;
|
||||
|
||||
public function __construct(
|
||||
public function __construct(
|
||||
private QueryEnricherConfig $config
|
||||
) {
|
||||
}
|
||||
@@ -95,7 +93,7 @@ final readonly class QueryEnricher
|
||||
$matches[] = $mappedValue;
|
||||
$seenNormalizedExpansions[$normalizedMappedValue] = true;
|
||||
|
||||
if (count($matches) >= self::MAX_EXPANSIONS) {
|
||||
if (count($matches) >= $this->config->getMaxExpansions()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user