503 lines
17 KiB
PHP
503 lines
17 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Config;
|
|
|
|
final class NdjsonHybridRetrieverConfig
|
|
{
|
|
/**
|
|
* Maximum number of chunks the retriever may finally hand to the model.
|
|
*
|
|
* Rationale:
|
|
* - enough room for the stronger hybrid pipeline
|
|
* - still conservative enough to avoid prompt bloat
|
|
*/
|
|
public const HARD_MAX_CHUNKS = 6;
|
|
|
|
/**
|
|
* Hard upper bound for vector retrieval candidate size.
|
|
*
|
|
* Rationale:
|
|
* - the pipeline now combines primary vector, secondary vector,
|
|
* lexical, scoped retrieval and re-ranking
|
|
* - the old limit would constrain recall too early
|
|
* - still capped to keep latency controlled
|
|
*/
|
|
public const HARD_MAX_VECTORK = 18;
|
|
|
|
/**
|
|
* Default semantic score threshold for vector hits.
|
|
*
|
|
* Rationale:
|
|
* - slightly relaxed compared to stricter pure-vector setups
|
|
* - the system now has more safeguards:
|
|
* lexical cross-signals, scoped retrieval, title/meta boost, selection rules
|
|
*/
|
|
public const VECTOR_SCORE_THRESHOLD = 0.83;
|
|
|
|
/**
|
|
* Lower safety boundary for dynamic threshold adjustments.
|
|
*
|
|
* Rationale:
|
|
* - prevents the system from getting too noisy in fallback cases
|
|
* - still allows recovery when exact signals are sparse
|
|
*/
|
|
public const THRESHOLD_FLOOR = 0.75;
|
|
|
|
/**
|
|
* Upper safety boundary for dynamic threshold adjustments.
|
|
*
|
|
* Rationale:
|
|
* - protects objection/pricing/list adjustments from becoming too strict
|
|
* - keeps retrieval from collapsing into empty result sets too easily
|
|
*/
|
|
public const THRESHOLD_CEIL = 0.90;
|
|
|
|
/**
|
|
* Additional candidate expansion factor for list-like prompts.
|
|
*
|
|
* Rationale:
|
|
* - list requests benefit from wider candidate recall
|
|
* - too high would create noise across multiple retrieval channels
|
|
*/
|
|
public const LIST_BONUS = 1.35;
|
|
|
|
/**
|
|
* Reciprocal Rank Fusion constant.
|
|
*
|
|
* Rationale:
|
|
* - keep rank importance meaningful
|
|
* - but not so aggressive that one retrieval source dominates too hard
|
|
*/
|
|
public const RRF_K = 50;
|
|
|
|
/**
|
|
* Keyword retrieval is fused with vector retrieval as a factual safety net.
|
|
* It protects exact values, ranges, thresholds, model codes and domain terms
|
|
* that semantic retrieval can miss or rank too low.
|
|
*/
|
|
public const HARD_MAX_KEYWORDK = 36;
|
|
public const KEYWORD_TOPK_MULTIPLIER = 2.0;
|
|
public const KEYWORD_SCORE_THRESHOLD = 0.35;
|
|
public const KEYWORD_RRF_WEIGHT = 1.15;
|
|
public const SCOPED_VECTOR_RRF_WEIGHT = 1.20;
|
|
public const SCOPED_KEYWORD_RRF_WEIGHT = 1.30;
|
|
|
|
/**
|
|
* Fallback size when thresholded fusion yields no candidates.
|
|
*
|
|
* Rationale:
|
|
* - slightly larger safety net for the richer hybrid stack
|
|
* - helps no-tag and low-signal cases without exploding context
|
|
*/
|
|
public const EMPTY_RRF_FALLBACK_TOPN = 1;
|
|
|
|
/**
|
|
* Maximum number of chunks allowed from one document in spread mode.
|
|
*
|
|
* Rationale:
|
|
* - preserve diversity across documents
|
|
* - still allow coherent multi-chunk retrieval from strong sources
|
|
*/
|
|
public const MAX_CHUNKS_PER_DOC = 2;
|
|
|
|
/**
|
|
* Minimum distance between chunk indices from the same document
|
|
* during spread-style selection.
|
|
*
|
|
* Rationale:
|
|
* - reduce near-duplicate neighboring chunks
|
|
* - still allow relevant continuation when needed
|
|
*/
|
|
public const MIN_CHUNK_DISTANCE = 2;
|
|
|
|
/**
|
|
* When one document clearly dominates the top-ranked window,
|
|
* temporarily switch from "spread" mode to "dominant document" mode.
|
|
*/
|
|
public const DOMINANT_DOC_WINDOW = 6;
|
|
public const DOMINANT_DOC_MIN_HITS = 3;
|
|
public const DOMINANT_DOC_MAX_CHUNKS = 4;
|
|
public const EXACT_DOCUMENT_MAX_CHUNKS = 6;
|
|
public const FOCUSED_PRODUCT_WINDOW = 8;
|
|
public const FOCUSED_PRODUCT_MIN_SCORE = 10.0;
|
|
public const FOCUSED_PRODUCT_MIN_GAP = 4.0;
|
|
public const FOCUSED_PRODUCT_MAX_CHUNKS = 4;
|
|
|
|
public const GENERIC_PRODUCT_TOKEN = [
|
|
'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät',
|
|
'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte',
|
|
'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung',
|
|
'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend',
|
|
'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher',
|
|
'welches', 'brauche', 'suche',
|
|
];
|
|
|
|
public const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
|
|
|
|
public const FAMILY_DESCRIPTOR_TOKEN = [
|
|
'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab',
|
|
'inline', 'compact', 'panel', 'sc',
|
|
];
|
|
|
|
public const LOOKS_LIKE_REAGENT_TOKENS = [
|
|
'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie',
|
|
'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche',
|
|
'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz',
|
|
'kerzenfilter', 'druckregler',
|
|
];
|
|
|
|
public const LOOKS_LIKE_SAFETY_DOCS = [
|
|
'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung',
|
|
'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp',
|
|
'kennzeichnung', 'h290', 'pbt', 'vpvb',
|
|
];
|
|
|
|
public const LOOKS_LIKE_REAGENT_WORDS = [
|
|
'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb',
|
|
'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde',
|
|
];
|
|
|
|
public const LOOKS_LIKE_DOCUMENT_WORDS = [
|
|
'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung',
|
|
'sdb', 'sicherheitsdatenblatt', 'msds',
|
|
];
|
|
|
|
public const LOOKS_LIKE_SAFETY_WORDS = [
|
|
'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung',
|
|
'transport', 'lagerung', 'piktogramm',
|
|
];
|
|
|
|
public const LOOKS_LIKE_DEVICE_WORDS = [
|
|
'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat',
|
|
'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor',
|
|
];
|
|
|
|
/**
|
|
* @param array<string, mixed> $config
|
|
*/
|
|
public function __construct(
|
|
private array $config = [],
|
|
private readonly ?DomainVocabularyConfig $vocabulary = null,
|
|
) {
|
|
}
|
|
|
|
public function hardMaxChunks(): int
|
|
{
|
|
return $this->intValue('hard_max_chunks', self::HARD_MAX_CHUNKS, 1);
|
|
}
|
|
|
|
public function hardMaxVectorK(): int
|
|
{
|
|
return $this->intValue('hard_max_vectork', self::HARD_MAX_VECTORK, 1);
|
|
}
|
|
|
|
public function hardMaxKeywordK(): int
|
|
{
|
|
return $this->intValue('hard_max_keywordk', self::HARD_MAX_KEYWORDK, 1);
|
|
}
|
|
|
|
public function vectorScoreThreshold(): float
|
|
{
|
|
return $this->floatValue('vector_score_threshold', self::VECTOR_SCORE_THRESHOLD, 0.0, 1.0);
|
|
}
|
|
|
|
public function thresholdFloor(): float
|
|
{
|
|
return $this->floatValue('threshold_floor', self::THRESHOLD_FLOOR, 0.0, 1.0);
|
|
}
|
|
|
|
public function thresholdCeil(): float
|
|
{
|
|
return $this->floatValue('threshold_ceil', self::THRESHOLD_CEIL, 0.0, 1.0);
|
|
}
|
|
|
|
public function listBonus(): float
|
|
{
|
|
return $this->floatValue('list_bonus', self::LIST_BONUS, 1.0);
|
|
}
|
|
|
|
public function rrfK(): int
|
|
{
|
|
return $this->intValue('rrf_k', self::RRF_K, 1);
|
|
}
|
|
|
|
public function keywordTopKMultiplier(): float
|
|
{
|
|
return $this->floatValue('keyword_topk_multiplier', self::KEYWORD_TOPK_MULTIPLIER, 0.1);
|
|
}
|
|
|
|
public function keywordScoreThreshold(): float
|
|
{
|
|
return $this->floatValue('keyword_score_threshold', self::KEYWORD_SCORE_THRESHOLD, 0.0, 1.0);
|
|
}
|
|
|
|
public function keywordRrfWeight(): float
|
|
{
|
|
return $this->floatValue('keyword_rrf_weight', self::KEYWORD_RRF_WEIGHT, 0.0);
|
|
}
|
|
|
|
public function scopedVectorRrfWeight(): float
|
|
{
|
|
return $this->floatValue('scoped_vector_rrf_weight', self::SCOPED_VECTOR_RRF_WEIGHT, 0.0);
|
|
}
|
|
|
|
public function scopedKeywordRrfWeight(): float
|
|
{
|
|
return $this->floatValue('scoped_keyword_rrf_weight', self::SCOPED_KEYWORD_RRF_WEIGHT, 0.0);
|
|
}
|
|
|
|
public function emptyRrfFallbackTopN(): int
|
|
{
|
|
return $this->intValue('empty_rrf_fallback_topn', self::EMPTY_RRF_FALLBACK_TOPN, 1);
|
|
}
|
|
|
|
public function maxChunksPerDoc(): int
|
|
{
|
|
return $this->intValue('max_chunks_per_doc', self::MAX_CHUNKS_PER_DOC, 1);
|
|
}
|
|
|
|
public function minChunkDistance(): int
|
|
{
|
|
return $this->intValue('min_chunk_distance', self::MIN_CHUNK_DISTANCE, 0);
|
|
}
|
|
|
|
public function dominantDocWindow(): int
|
|
{
|
|
return $this->intValue('dominant_doc_window', self::DOMINANT_DOC_WINDOW, 1);
|
|
}
|
|
|
|
public function dominantDocMinHits(): int
|
|
{
|
|
return $this->intValue('dominant_doc_min_hits', self::DOMINANT_DOC_MIN_HITS, 1);
|
|
}
|
|
|
|
public function dominantDocMaxChunks(): int
|
|
{
|
|
return $this->intValue('dominant_doc_max_chunks', self::DOMINANT_DOC_MAX_CHUNKS, 1);
|
|
}
|
|
|
|
public function exactDocumentMaxChunks(): int
|
|
{
|
|
return $this->intValue('exact_document_max_chunks', self::EXACT_DOCUMENT_MAX_CHUNKS, 1);
|
|
}
|
|
|
|
public function focusedProductWindow(): int
|
|
{
|
|
return $this->intValue('focused_product_window', self::FOCUSED_PRODUCT_WINDOW, 1);
|
|
}
|
|
|
|
public function focusedProductMinScore(): float
|
|
{
|
|
return $this->floatValue('focused_product_min_score', self::FOCUSED_PRODUCT_MIN_SCORE, 0.0);
|
|
}
|
|
|
|
public function focusedProductMinGap(): float
|
|
{
|
|
return $this->floatValue('focused_product_min_gap', self::FOCUSED_PRODUCT_MIN_GAP, 0.0);
|
|
}
|
|
|
|
public function focusedProductMaxChunks(): int
|
|
{
|
|
return $this->intValue('focused_product_max_chunks', self::FOCUSED_PRODUCT_MAX_CHUNKS, 1);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function genericProductTokens(): array
|
|
{
|
|
return $this->stringList('generic_product_tokens', $this->vocabularyView('retrieval.generic_product_tokens', self::GENERIC_PRODUCT_TOKEN));
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function importantShortModelTokens(): array
|
|
{
|
|
return $this->stringList('important_short_model_tokens', $this->vocabularyView('retrieval.important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN));
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function familyDescriptorTokens(): array
|
|
{
|
|
return $this->stringList('family_descriptor_tokens', $this->vocabularyView('retrieval.family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN));
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function looksLikeReagentTokens(): array
|
|
{
|
|
return $this->stringList('looks_like_reagent_tokens', $this->vocabularyView('retrieval.looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS));
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function looksLikeSafetyDocs(): array
|
|
{
|
|
return $this->stringList('looks_like_safety_docs', $this->vocabularyView('retrieval.looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS));
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function looksLikeReagentWords(): array
|
|
{
|
|
return $this->stringList('looks_like_reagent_words', $this->vocabularyView('retrieval.looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS));
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function looksLikeDocumentWords(): array
|
|
{
|
|
return $this->stringList('looks_like_document_words', $this->vocabularyView('retrieval.looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS));
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function looksLikeSafetyWords(): array
|
|
{
|
|
return $this->stringList('looks_like_safety_words', $this->vocabularyView('retrieval.looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS));
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function looksLikeDeviceWords(): array
|
|
{
|
|
return $this->stringList('looks_like_device_words', $this->vocabularyView('retrieval.looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS));
|
|
}
|
|
/**
|
|
* Effective retrieval vocabulary as a dedicated structure for diagnostics and config dumps.
|
|
*
|
|
* @return array<string, array<int, string>>
|
|
*/
|
|
public function vocabularyToArray(): array
|
|
{
|
|
return [
|
|
'generic_product_tokens' => $this->genericProductTokens(),
|
|
'important_short_model_tokens' => $this->importantShortModelTokens(),
|
|
'family_descriptor_tokens' => $this->familyDescriptorTokens(),
|
|
'looks_like_reagent_tokens' => $this->looksLikeReagentTokens(),
|
|
'looks_like_safety_docs' => $this->looksLikeSafetyDocs(),
|
|
'looks_like_reagent_words' => $this->looksLikeReagentWords(),
|
|
'looks_like_document_words' => $this->looksLikeDocumentWords(),
|
|
'looks_like_safety_words' => $this->looksLikeSafetyWords(),
|
|
'looks_like_device_words' => $this->looksLikeDeviceWords(),
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @return array<string, mixed>
|
|
*/
|
|
public function toArray(): array
|
|
{
|
|
return [
|
|
'hard_max_chunks' => $this->hardMaxChunks(),
|
|
'hard_max_vectork' => $this->hardMaxVectorK(),
|
|
'hard_max_keywordk' => $this->hardMaxKeywordK(),
|
|
'vector_score_threshold' => $this->vectorScoreThreshold(),
|
|
'threshold_floor' => $this->thresholdFloor(),
|
|
'threshold_ceil' => $this->thresholdCeil(),
|
|
'list_bonus' => $this->listBonus(),
|
|
'rrf_k' => $this->rrfK(),
|
|
'keyword_topk_multiplier' => $this->keywordTopKMultiplier(),
|
|
'keyword_score_threshold' => $this->keywordScoreThreshold(),
|
|
'keyword_rrf_weight' => $this->keywordRrfWeight(),
|
|
'scoped_vector_rrf_weight' => $this->scopedVectorRrfWeight(),
|
|
'scoped_keyword_rrf_weight' => $this->scopedKeywordRrfWeight(),
|
|
'empty_rrf_fallback_topn' => $this->emptyRrfFallbackTopN(),
|
|
'max_chunks_per_doc' => $this->maxChunksPerDoc(),
|
|
'min_chunk_distance' => $this->minChunkDistance(),
|
|
'dominant_doc_window' => $this->dominantDocWindow(),
|
|
'dominant_doc_min_hits' => $this->dominantDocMinHits(),
|
|
'dominant_doc_max_chunks' => $this->dominantDocMaxChunks(),
|
|
'exact_document_max_chunks' => $this->exactDocumentMaxChunks(),
|
|
'focused_product_window' => $this->focusedProductWindow(),
|
|
'focused_product_min_score' => $this->focusedProductMinScore(),
|
|
'focused_product_min_gap' => $this->focusedProductMinGap(),
|
|
'focused_product_max_chunks' => $this->focusedProductMaxChunks(),
|
|
'generic_product_tokens' => $this->genericProductTokens(),
|
|
'important_short_model_tokens' => $this->importantShortModelTokens(),
|
|
'family_descriptor_tokens' => $this->familyDescriptorTokens(),
|
|
'looks_like_reagent_tokens' => $this->looksLikeReagentTokens(),
|
|
'looks_like_safety_docs' => $this->looksLikeSafetyDocs(),
|
|
'looks_like_reagent_words' => $this->looksLikeReagentWords(),
|
|
'looks_like_document_words' => $this->looksLikeDocumentWords(),
|
|
'looks_like_safety_words' => $this->looksLikeSafetyWords(),
|
|
'looks_like_device_words' => $this->looksLikeDeviceWords(),
|
|
];
|
|
}
|
|
|
|
private function intValue(string $key, int $default, int $min = PHP_INT_MIN, ?int $max = null): int
|
|
{
|
|
$value = $this->raw($key, $default);
|
|
|
|
if (!is_numeric($value)) {
|
|
return $default;
|
|
}
|
|
|
|
$value = (int) $value;
|
|
$value = max($min, $value);
|
|
|
|
if ($max !== null) {
|
|
$value = min($max, $value);
|
|
}
|
|
|
|
return $value;
|
|
}
|
|
|
|
private function floatValue(string $key, float $default, float $min = -INF, ?float $max = null): float
|
|
{
|
|
$value = $this->raw($key, $default);
|
|
|
|
if (!is_numeric($value)) {
|
|
return $default;
|
|
}
|
|
|
|
$value = (float) $value;
|
|
$value = max($min, $value);
|
|
|
|
if ($max !== null) {
|
|
$value = min($max, $value);
|
|
}
|
|
|
|
return $value;
|
|
}
|
|
|
|
/**
|
|
* @param string[] $default
|
|
* @return string[]
|
|
*/
|
|
/** @return string[] */
|
|
private function vocabularyView(string $path, array $fallback): array
|
|
{
|
|
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
|
|
}
|
|
|
|
private function stringList(string $key, array $default): array
|
|
{
|
|
$value = $this->raw($key, $default);
|
|
|
|
if (!is_array($value)) {
|
|
return $default;
|
|
}
|
|
|
|
$out = [];
|
|
foreach ($value as $item) {
|
|
if (!is_scalar($item)) {
|
|
continue;
|
|
}
|
|
|
|
$item = trim((string) $item);
|
|
if ($item === '') {
|
|
continue;
|
|
}
|
|
|
|
if (!in_array($item, $out, true)) {
|
|
$out[] = $item;
|
|
}
|
|
}
|
|
|
|
return $out !== [] ? $out : $default;
|
|
}
|
|
|
|
private function raw(string $key, mixed $default): mixed
|
|
{
|
|
if (array_key_exists($key, $this->config)) {
|
|
return $this->config[$key];
|
|
}
|
|
|
|
return $default;
|
|
}
|
|
}
|