This commit is contained in:
team 1
2026-04-24 18:54:25 +02:00
parent 372a6797fa
commit c439fb99d6
12 changed files with 1126 additions and 336 deletions

View File

@@ -131,24 +131,24 @@ final class NdjsonHybridRetrieverConfig
'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung',
'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend',
'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher',
'welches', 'brauche', 'suche'
'welches', 'brauche', 'suche',
];
const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
public const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
const FAMILY_DESCRIPTOR_TOKEN = [
public const FAMILY_DESCRIPTOR_TOKEN = [
'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab',
'inline', 'compact', 'panel', 'sc',
];
const LOOKS_LIKE_REAGENT_TOKENS = [
public const LOOKS_LIKE_REAGENT_TOKENS = [
'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie',
'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche',
'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz',
'kerzenfilter', 'druckregler',
];
const LOOKS_LIKE_SAFETY_DOCS = [
public const LOOKS_LIKE_SAFETY_DOCS = [
'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung',
'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp',
'kennzeichnung', 'h290', 'pbt', 'vpvb',
@@ -174,4 +174,309 @@ final class NdjsonHybridRetrieverConfig
'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor',
];
}
/**
* @param array<string, mixed> $config
* @param array<string, mixed> $vocabulary Kept for backwards-compatible service wiring.
*/
public function __construct(
private array $config = [],
private array $vocabulary = [],
) {
}
public function hardMaxChunks(): int
{
return $this->intValue('hard_max_chunks', self::HARD_MAX_CHUNKS, 1);
}
public function hardMaxVectorK(): int
{
return $this->intValue('hard_max_vectork', self::HARD_MAX_VECTORK, 1);
}
public function hardMaxKeywordK(): int
{
return $this->intValue('hard_max_keywordk', self::HARD_MAX_KEYWORDK, 1);
}
public function vectorScoreThreshold(): float
{
return $this->floatValue('vector_score_threshold', self::VECTOR_SCORE_THRESHOLD, 0.0, 1.0);
}
public function thresholdFloor(): float
{
return $this->floatValue('threshold_floor', self::THRESHOLD_FLOOR, 0.0, 1.0);
}
public function thresholdCeil(): float
{
return $this->floatValue('threshold_ceil', self::THRESHOLD_CEIL, 0.0, 1.0);
}
public function listBonus(): float
{
return $this->floatValue('list_bonus', self::LIST_BONUS, 1.0);
}
public function rrfK(): int
{
return $this->intValue('rrf_k', self::RRF_K, 1);
}
public function keywordTopKMultiplier(): float
{
return $this->floatValue('keyword_topk_multiplier', self::KEYWORD_TOPK_MULTIPLIER, 0.1);
}
public function keywordScoreThreshold(): float
{
return $this->floatValue('keyword_score_threshold', self::KEYWORD_SCORE_THRESHOLD, 0.0, 1.0);
}
public function keywordRrfWeight(): float
{
return $this->floatValue('keyword_rrf_weight', self::KEYWORD_RRF_WEIGHT, 0.0);
}
public function scopedVectorRrfWeight(): float
{
return $this->floatValue('scoped_vector_rrf_weight', self::SCOPED_VECTOR_RRF_WEIGHT, 0.0);
}
public function scopedKeywordRrfWeight(): float
{
return $this->floatValue('scoped_keyword_rrf_weight', self::SCOPED_KEYWORD_RRF_WEIGHT, 0.0);
}
public function emptyRrfFallbackTopN(): int
{
return $this->intValue('empty_rrf_fallback_topn', self::EMPTY_RRF_FALLBACK_TOPN, 1);
}
public function maxChunksPerDoc(): int
{
return $this->intValue('max_chunks_per_doc', self::MAX_CHUNKS_PER_DOC, 1);
}
public function minChunkDistance(): int
{
return $this->intValue('min_chunk_distance', self::MIN_CHUNK_DISTANCE, 0);
}
public function dominantDocWindow(): int
{
return $this->intValue('dominant_doc_window', self::DOMINANT_DOC_WINDOW, 1);
}
public function dominantDocMinHits(): int
{
return $this->intValue('dominant_doc_min_hits', self::DOMINANT_DOC_MIN_HITS, 1);
}
public function dominantDocMaxChunks(): int
{
return $this->intValue('dominant_doc_max_chunks', self::DOMINANT_DOC_MAX_CHUNKS, 1);
}
public function exactDocumentMaxChunks(): int
{
return $this->intValue('exact_document_max_chunks', self::EXACT_DOCUMENT_MAX_CHUNKS, 1);
}
public function focusedProductWindow(): int
{
return $this->intValue('focused_product_window', self::FOCUSED_PRODUCT_WINDOW, 1);
}
public function focusedProductMinScore(): float
{
return $this->floatValue('focused_product_min_score', self::FOCUSED_PRODUCT_MIN_SCORE, 0.0);
}
public function focusedProductMinGap(): float
{
return $this->floatValue('focused_product_min_gap', self::FOCUSED_PRODUCT_MIN_GAP, 0.0);
}
public function focusedProductMaxChunks(): int
{
return $this->intValue('focused_product_max_chunks', self::FOCUSED_PRODUCT_MAX_CHUNKS, 1);
}
/** @return string[] */
public function genericProductTokens(): array
{
return $this->stringList('generic_product_tokens', self::GENERIC_PRODUCT_TOKEN);
}
/** @return string[] */
public function importantShortModelTokens(): array
{
return $this->stringList('important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN);
}
/** @return string[] */
public function familyDescriptorTokens(): array
{
return $this->stringList('family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN);
}
/** @return string[] */
public function looksLikeReagentTokens(): array
{
return $this->stringList('looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS);
}
/** @return string[] */
public function looksLikeSafetyDocs(): array
{
return $this->stringList('looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS);
}
/** @return string[] */
public function looksLikeReagentWords(): array
{
return $this->stringList('looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS);
}
/** @return string[] */
public function looksLikeDocumentWords(): array
{
return $this->stringList('looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS);
}
/** @return string[] */
public function looksLikeSafetyWords(): array
{
return $this->stringList('looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS);
}
/** @return string[] */
public function looksLikeDeviceWords(): array
{
return $this->stringList('looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS);
}
/**
* @return array<string, mixed>
*/
public function toArray(): array
{
return [
'hard_max_chunks' => $this->hardMaxChunks(),
'hard_max_vectork' => $this->hardMaxVectorK(),
'hard_max_keywordk' => $this->hardMaxKeywordK(),
'vector_score_threshold' => $this->vectorScoreThreshold(),
'threshold_floor' => $this->thresholdFloor(),
'threshold_ceil' => $this->thresholdCeil(),
'list_bonus' => $this->listBonus(),
'rrf_k' => $this->rrfK(),
'keyword_topk_multiplier' => $this->keywordTopKMultiplier(),
'keyword_score_threshold' => $this->keywordScoreThreshold(),
'keyword_rrf_weight' => $this->keywordRrfWeight(),
'scoped_vector_rrf_weight' => $this->scopedVectorRrfWeight(),
'scoped_keyword_rrf_weight' => $this->scopedKeywordRrfWeight(),
'empty_rrf_fallback_topn' => $this->emptyRrfFallbackTopN(),
'max_chunks_per_doc' => $this->maxChunksPerDoc(),
'min_chunk_distance' => $this->minChunkDistance(),
'dominant_doc_window' => $this->dominantDocWindow(),
'dominant_doc_min_hits' => $this->dominantDocMinHits(),
'dominant_doc_max_chunks' => $this->dominantDocMaxChunks(),
'exact_document_max_chunks' => $this->exactDocumentMaxChunks(),
'focused_product_window' => $this->focusedProductWindow(),
'focused_product_min_score' => $this->focusedProductMinScore(),
'focused_product_min_gap' => $this->focusedProductMinGap(),
'focused_product_max_chunks' => $this->focusedProductMaxChunks(),
'generic_product_tokens' => $this->genericProductTokens(),
'important_short_model_tokens' => $this->importantShortModelTokens(),
'family_descriptor_tokens' => $this->familyDescriptorTokens(),
'looks_like_reagent_tokens' => $this->looksLikeReagentTokens(),
'looks_like_safety_docs' => $this->looksLikeSafetyDocs(),
'looks_like_reagent_words' => $this->looksLikeReagentWords(),
'looks_like_document_words' => $this->looksLikeDocumentWords(),
'looks_like_safety_words' => $this->looksLikeSafetyWords(),
'looks_like_device_words' => $this->looksLikeDeviceWords(),
];
}
private function intValue(string $key, int $default, int $min = PHP_INT_MIN, ?int $max = null): int
{
$value = $this->raw($key, $default);
if (!is_numeric($value)) {
return $default;
}
$value = (int) $value;
$value = max($min, $value);
if ($max !== null) {
$value = min($max, $value);
}
return $value;
}
private function floatValue(string $key, float $default, float $min = -INF, ?float $max = null): float
{
$value = $this->raw($key, $default);
if (!is_numeric($value)) {
return $default;
}
$value = (float) $value;
$value = max($min, $value);
if ($max !== null) {
$value = min($max, $value);
}
return $value;
}
/**
* @param string[] $default
* @return string[]
*/
private function stringList(string $key, array $default): array
{
$value = $this->raw($key, $default);
if (!is_array($value)) {
return $default;
}
$out = [];
foreach ($value as $item) {
if (!is_scalar($item)) {
continue;
}
$item = trim((string) $item);
if ($item === '') {
continue;
}
if (!in_array($item, $out, true)) {
$out[] = $item;
}
}
return $out !== [] ? $out : $default;
}
private function raw(string $key, mixed $default): mixed
{
if (array_key_exists($key, $this->config)) {
return $this->config[$key];
}
if (array_key_exists($key, $this->vocabulary)) {
return $this->vocabulary[$key];
}
return $default;
}
}