This commit is contained in:
team 1
2026-04-30 20:10:56 +02:00
parent aa4aa009e4
commit 0442a24d4c
2 changed files with 137 additions and 234 deletions

View File

@@ -0,0 +1,60 @@
# RetrieX Patch 10 - NdjsonHybridRetrieverConfig YAML-only
## Ziel
Dieser Patch entfernt die verbliebenen PHP-Defaults aus `NdjsonHybridRetrieverConfig`.
Die Retrieval-Werte liegen bereits in `config/retriex/retrieval.yaml`. PHP liest diese Werte jetzt als Pflichtkonfiguration und wirft bei fehlenden oder ungültigen Werten klare Exceptions, statt still auf PHP-Konstanten zurückzufallen.
## Geändert
- `src/Config/NdjsonHybridRetrieverConfig.php`
## YAML-only
Umgestellt wurden:
- Retrieval-Limits und Thresholds
- RRF-/Keyword-Gewichte
- Dominant-/Focused-Document-Werte
- Retrieval-Tokenlisten:
- `generic_product_tokens`
- `important_short_model_tokens`
- `family_descriptor_tokens`
- `looks_like_reagent_tokens`
- `looks_like_safety_docs`
- `looks_like_reagent_words`
- `looks_like_document_words`
- `looks_like_safety_words`
- `looks_like_device_words`
## Nicht geändert
- keine Änderung an `retrieval.yaml`
- keine Änderung an Retrieval-Algorithmus oder Scoring-Logik
- keine Änderung an PromptBuilder
- keine Änderung an Commerce/Shop
- keine Änderung an AgentRunner
- keine Änderung an SSE/Frontend
## Wichtig
Die YAML-Werte waren bereits vorhanden. Der Patch entfernt nur die PHP-Fallback-Ebene.
## Nach dem Einspielen prüfen
```bash
php bin/console cache:clear
php bin/console mto:agent:config:validate
php bin/console mto:agent:config:audit-source --details
php bin/console mto:agent:regression:test
```
Erwartung:
- `NdjsonHybridRetrieverConfig` taucht nicht mehr als `yaml_with_php_fallback` auf.
- Die Regression bleibt grün, insbesondere:
- `important_short_model_token_th/tc/tp/tm/ph/rx`
- `retrieval_reagent_word_indikator`
- `retrieval_device_word_geraet`
- Testomat-808-/Indikator-300-Baseline

View File

@@ -4,358 +4,192 @@ declare(strict_types=1);
namespace App\Config; namespace App\Config;
use InvalidArgumentException;
final class NdjsonHybridRetrieverConfig final class NdjsonHybridRetrieverConfig
{ {
/**
* Maximum number of chunks the retriever may finally hand to the model.
*
* Rationale:
* - enough room for the stronger hybrid pipeline
* - still conservative enough to avoid prompt bloat
*/
public const HARD_MAX_CHUNKS = 6;
/**
* Hard upper bound for vector retrieval candidate size.
*
* Rationale:
* - the pipeline now combines primary vector, secondary vector,
* lexical, scoped retrieval and re-ranking
* - the old limit would constrain recall too early
* - still capped to keep latency controlled
*/
public const HARD_MAX_VECTORK = 18;
/**
* Default semantic score threshold for vector hits.
*
* Rationale:
* - slightly relaxed compared to stricter pure-vector setups
* - the system now has more safeguards:
* lexical cross-signals, scoped retrieval, title/meta boost, selection rules
*/
public const VECTOR_SCORE_THRESHOLD = 0.83;
/**
* Lower safety boundary for dynamic threshold adjustments.
*
* Rationale:
* - prevents the system from getting too noisy in fallback cases
* - still allows recovery when exact signals are sparse
*/
public const THRESHOLD_FLOOR = 0.75;
/**
* Upper safety boundary for dynamic threshold adjustments.
*
* Rationale:
* - protects objection/pricing/list adjustments from becoming too strict
* - keeps retrieval from collapsing into empty result sets too easily
*/
public const THRESHOLD_CEIL = 0.90;
/**
* Additional candidate expansion factor for list-like prompts.
*
* Rationale:
* - list requests benefit from wider candidate recall
* - too high would create noise across multiple retrieval channels
*/
public const LIST_BONUS = 1.35;
/**
* Reciprocal Rank Fusion constant.
*
* Rationale:
* - keep rank importance meaningful
* - but not so aggressive that one retrieval source dominates too hard
*/
public const RRF_K = 50;
/**
* Keyword retrieval is fused with vector retrieval as a factual safety net.
* It protects exact values, ranges, thresholds, model codes and domain terms
* that semantic retrieval can miss or rank too low.
*/
public const HARD_MAX_KEYWORDK = 36;
public const KEYWORD_TOPK_MULTIPLIER = 2.0;
public const KEYWORD_SCORE_THRESHOLD = 0.35;
public const KEYWORD_RRF_WEIGHT = 1.15;
public const SCOPED_VECTOR_RRF_WEIGHT = 1.20;
public const SCOPED_KEYWORD_RRF_WEIGHT = 1.30;
/**
* Fallback size when thresholded fusion yields no candidates.
*
* Rationale:
* - slightly larger safety net for the richer hybrid stack
* - helps no-tag and low-signal cases without exploding context
*/
public const EMPTY_RRF_FALLBACK_TOPN = 1;
/**
* Maximum number of chunks allowed from one document in spread mode.
*
* Rationale:
* - preserve diversity across documents
* - still allow coherent multi-chunk retrieval from strong sources
*/
public const MAX_CHUNKS_PER_DOC = 2;
/**
* Minimum distance between chunk indices from the same document
* during spread-style selection.
*
* Rationale:
* - reduce near-duplicate neighboring chunks
* - still allow relevant continuation when needed
*/
public const MIN_CHUNK_DISTANCE = 2;
/**
* When one document clearly dominates the top-ranked window,
* temporarily switch from "spread" mode to "dominant document" mode.
*/
public const DOMINANT_DOC_WINDOW = 6;
public const DOMINANT_DOC_MIN_HITS = 3;
public const DOMINANT_DOC_MAX_CHUNKS = 4;
public const EXACT_DOCUMENT_MAX_CHUNKS = 6;
public const FOCUSED_PRODUCT_WINDOW = 8;
public const FOCUSED_PRODUCT_MIN_SCORE = 10.0;
public const FOCUSED_PRODUCT_MIN_GAP = 4.0;
public const FOCUSED_PRODUCT_MAX_CHUNKS = 4;
public const GENERIC_PRODUCT_TOKEN = [
'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät',
'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte',
'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung',
'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend',
'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher',
'welches', 'brauche', 'suche',
];
public const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
public const FAMILY_DESCRIPTOR_TOKEN = [
'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab',
'inline', 'compact', 'panel', 'sc',
];
public const LOOKS_LIKE_REAGENT_TOKENS = [
'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie',
'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche',
'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz',
'kerzenfilter', 'druckregler',
];
public const LOOKS_LIKE_SAFETY_DOCS = [
'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung',
'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp',
'kennzeichnung', 'h290', 'pbt', 'vpvb',
];
public const LOOKS_LIKE_REAGENT_WORDS = [
'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb',
'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde',
];
public const LOOKS_LIKE_DOCUMENT_WORDS = [
'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung',
'sdb', 'sicherheitsdatenblatt', 'msds',
];
public const LOOKS_LIKE_SAFETY_WORDS = [
'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung',
'transport', 'lagerung', 'piktogramm',
];
public const LOOKS_LIKE_DEVICE_WORDS = [
'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat',
'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor',
];
/** /**
* @param array<string, mixed> $config * @param array<string, mixed> $config
*/ */
public function __construct( public function __construct(
private array $config = [], private array $config = [],
private readonly ?DomainVocabularyConfig $vocabulary = null,
) { ) {
} }
public function hardMaxChunks(): int public function hardMaxChunks(): int
{ {
return $this->intValue('hard_max_chunks', self::HARD_MAX_CHUNKS, 1); return $this->requiredInt('hard_max_chunks', 1);
} }
public function hardMaxVectorK(): int public function hardMaxVectorK(): int
{ {
return $this->intValue('hard_max_vectork', self::HARD_MAX_VECTORK, 1); return $this->requiredInt('hard_max_vectork', 1);
} }
public function hardMaxKeywordK(): int public function hardMaxKeywordK(): int
{ {
return $this->intValue('hard_max_keywordk', self::HARD_MAX_KEYWORDK, 1); return $this->requiredInt('hard_max_keywordk', 1);
} }
public function vectorScoreThreshold(): float public function vectorScoreThreshold(): float
{ {
return $this->floatValue('vector_score_threshold', self::VECTOR_SCORE_THRESHOLD, 0.0, 1.0); return $this->requiredFloat('vector_score_threshold', 0.0, 1.0);
} }
public function thresholdFloor(): float public function thresholdFloor(): float
{ {
return $this->floatValue('threshold_floor', self::THRESHOLD_FLOOR, 0.0, 1.0); return $this->requiredFloat('threshold_floor', 0.0, 1.0);
} }
public function thresholdCeil(): float public function thresholdCeil(): float
{ {
return $this->floatValue('threshold_ceil', self::THRESHOLD_CEIL, 0.0, 1.0); return $this->requiredFloat('threshold_ceil', 0.0, 1.0);
} }
public function listBonus(): float public function listBonus(): float
{ {
return $this->floatValue('list_bonus', self::LIST_BONUS, 1.0); return $this->requiredFloat('list_bonus', 1.0);
} }
public function rrfK(): int public function rrfK(): int
{ {
return $this->intValue('rrf_k', self::RRF_K, 1); return $this->requiredInt('rrf_k', 1);
} }
public function keywordTopKMultiplier(): float public function keywordTopKMultiplier(): float
{ {
return $this->floatValue('keyword_topk_multiplier', self::KEYWORD_TOPK_MULTIPLIER, 0.1); return $this->requiredFloat('keyword_topk_multiplier', 0.1);
} }
public function keywordScoreThreshold(): float public function keywordScoreThreshold(): float
{ {
return $this->floatValue('keyword_score_threshold', self::KEYWORD_SCORE_THRESHOLD, 0.0, 1.0); return $this->requiredFloat('keyword_score_threshold', 0.0, 1.0);
} }
public function keywordRrfWeight(): float public function keywordRrfWeight(): float
{ {
return $this->floatValue('keyword_rrf_weight', self::KEYWORD_RRF_WEIGHT, 0.0); return $this->requiredFloat('keyword_rrf_weight', 0.0);
} }
public function scopedVectorRrfWeight(): float public function scopedVectorRrfWeight(): float
{ {
return $this->floatValue('scoped_vector_rrf_weight', self::SCOPED_VECTOR_RRF_WEIGHT, 0.0); return $this->requiredFloat('scoped_vector_rrf_weight', 0.0);
} }
public function scopedKeywordRrfWeight(): float public function scopedKeywordRrfWeight(): float
{ {
return $this->floatValue('scoped_keyword_rrf_weight', self::SCOPED_KEYWORD_RRF_WEIGHT, 0.0); return $this->requiredFloat('scoped_keyword_rrf_weight', 0.0);
} }
public function emptyRrfFallbackTopN(): int public function emptyRrfFallbackTopN(): int
{ {
return $this->intValue('empty_rrf_fallback_topn', self::EMPTY_RRF_FALLBACK_TOPN, 1); return $this->requiredInt('empty_rrf_fallback_topn', 1);
} }
public function maxChunksPerDoc(): int public function maxChunksPerDoc(): int
{ {
return $this->intValue('max_chunks_per_doc', self::MAX_CHUNKS_PER_DOC, 1); return $this->requiredInt('max_chunks_per_doc', 1);
} }
public function minChunkDistance(): int public function minChunkDistance(): int
{ {
return $this->intValue('min_chunk_distance', self::MIN_CHUNK_DISTANCE, 0); return $this->requiredInt('min_chunk_distance', 0);
} }
public function dominantDocWindow(): int public function dominantDocWindow(): int
{ {
return $this->intValue('dominant_doc_window', self::DOMINANT_DOC_WINDOW, 1); return $this->requiredInt('dominant_doc_window', 1);
} }
public function dominantDocMinHits(): int public function dominantDocMinHits(): int
{ {
return $this->intValue('dominant_doc_min_hits', self::DOMINANT_DOC_MIN_HITS, 1); return $this->requiredInt('dominant_doc_min_hits', 1);
} }
public function dominantDocMaxChunks(): int public function dominantDocMaxChunks(): int
{ {
return $this->intValue('dominant_doc_max_chunks', self::DOMINANT_DOC_MAX_CHUNKS, 1); return $this->requiredInt('dominant_doc_max_chunks', 1);
} }
public function exactDocumentMaxChunks(): int public function exactDocumentMaxChunks(): int
{ {
return $this->intValue('exact_document_max_chunks', self::EXACT_DOCUMENT_MAX_CHUNKS, 1); return $this->requiredInt('exact_document_max_chunks', 1);
} }
public function focusedProductWindow(): int public function focusedProductWindow(): int
{ {
return $this->intValue('focused_product_window', self::FOCUSED_PRODUCT_WINDOW, 1); return $this->requiredInt('focused_product_window', 1);
} }
public function focusedProductMinScore(): float public function focusedProductMinScore(): float
{ {
return $this->floatValue('focused_product_min_score', self::FOCUSED_PRODUCT_MIN_SCORE, 0.0); return $this->requiredFloat('focused_product_min_score', 0.0);
} }
public function focusedProductMinGap(): float public function focusedProductMinGap(): float
{ {
return $this->floatValue('focused_product_min_gap', self::FOCUSED_PRODUCT_MIN_GAP, 0.0); return $this->requiredFloat('focused_product_min_gap', 0.0);
} }
public function focusedProductMaxChunks(): int public function focusedProductMaxChunks(): int
{ {
return $this->intValue('focused_product_max_chunks', self::FOCUSED_PRODUCT_MAX_CHUNKS, 1); return $this->requiredInt('focused_product_max_chunks', 1);
} }
/** @return string[] */ /** @return string[] */
public function genericProductTokens(): array public function genericProductTokens(): array
{ {
return $this->stringList('generic_product_tokens', $this->vocabularyView('retrieval.generic_product_tokens', self::GENERIC_PRODUCT_TOKEN)); return $this->requiredStringList('generic_product_tokens');
} }
/** @return string[] */ /** @return string[] */
public function importantShortModelTokens(): array public function importantShortModelTokens(): array
{ {
return $this->stringList('important_short_model_tokens', $this->vocabularyView('retrieval.important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN)); return $this->requiredStringList('important_short_model_tokens');
} }
/** @return string[] */ /** @return string[] */
public function familyDescriptorTokens(): array public function familyDescriptorTokens(): array
{ {
return $this->stringList('family_descriptor_tokens', $this->vocabularyView('retrieval.family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN)); return $this->requiredStringList('family_descriptor_tokens');
} }
/** @return string[] */ /** @return string[] */
public function looksLikeReagentTokens(): array public function looksLikeReagentTokens(): array
{ {
return $this->stringList('looks_like_reagent_tokens', $this->vocabularyView('retrieval.looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS)); return $this->requiredStringList('looks_like_reagent_tokens');
} }
/** @return string[] */ /** @return string[] */
public function looksLikeSafetyDocs(): array public function looksLikeSafetyDocs(): array
{ {
return $this->stringList('looks_like_safety_docs', $this->vocabularyView('retrieval.looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS)); return $this->requiredStringList('looks_like_safety_docs');
} }
/** @return string[] */ /** @return string[] */
public function looksLikeReagentWords(): array public function looksLikeReagentWords(): array
{ {
return $this->stringList('looks_like_reagent_words', $this->vocabularyView('retrieval.looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS)); return $this->requiredStringList('looks_like_reagent_words');
} }
/** @return string[] */ /** @return string[] */
public function looksLikeDocumentWords(): array public function looksLikeDocumentWords(): array
{ {
return $this->stringList('looks_like_document_words', $this->vocabularyView('retrieval.looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS)); return $this->requiredStringList('looks_like_document_words');
} }
/** @return string[] */ /** @return string[] */
public function looksLikeSafetyWords(): array public function looksLikeSafetyWords(): array
{ {
return $this->stringList('looks_like_safety_words', $this->vocabularyView('retrieval.looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS)); return $this->requiredStringList('looks_like_safety_words');
} }
/** @return string[] */ /** @return string[] */
public function looksLikeDeviceWords(): array public function looksLikeDeviceWords(): array
{ {
return $this->stringList('looks_like_device_words', $this->vocabularyView('retrieval.looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS)); return $this->requiredStringList('looks_like_device_words');
} }
/** /**
* Effective retrieval vocabulary as a dedicated structure for diagnostics and config dumps. * Effective retrieval vocabulary as a dedicated structure for diagnostics and config dumps.
* *
@@ -418,58 +252,53 @@ final class NdjsonHybridRetrieverConfig
]; ];
} }
private function intValue(string $key, int $default, int $min = PHP_INT_MIN, ?int $max = null): int private function requiredInt(string $key, int $min = PHP_INT_MIN, ?int $max = null): int
{ {
$value = $this->raw($key, $default); $value = $this->requiredValue($key);
if (!is_numeric($value)) { if (!is_numeric($value)) {
return $default; throw $this->invalid($key, 'must be numeric');
} }
$value = (int) $value; $value = (int) $value;
$value = max($min, $value); if ($value < $min) {
throw $this->invalid($key, sprintf('must be greater than or equal to %d', $min));
}
if ($max !== null) { if ($max !== null && $value > $max) {
$value = min($max, $value); throw $this->invalid($key, sprintf('must be less than or equal to %d', $max));
} }
return $value; return $value;
} }
private function floatValue(string $key, float $default, float $min = -INF, ?float $max = null): float private function requiredFloat(string $key, float $min = -INF, ?float $max = null): float
{ {
$value = $this->raw($key, $default); $value = $this->requiredValue($key);
if (!is_numeric($value)) { if (!is_numeric($value)) {
return $default; throw $this->invalid($key, 'must be numeric');
} }
$value = (float) $value; $value = (float) $value;
$value = max($min, $value); if ($value < $min) {
throw $this->invalid($key, sprintf('must be greater than or equal to %s', (string) $min));
}
if ($max !== null) { if ($max !== null && $value > $max) {
$value = min($max, $value); throw $this->invalid($key, sprintf('must be less than or equal to %s', (string) $max));
} }
return $value; return $value;
} }
/**
* @param string[] $default
* @return string[]
*/
/** @return string[] */ /** @return string[] */
private function vocabularyView(string $path, array $fallback): array private function requiredStringList(string $key): array
{ {
return $this->vocabulary?->view($path, $fallback) ?? $fallback; $value = $this->requiredValue($key);
}
private function stringList(string $key, array $default): array
{
$value = $this->raw($key, $default);
if (!is_array($value)) { if (!is_array($value)) {
return $default; throw $this->invalid($key, 'must be a list of non-empty strings');
} }
$out = []; $out = [];
@@ -488,15 +317,29 @@ final class NdjsonHybridRetrieverConfig
} }
} }
return $out !== [] ? $out : $default; if ($out === []) {
throw $this->invalid($key, 'must contain at least one non-empty string');
} }
private function raw(string $key, mixed $default): mixed return $out;
}
private function requiredValue(string $key): mixed
{ {
if (array_key_exists($key, $this->config)) { if (!array_key_exists($key, $this->config)) {
throw $this->missing($key);
}
return $this->config[$key]; return $this->config[$key];
} }
return $default; private function missing(string $key): InvalidArgumentException
{
return new InvalidArgumentException(sprintf('RetrieX retrieval config "%s" is missing.', $key));
}
private function invalid(string $key, string $reason): InvalidArgumentException
{
return new InvalidArgumentException(sprintf('RetrieX retrieval config "%s" %s.', $key, $reason));
} }
} }