diff --git a/RETRIEX_PATCH_10_RETRIEVAL_CONFIG_YAML_ONLY_README.md b/RETRIEX_PATCH_10_RETRIEVAL_CONFIG_YAML_ONLY_README.md new file mode 100644 index 0000000..3c37ef7 --- /dev/null +++ b/RETRIEX_PATCH_10_RETRIEVAL_CONFIG_YAML_ONLY_README.md @@ -0,0 +1,60 @@ +# RetrieX Patch 10 - NdjsonHybridRetrieverConfig YAML-only + +## Ziel + +Dieser Patch entfernt die verbliebenen PHP-Defaults aus `NdjsonHybridRetrieverConfig`. + +Die Retrieval-Werte liegen bereits in `config/retriex/retrieval.yaml`. PHP liest diese Werte jetzt als Pflichtkonfiguration und wirft bei fehlenden oder ungültigen Werten klare Exceptions, statt still auf PHP-Konstanten zurückzufallen. + +## Geändert + +- `src/Config/NdjsonHybridRetrieverConfig.php` + +## YAML-only + +Umgestellt wurden: + +- Retrieval-Limits und Thresholds +- RRF-/Keyword-Gewichte +- Dominant-/Focused-Document-Werte +- Retrieval-Tokenlisten: + - `generic_product_tokens` + - `important_short_model_tokens` + - `family_descriptor_tokens` + - `looks_like_reagent_tokens` + - `looks_like_safety_docs` + - `looks_like_reagent_words` + - `looks_like_document_words` + - `looks_like_safety_words` + - `looks_like_device_words` + +## Nicht geändert + +- keine Änderung an `retrieval.yaml` +- keine Änderung an Retrieval-Algorithmus oder Scoring-Logik +- keine Änderung an PromptBuilder +- keine Änderung an Commerce/Shop +- keine Änderung an AgentRunner +- keine Änderung an SSE/Frontend + +## Wichtig + +Die YAML-Werte waren bereits vorhanden. Der Patch entfernt nur die PHP-Fallback-Ebene. + +## Nach dem Einspielen prüfen + +```bash +php bin/console cache:clear +php bin/console mto:agent:config:validate +php bin/console mto:agent:config:audit-source --details +php bin/console mto:agent:regression:test +``` + +Erwartung: + +- `NdjsonHybridRetrieverConfig` taucht nicht mehr als `yaml_with_php_fallback` auf. +- Die Regression bleibt grün, insbesondere: + - `important_short_model_token_th/tc/tp/tm/ph/rx` + - `retrieval_reagent_word_indikator` + - `retrieval_device_word_geraet` + - Testomat-808-/Indikator-300-Baseline diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index ce85e6d..e72d1b5 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -4,358 +4,192 @@ declare(strict_types=1); namespace App\Config; +use InvalidArgumentException; + final class NdjsonHybridRetrieverConfig { - /** - * Maximum number of chunks the retriever may finally hand to the model. - * - * Rationale: - * - enough room for the stronger hybrid pipeline - * - still conservative enough to avoid prompt bloat - */ - public const HARD_MAX_CHUNKS = 6; - - /** - * Hard upper bound for vector retrieval candidate size. - * - * Rationale: - * - the pipeline now combines primary vector, secondary vector, - * lexical, scoped retrieval and re-ranking - * - the old limit would constrain recall too early - * - still capped to keep latency controlled - */ - public const HARD_MAX_VECTORK = 18; - - /** - * Default semantic score threshold for vector hits. - * - * Rationale: - * - slightly relaxed compared to stricter pure-vector setups - * - the system now has more safeguards: - * lexical cross-signals, scoped retrieval, title/meta boost, selection rules - */ - public const VECTOR_SCORE_THRESHOLD = 0.83; - - /** - * Lower safety boundary for dynamic threshold adjustments. - * - * Rationale: - * - prevents the system from getting too noisy in fallback cases - * - still allows recovery when exact signals are sparse - */ - public const THRESHOLD_FLOOR = 0.75; - - /** - * Upper safety boundary for dynamic threshold adjustments. - * - * Rationale: - * - protects objection/pricing/list adjustments from becoming too strict - * - keeps retrieval from collapsing into empty result sets too easily - */ - public const THRESHOLD_CEIL = 0.90; - - /** - * Additional candidate expansion factor for list-like prompts. - * - * Rationale: - * - list requests benefit from wider candidate recall - * - too high would create noise across multiple retrieval channels - */ - public const LIST_BONUS = 1.35; - - /** - * Reciprocal Rank Fusion constant. - * - * Rationale: - * - keep rank importance meaningful - * - but not so aggressive that one retrieval source dominates too hard - */ - public const RRF_K = 50; - - /** - * Keyword retrieval is fused with vector retrieval as a factual safety net. - * It protects exact values, ranges, thresholds, model codes and domain terms - * that semantic retrieval can miss or rank too low. - */ - public const HARD_MAX_KEYWORDK = 36; - public const KEYWORD_TOPK_MULTIPLIER = 2.0; - public const KEYWORD_SCORE_THRESHOLD = 0.35; - public const KEYWORD_RRF_WEIGHT = 1.15; - public const SCOPED_VECTOR_RRF_WEIGHT = 1.20; - public const SCOPED_KEYWORD_RRF_WEIGHT = 1.30; - - /** - * Fallback size when thresholded fusion yields no candidates. - * - * Rationale: - * - slightly larger safety net for the richer hybrid stack - * - helps no-tag and low-signal cases without exploding context - */ - public const EMPTY_RRF_FALLBACK_TOPN = 1; - - /** - * Maximum number of chunks allowed from one document in spread mode. - * - * Rationale: - * - preserve diversity across documents - * - still allow coherent multi-chunk retrieval from strong sources - */ - public const MAX_CHUNKS_PER_DOC = 2; - - /** - * Minimum distance between chunk indices from the same document - * during spread-style selection. - * - * Rationale: - * - reduce near-duplicate neighboring chunks - * - still allow relevant continuation when needed - */ - public const MIN_CHUNK_DISTANCE = 2; - - /** - * When one document clearly dominates the top-ranked window, - * temporarily switch from "spread" mode to "dominant document" mode. - */ - public const DOMINANT_DOC_WINDOW = 6; - public const DOMINANT_DOC_MIN_HITS = 3; - public const DOMINANT_DOC_MAX_CHUNKS = 4; - public const EXACT_DOCUMENT_MAX_CHUNKS = 6; - public const FOCUSED_PRODUCT_WINDOW = 8; - public const FOCUSED_PRODUCT_MIN_SCORE = 10.0; - public const FOCUSED_PRODUCT_MIN_GAP = 4.0; - public const FOCUSED_PRODUCT_MAX_CHUNKS = 4; - - public const GENERIC_PRODUCT_TOKEN = [ - 'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät', - 'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte', - 'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung', - 'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend', - 'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher', - 'welches', 'brauche', 'suche', - ]; - - public const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx']; - - public const FAMILY_DESCRIPTOR_TOKEN = [ - 'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab', - 'inline', 'compact', 'panel', 'sc', - ]; - - public const LOOKS_LIKE_REAGENT_TOKENS = [ - 'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie', - 'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche', - 'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz', - 'kerzenfilter', 'druckregler', - ]; - - public const LOOKS_LIKE_SAFETY_DOCS = [ - 'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung', - 'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp', - 'kennzeichnung', 'h290', 'pbt', 'vpvb', - ]; - - public const LOOKS_LIKE_REAGENT_WORDS = [ - 'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb', - 'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde', - ]; - - public const LOOKS_LIKE_DOCUMENT_WORDS = [ - 'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung', - 'sdb', 'sicherheitsdatenblatt', 'msds', - ]; - - public const LOOKS_LIKE_SAFETY_WORDS = [ - 'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung', - 'transport', 'lagerung', 'piktogramm', - ]; - - public const LOOKS_LIKE_DEVICE_WORDS = [ - 'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat', - 'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor', - ]; - /** * @param array $config */ public function __construct( private array $config = [], - private readonly ?DomainVocabularyConfig $vocabulary = null, ) { } public function hardMaxChunks(): int { - return $this->intValue('hard_max_chunks', self::HARD_MAX_CHUNKS, 1); + return $this->requiredInt('hard_max_chunks', 1); } public function hardMaxVectorK(): int { - return $this->intValue('hard_max_vectork', self::HARD_MAX_VECTORK, 1); + return $this->requiredInt('hard_max_vectork', 1); } public function hardMaxKeywordK(): int { - return $this->intValue('hard_max_keywordk', self::HARD_MAX_KEYWORDK, 1); + return $this->requiredInt('hard_max_keywordk', 1); } public function vectorScoreThreshold(): float { - return $this->floatValue('vector_score_threshold', self::VECTOR_SCORE_THRESHOLD, 0.0, 1.0); + return $this->requiredFloat('vector_score_threshold', 0.0, 1.0); } public function thresholdFloor(): float { - return $this->floatValue('threshold_floor', self::THRESHOLD_FLOOR, 0.0, 1.0); + return $this->requiredFloat('threshold_floor', 0.0, 1.0); } public function thresholdCeil(): float { - return $this->floatValue('threshold_ceil', self::THRESHOLD_CEIL, 0.0, 1.0); + return $this->requiredFloat('threshold_ceil', 0.0, 1.0); } public function listBonus(): float { - return $this->floatValue('list_bonus', self::LIST_BONUS, 1.0); + return $this->requiredFloat('list_bonus', 1.0); } public function rrfK(): int { - return $this->intValue('rrf_k', self::RRF_K, 1); + return $this->requiredInt('rrf_k', 1); } public function keywordTopKMultiplier(): float { - return $this->floatValue('keyword_topk_multiplier', self::KEYWORD_TOPK_MULTIPLIER, 0.1); + return $this->requiredFloat('keyword_topk_multiplier', 0.1); } public function keywordScoreThreshold(): float { - return $this->floatValue('keyword_score_threshold', self::KEYWORD_SCORE_THRESHOLD, 0.0, 1.0); + return $this->requiredFloat('keyword_score_threshold', 0.0, 1.0); } public function keywordRrfWeight(): float { - return $this->floatValue('keyword_rrf_weight', self::KEYWORD_RRF_WEIGHT, 0.0); + return $this->requiredFloat('keyword_rrf_weight', 0.0); } public function scopedVectorRrfWeight(): float { - return $this->floatValue('scoped_vector_rrf_weight', self::SCOPED_VECTOR_RRF_WEIGHT, 0.0); + return $this->requiredFloat('scoped_vector_rrf_weight', 0.0); } public function scopedKeywordRrfWeight(): float { - return $this->floatValue('scoped_keyword_rrf_weight', self::SCOPED_KEYWORD_RRF_WEIGHT, 0.0); + return $this->requiredFloat('scoped_keyword_rrf_weight', 0.0); } public function emptyRrfFallbackTopN(): int { - return $this->intValue('empty_rrf_fallback_topn', self::EMPTY_RRF_FALLBACK_TOPN, 1); + return $this->requiredInt('empty_rrf_fallback_topn', 1); } public function maxChunksPerDoc(): int { - return $this->intValue('max_chunks_per_doc', self::MAX_CHUNKS_PER_DOC, 1); + return $this->requiredInt('max_chunks_per_doc', 1); } public function minChunkDistance(): int { - return $this->intValue('min_chunk_distance', self::MIN_CHUNK_DISTANCE, 0); + return $this->requiredInt('min_chunk_distance', 0); } public function dominantDocWindow(): int { - return $this->intValue('dominant_doc_window', self::DOMINANT_DOC_WINDOW, 1); + return $this->requiredInt('dominant_doc_window', 1); } public function dominantDocMinHits(): int { - return $this->intValue('dominant_doc_min_hits', self::DOMINANT_DOC_MIN_HITS, 1); + return $this->requiredInt('dominant_doc_min_hits', 1); } public function dominantDocMaxChunks(): int { - return $this->intValue('dominant_doc_max_chunks', self::DOMINANT_DOC_MAX_CHUNKS, 1); + return $this->requiredInt('dominant_doc_max_chunks', 1); } public function exactDocumentMaxChunks(): int { - return $this->intValue('exact_document_max_chunks', self::EXACT_DOCUMENT_MAX_CHUNKS, 1); + return $this->requiredInt('exact_document_max_chunks', 1); } public function focusedProductWindow(): int { - return $this->intValue('focused_product_window', self::FOCUSED_PRODUCT_WINDOW, 1); + return $this->requiredInt('focused_product_window', 1); } public function focusedProductMinScore(): float { - return $this->floatValue('focused_product_min_score', self::FOCUSED_PRODUCT_MIN_SCORE, 0.0); + return $this->requiredFloat('focused_product_min_score', 0.0); } public function focusedProductMinGap(): float { - return $this->floatValue('focused_product_min_gap', self::FOCUSED_PRODUCT_MIN_GAP, 0.0); + return $this->requiredFloat('focused_product_min_gap', 0.0); } public function focusedProductMaxChunks(): int { - return $this->intValue('focused_product_max_chunks', self::FOCUSED_PRODUCT_MAX_CHUNKS, 1); + return $this->requiredInt('focused_product_max_chunks', 1); } /** @return string[] */ public function genericProductTokens(): array { - return $this->stringList('generic_product_tokens', $this->vocabularyView('retrieval.generic_product_tokens', self::GENERIC_PRODUCT_TOKEN)); + return $this->requiredStringList('generic_product_tokens'); } /** @return string[] */ public function importantShortModelTokens(): array { - return $this->stringList('important_short_model_tokens', $this->vocabularyView('retrieval.important_short_model_tokens', self::IMPORTANT_SHORT_MODEL_TOKEN)); + return $this->requiredStringList('important_short_model_tokens'); } /** @return string[] */ public function familyDescriptorTokens(): array { - return $this->stringList('family_descriptor_tokens', $this->vocabularyView('retrieval.family_descriptor_tokens', self::FAMILY_DESCRIPTOR_TOKEN)); + return $this->requiredStringList('family_descriptor_tokens'); } /** @return string[] */ public function looksLikeReagentTokens(): array { - return $this->stringList('looks_like_reagent_tokens', $this->vocabularyView('retrieval.looks_like_reagent_tokens', self::LOOKS_LIKE_REAGENT_TOKENS)); + return $this->requiredStringList('looks_like_reagent_tokens'); } /** @return string[] */ public function looksLikeSafetyDocs(): array { - return $this->stringList('looks_like_safety_docs', $this->vocabularyView('retrieval.looks_like_safety_docs', self::LOOKS_LIKE_SAFETY_DOCS)); + return $this->requiredStringList('looks_like_safety_docs'); } /** @return string[] */ public function looksLikeReagentWords(): array { - return $this->stringList('looks_like_reagent_words', $this->vocabularyView('retrieval.looks_like_reagent_words', self::LOOKS_LIKE_REAGENT_WORDS)); + return $this->requiredStringList('looks_like_reagent_words'); } /** @return string[] */ public function looksLikeDocumentWords(): array { - return $this->stringList('looks_like_document_words', $this->vocabularyView('retrieval.looks_like_document_words', self::LOOKS_LIKE_DOCUMENT_WORDS)); + return $this->requiredStringList('looks_like_document_words'); } /** @return string[] */ public function looksLikeSafetyWords(): array { - return $this->stringList('looks_like_safety_words', $this->vocabularyView('retrieval.looks_like_safety_words', self::LOOKS_LIKE_SAFETY_WORDS)); + return $this->requiredStringList('looks_like_safety_words'); } /** @return string[] */ public function looksLikeDeviceWords(): array { - return $this->stringList('looks_like_device_words', $this->vocabularyView('retrieval.looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS)); + return $this->requiredStringList('looks_like_device_words'); } + /** * Effective retrieval vocabulary as a dedicated structure for diagnostics and config dumps. * @@ -418,58 +252,53 @@ final class NdjsonHybridRetrieverConfig ]; } - private function intValue(string $key, int $default, int $min = PHP_INT_MIN, ?int $max = null): int + private function requiredInt(string $key, int $min = PHP_INT_MIN, ?int $max = null): int { - $value = $this->raw($key, $default); + $value = $this->requiredValue($key); if (!is_numeric($value)) { - return $default; + throw $this->invalid($key, 'must be numeric'); } $value = (int) $value; - $value = max($min, $value); + if ($value < $min) { + throw $this->invalid($key, sprintf('must be greater than or equal to %d', $min)); + } - if ($max !== null) { - $value = min($max, $value); + if ($max !== null && $value > $max) { + throw $this->invalid($key, sprintf('must be less than or equal to %d', $max)); } return $value; } - private function floatValue(string $key, float $default, float $min = -INF, ?float $max = null): float + private function requiredFloat(string $key, float $min = -INF, ?float $max = null): float { - $value = $this->raw($key, $default); + $value = $this->requiredValue($key); if (!is_numeric($value)) { - return $default; + throw $this->invalid($key, 'must be numeric'); } $value = (float) $value; - $value = max($min, $value); + if ($value < $min) { + throw $this->invalid($key, sprintf('must be greater than or equal to %s', (string) $min)); + } - if ($max !== null) { - $value = min($max, $value); + if ($max !== null && $value > $max) { + throw $this->invalid($key, sprintf('must be less than or equal to %s', (string) $max)); } return $value; } - /** - * @param string[] $default - * @return string[] - */ /** @return string[] */ - private function vocabularyView(string $path, array $fallback): array + private function requiredStringList(string $key): array { - return $this->vocabulary?->view($path, $fallback) ?? $fallback; - } - - private function stringList(string $key, array $default): array - { - $value = $this->raw($key, $default); + $value = $this->requiredValue($key); if (!is_array($value)) { - return $default; + throw $this->invalid($key, 'must be a list of non-empty strings'); } $out = []; @@ -488,15 +317,29 @@ final class NdjsonHybridRetrieverConfig } } - return $out !== [] ? $out : $default; - } - - private function raw(string $key, mixed $default): mixed - { - if (array_key_exists($key, $this->config)) { - return $this->config[$key]; + if ($out === []) { + throw $this->invalid($key, 'must contain at least one non-empty string'); } - return $default; + return $out; + } + + private function requiredValue(string $key): mixed + { + if (!array_key_exists($key, $this->config)) { + throw $this->missing($key); + } + + return $this->config[$key]; + } + + private function missing(string $key): InvalidArgumentException + { + return new InvalidArgumentException(sprintf('RetrieX retrieval config "%s" is missing.', $key)); + } + + private function invalid(string $key, string $reason): InvalidArgumentException + { + return new InvalidArgumentException(sprintf('RetrieX retrieval config "%s" %s.', $key, $reason)); } }