lexical logic

2026-04-20 21:46:42 +02:00
parent 2587ac8b4b
commit 065f59c090
9 changed files with 2576 additions and 326 deletions
--- a/src/Config/NdjsonHybridRetrieverConfig.php
+++ b/src/Config/NdjsonHybridRetrieverConfig.php
@@ -7,62 +7,96 @@ namespace App\Config;
 final class NdjsonHybridRetrieverConfig
 {
    /**
-     * Default semantic similarity threshold for vector hits.
+     * Maximum number of chunks the retriever may finally hand to the model.
     *
-     * Chosen to stay selective enough for product-family-heavy data
-     * while not cutting off too many useful fallback hits.
+     * Rationale:
+     * - enough room for the stronger hybrid pipeline
+     * - still conservative enough to avoid prompt bloat
     */
-    public const VECTOR_SCORE_THRESHOLD = 0.83;
+    public const HARD_MAX_CHUNKS = 6;

    /**
-     * Absolute safety caps.
+     * Hard upper bound for vector retrieval candidate size.
     *
-     * These limits protect the retriever from overly large candidate sets
-     * even if runtime config values are set too high.
+     * Rationale:
+     * - the pipeline now combines primary vector, secondary vector,
+     *   lexical, scoped retrieval and re-ranking
+     * - the old limit would constrain recall too early
+     * - still capped to keep latency controlled
     */
-    public const HARD_MAX_CHUNKS = 72;
-    public const HARD_MAX_VECTORK = 180;
+    public const HARD_MAX_VECTORK = 18;

    /**
-     * List-style queries benefit from a slightly wider candidate pool
-     * before de-duplication and final selection.
+     * Default semantic score threshold for vector hits.
+     *
+     * Rationale:
+     * - slightly relaxed compared to stricter pure-vector setups
+     * - the system now has more safeguards:
+     *   lexical cross-signals, scoped retrieval, title/meta boost, selection rules
     */
-    public const LIST_BONUS = 1.25;
+    public const VECTOR_SCORE_THRESHOLD = 0.81;

    /**
-     * Selection rules for cross-document semantic retrieval.
+     * Lower safety boundary for dynamic threshold adjustments.
     *
-     * MAX_CHUNKS_PER_DOC:
-     * Keeps one document from dominating the final result in normal
-     * semantic retrieval mode.
-     *
-     * MIN_CHUNK_DISTANCE:
-     * Allows nearby chunks to be selected when they are still meaningfully
-     * distinct, which is important for compact product sheets.
+     * Rationale:
+     * - prevents the system from getting too noisy in fallback cases
+     * - still allows recovery when exact signals are sparse
     */
-    public const MAX_CHUNKS_PER_DOC = 3;
-    public const MIN_CHUNK_DISTANCE = 1.0;
+    public const THRESHOLD_FLOOR = 0.75;
+
+    /**
+     * Upper safety boundary for dynamic threshold adjustments.
+     *
+     * Rationale:
+     * - protects objection/pricing/list adjustments from becoming too strict
+     * - keeps retrieval from collapsing into empty result sets too easily
+     */
+    public const THRESHOLD_CEIL = 0.90;
+
+    /**
+     * Additional candidate expansion factor for list-like prompts.
+     *
+     * Rationale:
+     * - list requests benefit from wider candidate recall
+     * - too high would create noise across multiple retrieval channels
+     */
+    public const LIST_BONUS = 1.35;

    /**
     * Reciprocal Rank Fusion constant.
     *
-     * Slightly lower than classic defaults so top-ranked hits matter more.
+     * Rationale:
+     * - keep rank importance meaningful
+     * - but not so aggressive that one retrieval source dominates too hard
     */
    public const RRF_K = 50;

    /**
-     * Dynamic threshold clamp boundaries.
+     * Fallback size when thresholded fusion yields no candidates.
     *
-     * The floor must stay below the default threshold, otherwise the
-     * configured base threshold becomes ineffective.
+     * Rationale:
+     * - slightly larger safety net for the richer hybrid stack
+     * - helps no-tag and low-signal cases without exploding context
     */
-    public const THRESHOLD_FLOOR = 0.78;
-    public const THRESHOLD_CEIL = 0.90;
+    public const EMPTY_RRF_FALLBACK_TOPN = 5;

    /**
-     * Fallback breadth when strict thresholding removes all fused hits.
+     * Maximum number of chunks allowed from one document in spread mode.
     *
-     * More than one fallback result makes the retriever less brittle.
+     * Rationale:
+     * - preserve diversity across documents
+     * - still allow coherent multi-chunk retrieval from strong sources
     */
-    public const EMPTY_RRF_FALLBACK_TOPN = 3;
+    public const MAX_CHUNKS_PER_DOC = 2;
+
+    /**
+     * Minimum distance between chunk indices from the same document
+     * during spread-style selection.
+     *
+     * Rationale:
+     * - reduce near-duplicate neighboring chunks
+     * - still allow relevant continuation when needed
+     */
+    public const MIN_CHUNK_DISTANCE = 2;
 }
--- a/src/Config/QueryEnricherConfig.php
+++ b/src/Config/QueryEnricherConfig.php
@@ -1,22 +1,180 @@
 <?php

+declare(strict_types=1);
+
 namespace App\Config;

-class QueryEnricherConfig
+final readonly class QueryEnricherConfig
 {
+    /**
+     * Keep the enrichment vocabulary in the class for now.
+     *
+     * Important:
+     * - This is intentionally NOT externalized yet.
+     * - Add or maintain the current project-specific mappings here.
+     * - The later move to external config/files can happen separately.
+     *
+     * Supported shapes:
+     *
+     * 1) Simple mapping:
+     * [
+     *     'water hardness' => 'residual hardness',
+     *     'device' => 'instrument',
+     * ]
+     *
+     * 2) Small synonym groups:
+     * [
+     *     ['water hardness', 'residual hardness', 'hardness'],
+     *     ['device', 'instrument', 'meter'],
+     * ]
+     *
+     * The public API stays intentionally simple:
+     * - getEnrichQueryList(): array<string,string>
+     *
+     * This keeps QueryEnricher generic while the domain vocabulary
+     * deliberately remains inside this class for now.
+     *
+     * Replace the example entries below with your real project mappings.
+     *
+     * @var array<int|string, mixed>
+     */
+    private const ENRICH_QUERY_LIST = [
+        // -----------------------------------------------------------------
+        // Example mappings.
+        // Replace / extend these with your current real project mappings.
+        // -----------------------------------------------------------------
+
+        'water hardness' => 'residual hardness',
+        'device' => 'instrument',
+        'gerät'=>'produkt',
+        'rebuild'=>'reindex',
+
+        ['measuring device', 'meter', 'instrument'],
+    ];
+
+    /**
+     * Returns a normalized, deduplicated mapping for the QueryEnricher.
+     *
+     * Output format:
+     * [
+     *     'term a' => 'term b',
+     *     'term c' => 'term d',
+     * ]
+     *
+     * Rules:
+     * - ignore empty / invalid values
+     * - trim and normalize whitespace
+     * - ignore self-mappings
+     * - preserve first valid rule if duplicates normalize to the same key
+     *
+     * @return array<string, string>
+     */
    public function getEnrichQueryList(): array
    {
-        return [
-            'Wasserhärte' => 'Resthärte',
-            'Gerät' => 'Modell',
-            'Indikator' => 'Chemie',
-            'Seminar' => 'Webinar',
-            'Schulung' => 'Seminar',
-            'Indikatoren' => 'Indikator',
-            'Wasserhärte-Grenzwert' => 'Resthärte',
-            'Resthärte-Grenzwert' => 'Wasserhärte',
-            'Grenzwert' => 'Überwachungsbereich',
-            'store'=>'shop'
-        ];
+        $normalized = [];
+
+        foreach (self::ENRICH_QUERY_LIST as $key => $value) {
+            if (is_array($value)) {
+                $this->ingestGroup($normalized, $value);
+                continue;
+            }
+
+            $left = $this->normalizePhrase(is_string($key) ? $key : '');
+            $right = $this->normalizePhrase(is_string($value) ? $value : '');
+
+            if (!$this->isValidPair($left, $right)) {
+                continue;
+            }
+
+            if (!isset($normalized[$left])) {
+                $normalized[$left] = $right;
+            }
+        }
+
+        return $normalized;
+    }
+
+    /**
+     * Returns true when at least one valid enrichment rule exists.
+     */
+    public function hasRules(): bool
+    {
+        return $this->getEnrichQueryList() !== [];
+    }
+
+    /**
+     * @param array<string, string> $normalized
+     * @param array<int|string, mixed> $group
+     */
+    private function ingestGroup(array &$normalized, array $group): void
+    {
+        $items = [];
+
+        foreach ($group as $item) {
+            if (!is_string($item)) {
+                continue;
+            }
+
+            $item = $this->normalizePhrase($item);
+
+            if ($item === '') {
+                continue;
+            }
+
+            $items[$item] = $item;
+        }
+
+        $items = array_values($items);
+
+        if (count($items) < 2) {
+            return;
+        }
+
+        /**
+         * Turn a synonym group into a conservative chain:
+         * ['a', 'b', 'c'] => a=>b, b=>c
+         *
+         * QueryEnricher builds a bidirectional lookup later,
+         * so the config output stays intentionally small.
+         */
+        for ($i = 0, $max = count($items) - 1; $i < $max; $i++) {
+            $left = $items[$i];
+            $right = $items[$i + 1];
+
+            if (!$this->isValidPair($left, $right)) {
+                continue;
+            }
+
+            if (!isset($normalized[$left])) {
+                $normalized[$left] = $right;
+            }
+        }
+    }
+
+    private function isValidPair(string $left, string $right): bool
+    {
+        if ($left === '' || $right === '') {
+            return false;
+        }
+
+        if ($left === $right) {
+            return false;
+        }
+
+        return true;
+    }
+
+    private function normalizePhrase(string $value): string
+    {
+        $value = trim($value);
+
+        if ($value === '') {
+            return '';
+        }
+
+        $value = mb_strtolower($value, 'UTF-8');
+        $value = preg_replace('/\s+/u', ' ', $value) ?? $value;
+
+        return trim($value);
    }
 }