add tagging

2026-02-21 16:23:34 +01:00
parent 5a3852db12
commit cf5b473034
23 changed files with 1984 additions and 85 deletions
--- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
+++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
@@ -4,33 +4,75 @@ declare(strict_types=1);

 namespace App\Knowledge\Retrieval;

-use App\Knowledge\QueryCleaner;
+use App\Knowledge\ChunkManager;
+use App\Tag\TagRoutingService;
 use App\Vector\VectorSearchClient;

 final class NdjsonHybridRetriever implements RetrieverInterface
 {
-    private const VECTOR_SCORE_THRESHOLD = 0.25;
+    private const VECTOR_SCORE_THRESHOLD = 0.65;
+
+    /**
+     * Wenn Tag-Routing aktiv ist, erhöhen wir TopK,
+     * weil wir danach per document_id filtern.
+     */
+    private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 10;
+
+    /**
+     * Keyword-Scan: Mindest-Trefferanzahl an Terms, damit ein Chunk als Kandidat gilt.
+     */
+    private const KEYWORD_MIN_HITS = 1;

    public function __construct(
+        private readonly ChunkManager       $chunkManager,
        private readonly NdjsonChunkLookup  $lookup,
        private readonly VectorSearchClient $vectorClient,
-        private readonly QueryCleaner       $queryCleaner,
-        private readonly int                $maxChunks = 25,
-        private readonly int                $vectorTopK = 10,
-    )
-    {
-    }
+        private readonly TagRoutingService  $tagRouting,
+        private readonly int                $maxChunks = 3,
+        private readonly int                $vectorTopK = 5,
+    ) {}

    public function retrieve(string $prompt, int $limit = null): array
    {
-        $limit = $this->maxChunks;
-        $keywordChunks = [];
-        $query = $this->queryCleaner->clean($prompt);
+        $limit ??= $this->maxChunks;

-        // Vector / enrichment
-        $hits = $this->vectorClient->search($query, $this->vectorTopK);
+        // ---------------------------------------------------------
+        // 0) Tag-Routing FIRST (soft gate)
+        // ---------------------------------------------------------
+        $candidateDocIds = $this->tagRouting->route($prompt);
+
+        $candidateSet = null;
+
+        if (is_array($candidateDocIds) && $candidateDocIds !== []) {
+            $candidateSet = array_fill_keys($candidateDocIds, true);
+        }
+
+        // ---------------------------------------------------------
+        // 1) Keyword first (simple streaming scan)
+        // ---------------------------------------------------------
+        $terms = $this->extractTerms($prompt);
+
+        $keywordChunks = $this->keywordSearchStreaming($terms, $limit, $candidateSet);
+
+        if (\count($keywordChunks) >= $limit) {
+            return array_slice($keywordChunks, 0, $limit);
+        }
+
+        // ---------------------------------------------------------
+        // 2) Vector fallback / enrichment
+        //    - If routed: increase TopK, then filter by document_id
+        //    - Soft fallback: if filtering yields nothing -> global vector once
+        // ---------------------------------------------------------
+        $topK = $this->vectorTopK;
+
+        if ($candidateSet !== null) {
+            $topK = max($this->vectorTopK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $this->vectorTopK);
+            $topK = min($topK, 200); // guardrail
+        }
+
+        $hits = $this->vectorClient->search($prompt, $topK);
        if ($hits === []) {
-            return $this->diversifyByDevice($keywordChunks, $limit, 1);
+            return $keywordChunks;
        }

        $chunkIds = [];
@@ -45,73 +87,78 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        }

        if ($chunkIds === []) {
-            return $this->diversifyByDevice($keywordChunks, $limit, 1);
+            return $keywordChunks;
        }

        $rows = $this->lookup->findByChunkIds($chunkIds);

-        foreach ($chunkIds as $id) {
+        // routed filtering by document_id
+        $finalChunkIds = $chunkIds;
+
+        if ($candidateSet !== null) {
+            $filtered = [];
+
+            foreach ($chunkIds as $id) {
+                $row = $rows[$id] ?? null;
+                if (!is_array($row)) {
+                    continue;
+                }
+                $docId = $row['document_id'] ?? null;
+                if (!is_string($docId) || !isset($candidateSet[$docId])) {
+                    continue;
+                }
+                $filtered[] = $id;
+            }
+
+            // Soft fallback: if routing filtered everything away, retry global vector once
+            if ($filtered === []) {
+                $hits2 = $this->vectorClient->search($prompt, $this->vectorTopK);
+                if ($hits2 === []) {
+                    return $keywordChunks;
+                }
+
+                $chunkIds2 = [];
+                foreach ($hits2 as $hit) {
+                    if (!isset($hit['chunk_id'], $hit['score'])) {
+                        continue;
+                    }
+                    if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
+                        continue;
+                    }
+                    $chunkIds2[] = (string)$hit['chunk_id'];
+                }
+
+                if ($chunkIds2 === []) {
+                    return $keywordChunks;
+                }
+
+                $rows = $this->lookup->findByChunkIds($chunkIds2);
+                $finalChunkIds = $chunkIds2;
+            } else {
+                $finalChunkIds = $filtered;
+            }
+        }
+
+        foreach ($finalChunkIds as $id) {
            if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) {
                continue;
            }
            $keywordChunks[] = trim($rows[$id]['text']);
        }

-        // dedupe
+        // ---------------------------------------------------------
+        // 3) dedupe + limit
+        // ---------------------------------------------------------
        $seen = [];
-        $deduped = [];
+        $out = [];

        foreach ($keywordChunks as $chunk) {
-            $key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
+            $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
            if (isset($seen[$key])) {
                continue;
            }
            $seen[$key] = true;
-            $deduped[] = $chunk;
-        }
-
-        // diversify
-        return $this->diversifyByDevice($deduped, $limit, 1);
-    }
-
-    private function extractTerms(string $text): array
-    {
-        $text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
-
-        return array_values(array_filter(
-            explode(' ', $text),
-            static fn(string $w) => mb_strlen($w) > 2
-        ));
-    }
-
-    private function extractDevice(string $chunk): string
-    {
-        $firstLine = explode("\n", $chunk, 2)[0] ?? '';
-        return trim($firstLine);
-    }
-
-    private function diversifyByDevice(array $chunks, int $limit, int $maxPerDevice = 1): array
-    {
-        $seenDevices = [];
-        $out = [];
-
-        foreach ($chunks as $chunk) {
-            $device = $this->extractDevice($chunk);
-
-            if ($device === '') {
-                continue;
-            }
-
-            if (!isset($seenDevices[$device])) {
-                $seenDevices[$device] = 0;
-            }
-
-            if ($seenDevices[$device] >= $maxPerDevice) {
-                continue;
-            }
-
            $out[] = $chunk;
-            $seenDevices[$device]++;

            if (\count($out) >= $limit) {
                break;
@@ -120,4 +167,116 @@ final class NdjsonHybridRetriever implements RetrieverInterface

        return $out;
    }
-}
+
+    /**
+     * Streaming Keyword Search über index.ndjson.
+     * Minimal, aber nützlich:
+     * - Score = Anzahl gefundener Terms
+     * - CandidateDocs (Tag-Routing) reduziert Scan massiv
+     *
+     * @param string[] $terms
+     * @param array<string,true>|null $candidateSet
+     * @return string[]
+     */
+    private function keywordSearchStreaming(array $terms, int $limit, ?array $candidateSet): array
+    {
+        if ($terms === []) {
+            return [];
+        }
+
+        $maxScore = \count($terms);
+
+        // top list: each item = ['score' => int, 'text' => string]
+        $top = [];
+
+        foreach ($this->chunkManager->streamAll() as $row) {
+            $text = $row['text'] ?? null;
+            if (!is_string($text) || $text === '') {
+                continue;
+            }
+
+            if ($candidateSet !== null) {
+                $docId = $row['document_id'] ?? null;
+                if (!is_string($docId) || !isset($candidateSet[$docId])) {
+                    continue;
+                }
+            }
+
+            $haystack = mb_strtolower($text);
+
+            $score = 0;
+            foreach ($terms as $t) {
+                if ($t === '') {
+                    continue;
+                }
+                if (mb_stripos($haystack, $t) !== false) {
+                    $score++;
+                }
+            }
+
+            if ($score < self::KEYWORD_MIN_HITS) {
+                continue;
+            }
+
+            $top[] = [
+                'score' => $score,
+                'text'  => trim($text),
+            ];
+
+            // keep only best N (simple sort, N is tiny)
+            usort($top, static function (array $a, array $b): int {
+                // higher score first
+                $cmp = ($b['score'] <=> $a['score']);
+                if ($cmp !== 0) {
+                    return $cmp;
+                }
+                // shorter chunk first (often more precise)
+                return (mb_strlen($a['text']) <=> mb_strlen($b['text']));
+            });
+
+            if (\count($top) > $limit) {
+                $top = array_slice($top, 0, $limit);
+            }
+
+            // early exit: perfect matches filled
+            if (\count($top) === $limit && ($top[0]['score'] ?? 0) >= $maxScore) {
+                break;
+            }
+        }
+
+        $out = [];
+        foreach ($top as $item) {
+            $out[] = (string)$item['text'];
+        }
+
+        return $out;
+    }
+
+    /**
+     * Minimal term extraction (stabiles Verhalten, wenig Magie)
+     *
+     * @return string[]
+     */
+    private function extractTerms(string $text): array
+    {
+        $text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
+
+        $parts = array_values(array_filter(
+            explode(' ', $text),
+            static fn(string $w) => mb_strlen($w) > 2
+        ));
+
+        // unique, order preserved
+        $seen = [];
+        $out = [];
+        foreach ($parts as $w) {
+            if (isset($seen[$w])) {
+                continue;
+            }
+            $seen[$w] = true;
+            $out[] = $w;
+        }
+
+        return $out;
+    }
+}