diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 7d313ef..7d90615 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -14,7 +14,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface { private const VECTOR_SCORE_THRESHOLD = 0.25; private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 10; - private const KEYWORD_MIN_HITS = 1; private const HARD_MAX_CHUNKS = 200; private const HARD_MAX_VECTORK = 200; @@ -27,9 +26,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface private readonly ModelGenerationConfigRepository $configRepository, ) {} - /** - * Normalbetrieb – ausschließlich aktive Config. - */ public function retrieve(string $prompt): array { $config = $this->configRepository->findActiveForModel(); @@ -41,37 +37,31 @@ final class NdjsonHybridRetriever implements RetrieverInterface return $this->retrieveInternal($prompt, $config); } - /** - * Admin-Testbetrieb – explizite Config. - * Verändert KEINEN globalen Zustand. - */ public function retrieveForConfig(string $prompt, ModelGenerationConfig $config): array { return $this->retrieveInternal($prompt, $config); } - /** - * Zentrale Retrieval-Logik (keine Duplikation). - */ private function retrieveInternal(string $prompt, ModelGenerationConfig $config): array { $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); - $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); - // --------------------------------------------------------- - // 1) Tag-Vector FIRST -> candidateSet (DocIDs) - // --------------------------------------------------------- - $candidateDocIds = $this->tagRouting->route($prompt); // <= DAS muss intern auf Tag-Vector gehen + $isListQuery = $this->isListQuery($prompt); + + // ------------------------------- + // 1) Tag Routing + // ------------------------------- + $candidateDocIds = $this->tagRouting->route($prompt); $candidateSet = null; if (is_array($candidateDocIds) && $candidateDocIds !== []) { $candidateSet = array_fill_keys($candidateDocIds, true); } - // --------------------------------------------------------- - // 2) Vector chunks (primary) - // --------------------------------------------------------- + // ------------------------------- + // 2) Vector Search + // ------------------------------- $topK = $vectorTopKBase; if ($candidateSet !== null) { @@ -84,85 +74,183 @@ final class NdjsonHybridRetriever implements RetrieverInterface $hits = $this->vectorClient->search($prompt, $topK); if ($hits === []) { - // Tags-only System: kein Vector-Hit -> keine Chunks - return []; + return $candidateSet !== null + ? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit) + : []; } + // ------------------------------- + // 3) Chunk-IDs + Lookup einmalig + // ------------------------------- $chunkIds = []; + foreach ($hits as $hit) { if (!isset($hit['chunk_id'], $hit['score'])) { continue; } + if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) { continue; } + $chunkIds[] = (string)$hit['chunk_id']; } if ($chunkIds === []) { - return []; + return $candidateSet !== null + ? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit) + : []; } $rows = $this->lookup->findByChunkIds($chunkIds); - $finalChunkIds = $chunkIds; - // --------------------------------------------------------- - // 3) Routed filtering (wenn candidateSet vorhanden) - // --------------------------------------------------------- - if ($candidateSet !== null) { - $filtered = []; + // ------------------------------- + // 4) Listen-Modus → Dokument-Ranking + // ------------------------------- + if ($isListQuery && $candidateSet !== null) { - foreach ($chunkIds as $id) { - $row = $rows[$id] ?? null; - if (!is_array($row)) { - continue; - } + $rankedDocIds = $this->rankDocumentsFromHits($hits, $rows, $candidateSet); - $docId = $row['document_id'] ?? null; - if (!is_string($docId) || !isset($candidateSet[$docId])) { - continue; - } - - $filtered[] = $id; + if ($rankedDocIds === []) { + return $this->fallbackChunksFromCandidateDocs($candidateSet, $limit); } - // Wenn Routing ALLES wegfiltert -> einmal global retry - if ($filtered === []) { - $hits2 = $this->vectorClient->search($prompt, $vectorTopKBase); - if ($hits2 === []) { - return []; + $topDocIds = array_slice($rankedDocIds, 0, $limit); + + return $this->collectBestChunkPerDocument($topDocIds, $hits, $rows); + } + + // ------------------------------- + // 5) Normaler Chunk-Modus + // ------------------------------- + return $this->collectTexts($chunkIds, $rows, $limit); + } + + // ========================================================= + // LIST QUERY DETECTION + // ========================================================= + + private function isListQuery(string $prompt): bool + { + $prompt = mb_strtolower($prompt); + + return str_contains($prompt, 'liste') + || str_contains($prompt, 'zeige') + || str_contains($prompt, 'nenn') + || str_contains($prompt, 'welche') + || preg_match('/\b\d+\b/', $prompt) === 1; + } + + // ========================================================= + // DOCUMENT RANKING + // ========================================================= + + private function rankDocumentsFromHits( + array $hits, + array $rows, + array $candidateSet + ): array { + $documentScores = []; + + foreach ($hits as $hit) { + if (!isset($hit['chunk_id'], $hit['score'])) { + continue; + } + + $chunkId = (string)$hit['chunk_id']; + + if (!isset($rows[$chunkId])) { + continue; + } + + $row = $rows[$chunkId]; + $docId = $row['document_id'] ?? null; + + if (!is_string($docId) || !isset($candidateSet[$docId])) { + continue; + } + + $documentScores[$docId][] = (float)$hit['score']; + } + + if ($documentScores === []) { + return []; + } + + $ranked = []; + + foreach ($documentScores as $docId => $scores) { + rsort($scores); + $topScores = array_slice($scores, 0, 3); + $ranked[$docId] = array_sum($topScores) / count($topScores); + } + + arsort($ranked); + + return array_keys($ranked); + } + + private function collectBestChunkPerDocument( + array $docIds, + array $hits, + array $rows + ): array { + $result = []; + + foreach ($docIds as $docId) { + + $bestScore = -INF; + $bestText = null; + + foreach ($hits as $hit) { + if (!isset($hit['chunk_id'], $hit['score'])) { + continue; } - $chunkIds2 = []; - foreach ($hits2 as $hit) { - if (!isset($hit['chunk_id'], $hit['score'])) { - continue; - } - if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) { - continue; - } - $chunkIds2[] = (string)$hit['chunk_id']; + $chunkId = (string)$hit['chunk_id']; + + if (!isset($rows[$chunkId])) { + continue; } - if ($chunkIds2 === []) { - return []; + $row = $rows[$chunkId]; + + if (($row['document_id'] ?? null) !== $docId) { + continue; } - $rows = $this->lookup->findByChunkIds($chunkIds2); - $finalChunkIds = $chunkIds2; - } else { - $finalChunkIds = $filtered; + if ((float)$hit['score'] > $bestScore) { + $bestScore = (float)$hit['score']; + $bestText = $row['text'] ?? null; + } + } + + if (is_string($bestText) && $bestText !== '') { + $result[] = trim($bestText); } } - // --------------------------------------------------------- - // 4) Collect texts + Dedupe + Limit - // --------------------------------------------------------- + return $result; + } + + // ========================================================= + // FALLBACK + NORMAL MODE + // ========================================================= + + private function fallbackChunksFromCandidateDocs(array $candidateSet, int $limit): array + { $seen = []; $out = []; - foreach ($finalChunkIds as $id) { - $text = $rows[$id]['text'] ?? null; + foreach ($this->chunkManager->streamAll() as $row) { + $docId = $row['document_id'] ?? null; + + if (!is_string($docId) || !isset($candidateSet[$docId])) { + continue; + } + + $text = $row['text'] ?? null; + if (!is_string($text) || $text === '') { continue; } @@ -185,84 +273,29 @@ final class NdjsonHybridRetriever implements RetrieverInterface return $out; } - private function keywordSearchStreaming(array $terms, int $limit, ?array $candidateSet): array + private function collectTexts(array $chunkIds, array $rows, int $limit): array { - if ($terms === []) { - return []; - } - - $maxScore = \count($terms); - $top = []; - - foreach ($this->chunkManager->streamAll() as $row) { - $text = $row['text'] ?? null; - - if (!is_string($text) || $text === '') { - continue; - } - - if ($candidateSet !== null) { - $docId = $row['document_id'] ?? null; - - if (!is_string($docId) || !isset($candidateSet[$docId])) { - continue; - } - } - - $haystack = mb_strtolower($text); - $score = 0; - - foreach ($terms as $t) { - if ($t !== '' && mb_stripos($haystack, $t) !== false) { - $score++; - } - } - - if ($score < self::KEYWORD_MIN_HITS) { - continue; - } - - $top[] = [ - 'score' => $score, - 'text' => trim($text), - ]; - - usort($top, static function (array $a, array $b): int { - $cmp = ($b['score'] <=> $a['score']); - if ($cmp !== 0) { - return $cmp; - } - return (mb_strlen($a['text']) <=> mb_strlen($b['text'])); - }); - - if (\count($top) > $limit) { - $top = array_slice($top, 0, $limit); - } - - if (\count($top) === $limit && ($top[0]['score'] ?? 0) >= $maxScore) { - break; - } - } - - return array_map(static fn($item) => (string)$item['text'], $top); - } - - private function extractTerms(string $text): array - { - $text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text)); - - $parts = array_values(array_filter( - explode(' ', $text), - static fn(string $w) => mb_strlen($w) > 2 - )); - $seen = []; $out = []; - foreach ($parts as $w) { - if (!isset($seen[$w])) { - $seen[$w] = true; - $out[] = $w; + foreach ($chunkIds as $id) { + if (!isset($rows[$id]['text'])) { + continue; + } + + $chunk = trim($rows[$id]['text']); + + $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); + + if (isset($seen[$key])) { + continue; + } + + $seen[$key] = true; + $out[] = $chunk; + + if (\count($out) >= $limit) { + break; } }