diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 41f0540..89ea827 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -37,6 +37,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface { } + // ========================================================= + // PRODUCTION (UNVERÄNDERTES VERHALTEN) + // ========================================================= + public function retrieve(string $prompt): array { $config = $this->configRepository->findActiveForModel(); @@ -49,6 +53,120 @@ final class NdjsonHybridRetriever implements RetrieverInterface } public function retrieveInternal(string $prompt, ModelGenerationConfig $config): array + { + $core = $this->runCore($prompt, $config, false); + + if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { + return []; + } + + // ✅ ORIGINAL: Normal Mode -> Sales optimized selection + if (!$core['is_list_query']) { + return $this->collectSalesOptimized( + $core['ranked_chunk_ids'], + $core['rows'], + $core['limit'] + ); + } + + // ✅ ORIGINAL: List Mode -> simple collectTexts + return $this->collectTexts( + $core['ranked_chunk_ids'], + $core['rows'], + $core['limit'] + ); + } + + // ========================================================= + // DEBUG (NEU, ABER NICHT IM PRODUKTIONS-PFAD) + // ========================================================= + + /** + * Gibt genau DIE Treffer zurück, die auch in Produktion ausgewählt werden, + * plus Scores/Meta pro ausgewähltem Chunk. + * + * @return array + */ + public function retrieveDebug(string $prompt): array + { + $config = $this->configRepository->findActiveForModel(); + + if ($config === null) { + throw new \RuntimeException('No active ModelGenerationConfig found.'); + } + + $core = $this->runCore($prompt, $config, true); + + if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { + return []; + } + + // 1) Production-like selection: wir selektieren Texte, + // aber in Debug brauchen wir die ChunkIds dazu. + $selectedChunkIds = $core['is_list_query'] + ? $this->selectChunkIdsListMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']) + : $this->selectChunkIdsSalesMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']); + + if ($selectedChunkIds === []) { + return []; + } + + // 2) Ausgabe inklusive Scores + $out = []; + $rank = 0; + + foreach ($selectedChunkIds as $chunkId) { + if (!isset($core['rows'][$chunkId])) { + continue; + } + + $rank++; + $text = trim((string)($core['rows'][$chunkId]['text'] ?? '')); + + $out[] = [ + 'rank' => $rank, + 'chunk_id' => $chunkId, + 'document_id' => isset($core['rows'][$chunkId]['document_id']) ? (string)$core['rows'][$chunkId]['document_id'] : null, + 'raw_score' => isset($core['raw_scores'][$chunkId]) ? (float)$core['raw_scores'][$chunkId] : null, + 'rrf_score' => isset($core['rrf_scores'][$chunkId]) ? (float)$core['rrf_scores'][$chunkId] : null, + 'threshold' => (float)$core['threshold'], + 'intent' => (string)$core['sales_intent'], + 'is_list_query'=> (bool)$core['is_list_query'], + 'text' => $text, + ]; + } + + return $out; + } + + // ========================================================= + // CORE PIPELINE (einmalig, shared) + // ========================================================= + + /** + * @return array{ + * limit:int, + * is_list_query:bool, + * sales_intent:string, + * threshold:float, + * topk:int, + * ranked_chunk_ids: string[], + * rows: array>, + * rrf_scores: array, + * raw_scores: array + * } + */ + private function runCore(string $prompt, ModelGenerationConfig $config, bool $withScores): array { $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); @@ -61,16 +179,13 @@ final class NdjsonHybridRetriever implements RetrieverInterface $cleanQuery = $prompt; } - // ------------------------------------------------- - // Intent-based adjustments - // ------------------------------------------------- - + // Intent-based adjustments (identisch zur Produktionslogik) $threshold = self::VECTOR_SCORE_THRESHOLD; $topK = $vectorTopKBase; switch ($salesIntent) { case SalesIntentLite::PRICING: - $threshold += 0.02; // more precision + $threshold += 0.02; break; case SalesIntentLite::COMPARISON: @@ -101,10 +216,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface $topK = max(1, min($topK, self::HARD_MAX_VECTORK)); - // ------------------------------------------------- - // Tag routing - // ------------------------------------------------- - + // Tag routing (identisch) $candidateDocIds = $this->tagRouting->route($cleanQuery); $candidateSet = null; @@ -112,63 +224,228 @@ final class NdjsonHybridRetriever implements RetrieverInterface $candidateSet = array_fill_keys($candidateDocIds, true); } - // ------------------------------------------------- - // Dual search - // ------------------------------------------------- - + // Dual search (identisch) $globalHits = $this->vectorClient->search($cleanQuery, $topK); $scopedHits = []; if ($candidateSet !== null) { - $scopedHits = $this->vectorClient->searchScoped( - $cleanQuery, - $topK, - array_keys($candidateSet) - ); + $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, array_keys($candidateSet)); } if ($globalHits === [] && $scopedHits === []) { - return []; + return [ + 'limit' => $limit, + 'is_list_query' => $isListQuery, + 'sales_intent' => (string)$salesIntent, + 'threshold' => $threshold, + 'topk' => $topK, + 'ranked_chunk_ids' => [], + 'rows' => [], + 'rrf_scores' => [], + 'raw_scores' => [], + ]; } - // ------------------------------------------------- - // RRF Fusion - // ------------------------------------------------- - $rrfScores = []; + $rawScores = []; - $this->applyRrf($globalHits, $rrfScores, $threshold); - $this->applyRrf($scopedHits, $rrfScores, $threshold, $salesIntent === SalesIntentLite::OBJECTION); + // RRF (identisch) + optional raw capture + $this->applyRrfWithOptionalRaw($globalHits, $rrfScores, $rawScores, $threshold, false, $withScores); + $this->applyRrfWithOptionalRaw( + $scopedHits, + $rrfScores, + $rawScores, + $threshold, + $salesIntent === SalesIntentLite::OBJECTION, + $withScores + ); if ($rrfScores === []) { - return []; + return [ + 'limit' => $limit, + 'is_list_query' => $isListQuery, + 'sales_intent' => (string)$salesIntent, + 'threshold' => $threshold, + 'topk' => $topK, + 'ranked_chunk_ids' => [], + 'rows' => [], + 'rrf_scores' => [], + 'raw_scores' => $rawScores, + ]; } arsort($rrfScores); $rankedChunkIds = array_keys($rrfScores); $rows = $this->lookup->findByChunkIds($rankedChunkIds); - if ($rows === []) { - return []; - } - if (!$isListQuery) { - return $this->collectSalesOptimized( - $rankedChunkIds, - $rows, - $limit - ); - } - - return $this->collectTexts($rankedChunkIds, $rows, $limit); + return [ + 'limit' => $limit, + 'is_list_query' => $isListQuery, + 'sales_intent' => (string)$salesIntent, + 'threshold' => $threshold, + 'topk' => $topK, + 'ranked_chunk_ids' => $rankedChunkIds, + 'rows' => $rows, + 'rrf_scores' => $rrfScores, + 'raw_scores' => $rawScores, + ]; } + /** + * Gleiche Logik wie applyRrf(), aber optional mit raw-score capture. + * + * @param array $hits + * @param array $rrfScores + * @param array $rawScores + */ + private function applyRrfWithOptionalRaw( + array $hits, + array &$rrfScores, + array &$rawScores, + float $threshold, + bool $boost = false, + bool $captureRaw = false + ): void { + $rank = 0; + + foreach ($hits as $hit) { + if (!isset($hit['chunk_id'], $hit['score'])) { + continue; + } + + $raw = (float)$hit['score']; + if ($raw < $threshold) { + continue; + } + + $chunkId = (string)$hit['chunk_id']; + + if ($captureRaw) { + // wenn global+scoped vorkommt: bestes raw behalten + if (!isset($rawScores[$chunkId]) || $raw > $rawScores[$chunkId]) { + $rawScores[$chunkId] = $raw; + } + } + + $rank++; + $rrf = 1 / (self::RRF_K + $rank); + + if ($boost) { + $rrf *= 1.2; + } + + if (!isset($rrfScores[$chunkId])) { + $rrfScores[$chunkId] = 0.0; + } + + $rrfScores[$chunkId] += $rrf; + } + } + + // ========================================================= + // DEBUG SELECTION HELPERS (identisch zu Produktionsregeln) + // ========================================================= + + /** + * List-Mode nutzt exakt collectTexts() Regeln, aber gibt ChunkIds zurück. + * + * @return string[] + */ + private function selectChunkIdsListMode(array $chunkIds, array $rows, int $limit): array + { + $seen = []; + $out = []; + + foreach ($chunkIds as $id) { + if (!isset($rows[$id]['text'])) { + continue; + } + + $chunk = trim((string)$rows[$id]['text']); + if ($chunk === '') { + continue; + } + + $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); + + if (isset($seen[$key])) { + continue; + } + + $seen[$key] = true; + $out[] = (string)$id; + + if (\count($out) >= $limit) { + break; + } + } + + return $out; + } + + /** + * Normal-Mode nutzt exakt collectSalesOptimized() Regeln, aber gibt ChunkIds zurück. + * + * @return string[] + */ + private function selectChunkIdsSalesMode(array $chunkIds, array $rows, int $limit): array + { + $out = []; + $docCounter = []; + $docChunkPositions = []; + + foreach ($chunkIds as $chunkId) { + if (!isset($rows[$chunkId]['text'])) { + continue; + } + + $docId = $rows[$chunkId]['document_id'] ?? null; + $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; + + if (!is_string($docId)) { + continue; + } + + if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) { + continue; + } + + if (is_int($chunkIndex)) { + $prev = $docChunkPositions[$docId] ?? []; + foreach ($prev as $prevIdx) { + if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) { + continue 2; + } + } + $docChunkPositions[$docId][] = $chunkIndex; + } + + $text = trim((string)$rows[$chunkId]['text']); + if ($text === '') { + continue; + } + + $out[] = (string)$chunkId; + $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; + + if (\count($out) >= $limit) { + break; + } + } + + return $out; + } + + // ========================================================= + // ORIGINAL METHODS (UNVERÄNDERT) + // ========================================================= + private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void { $rank = 0; foreach ($hits as $hit) { - if (!isset($hit['chunk_id'], $hit['score'])) { continue; } @@ -184,7 +461,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface $rrf = 1 / (self::RRF_K + $rank); if ($boost) { - $rrf *= 1.2; // scoped boost for objections + $rrf *= 1.2; } if (!isset($rrfScores[$chunkId])) { @@ -202,7 +479,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface $docChunkPositions = []; foreach ($chunkIds as $chunkId) { - if (!isset($rows[$chunkId]['text'])) { continue; } @@ -236,7 +512,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface $out[] = $text; $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; - if (count($out) >= $limit) { + if (\count($out) >= $limit) { break; } } @@ -250,7 +526,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface $out = []; foreach ($chunkIds as $id) { - if (!isset($rows[$id]['text'])) { continue; }