From eb9ab2ec48b3e638b298237e8a41ed3d7b0bb380 Mon Sep 17 00:00:00 2001 From: team2 Date: Sun, 1 Mar 2026 10:48:23 +0100 Subject: [PATCH] optimize document loader --- .../Retrieval/NdjsonHybridRetriever.php | 419 +++++++++--------- 1 file changed, 199 insertions(+), 220 deletions(-) diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 538f48b..22073b5 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -18,15 +18,22 @@ final class NdjsonHybridRetriever implements RetrieverInterface { private const VECTOR_SCORE_THRESHOLD = 0.72; + // Guardrails private const HARD_MAX_CHUNKS = 90; private const HARD_MAX_VECTORK = 250; - private const LIST_BONUS = 1.5; + private const LIST_BONUS = 1.25; + // Selection / Fusion private const MAX_CHUNKS_PER_DOC = 2; private const MIN_CHUNK_DISTANCE = 2; private const RRF_K = 60; + // Hardening (nur Edge-Cases; Standardverhalten bleibt gleich) + private const THRESHOLD_FLOOR = 0.65; + private const THRESHOLD_CEIL = 0.90; + private const EMPTY_RRF_FALLBACK_TOPN = 5; + public function __construct( private readonly NdjsonChunkLookup $lookup, private readonly VectorSearchClient $vectorClient, @@ -66,10 +73,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface $entityLabel = $this->catalogIntent->detect($prompt); // 2) Intent (regelbasiert) - $intent = (string)($this->salesIntentLite->detect($prompt)['intent'] ?? SalesIntentLite::DISCOVERY); + $salesIntent = $this->detectSalesIntent($prompt); // 3) Route bestimmen (Intent + Entity) - $route = $this->routeResolver->resolve($intent, $entityLabel); + $route = $this->routeResolver->resolve($salesIntent, $entityLabel); // 4) Early Exit nur für catalog_list if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) { @@ -84,41 +91,30 @@ final class NdjsonHybridRetriever implements RetrieverInterface // NORMALER CORE // ------------------------------------------------------------ - $core = $this->runCore($prompt, $config, false); + $core = $this->runCore($prompt, $config, false, $salesIntent); if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { return []; } - // ✅ ORIGINAL: Normal Mode -> Sales optimized selection if (!$core['is_list_query']) { - return $this->collectSalesOptimized( - $core['ranked_chunk_ids'], - $core['rows'], - $core['limit'] - ); + $selectedIds = $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']); + return $this->collectTextsFromIds($selectedIds, $core['rows']); } - // ✅ ORIGINAL: List Mode -> simple collectTexts - return $this->collectTexts( - $core['ranked_chunk_ids'], - $core['rows'], - $core['limit'] - ); + $selectedIds = $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']); + return $this->collectTextsFromIds($selectedIds, $core['rows']); } // ========================================================= - // DEBUG (unverändert, kein Early-Exit damit Debug immer Core zeigt) + // DEBUG (deterministisch: gleiche Intent-Bestimmung wie Prod) // ========================================================= /** - * Gibt genau DIE Treffer zurück, die auch in Produktion ausgewählt werden, - * plus Scores/Meta pro ausgewähltem Chunk. - * * @return arrayrunCore($prompt, $config, true); + $salesIntent = $this->detectSalesIntent($prompt); + + // Debug zeigt Core ohne Early Exit, aber mit identischem Intent-Input. + $core = $this->runCore($prompt, $config, true, $salesIntent); if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { return []; } $selectedChunkIds = $core['is_list_query'] - ? $this->selectChunkIdsListMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']) - : $this->selectChunkIdsSalesMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']); + ? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']) + : $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']); if ($selectedChunkIds === []) { return []; @@ -162,7 +161,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface $out[] = [ 'rank' => $rank, - 'chunk_id' => $chunkId, + 'chunk_id' => (string)$chunkId, 'document_id' => isset($core['rows'][$chunkId]['document_id']) ? (string)$core['rows'][$chunkId]['document_id'] : null, 'raw_score' => isset($core['raw_scores'][$chunkId]) ? (float)$core['raw_scores'][$chunkId] : null, 'rrf_score' => isset($core['rrf_scores'][$chunkId]) ? (float)$core['rrf_scores'][$chunkId] : null, @@ -177,7 +176,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface } // ========================================================= - // CORE PIPELINE (einmalig, shared) + // CORE PIPELINE // ========================================================= /** @@ -187,25 +186,109 @@ final class NdjsonHybridRetriever implements RetrieverInterface * sales_intent:string, * threshold:float, * topk:int, - * ranked_chunk_ids: string[], - * rows: array>, - * rrf_scores: array, - * raw_scores: array + * ranked_chunk_ids:string[], + * rows:array>, + * rrf_scores:array, + * raw_scores:array * } */ - private function runCore(string $prompt, ModelGenerationConfig $config, bool $withScores): array - { + private function runCore( + string $prompt, + ModelGenerationConfig $config, + bool $withScores, + string $salesIntent + ): array { $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); $isListQuery = $this->intentLite->isListQuery($prompt); - $salesIntent = $this->salesIntentLite->detect($prompt)['intent']; $cleanQuery = $this->queryCleaner->clean($prompt); if ($cleanQuery === '') { $cleanQuery = $prompt; } + [$threshold, $topK] = $this->computeThresholdAndTopK($salesIntent, $isListQuery, $vectorTopKBase); + + // Candidate Routing (keine Set-Map nötig; scoped nur wenn IDs existieren) + $candidateDocIds = $this->tagRouting->route($cleanQuery); + $candidateDocIds = is_array($candidateDocIds) ? array_values(array_unique(array_filter($candidateDocIds, 'is_string'))) : []; + + $globalHits = $this->vectorClient->search($cleanQuery, $topK); + + $scopedHits = []; + if ($candidateDocIds !== []) { + $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds); + } + + if ($globalHits === [] && $scopedHits === []) { + return [ + 'limit' => $limit, + 'is_list_query' => $isListQuery, + 'sales_intent' => $salesIntent, + 'threshold' => $threshold, + 'topk' => $topK, + 'ranked_chunk_ids' => [], + 'rows' => [], + 'rrf_scores' => [], + 'raw_scores' => [], + ]; + } + + $fused = $this->fuseHits( + $globalHits, + $scopedHits, + $threshold, + $salesIntent === SalesIntentLite::OBJECTION, + $withScores + ); + + $rrfScores = $fused['rrf_scores']; + $rawScores = $fused['raw_scores']; + + // 🛡 Hardening: wenn Threshold alles rausfiltert, aber globale Hits existieren, + // nehmen wir Top-N als minimalen Kontext. Greift nur in Edge-Cases. + if ($rrfScores === [] && $globalHits !== []) { + $rrfScores = $this->fallbackRrfFromHits($globalHits, self::EMPTY_RRF_FALLBACK_TOPN); + } + + if ($rrfScores === []) { + return [ + 'limit' => $limit, + 'is_list_query' => $isListQuery, + 'sales_intent' => $salesIntent, + 'threshold' => $threshold, + 'topk' => $topK, + 'ranked_chunk_ids' => [], + 'rows' => [], + 'rrf_scores' => [], + 'raw_scores' => $rawScores, + ]; + } + + arsort($rrfScores); + $rankedChunkIds = array_keys($rrfScores); + + $rows = $this->lookup->findByChunkIds($rankedChunkIds); + + return [ + 'limit' => $limit, + 'is_list_query' => $isListQuery, + 'sales_intent' => $salesIntent, + 'threshold' => $threshold, + 'topk' => $topK, + 'ranked_chunk_ids' => $rankedChunkIds, + 'rows' => $rows, + 'rrf_scores' => $rrfScores, + 'raw_scores' => $rawScores, + ]; + } + + /** + * @return array{0: float, 1: int} threshold, topK + */ + private function computeThresholdAndTopK(string $salesIntent, bool $isListQuery, int $vectorTopKBase): array + { $threshold = self::VECTOR_SCORE_THRESHOLD; $topK = $vectorTopKBase; @@ -242,124 +325,106 @@ final class NdjsonHybridRetriever implements RetrieverInterface $topK = max(1, min($topK, self::HARD_MAX_VECTORK)); - $candidateDocIds = $this->tagRouting->route($cleanQuery); - $candidateSet = null; + // Enterprise clamp: verhindert Drift, ohne den aktuellen Normalfall zu ändern. + $threshold = max(self::THRESHOLD_FLOOR, min(self::THRESHOLD_CEIL, $threshold)); - if (is_array($candidateDocIds) && $candidateDocIds !== []) { - $candidateSet = array_fill_keys($candidateDocIds, true); - } - - $globalHits = $this->vectorClient->search($cleanQuery, $topK); - - $scopedHits = []; - if ($candidateSet !== null) { - $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, array_keys($candidateSet)); - } - - if ($globalHits === [] && $scopedHits === []) { - return [ - 'limit' => $limit, - 'is_list_query' => $isListQuery, - 'sales_intent' => (string)$salesIntent, - 'threshold' => $threshold, - 'topk' => $topK, - 'ranked_chunk_ids' => [], - 'rows' => [], - 'rrf_scores' => [], - 'raw_scores' => [], - ]; - } + return [$threshold, $topK]; + } + /** + * @return array{ + * rrf_scores: array, + * raw_scores: array + * } + */ + private function fuseHits( + array $globalHits, + array $scopedHits, + float $threshold, + bool $boostScoped, + bool $captureRaw + ): array { $rrfScores = []; $rawScores = []; - $this->applyRrfWithOptionalRaw($globalHits, $rrfScores, $rawScores, $threshold, false, $withScores); - $this->applyRrfWithOptionalRaw( - $scopedHits, - $rrfScores, - $rawScores, - $threshold, - $salesIntent === SalesIntentLite::OBJECTION, - $withScores - ); + $apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void { + $rank = 0; - if ($rrfScores === []) { - return [ - 'limit' => $limit, - 'is_list_query' => $isListQuery, - 'sales_intent' => (string)$salesIntent, - 'threshold' => $threshold, - 'topk' => $topK, - 'ranked_chunk_ids' => [], - 'rows' => [], - 'rrf_scores' => [], - 'raw_scores' => $rawScores, - ]; - } + foreach ($hits as $hit) { + if (!isset($hit['chunk_id'], $hit['score'])) { + continue; + } - arsort($rrfScores); - $rankedChunkIds = array_keys($rrfScores); + $raw = (float)$hit['score']; + if ($raw < $threshold) { + continue; + } - $rows = $this->lookup->findByChunkIds($rankedChunkIds); + $chunkId = (string)$hit['chunk_id']; + + if ($captureRaw) { + $rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw); + } + + $rank++; + $rrf = 1.0 / (self::RRF_K + $rank); + + if ($boost) { + $rrf *= 1.2; + } + + $rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf; + } + }; + + $apply($globalHits, false); + $apply($scopedHits, $boostScoped); return [ - 'limit' => $limit, - 'is_list_query' => $isListQuery, - 'sales_intent' => (string)$salesIntent, - 'threshold' => $threshold, - 'topk' => $topK, - 'ranked_chunk_ids' => $rankedChunkIds, - 'rows' => $rows, 'rrf_scores' => $rrfScores, 'raw_scores' => $rawScores, ]; } - private function applyRrfWithOptionalRaw( - array $hits, - array &$rrfScores, - array &$rawScores, - float $threshold, - bool $boost = false, - bool $captureRaw = false - ): void + /** + * Minimaler Fallback: baut RRF nur aus der Reihenfolge (ohne Threshold), + * damit Edge-Cases nicht leer laufen. + * + * @return array + */ + private function fallbackRrfFromHits(array $hits, int $topN): array { + $rrf = []; $rank = 0; foreach ($hits as $hit) { - if (!isset($hit['chunk_id'], $hit['score'])) { + if (!isset($hit['chunk_id'])) { continue; } - $raw = (float)$hit['score']; - if ($raw < $threshold) { - continue; - } - - $chunkId = (string)$hit['chunk_id']; - - if ($captureRaw) { - if (!isset($rawScores[$chunkId]) || $raw > $rawScores[$chunkId]) { - $rawScores[$chunkId] = $raw; - } - } - $rank++; - $rrf = 1 / (self::RRF_K + $rank); + $chunkId = (string)$hit['chunk_id']; + $rrf[$chunkId] = 1.0 / (self::RRF_K + $rank); - if ($boost) { - $rrf *= 1.2; + if ($rank >= $topN) { + break; } - - if (!isset($rrfScores[$chunkId])) { - $rrfScores[$chunkId] = 0.0; - } - - $rrfScores[$chunkId] += $rrf; } + + return $rrf; } - private function selectChunkIdsListMode(array $chunkIds, array $rows, int $limit): array + private function detectSalesIntent(string $prompt): string + { + $data = $this->salesIntentLite->detect($prompt); + return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY); + } + + // ========================================================= + // SELECTION (shared) + // ========================================================= + + private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array { $seen = []; $out = []; @@ -374,7 +439,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface continue; } - $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); + // Dedupe Key (billig & stabil) + $key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk))); if (isset($seen[$key])) { continue; @@ -391,7 +457,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface return $out; } - private function selectChunkIdsSalesMode(array $chunkIds, array $rows, int $limit): array + private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array { $out = []; $docCounter = []; @@ -439,88 +505,12 @@ final class NdjsonHybridRetriever implements RetrieverInterface return $out; } - private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void + // ========================================================= + // COLLECT (shared) + // ========================================================= + + private function collectTextsFromIds(array $chunkIds, array $rows): array { - $rank = 0; - - foreach ($hits as $hit) { - if (!isset($hit['chunk_id'], $hit['score'])) { - continue; - } - - $raw = (float)$hit['score']; - if ($raw < $threshold) { - continue; - } - - $chunkId = (string)$hit['chunk_id']; - - $rank++; - $rrf = 1 / (self::RRF_K + $rank); - - if ($boost) { - $rrf *= 1.2; - } - - if (!isset($rrfScores[$chunkId])) { - $rrfScores[$chunkId] = 0.0; - } - - $rrfScores[$chunkId] += $rrf; - } - } - - private function collectSalesOptimized(array $chunkIds, array $rows, int $limit): array - { - $out = []; - $docCounter = []; - $docChunkPositions = []; - - foreach ($chunkIds as $chunkId) { - if (!isset($rows[$chunkId]['text'])) { - continue; - } - - $docId = $rows[$chunkId]['document_id'] ?? null; - $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; - - if (!is_string($docId)) { - continue; - } - - if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) { - continue; - } - - if (is_int($chunkIndex)) { - $prev = $docChunkPositions[$docId] ?? []; - foreach ($prev as $prevIdx) { - if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) { - continue 2; - } - } - $docChunkPositions[$docId][] = $chunkIndex; - } - - $text = trim((string)$rows[$chunkId]['text']); - if ($text === '') { - continue; - } - - $out[] = $text; - $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; - - if (\count($out) >= $limit) { - break; - } - } - - return $out; - } - - private function collectTexts(array $chunkIds, array $rows, int $limit): array - { - $seen = []; $out = []; foreach ($chunkIds as $id) { @@ -528,23 +518,12 @@ final class NdjsonHybridRetriever implements RetrieverInterface continue; } - $chunk = trim((string)$rows[$id]['text']); - if ($chunk === '') { + $text = trim((string)$rows[$id]['text']); + if ($text === '') { continue; } - $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); - - if (isset($seen[$key])) { - continue; - } - - $seen[$key] = true; - $out[] = $chunk; - - if (\count($out) >= $limit) { - break; - } + $out[] = $text; } return $out;