diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 2f0bf6c..4a04a1f 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -18,18 +18,15 @@ final class NdjsonHybridRetriever implements RetrieverInterface { private const VECTOR_SCORE_THRESHOLD = 0.82; - // Guardrails private const HARD_MAX_CHUNKS = 90; private const HARD_MAX_VECTORK = 250; private const LIST_BONUS = 1.25; - // Selection / Fusion private const MAX_CHUNKS_PER_DOC = 2; private const MIN_CHUNK_DISTANCE = 2; private const RRF_K = 60; - // Hardening (nur Edge-Cases; Standardverhalten bleibt gleich) private const THRESHOLD_FLOOR = 0.65; private const THRESHOLD_CEIL = 0.90; private const EMPTY_RRF_FALLBACK_TOPN = 5; @@ -45,161 +42,140 @@ final class NdjsonHybridRetriever implements RetrieverInterface private readonly CatalogIntentLite $catalogIntent, private readonly IntentRouteResolver $routeResolver, private readonly EntityCatalogService $entityCatalogService - ) - { - } + ) {} // ========================================================= - // PRODUCTION + // PUBLIC API // ========================================================= public function retrieve(string $prompt): array { - $config = $this->configRepository->findActiveForModel(); + $config = $this->requireConfig(); + $result = $this->execute($prompt, $config, false); - if ($config === null) { - throw new \RuntimeException('No active ModelGenerationConfig found.'); + if ($result['catalogBlock'] !== null) { + return [$result['catalogBlock']]; } - return $this->retrieveInternal($prompt, $config); + return $this->collectTextsFromIds( + $result['selectedChunkIds'], + $result['rows'] + ); } - public function retrieveInternal(string $prompt, ModelGenerationConfig $config): array - { - // ------------------------------------------------------------ - // ROUTING-MATRIX (minimal, ohne Core zu zerlegen) - // ------------------------------------------------------------ - - // 1) Entity (semantisch über Tag-Vektor) - $entityLabel = $this->catalogIntent->detect($prompt); - - // 2) Intent (regelbasiert) - $salesIntent = $this->detectSalesIntent($prompt); - - // 3) Route bestimmen (Intent + Entity) - $route = $this->routeResolver->resolve($salesIntent, $entityLabel); - - // 4) Early Exit nur für catalog_list - if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) { - $catalogBlock = $this->entityCatalogService->listByTerm($entityLabel); - - if ($catalogBlock !== null) { - return [$catalogBlock]; - } - } - - // ------------------------------------------------------------ - // NORMALER CORE - // ------------------------------------------------------------ - - $core = $this->runCore($prompt, $config, false, $salesIntent); - - if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { - return []; - } - - if (!$core['is_list_query']) { - $selectedIds = $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']); - return $this->collectTextsFromIds($selectedIds, $core['rows']); - } - - $selectedIds = $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']); - return $this->collectTextsFromIds($selectedIds, $core['rows']); - } - - // ========================================================= - // DEBUG (deterministisch: gleiche Intent-Bestimmung wie Prod) - // ========================================================= - - /** - * @return array - */ public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array { - $config = $config ?? $this->configRepository->findActiveForModel(); + $config = $config ?? $this->requireConfig(); + $result = $this->execute($prompt, $config, true); - if ($config === null) { - throw new \RuntimeException('No active ModelGenerationConfig found.'); - } - - $salesIntent = $this->detectSalesIntent($prompt); - - // Debug zeigt Core ohne Early Exit, aber mit identischem Intent-Input. - $core = $this->runCore($prompt, $config, true, $salesIntent); - - if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { - return []; - } - - $selectedChunkIds = $core['is_list_query'] - ? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']) - : $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']); - - if ($selectedChunkIds === []) { - return []; + if ($result['catalogBlock'] !== null) { + return [[ + 'rank' => 1, + 'chunk_id' => '__CATALOG_LIST__', + 'document_id' => null, + 'raw_score' => null, + 'rrf_score' => null, + 'threshold' => 0.0, + 'intent' => $result['intent'], + 'route' => $result['route'], + 'entity_label' => $result['entityLabel'], + 'is_list_query' => true, + 'text' => $result['catalogBlock'], + ]]; } $out = []; $rank = 0; - foreach ($selectedChunkIds as $chunkId) { - if (!isset($core['rows'][$chunkId])) { + foreach ($result['selectedChunkIds'] as $chunkId) { + if (!isset($result['rows'][$chunkId])) { continue; } $rank++; - $text = trim((string)($core['rows'][$chunkId]['text'] ?? '')); $out[] = [ 'rank' => $rank, - 'chunk_id' => (string)$chunkId, - 'document_id' => isset($core['rows'][$chunkId]['document_id']) ? (string)$core['rows'][$chunkId]['document_id'] : null, - 'raw_score' => isset($core['raw_scores'][$chunkId]) ? (float)$core['raw_scores'][$chunkId] : null, - 'rrf_score' => isset($core['rrf_scores'][$chunkId]) ? (float)$core['rrf_scores'][$chunkId] : null, - 'threshold' => (float)$core['threshold'], - 'intent' => (string)$core['sales_intent'], - 'is_list_query' => (bool)$core['is_list_query'], - 'text' => $text, + 'chunk_id' => $chunkId, + 'document_id' => $result['rows'][$chunkId]['document_id'] ?? null, + 'raw_score' => $result['rawScores'][$chunkId] ?? null, + 'rrf_score' => $result['rrfScores'][$chunkId] ?? null, + 'threshold' => $result['threshold'], + 'intent' => $result['intent'], + 'route' => $result['route'], + 'entity_label' => $result['entityLabel'], + 'is_list_query' => $result['isListQuery'], + 'text' => trim((string)$result['rows'][$chunkId]['text']), ]; } return $out; } + // ========================================================= + // CENTRAL ORCHESTRATION + // ========================================================= + + private function execute( + string $prompt, + ModelGenerationConfig $config, + bool $withScores + ): array { + + $entityLabel = $this->catalogIntent->detect($prompt); + $salesIntent = $this->detectSalesIntent($prompt); + $route = $this->routeResolver->resolve($salesIntent, $entityLabel); + + if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) { + $catalogBlock = $this->entityCatalogService->listByTerm($entityLabel); + + if ($catalogBlock !== null) { + return [ + 'route' => $route, + 'entityLabel' => $entityLabel, + 'intent' => $salesIntent, + 'isListQuery' => true, + 'selectedChunkIds' => [], + 'rows' => [], + 'rrfScores' => [], + 'rawScores' => [], + 'threshold' => 0.0, + 'catalogBlock' => trim($catalogBlock), + ]; + } + } + + $core = $this->runCore($prompt, $config, $withScores, $salesIntent); + + $selectedChunkIds = $core['is_list_query'] + ? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']) + : $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']); + + return [ + 'route' => $route, + 'entityLabel' => $entityLabel, + 'intent' => $salesIntent, + 'isListQuery' => $core['is_list_query'], + 'selectedChunkIds' => $selectedChunkIds, + 'rows' => $core['rows'], + 'rrfScores' => $core['rrf_scores'], + 'rawScores' => $core['raw_scores'], + 'threshold' => $core['threshold'], + 'catalogBlock' => null, + ]; + } + // ========================================================= // CORE PIPELINE // ========================================================= - /** - * @return array{ - * limit:int, - * is_list_query:bool, - * sales_intent:string, - * threshold:float, - * topk:int, - * ranked_chunk_ids:string[], - * rows:array>, - * rrf_scores:array, - * raw_scores:array - * } - */ private function runCore( - string $prompt, + string $prompt, ModelGenerationConfig $config, - bool $withScores, - string $salesIntent - ): array - { + bool $withScores, + string $salesIntent + ): array { + $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); @@ -210,33 +186,24 @@ final class NdjsonHybridRetriever implements RetrieverInterface $cleanQuery = $prompt; } - [$threshold, $topK] = $this->computeThresholdAndTopK($salesIntent, $isListQuery, $vectorTopKBase); + [$threshold, $topK] = $this->computeThresholdAndTopK( + $salesIntent, + $isListQuery, + $vectorTopKBase + ); - // Candidate Routing (keine Set-Map nötig; scoped nur wenn IDs existieren) $candidateDocIds = $this->tagRouting->route($cleanQuery); - $candidateDocIds = is_array($candidateDocIds) ? array_values(array_unique(array_filter($candidateDocIds, 'is_string'))) : []; + $candidateDocIds = is_array($candidateDocIds) + ? array_values(array_unique(array_filter($candidateDocIds, 'is_string'))) + : []; $globalHits = $this->vectorClient->search($cleanQuery, $topK); $scopedHits = []; - if ($candidateDocIds !== []) { + if (!empty($candidateDocIds)) { $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds); } - if ($globalHits === [] && $scopedHits === []) { - return [ - 'limit' => $limit, - 'is_list_query' => $isListQuery, - 'sales_intent' => $salesIntent, - 'threshold' => $threshold, - 'topk' => $topK, - 'ranked_chunk_ids' => [], - 'rows' => [], - 'rrf_scores' => [], - 'raw_scores' => [], - ]; - } - $fused = $this->fuseHits( $globalHits, $scopedHits, @@ -248,37 +215,19 @@ final class NdjsonHybridRetriever implements RetrieverInterface $rrfScores = $fused['rrf_scores']; $rawScores = $fused['raw_scores']; - // 🛡 Hardening: wenn Threshold alles rausfiltert, aber globale Hits existieren, - // nehmen wir Top-N als minimalen Kontext. Greift nur in Edge-Cases. if ($rrfScores === [] && $globalHits !== []) { $rrfScores = $this->fallbackRrfFromHits($globalHits, self::EMPTY_RRF_FALLBACK_TOPN); } - if ($rrfScores === []) { - return [ - 'limit' => $limit, - 'is_list_query' => $isListQuery, - 'sales_intent' => $salesIntent, - 'threshold' => $threshold, - 'topk' => $topK, - 'ranked_chunk_ids' => [], - 'rows' => [], - 'rrf_scores' => [], - 'raw_scores' => $rawScores, - ]; - } - arsort($rrfScores); - $rankedChunkIds = array_keys($rrfScores); + $rankedChunkIds = array_keys($rrfScores); $rows = $this->lookup->findByChunkIds($rankedChunkIds); return [ 'limit' => $limit, 'is_list_query' => $isListQuery, - 'sales_intent' => $salesIntent, 'threshold' => $threshold, - 'topk' => $topK, 'ranked_chunk_ids' => $rankedChunkIds, 'rows' => $rows, 'rrf_scores' => $rrfScores, @@ -286,36 +235,33 @@ final class NdjsonHybridRetriever implements RetrieverInterface ]; } - /** - * @return array{0: float, 1: int} threshold, topK - */ + // ========================================================= + // SUPPORT + // ========================================================= + + private function requireConfig(): ModelGenerationConfig + { + $config = $this->configRepository->findActiveForModel(); + if ($config === null) { + throw new \RuntimeException('No active ModelGenerationConfig found.'); + } + return $config; + } + + private function detectSalesIntent(string $prompt): string + { + $data = $this->salesIntentLite->detect($prompt); + return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY); + } + private function computeThresholdAndTopK(string $salesIntent, bool $isListQuery, int $vectorTopKBase): array { $threshold = self::VECTOR_SCORE_THRESHOLD; $topK = $vectorTopKBase; - switch ($salesIntent) { - case SalesIntentLite::OBJECTION: - case SalesIntentLite::PRICING: - $threshold += 0.02; - break; - - case SalesIntentLite::COMPARISON: - $topK = (int)round($vectorTopKBase * 1.4); - break; - - case SalesIntentLite::IMPLEMENTATION: - $topK = (int)round($vectorTopKBase * 1.3); - break; - - case SalesIntentLite::ROI: - $topK = (int)round($vectorTopKBase * 1.2); - break; - - case SalesIntentLite::DISCOVERY: - default: - $threshold += 0; - break; + if ($salesIntent === SalesIntentLite::OBJECTION || + $salesIntent === SalesIntentLite::PRICING) { + $threshold += 0.02; } if ($isListQuery) { @@ -323,39 +269,34 @@ final class NdjsonHybridRetriever implements RetrieverInterface } $topK = max(1, min($topK, self::HARD_MAX_VECTORK)); - - // Enterprise clamp: verhindert Drift, ohne den aktuellen Normalfall zu ändern. $threshold = max(self::THRESHOLD_FLOOR, min(self::THRESHOLD_CEIL, $threshold)); return [$threshold, $topK]; } - /** - * @return array{ - * rrf_scores: array, - * raw_scores: array - * } - */ private function fuseHits( array $globalHits, array $scopedHits, float $threshold, - bool $boostScoped, - bool $captureRaw - ): array - { + bool $boostScoped, + bool $captureRaw + ): array { + $rrfScores = []; $rawScores = []; $apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void { + $rank = 0; foreach ($hits as $hit) { + if (!isset($hit['chunk_id'], $hit['score'])) { continue; } $raw = (float)$hit['score']; + if ($raw < $threshold) { continue; } @@ -386,12 +327,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface ]; } - /** - * Minimaler Fallback: baut RRF nur aus der Reihenfolge (ohne Threshold), - * damit Edge-Cases nicht leer laufen. - * - * @return array - */ private function fallbackRrfFromHits(array $hits, int $topN): array { $rrf = []; @@ -403,8 +338,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface } $rank++; - $chunkId = (string)$hit['chunk_id']; - $rrf[$chunkId] = 1.0 / (self::RRF_K + $rank); + $rrf[(string)$hit['chunk_id']] = 1.0 / (self::RRF_K + $rank); if ($rank >= $topN) { break; @@ -414,16 +348,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface return $rrf; } - private function detectSalesIntent(string $prompt): string - { - $data = $this->salesIntentLite->detect($prompt); - return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY); - } - - // ========================================================= - // SELECTION (shared) - // ========================================================= - private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array { $seen = []; @@ -439,7 +363,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface continue; } - // Dedupe Key (billig & stabil) $key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk))); if (isset($seen[$key])) { @@ -449,7 +372,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface $seen[$key] = true; $out[] = (string)$id; - if (\count($out) >= $limit) { + if (count($out) >= $limit) { break; } } @@ -464,6 +387,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface $docChunkPositions = []; foreach ($chunkIds as $chunkId) { + if (!isset($rows[$chunkId]['text'])) { continue; } @@ -480,8 +404,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface } if (is_int($chunkIndex)) { - $prev = $docChunkPositions[$docId] ?? []; - foreach ($prev as $prevIdx) { + foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) { if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) { continue 2; } @@ -497,7 +420,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface $out[] = (string)$chunkId; $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; - if (\count($out) >= $limit) { + if (count($out) >= $limit) { break; } } @@ -505,10 +428,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface return $out; } - // ========================================================= - // COLLECT (shared) - // ========================================================= - private function collectTextsFromIds(array $chunkIds, array $rows): array { $out = []; @@ -519,11 +438,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface } $text = trim((string)$rows[$id]['text']); - if ($text === '') { - continue; + if ($text !== '') { + $out[] = $text; } - - $out[] = $text; } return $out; diff --git a/templates/admin/model_config/test_retrieval.html.twig b/templates/admin/model_config/test_retrieval.html.twig index 1d2b210..ec80ea6 100644 --- a/templates/admin/model_config/test_retrieval.html.twig +++ b/templates/admin/model_config/test_retrieval.html.twig @@ -21,19 +21,16 @@
-
Max Chunks: {{ config.retrievalMaxChunks }}
-
Vector Top K: {{ config.retrievalVectorTopK }}
-
@@ -77,38 +74,95 @@
+ {% for chunk in results %}
{# ================= META-ZEILE ================= #} -
- rank: {{ chunk.rank }} | - chunk_id: {{ chunk.chunk_id }} | - document_id: - {{ chunk.document_id }} - | - rrf_score: {{ chunk.rrf_score|number_format(6, '.', '') }} - | - raw_score: {{ chunk.raw_score|number_format(6, '.', '') }} - | +
+ + + rank: {{ chunk.rank }} + | + + + chunk_id: {{ chunk.chunk_id }} + | + + + document_id: + {% if chunk.document_id %} + + {{ chunk.document_id }} + + {% else %} + — + {% endif %} + | - threshold: {{ chunk.threshold }} | - intent: {{ chunk.intent }} | - is_list_query: - {{ chunk.is_list_query ? 'true' : 'false' }} - + route: + + {{ chunk.route ?? '—' }} + + | + + + entity: + {{ chunk.entity_label ?? '—' }} + | + + + intent: + {{ chunk.intent ?? '—' }} + | + + + rrf: + {{ chunk.rrf_score is not null + ? chunk.rrf_score|number_format(6, '.', '') + : '—' }} + | + + + raw: + {{ chunk.raw_score is not null + ? chunk.raw_score|number_format(6, '.', '') + : '—' }} + | + + + threshold: + {{ chunk.threshold ?? '—' }} + | + + + list: + {{ chunk.is_list_query ? 'true' : 'false' }} + +
{# ================= CHUNK TEXT ================= #} -
+
{{ chunk.text }}
{% endfor %} +