configRepository->findActiveForModel(); if ($config === null) { throw new \RuntimeException('No active ModelGenerationConfig found.'); } return $this->retrieveInternal($prompt, $config); } public function retrieveInternal(string $prompt, ModelGenerationConfig $config): array { // 🔵 ENTITY CATALOG EARLY EXIT (jetzt auch im Admin-Test aktiv) $entityTerm = $this->catalogIntent->detect($prompt); if ($entityTerm !== null) { $catalogBlock = $this->entityCatalogService->listByTerm($entityTerm); if ($catalogBlock !== null) { return [$catalogBlock]; } } $core = $this->runCore($prompt, $config, false); if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { return []; } // ✅ ORIGINAL: Normal Mode -> Sales optimized selection if (!$core['is_list_query']) { return $this->collectSalesOptimized( $core['ranked_chunk_ids'], $core['rows'], $core['limit'] ); } // ✅ ORIGINAL: List Mode -> simple collectTexts return $this->collectTexts( $core['ranked_chunk_ids'], $core['rows'], $core['limit'] ); } // ========================================================= // DEBUG (NEU, ABER NICHT IM PRODUKTIONS-PFAD) // ========================================================= /** * Gibt genau DIE Treffer zurück, die auch in Produktion ausgewählt werden, * plus Scores/Meta pro ausgewähltem Chunk. * * @return array */ public function retrieveDebug(string $prompt): array { $config = $this->configRepository->findActiveForModel(); if ($config === null) { throw new \RuntimeException('No active ModelGenerationConfig found.'); } $core = $this->runCore($prompt, $config, true); if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { return []; } $selectedChunkIds = $core['is_list_query'] ? $this->selectChunkIdsListMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']) : $this->selectChunkIdsSalesMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']); if ($selectedChunkIds === []) { return []; } $out = []; $rank = 0; foreach ($selectedChunkIds as $chunkId) { if (!isset($core['rows'][$chunkId])) { continue; } $rank++; $text = trim((string)($core['rows'][$chunkId]['text'] ?? '')); $out[] = [ 'rank' => $rank, 'chunk_id' => $chunkId, 'document_id' => isset($core['rows'][$chunkId]['document_id']) ? (string)$core['rows'][$chunkId]['document_id'] : null, 'raw_score' => isset($core['raw_scores'][$chunkId]) ? (float)$core['raw_scores'][$chunkId] : null, 'rrf_score' => isset($core['rrf_scores'][$chunkId]) ? (float)$core['rrf_scores'][$chunkId] : null, 'threshold' => (float)$core['threshold'], 'intent' => (string)$core['sales_intent'], 'is_list_query'=> (bool)$core['is_list_query'], 'text' => $text, ]; } return $out; } // ========================================================= // CORE PIPELINE (einmalig, shared) // ========================================================= /** * @return array{ * limit:int, * is_list_query:bool, * sales_intent:string, * threshold:float, * topk:int, * ranked_chunk_ids: string[], * rows: array>, * rrf_scores: array, * raw_scores: array * } */ private function runCore(string $prompt, ModelGenerationConfig $config, bool $withScores): array { $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); $isListQuery = $this->intentLite->isListQuery($prompt); $salesIntent = $this->salesIntentLite->detect($prompt)['intent']; $cleanQuery = $this->queryCleaner->clean($prompt); if ($cleanQuery === '') { $cleanQuery = $prompt; } $threshold = self::VECTOR_SCORE_THRESHOLD; $topK = $vectorTopKBase; switch ($salesIntent) { case SalesIntentLite::PRICING: $threshold += 0.02; break; case SalesIntentLite::COMPARISON: $topK = (int)round($vectorTopKBase * 1.4); break; case SalesIntentLite::OBJECTION: $threshold -= 0.02; break; case SalesIntentLite::IMPLEMENTATION: $topK = (int)round($vectorTopKBase * 1.3); break; case SalesIntentLite::ROI: $topK = (int)round($vectorTopKBase * 1.2); break; case SalesIntentLite::DISCOVERY: default: $threshold -= 0.03; break; } if ($isListQuery) { $topK = (int)round($topK * self::LIST_BONUS); } $topK = max(1, min($topK, self::HARD_MAX_VECTORK)); $candidateDocIds = $this->tagRouting->route($cleanQuery); $candidateSet = null; if (is_array($candidateDocIds) && $candidateDocIds !== []) { $candidateSet = array_fill_keys($candidateDocIds, true); } $globalHits = $this->vectorClient->search($cleanQuery, $topK); $scopedHits = []; if ($candidateSet !== null) { $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, array_keys($candidateSet)); } if ($globalHits === [] && $scopedHits === []) { return [ 'limit' => $limit, 'is_list_query' => $isListQuery, 'sales_intent' => (string)$salesIntent, 'threshold' => $threshold, 'topk' => $topK, 'ranked_chunk_ids' => [], 'rows' => [], 'rrf_scores' => [], 'raw_scores' => [], ]; } $rrfScores = []; $rawScores = []; $this->applyRrfWithOptionalRaw($globalHits, $rrfScores, $rawScores, $threshold, false, $withScores); $this->applyRrfWithOptionalRaw( $scopedHits, $rrfScores, $rawScores, $threshold, $salesIntent === SalesIntentLite::OBJECTION, $withScores ); if ($rrfScores === []) { return [ 'limit' => $limit, 'is_list_query' => $isListQuery, 'sales_intent' => (string)$salesIntent, 'threshold' => $threshold, 'topk' => $topK, 'ranked_chunk_ids' => [], 'rows' => [], 'rrf_scores' => [], 'raw_scores' => $rawScores, ]; } arsort($rrfScores); $rankedChunkIds = array_keys($rrfScores); $rows = $this->lookup->findByChunkIds($rankedChunkIds); return [ 'limit' => $limit, 'is_list_query' => $isListQuery, 'sales_intent' => (string)$salesIntent, 'threshold' => $threshold, 'topk' => $topK, 'ranked_chunk_ids' => $rankedChunkIds, 'rows' => $rows, 'rrf_scores' => $rrfScores, 'raw_scores' => $rawScores, ]; } private function applyRrfWithOptionalRaw( array $hits, array &$rrfScores, array &$rawScores, float $threshold, bool $boost = false, bool $captureRaw = false ): void { $rank = 0; foreach ($hits as $hit) { if (!isset($hit['chunk_id'], $hit['score'])) { continue; } $raw = (float)$hit['score']; if ($raw < $threshold) { continue; } $chunkId = (string)$hit['chunk_id']; if ($captureRaw) { if (!isset($rawScores[$chunkId]) || $raw > $rawScores[$chunkId]) { $rawScores[$chunkId] = $raw; } } $rank++; $rrf = 1 / (self::RRF_K + $rank); if ($boost) { $rrf *= 1.2; } if (!isset($rrfScores[$chunkId])) { $rrfScores[$chunkId] = 0.0; } $rrfScores[$chunkId] += $rrf; } } private function selectChunkIdsListMode(array $chunkIds, array $rows, int $limit): array { $seen = []; $out = []; foreach ($chunkIds as $id) { if (!isset($rows[$id]['text'])) { continue; } $chunk = trim((string)$rows[$id]['text']); if ($chunk === '') { continue; } $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); if (isset($seen[$key])) { continue; } $seen[$key] = true; $out[] = (string)$id; if (\count($out) >= $limit) { break; } } return $out; } private function selectChunkIdsSalesMode(array $chunkIds, array $rows, int $limit): array { $out = []; $docCounter = []; $docChunkPositions = []; foreach ($chunkIds as $chunkId) { if (!isset($rows[$chunkId]['text'])) { continue; } $docId = $rows[$chunkId]['document_id'] ?? null; $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; if (!is_string($docId)) { continue; } if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) { continue; } if (is_int($chunkIndex)) { $prev = $docChunkPositions[$docId] ?? []; foreach ($prev as $prevIdx) { if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) { continue 2; } } $docChunkPositions[$docId][] = $chunkIndex; } $text = trim((string)$rows[$chunkId]['text']); if ($text === '') { continue; } $out[] = (string)$chunkId; $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; if (\count($out) >= $limit) { break; } } return $out; } private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void { $rank = 0; foreach ($hits as $hit) { if (!isset($hit['chunk_id'], $hit['score'])) { continue; } $raw = (float)$hit['score']; if ($raw < $threshold) { continue; } $chunkId = (string)$hit['chunk_id']; $rank++; $rrf = 1 / (self::RRF_K + $rank); if ($boost) { $rrf *= 1.2; } if (!isset($rrfScores[$chunkId])) { $rrfScores[$chunkId] = 0.0; } $rrfScores[$chunkId] += $rrf; } } private function collectSalesOptimized(array $chunkIds, array $rows, int $limit): array { $out = []; $docCounter = []; $docChunkPositions = []; foreach ($chunkIds as $chunkId) { if (!isset($rows[$chunkId]['text'])) { continue; } $docId = $rows[$chunkId]['document_id'] ?? null; $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; if (!is_string($docId)) { continue; } if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) { continue; } if (is_int($chunkIndex)) { $prev = $docChunkPositions[$docId] ?? []; foreach ($prev as $prevIdx) { if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) { continue 2; } } $docChunkPositions[$docId][] = $chunkIndex; } $text = trim((string)$rows[$chunkId]['text']); if ($text === '') { continue; } $out[] = $text; $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; if (\count($out) >= $limit) { break; } } return $out; } private function collectTexts(array $chunkIds, array $rows, int $limit): array { $seen = []; $out = []; foreach ($chunkIds as $id) { if (!isset($rows[$id]['text'])) { continue; } $chunk = trim((string)$rows[$id]['text']); if ($chunk === '') { continue; } $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); if (isset($seen[$key])) { continue; } $seen[$key] = true; $out[] = $chunk; if (\count($out) >= $limit) { break; } } return $out; } }