requireConfig(); $result = $this->execute($prompt, $config, false); if ($result['catalogBlock'] !== null) { return [$result['catalogBlock']]; } return $this->collectTextsFromIds( $result['selectedChunkIds'], $result['rows'] ); } public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array { $config = $config ?? $this->requireConfig(); $result = $this->execute($prompt, $config, true); if ($result['catalogBlock'] !== null) { return [[ 'rank' => 1, 'chunk_id' => '__CATALOG_LIST__', 'document_id' => null, 'raw_score' => null, 'rrf_score' => null, 'threshold' => 0.0, 'intent' => $result['intent'], 'route' => $result['route'], 'entity_label' => $result['entityLabel'], 'is_list_query' => true, 'text' => $result['catalogBlock'], ]]; } $out = []; $rank = 0; foreach ($result['selectedChunkIds'] as $chunkId) { if (!isset($result['rows'][$chunkId])) { continue; } $rank++; $out[] = [ 'rank' => $rank, 'chunk_id' => $chunkId, 'document_id' => $result['rows'][$chunkId]['document_id'] ?? null, 'raw_score' => $result['rawScores'][$chunkId] ?? null, 'rrf_score' => $result['rrfScores'][$chunkId] ?? null, 'threshold' => $result['threshold'], 'intent' => $result['intent'], 'route' => $result['route'], 'entity_label' => $result['entityLabel'], 'is_list_query' => $result['isListQuery'], 'text' => trim((string)$result['rows'][$chunkId]['text']), ]; } return $out; } // ========================================================= // CENTRAL ORCHESTRATION // ========================================================= private function execute( string $prompt, ModelGenerationConfig $config, bool $withScores ): array { $entityLabel = $this->catalogIntent->detect($prompt); $salesIntent = $this->detectSalesIntent($prompt); $route = $this->routeResolver->resolve($salesIntent, $entityLabel); if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) { $catalogBlock = $this->entityCatalogService->listByTerm($entityLabel); if ($catalogBlock !== null) { return [ 'route' => $route, 'entityLabel' => $entityLabel, 'intent' => $salesIntent, 'isListQuery' => true, 'selectedChunkIds' => [], 'rows' => [], 'rrfScores' => [], 'rawScores' => [], 'threshold' => 0.0, 'catalogBlock' => trim($catalogBlock), ]; } } $core = $this->runCore($prompt, $config, $withScores, $salesIntent); $selectedChunkIds = $core['is_list_query'] ? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']) : $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']); return [ 'route' => $route, 'entityLabel' => $entityLabel, 'intent' => $salesIntent, 'isListQuery' => $core['is_list_query'], 'selectedChunkIds' => $selectedChunkIds, 'rows' => $core['rows'], 'rrfScores' => $core['rrf_scores'], 'rawScores' => $core['raw_scores'], 'threshold' => $core['threshold'], 'catalogBlock' => null, ]; } // ========================================================= // CORE PIPELINE // ========================================================= private function runCore( string $prompt, ModelGenerationConfig $config, bool $withScores, string $salesIntent ): array { $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); $isListQuery = $this->intentLite->isListQuery($prompt); $cleanQuery = $this->queryCleaner->clean($prompt); if ($cleanQuery === '') { $cleanQuery = $prompt; } [$threshold, $topK] = $this->computeThresholdAndTopK( $salesIntent, $isListQuery, $vectorTopKBase ); $candidateDocIds = $this->tagRouting->route($cleanQuery); $candidateDocIds = is_array($candidateDocIds) ? array_values(array_unique(array_filter($candidateDocIds, 'is_string'))) : []; $globalHits = $this->vectorClient->search($cleanQuery, $topK); $scopedHits = []; if (!empty($candidateDocIds)) { $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds); } $fused = $this->fuseHits( $globalHits, $scopedHits, $threshold, $salesIntent === SalesIntentLite::OBJECTION, $withScores ); $rrfScores = $fused['rrf_scores']; $rawScores = $fused['raw_scores']; if ($rrfScores === [] && $globalHits !== []) { $rrfScores = $this->fallbackRrfFromHits($globalHits, self::EMPTY_RRF_FALLBACK_TOPN); } arsort($rrfScores); $rankedChunkIds = array_keys($rrfScores); $rows = $this->lookup->findByChunkIds($rankedChunkIds); return [ 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => $threshold, 'ranked_chunk_ids' => $rankedChunkIds, 'rows' => $rows, 'rrf_scores' => $rrfScores, 'raw_scores' => $rawScores, ]; } // ========================================================= // SUPPORT // ========================================================= private function requireConfig(): ModelGenerationConfig { $config = $this->configRepository->findActiveForModel(); if ($config === null) { throw new \RuntimeException('No active ModelGenerationConfig found.'); } return $config; } private function detectSalesIntent(string $prompt): string { $data = $this->salesIntentLite->detect($prompt); return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY); } private function computeThresholdAndTopK(string $salesIntent, bool $isListQuery, int $vectorTopKBase): array { $threshold = self::VECTOR_SCORE_THRESHOLD; $topK = $vectorTopKBase; if ($salesIntent === SalesIntentLite::OBJECTION || $salesIntent === SalesIntentLite::PRICING) { $threshold += 0.02; } if ($isListQuery) { $topK = (int)round($topK * self::LIST_BONUS); } $topK = max(1, min($topK, self::HARD_MAX_VECTORK)); $threshold = max(self::THRESHOLD_FLOOR, min(self::THRESHOLD_CEIL, $threshold)); return [$threshold, $topK]; } private function fuseHits( array $globalHits, array $scopedHits, float $threshold, bool $boostScoped, bool $captureRaw ): array { $rrfScores = []; $rawScores = []; $apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void { $rank = 0; foreach ($hits as $hit) { if (!isset($hit['chunk_id'], $hit['score'])) { continue; } $raw = (float)$hit['score']; if ($raw < $threshold) { continue; } $chunkId = (string)$hit['chunk_id']; if ($captureRaw) { $rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw); } $rank++; $rrf = 1.0 / (self::RRF_K + $rank); if ($boost) { $rrf *= 1.2; } $rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf; } }; $apply($globalHits, false); $apply($scopedHits, $boostScoped); return [ 'rrf_scores' => $rrfScores, 'raw_scores' => $rawScores, ]; } private function fallbackRrfFromHits(array $hits, int $topN): array { $rrf = []; $rank = 0; foreach ($hits as $hit) { if (!isset($hit['chunk_id'])) { continue; } $rank++; $rrf[(string)$hit['chunk_id']] = 1.0 / (self::RRF_K + $rank); if ($rank >= $topN) { break; } } return $rrf; } private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array { $seen = []; $out = []; foreach ($chunkIds as $id) { if (!isset($rows[$id]['text'])) { continue; } $chunk = trim((string)$rows[$id]['text']); if ($chunk === '') { continue; } $key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk))); if (isset($seen[$key])) { continue; } $seen[$key] = true; $out[] = (string)$id; if (count($out) >= $limit) { break; } } return $out; } private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array { $out = []; $docCounter = []; $docChunkPositions = []; foreach ($chunkIds as $chunkId) { if (!isset($rows[$chunkId]['text'])) { continue; } $docId = $rows[$chunkId]['document_id'] ?? null; $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; if (!is_string($docId)) { continue; } if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) { continue; } if (is_int($chunkIndex)) { foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) { if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) { continue 2; } } $docChunkPositions[$docId][] = $chunkIndex; } $text = trim((string)$rows[$chunkId]['text']); if ($text === '') { continue; } $out[] = (string)$chunkId; $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; if (count($out) >= $limit) { break; } } return $out; } private function collectTextsFromIds(array $chunkIds, array $rows): array { $out = []; foreach ($chunkIds as $id) { if (!isset($rows[$id]['text'])) { continue; } $text = trim((string)$rows[$id]['text']); if ($text !== '') { $out[] = $text; } } return $out; } }