diff --git a/src/Agent/PromptBuilder.php b/src/Agent/PromptBuilder.php index 02a856b..5ae3419 100644 --- a/src/Agent/PromptBuilder.php +++ b/src/Agent/PromptBuilder.php @@ -60,7 +60,6 @@ final readonly class PromptBuilder * @param ShopProductResult[] $shopResults * @param bool|null $fullContext * @param string|null $swagFullOutPut - * @return string */ public function build( string $prompt, @@ -71,11 +70,42 @@ final readonly class PromptBuilder ?bool $fullContext = false, ?string $swagFullOutPut = '' ): string { + $prompt = $this->normalizeBlockText($prompt); + $urlContent = $this->normalizeBlockText($urlContent); + $swagFullOutPut = $this->normalizeNullableBlockText($swagFullOutPut); + + $systemBlock = $this->buildSystemBlock(); + $shopBlock = $this->buildShopBlock($shopResults, $swagFullOutPut); + $knowledgeBlock = $this->buildKnowledgeBlock($knowledgeChunks, $urlContent, $prompt); + $userBlock = $this->buildUserBlock($prompt); + + // Build fixed blocks first so history only receives the remaining budget. + $fixedPrompt = $this->implodeBlocks([ + $systemBlock, + $shopBlock, + $knowledgeBlock, + $userBlock, + ]); + + $contextBlock = $this->buildContextBlock( + userId: $userId, + fixedPrompt: $fixedPrompt, + fullContext: (bool) $fullContext + ); + + return $this->implodeBlocks([ + $systemBlock, + $shopBlock, + $knowledgeBlock, + $contextBlock, + $userBlock, + ]); + } + + private function buildSystemBlock(): string + { $now = (new DateTimeImmutable())->format('Y-m-d H:i:s'); - // ------------------------------------------------------------ - // 1) SYSTEM INSTRUCTIONS - // ------------------------------------------------------------ $activePrompt = $this->systemPromptRepository->findActive(); if (!$activePrompt) { @@ -83,46 +113,13 @@ final readonly class PromptBuilder } $activeSystemPrompt = str_replace('{% now %}', $now, $activePrompt->getContent()); - $systemBlock = "SYSTEM:\n" . $activeSystemPrompt; - // ------------------------------------------------------------ - // 2) PRIORITIZED FIXED BLOCKS - // ------------------------------------------------------------ - $shopBlock = $this->buildShopBlock($shopResults, $swagFullOutPut); - $knowledgeBlock = $this->buildKnowledgeBlock($knowledgeChunks, $urlContent); - $userBlock = "USER QUESTION:\n" . $prompt; + return "SYSTEM:\n" . $this->normalizeBlockText($activeSystemPrompt); + } - // Build all fixed blocks first so history only gets the remaining budget. - $fixedBlocks = array_filter([ - $systemBlock, - $shopBlock, - $knowledgeBlock, - $userBlock, - ]); - - $fixedPrompt = implode("\n\n", $fixedBlocks); - - // ------------------------------------------------------------ - // 3) CONVERSATION CONTEXT (AUTHORITATIVE, FILLS REMAINING SPACE) - // ------------------------------------------------------------ - $contextBlock = $this->buildContextBlock( - userId: $userId, - fixedPrompt: $fixedPrompt, - fullContext: (bool) $fullContext - ); - - // ------------------------------------------------------------ - // 4) FINAL PROMPT ASSEMBLY - // ------------------------------------------------------------ - $blocks = array_filter([ - $systemBlock, - $shopBlock, - $knowledgeBlock, - $contextBlock, - $userBlock, - ]); - - return implode("\n\n", $blocks); + private function buildUserBlock(string $prompt): string + { + return "USER QUESTION:\n" . $prompt; } /** @@ -151,33 +148,36 @@ final readonly class PromptBuilder ); } + $history = $this->normalizeBlockText($history); + if ($history === '') { return ''; } return - "CONVERSATION CONTEXT (authoritative):\n" . - "The following messages are the previous turns of this conversation.\n" . - "They must be considered when answering the next question.\n\n" . + "CONVERSATION CONTEXT (contextual only):\n" . + "The following messages are previous turns of this conversation.\n" . + "Use them to resolve references, follow-up questions, and user intent.\n" . + "They must not override retrieved factual knowledge or live shop data.\n\n" . $history; } /** - * Build the shop block with the highest business priority. + * Build the shop block with the highest business priority for product facts. */ private function buildShopBlock(array $shopResults, ?string $swagFullOutPut): string { $parts = []; - if ($swagFullOutPut !== null && trim($swagFullOutPut) !== '') { + if ($swagFullOutPut !== null && $swagFullOutPut !== '') { $parts[] = "SHOP SEARCH QUERY:\n" . - trim($swagFullOutPut) . "\n" . + $swagFullOutPut . "\n" . "Source: Shop Search"; } if ($shopResults === []) { - return implode("\n\n", $parts); + return $this->implodeBlocks($parts); } $isDetailed = count($shopResults) <= 5; @@ -190,19 +190,19 @@ final readonly class PromptBuilder $n = $i + 1; $entryParts = [ - "[{$n}] " . $product->name, + "[{$n}] " . $this->normalizeBlockText($product->name), ]; if ($product->productNumber) { - $entryParts[] = "Product number: " . $product->productNumber; + $entryParts[] = "Product number: " . $this->normalizeBlockText($product->productNumber); } if ($product->manufacturer) { - $entryParts[] = "Manufacturer: " . $product->manufacturer; + $entryParts[] = "Manufacturer: " . $this->normalizeBlockText($product->manufacturer); } if ($product->price) { - $entryParts[] = "Price: " . $product->price; + $entryParts[] = "Price: " . $this->normalizeBlockText($product->price); } if ($product->available !== null) { @@ -210,23 +210,27 @@ final readonly class PromptBuilder } foreach ($product->highlights as $highlight) { - $entryParts[] = "- " . $highlight; + $highlight = $this->normalizeBlockText((string) $highlight); + + if ($highlight !== '') { + $entryParts[] = "- " . $highlight; + } } if ($product->url) { - $entryParts[] = "URL: " . $product->url; + $entryParts[] = "URL: " . $this->normalizeBlockText($product->url); } if ($product->productImage) { - $entryParts[] = "Product image: " . $product->productImage; + $entryParts[] = "Product image: " . $this->normalizeBlockText($product->productImage); } if ($isDetailed && $product->description) { - $entryParts[] = "Description: " . $product->description; + $entryParts[] = "Description: " . $this->normalizeBlockText($product->description); } if ($product->customFields) { - $entryParts[] = "Meta information: " . $product->customFields; + $entryParts[] = "Meta information: " . $this->normalizeBlockText($product->customFields); } $lines[] = implode("\n", $entryParts); @@ -235,41 +239,75 @@ final readonly class PromptBuilder if ($lines !== []) { $parts[] = "LIVE SHOP RESULTS (authoritative for products):\n" . + "Use these results as authoritative for product identity, availability, pricing, and shop-visible product details.\n" . + "If retrieved documents conflict with live shop data on product availability or price, prefer the live shop data.\n" . + "Do not infer undocumented technical specifications from live shop data.\n\n" . implode("\n\n", $lines); } - return implode("\n\n", $parts); + return $this->implodeBlocks($parts); } /** - * Build the supporting knowledge block. + * Build the knowledge block. + * + * Retrieved knowledge is authoritative for factual statements that are present in the sources. + * Missing facts must not be invented. */ - private function buildKnowledgeBlock(array $knowledgeChunks, string $urlContent): string + private function buildKnowledgeBlock(array $knowledgeChunks, string $urlContent, string $prompt): string { $knowledgeParts = []; + $isTechnicalProductQuestion = $this->isLikelyTechnicalProductQuestion($prompt); if ($knowledgeChunks !== []) { $lines = []; foreach ($knowledgeChunks as $i => $chunk) { + $chunk = $this->normalizeBlockText((string) $chunk); + + if ($chunk === '') { + continue; + } + $n = $i + 1; $lines[] = "[{$n}] {$chunk}"; } - $knowledgeParts[] = - "RETRIEVED KNOWLEDGE (supporting):\n" . - "Source: Documents\n" . - implode("\n\n", $lines); + if ($lines !== []) { + $knowledgeParts[] = + "FACT GROUNDING RULES:\n" . + "- Use retrieved knowledge as authoritative for factual answers.\n" . + "- Extract concrete values exactly when they are present, including units, ranges, model names, indicator names, IP classes, temperatures, pressures, dimensions, counts, relay outputs, current outputs, and error codes.\n" . + "- Do not invent missing values.\n" . + "- Do not replace missing values with estimates, defaults, or typical industry assumptions.\n" . + "- Do not claim that information is missing if it appears in the provided sources.\n" . + "- Do not compare with other products unless those products are also present in the provided sources.\n" . + "- Prefer source-faithful wording over persuasive wording.\n" . + "- Avoid marketing language such as 'ideal', 'perfect', 'unverzichtbar', or 'state-of-the-art'.\n" . + "- Clearly separate explicit facts from inferences.\n" . + "- If an inference is necessary, label it with 'Inference:'.\n" . + ($isTechnicalProductQuestion + ? "- For technical product questions, answer primarily with explicitly stated facts.\n" . + "- Keep interpretations minimal and do not generalize application areas beyond the provided sources.\n" . + "- If the retrieved knowledge describes one specific named product, stay within that product and do not merge related product families or variants.\n" . + "- Prefer neutral technical wording over evaluative summaries.\n" . + "- If a detail is not explicitly stated in the provided sources, say so plainly.\n" + : "" + ) . "\n" . + "RETRIEVED KNOWLEDGE (authoritative for facts):\n" . + "Source: Documents\n" . + implode("\n\n", $lines); + } } if ($urlContent !== '') { $knowledgeParts[] = - "CONTENT FROM URL (supporting):\n" . + "CONTENT FROM URL (authoritative if user-provided):\n" . "Source: URL\n" . $urlContent; } - return implode("\n\n", $knowledgeParts); + return $this->implodeBlocks($knowledgeParts); } /** @@ -309,6 +347,85 @@ final readonly class PromptBuilder return max(0, $remaining); } + private function implodeBlocks(array $blocks): string + { + $filtered = array_values(array_filter( + array_map( + fn ($block): string => is_string($block) ? $this->normalizeBlockText($block) : '', + $blocks + ), + static fn (string $block): bool => $block !== '' + )); + + return implode("\n\n", $filtered); + } + + private function normalizeNullableBlockText(?string $value): ?string + { + if ($value === null) { + return null; + } + + $normalized = $this->normalizeBlockText($value); + + return $normalized === '' ? null : $normalized; + } + + private function normalizeBlockText(string $value): string + { + $value = str_replace(["\r\n", "\r"], "\n", $value); + $value = str_replace("\u{00A0}", ' ', $value); + $value = trim($value); + + $value = preg_replace("/\n{3,}/", "\n\n", $value) ?? $value; + $value = preg_replace("/[ \t]+\n/", "\n", $value) ?? $value; + $value = preg_replace("/[ \t]{2,}/", " ", $value) ?? $value; + + return $value; + } + + private function isLikelyTechnicalProductQuestion(string $prompt): bool + { + $normalized = mb_strtolower($prompt, 'UTF-8'); + + $keywords = [ + 'technisch', + 'technical', + 'produkt', + 'product', + 'gerät', + 'device', + 'modell', + 'model', + 'messprinzip', + 'schnittstelle', + 'relais', + 'indikator', + 'spannung', + 'strom', + 'druck', + 'temperatur', + 'schutzart', + 'fehlercode', + 'wasserhärte', + 'testomat', + ]; + + $matches = 0; + + foreach ($keywords as $keyword) { + if (str_contains($normalized, $keyword)) { + $matches++; + } + } + + if ($matches >= 2) { + return true; + } + + return preg_match('/\b[\p{L}]{2,}\s?\d{2,5}\b/u', $prompt) === 1; + } + private function clamp(int $value, int $min, int $max): int { return max($min, min($max, $value)); diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index 7485640..b954e59 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -1,21 +1,68 @@ chunkManager->streamAll() as $row) { $id = $row['chunk_id'] ?? null; + if (!is_string($id) || !isset($wanted[$id])) { continue; } $found[$id] = $row; - if (\count($found) === \count($wanted)) { + if (count($found) === count($wanted)) { break; } } return $found; } -} + + /** + * Returns all chunks of one document keyed by chunk_id. + * + * @return array> + */ + public function findByDocumentId(string $documentId): array + { + $rows = []; + + foreach ($this->chunkManager->streamAll() as $row) { + $rowDocumentId = $row['document_id'] ?? null; + $chunkId = $row['chunk_id'] ?? null; + + if ($rowDocumentId !== $documentId || !is_string($chunkId) || $chunkId === '') { + continue; + } + + $rows[$chunkId] = $row; + } + + return $rows; + } + + /** + * Resolves the best exact document title match from the user prompt. + * + * Matching rules: + * - the normalized prompt must contain the full normalized document title + * - titles containing digits are preferred, e.g. "Testomat 808" + * - longer exact titles win over shorter generic titles + * + * @return array{ + * document_id:string, + * document_title:string, + * rows:array> + * }|null + */ + public function findBestExactDocumentByPrompt(string $prompt): ?array + { + $normalizedPrompt = $this->normalizeText($prompt); + + if ($normalizedPrompt === '') { + return null; + } + + $documents = []; + + foreach ($this->chunkManager->streamAll() as $row) { + $documentId = $row['document_id'] ?? null; + $chunkId = $row['chunk_id'] ?? null; + + if (!is_string($documentId) || $documentId === '' || !is_string($chunkId) || $chunkId === '') { + continue; + } + + if (!isset($documents[$documentId])) { + $documentTitle = $this->extractDocumentTitle($row); + + if ($documentTitle === '') { + continue; + } + + $documents[$documentId] = [ + 'document_id' => $documentId, + 'document_title' => $documentTitle, + 'normalized_title' => $this->normalizeText($documentTitle), + 'rows' => [], + ]; + } + + $documents[$documentId]['rows'][$chunkId] = $row; + } + + $best = null; + $bestScore = null; + + foreach ($documents as $document) { + $normalizedTitle = $document['normalized_title']; + + if (!$this->isConfidentTitleMatch($normalizedPrompt, $normalizedTitle)) { + continue; + } + + $score = mb_strlen($normalizedTitle, 'UTF-8'); + + if (preg_match('/\d/u', $normalizedTitle) === 1) { + $score += 1000; + } + + if ($best === null || $score > $bestScore) { + $best = $document; + $bestScore = $score; + } + } + + if ($best === null) { + return null; + } + + return [ + 'document_id' => $best['document_id'], + 'document_title' => $best['document_title'], + 'rows' => $best['rows'], + ]; + } + + /** + * @param array $row + */ + private function extractDocumentTitle(array $row): string + { + $metadataTitle = $row['metadata']['document_title'] ?? null; + + if (is_string($metadataTitle) && trim($metadataTitle) !== '') { + return trim($metadataTitle); + } + + $text = (string) ($row['text'] ?? ''); + + if ( + $text !== '' && + preg_match('/^#\s*Produkt\s+Titel:\s*`?([^`\n]+)`?/imu', $text, $matches) === 1 + ) { + return trim((string) ($matches[1] ?? '')); + } + + return ''; + } + + private function isConfidentTitleMatch(string $normalizedPrompt, string $normalizedTitle): bool + { + if ($normalizedPrompt === '' || $normalizedTitle === '') { + return false; + } + + $paddedPrompt = ' ' . $normalizedPrompt . ' '; + $paddedTitle = ' ' . $normalizedTitle . ' '; + + if (!str_contains($paddedPrompt, $paddedTitle)) { + return false; + } + + $tokens = preg_split('/\s+/u', $normalizedTitle, -1, PREG_SPLIT_NO_EMPTY) ?: []; + + $significantTokens = array_values(array_filter( + $tokens, + static fn (string $token): bool => mb_strlen($token, 'UTF-8') >= 3 || preg_match('/\d/u', $token) === 1 + )); + + return count($significantTokens) >= 2 || preg_match('/\d/u', $normalizedTitle) === 1; + } + + private function normalizeText(string $value): string + { + $value = mb_strtolower(trim($value), 'UTF-8'); + $value = str_replace(['-', '/', '_'], ' ', $value); + $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value; + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim($value); + } +} \ No newline at end of file diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 523ff30..2b7ca25 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -23,6 +23,7 @@ use RuntimeException; * Main responsibilities: * - detect high-level request intent * - optionally short-circuit to catalog list output + * - resolve exact document-title matches before semantic retrieval * - run vector retrieval globally and optionally document-scoped * - fuse both result sets with RRF-style scoring * - apply selection rules for list queries vs. sales-style queries @@ -30,6 +31,15 @@ use RuntimeException; */ final readonly class NdjsonHybridRetriever implements RetrieverInterface { + /** + * When one document clearly dominates the top-ranked window, + * temporarily switch from "spread" mode to "dominant document" mode. + */ + private const DOMINANT_DOC_WINDOW = 6; + private const DOMINANT_DOC_MIN_HITS = 3; + private const DOMINANT_DOC_MAX_CHUNKS = 4; + private const EXACT_DOCUMENT_MAX_CHUNKS = 6; + public function __construct( private NdjsonChunkLookup $lookup, private VectorSearchClient $vectorClient, @@ -58,6 +68,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface * - executes the full orchestration pipeline * - if the route resolves to a catalog list, returns the catalog block only * - otherwise returns the selected chunk texts + * * @throws Exception */ public function retrieve(string $prompt): array @@ -65,12 +76,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $config = $this->requireConfig(); $result = $this->execute($prompt, $config, false); - // Catalog list responses bypass normal chunk retrieval completely. if ($result['catalogBlock'] !== null) { return [$result['catalogBlock']]; } - // No selected chunks means no usable retrieval result. if ($result['selectedChunkIds'] === []) { return []; } @@ -90,6 +99,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface * - fused RRF scores * - intent / route information * - threshold and list-query flags + * * @throws Exception */ public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array @@ -97,12 +107,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $config = $config ?? $this->requireConfig(); $result = $this->execute($prompt, $config, true); - // For catalog list routes we expose a synthetic debug row. if ($result['catalogBlock'] !== null) { return [[ 'rank' => 1, 'chunk_id' => '__CATALOG_LIST__', 'document_id' => null, + 'chunk_index' => null, 'raw_score' => null, 'rrf_score' => null, 'threshold' => 0.0, @@ -110,6 +120,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'route' => $result['route'], 'entity_label' => $result['entityLabel'], 'is_list_query' => true, + 'selection_mode' => 'catalog_list', 'text' => $result['catalogBlock'], ]]; } @@ -122,8 +133,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $rank = 0; foreach ($result['selectedChunkIds'] as $chunkId) { - - // Skip ids that could not be resolved to real chunk rows. if (!isset($result['rows'][$chunkId])) { continue; } @@ -134,6 +143,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'rank' => $rank, 'chunk_id' => $chunkId, 'document_id' => $result['rows'][$chunkId]['document_id'] ?? null, + 'chunk_index' => $result['rows'][$chunkId]['chunk_index'] ?? null, 'raw_score' => $result['rawScores'][$chunkId] ?? null, 'rrf_score' => $result['rrfScores'][$chunkId] ?? null, 'threshold' => $result['threshold'], @@ -141,6 +151,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'route' => $result['route'], 'entity_label' => $result['entityLabel'], 'is_list_query' => $result['isListQuery'], + 'selection_mode' => $result['selectionMode'], 'text' => trim((string)$result['rows'][$chunkId]['text']), ]; } @@ -159,8 +170,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface * 1. Detect catalog entity and sales intent * 2. Resolve route * 3. If route is a catalog list route, try direct catalog output - * 4. Otherwise, run the normal hybrid retrieval core - * 5. Select final chunk ids depending on query type + * 4. If prompt matches one exact document title, use exact-document fast path + * 5. Otherwise, run the normal hybrid retrieval core + * 6. Select final chunk ids depending on query type + * * @throws Exception */ private function execute( @@ -169,16 +182,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface bool $withScores ): array { - $entityLabel = $this->catalogIntent->detect($prompt); $salesIntent = $this->detectSalesIntent($prompt); $route = $this->routeResolver->resolve($salesIntent, $entityLabel); - // Fast path: - // If the route explicitly asks for a catalog list and we have an entity label, - // we return a prebuilt catalog block instead of semantic chunk retrieval. if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) { - $catalogBlock = $this->entityCatalogService->listByTerm($entityLabel); if ($catalogBlock !== null) { @@ -187,6 +195,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'entityLabel' => $entityLabel, 'intent' => $salesIntent, 'isListQuery' => true, + 'selectionMode' => 'catalog_list', 'selectedChunkIds' => [], 'rows' => [], 'rrfScores' => [], @@ -197,15 +206,40 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } } + $exactDocumentMatch = $this->lookup->findBestExactDocumentByPrompt($prompt); + + if ($exactDocumentMatch !== null) { + $selectedChunkIds = $this->selectExactDocumentChunkIds( + $exactDocumentMatch['rows'], + max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)) + ); + + if ($selectedChunkIds !== []) { + return [ + 'route' => $route, + 'entityLabel' => $entityLabel, + 'intent' => $salesIntent, + 'isListQuery' => false, + 'selectionMode' => 'exact_document_title', + 'selectedChunkIds' => $selectedChunkIds, + 'rows' => $exactDocumentMatch['rows'], + 'rrfScores' => $this->buildExactDocumentScores($selectedChunkIds), + 'rawScores' => [], + 'threshold' => 1.0, + 'catalogBlock' => null, + ]; + } + } + $core = $this->runCore($prompt, $config, $withScores, $salesIntent); - // No ranked chunks or no resolved rows means retrieval produced nothing usable. if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { return [ 'route' => $route, 'entityLabel' => $entityLabel, 'intent' => $salesIntent, 'isListQuery' => $core['is_list_query'], + 'selectionMode' => null, 'selectedChunkIds' => [], 'rows' => [], 'rrfScores' => [], @@ -215,18 +249,30 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface ]; } - // Selection strategy depends on query type: - // - list queries prefer deduplicated chunks - // - sales queries prefer spread across docs / chunk distance - $selectedChunkIds = $core['is_list_query'] - ? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']) - : $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']); + if ($core['is_list_query']) { + $selectedChunkIds = $this->selectListChunkIds( + $core['ranked_chunk_ids'], + $core['rows'], + $core['limit'] + ); + $selectionMode = 'list_deduplicated'; + } else { + $salesSelection = $this->selectSalesChunkIds( + $core['ranked_chunk_ids'], + $core['rows'], + $core['limit'] + ); + + $selectedChunkIds = $salesSelection['ids']; + $selectionMode = $salesSelection['mode']; + } return [ 'route' => $route, 'entityLabel' => $entityLabel, 'intent' => $salesIntent, 'isListQuery' => $core['is_list_query'], + 'selectionMode' => $selectionMode, 'selectedChunkIds' => $selectedChunkIds, 'rows' => $core['rows'], 'rrfScores' => $core['rrf_scores'], @@ -252,6 +298,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface * - run global and optional scoped vector search * - fuse hits * - resolve chunk ids to chunk rows + * * @throws Exception */ private function runCore( @@ -261,17 +308,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface string $salesIntent ): array { - $limit = max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK)); $isListQuery = $this->intentLite->isListQuery($prompt); - // The prompt is normalized first, then enriched before retrieval. $cleanQuery = $this->queryCleaner->clean($prompt); $cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery); - // Empty cleaned query means retrieval would be meaningless. if ($cleanQuery === '') { return [ 'limit' => $limit, @@ -290,22 +334,21 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $vectorTopKBase ); - // Tag routing tries to narrow retrieval to relevant document ids. $candidateDocIds = $this->tagRouting->route($cleanQuery); $candidateDocIds = is_array($candidateDocIds) - ? array_values(array_unique(array_filter($candidateDocIds, 'is_string'))) + ? array_values(array_unique(array_filter( + $candidateDocIds, + static fn(mixed $value): bool => is_string($value) && $value !== '' + ))) : []; - // Always run a global search. $globalHits = $this->vectorClient->search($cleanQuery, $topK); - // Optionally run a scoped search if tag routing yielded document candidates. $scopedHits = []; if ($candidateDocIds !== []) { $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds); } - // Nothing found at all. if ($globalHits === [] && $scopedHits === []) { return [ 'limit' => $limit, @@ -318,25 +361,19 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface ]; } - // Fuse global and scoped hits with optional scoped boost. $fused = $this->fuseHits( $globalHits, $scopedHits, $threshold, - $salesIntent === SalesIntentLite::OBJECTION, + $scopedHits !== [], $withScores ); $rrfScores = $fused['rrf_scores']; $rawScores = $fused['raw_scores']; - // Fallback: - // If all hits were filtered by threshold but global hits exist, - // derive a weak RRF ranking from the raw hit order. if ($rrfScores === [] && $globalHits !== []) { - $rrfScores = $this->fallbackRrfFromHits( - $globalHits - ); + $rrfScores = $this->fallbackRrfFromHits($globalHits); } if ($rrfScores === []) { @@ -351,11 +388,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface ]; } - // Highest fused score first. arsort($rrfScores); $rankedChunkIds = array_keys($rrfScores); - // Resolve the ranking to actual NDJSON chunk rows. $rows = $this->lookup->findByChunkIds($rankedChunkIds); return [ @@ -381,9 +416,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface private function requireConfig(): ModelGenerationConfig { $config = $this->configRepository->findActiveForModel(); + if ($config === null) { throw new RuntimeException('No active ModelGenerationConfig found.'); } + return $config; } @@ -395,6 +432,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface private function detectSalesIntent(string $prompt): string { $data = $this->salesIntentLite->detect($prompt); + return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY); } @@ -412,7 +450,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface int $vectorTopKBase ): array { - $threshold = NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD; $topK = $vectorTopKBase; @@ -428,7 +465,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } $topK = max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK)); - $threshold = max(NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR, min(NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, $threshold)); + $threshold = max( + NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR, + min(NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, $threshold) + ); return [$threshold, $topK]; } @@ -450,31 +490,25 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface bool $captureRaw ): array { - $rrfScores = []; $rawScores = []; $apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void { - $rank = 0; foreach ($hits as $hit) { - - // Every hit must provide a chunk id and a numeric score. if (!isset($hit['chunk_id'], $hit['score'])) { continue; } $raw = (float)$hit['score']; - // Threshold is applied before rank fusion. if ($raw < $threshold) { continue; } $chunkId = (string)$hit['chunk_id']; - // Store the best raw score per chunk for debug inspection. if ($captureRaw) { $rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw); } @@ -482,12 +516,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $rank++; $rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); - // Scoped result lists can get a slight relevance bonus. if ($boost) { $rrf *= 1.2; } - // Scores from multiple hit lists accumulate. $rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf; } }; @@ -513,7 +545,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $rank = 0; foreach ($hits as $hit) { - if (!isset($hit['chunk_id'])) { continue; } @@ -529,6 +560,68 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $rrf; } + /** + * Selects a coherent chunk window from one exact document title match. + * + * For exact product questions we prefer a pure document slice over + * cross-document fusion to avoid mixing neighbouring product families. + * + * @param array> $rows + * @return string[] + */ + private function selectExactDocumentChunkIds(array $rows, int $limit): array + { + uasort($rows, static function (array $a, array $b): int { + $aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX; + $bIndex = is_int($b['chunk_index'] ?? null) ? (int)$b['chunk_index'] : PHP_INT_MAX; + + if ($aIndex !== $bIndex) { + return $aIndex <=> $bIndex; + } + + return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? '')); + }); + + $selected = []; + $max = min($limit, self::EXACT_DOCUMENT_MAX_CHUNKS); + + foreach ($rows as $row) { + $chunkId = $row['chunk_id'] ?? null; + $text = trim((string)($row['text'] ?? '')); + + if (!is_string($chunkId) || $chunkId === '' || $text === '') { + continue; + } + + $selected[] = $chunkId; + + if (count($selected) >= $max) { + break; + } + } + + return $selected; + } + + /** + * Builds synthetic scores for exact-title fast-path selections. + * + * These scores are only used for debug output consistency. + * + * @param string[] $chunkIds + * @return array + */ + private function buildExactDocumentScores(array $chunkIds): array + { + $scores = []; + + foreach (array_values($chunkIds) as $rank => $chunkId) { + $scores[(string)$chunkId] = 1.0 / (1 + $rank); + } + + return $scores; + } + /** * Selection strategy for list-style queries. * @@ -543,7 +636,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $out = []; foreach ($chunkIds as $id) { - if (!isset($rows[$id]['text'])) { continue; } @@ -553,7 +645,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - // Deduplicate by normalized chunk text. $key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk))); if (isset($seen[$key])) { @@ -574,18 +665,242 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface /** * Selection strategy for sales-oriented queries. * - * Goal: - * - avoid overloading the result with chunks from the same document - * - avoid chunks that are too close to each other in the same document - * - preserve top-ranked relevance while improving contextual spread + * Modes: + * - exact_document_title: + * used when the prompt clearly contains one exact document title + * and the answer should stay strictly within that document + * + * - sales_dominant_document: + * used when one document clearly dominates the top hit window + * and coherent neighbouring chunks from that document are more + * useful than cross-document spread + * + * - sales_spread: + * default mode that spreads chunks across documents and enforces + * distance between chunk positions of the same document */ private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array { - $out = []; + $dominantDocId = $this->detectDominantTopDocument($chunkIds, $rows); + + if ($dominantDocId !== null) { + $dominantChunkIds = $this->selectDominantDocumentChunkIds( + $dominantDocId, + $chunkIds, + $rows, + $limit + ); + + if ($dominantChunkIds !== []) { + return [ + 'ids' => $this->fillRemainingSalesChunkIds( + $dominantChunkIds, + $chunkIds, + $rows, + $limit + ), + 'mode' => 'sales_dominant_document', + ]; + } + } + + return [ + 'ids' => $this->selectSalesChunkIdsSpread($chunkIds, $rows, $limit), + 'mode' => 'sales_spread', + ]; + } + + /** + * Detects whether one document clearly dominates the first ranked window. + * + * This is especially useful for product-sheet style documents where + * several adjacent chunks belong together and should be passed to the model + * as one coherent factual block. + */ + private function detectDominantTopDocument(array $chunkIds, array $rows): ?string + { + $docWindow = []; + + foreach (array_slice($chunkIds, 0, self::DOMINANT_DOC_WINDOW) as $chunkId) { + if (!isset($rows[$chunkId]['text'])) { + continue; + } + + $text = trim((string)$rows[$chunkId]['text']); + $docId = $rows[$chunkId]['document_id'] ?? null; + + if ($text === '' || !is_string($docId) || $docId === '') { + continue; + } + + $docWindow[] = $docId; + } + + if (count($docWindow) < 2) { + return null; + } + + $counts = array_count_values($docWindow); + arsort($counts); + + $dominantDocId = array_key_first($counts); + + if (!is_string($dominantDocId) || $dominantDocId === '') { + return null; + } + + $dominantCount = (int)($counts[$dominantDocId] ?? 0); + + if ($dominantCount >= self::DOMINANT_DOC_MIN_HITS) { + return $dominantDocId; + } + + $first = $docWindow[0] ?? null; + $second = $docWindow[1] ?? null; + + if ($dominantCount >= 2 && $first === $dominantDocId && $second === $dominantDocId) { + return $dominantDocId; + } + + return null; + } + + /** + * Selects a coherent chunk window from the dominant document. + * + * Strategy: + * - use the highest-ranked chunk of that document as anchor + * - prefer neighbouring chunk indices around that anchor + * - sort the final selection by chunk index for prompt coherence + */ + private function selectDominantDocumentChunkIds( + string $documentId, + array $chunkIds, + array $rows, + int $limit + ): array + { + $docHits = []; + $anchorChunkIndex = null; + + foreach ($chunkIds as $rank => $chunkId) { + if (!isset($rows[$chunkId]['text'])) { + continue; + } + + $text = trim((string)$rows[$chunkId]['text']); + $docId = $rows[$chunkId]['document_id'] ?? null; + + if ($text === '' || $docId !== $documentId) { + continue; + } + + $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; + $chunkIndex = is_int($chunkIndex) ? $chunkIndex : null; + + if ($anchorChunkIndex === null && $chunkIndex !== null) { + $anchorChunkIndex = $chunkIndex; + } + + $docHits[] = [ + 'id' => (string)$chunkId, + 'rank' => $rank, + 'chunk_index' => $chunkIndex, + ]; + } + + if ($docHits === []) { + return []; + } + + $maxFromDoc = min($limit, self::DOMINANT_DOC_MAX_CHUNKS); + + if ($anchorChunkIndex !== null) { + usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int { + $aDistance = $a['chunk_index'] === null ? PHP_INT_MAX : abs($a['chunk_index'] - $anchorChunkIndex); + $bDistance = $b['chunk_index'] === null ? PHP_INT_MAX : abs($b['chunk_index'] - $anchorChunkIndex); + + if ($aDistance !== $bDistance) { + return $aDistance <=> $bDistance; + } + + return $a['rank'] <=> $b['rank']; + }); + } else { + usort($docHits, static fn(array $a, array $b): int => $a['rank'] <=> $b['rank']); + } + + $selected = array_slice($docHits, 0, $maxFromDoc); + + usort($selected, static function (array $a, array $b): int { + $aIndex = $a['chunk_index']; + $bIndex = $b['chunk_index']; + + if ($aIndex === null && $bIndex === null) { + return $a['rank'] <=> $b['rank']; + } + + if ($aIndex === null) { + return 1; + } + + if ($bIndex === null) { + return -1; + } + + if ($aIndex !== $bIndex) { + return $aIndex <=> $bIndex; + } + + return $a['rank'] <=> $b['rank']; + }); + + return array_map( + static fn(array $row): string => $row['id'], + $selected + ); + } + + /** + * Fills the remaining sales slots after a dominant document selection. + * + * The already selected dominant-document chunks stay fixed. + * Remaining slots are filled with the normal spread strategy. + */ + private function fillRemainingSalesChunkIds( + array $seedChunkIds, + array $chunkIds, + array $rows, + int $limit + ): array + { + $out = array_values(array_unique(array_map('strval', $seedChunkIds))); + + if (count($out) >= $limit) { + return array_slice($out, 0, $limit); + } + + $selected = array_fill_keys($out, true); $docCounter = []; $docChunkPositions = []; + foreach ($out as $chunkId) { + $docId = $rows[$chunkId]['document_id'] ?? null; + $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; + + if (is_string($docId) && $docId !== '') { + $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; + + if (is_int($chunkIndex)) { + $docChunkPositions[$docId][] = $chunkIndex; + } + } + } + foreach ($chunkIds as $chunkId) { + if (isset($selected[$chunkId])) { + continue; + } if (!isset($rows[$chunkId]['text'])) { continue; @@ -594,23 +909,80 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $docId = $rows[$chunkId]['document_id'] ?? null; $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; - // Sales selection requires a valid document context. - if (!is_string($docId)) { + if (!is_string($docId) || $docId === '') { continue; } - // Limit how many chunks may come from the same document. if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) { continue; } - // Enforce a minimum distance between chunk positions of the same document. if (is_int($chunkIndex)) { foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) { if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) { continue 2; } } + } + + $text = trim((string)$rows[$chunkId]['text']); + if ($text === '') { + continue; + } + + $out[] = (string)$chunkId; + $selected[$chunkId] = true; + $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; + + if (is_int($chunkIndex)) { + $docChunkPositions[$docId][] = $chunkIndex; + } + + if (count($out) >= $limit) { + break; + } + } + + return $out; + } + + /** + * Default spread selection for sales-oriented queries. + * + * Goal: + * - avoid overloading the result with chunks from the same document + * - avoid chunks that are too close to each other in the same document + * - preserve top-ranked relevance while improving contextual spread + */ + private function selectSalesChunkIdsSpread(array $chunkIds, array $rows, int $limit): array + { + $out = []; + $docCounter = []; + $docChunkPositions = []; + + foreach ($chunkIds as $chunkId) { + if (!isset($rows[$chunkId]['text'])) { + continue; + } + + $docId = $rows[$chunkId]['document_id'] ?? null; + $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; + + if (!is_string($docId) || $docId === '') { + continue; + } + + if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) { + continue; + } + + if (is_int($chunkIndex)) { + foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) { + if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) { + continue 2; + } + } + $docChunkPositions[$docId][] = $chunkIndex; } @@ -638,7 +1010,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $out = []; foreach ($chunkIds as $id) { - if (!isset($rows[$id]['text'])) { continue; }