From ce859b9662b9d10ab8f57f8900263d6d4e0f8f76 Mon Sep 17 00:00:00 2001 From: team 1 Date: Tue, 21 Apr 2026 17:20:16 +0200 Subject: [PATCH] fine tuning rag --- .../Retrieval/NdjsonHybridRetriever.php | 1631 +++++++---------- 1 file changed, 699 insertions(+), 932 deletions(-) diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index a29e86f..fa55f3f 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -25,8 +25,7 @@ use RuntimeException; * - optionally short-circuit to catalog list output * - resolve exact document-title matches before semantic retrieval * - run vector retrieval globally and optionally document-scoped - * - run lexical retrieval globally and optionally document-scoped - * - fuse all result sets with RRF-style scoring + * - fuse both result sets with RRF-style scoring * - apply selection rules for list queries vs. sales-style queries * - return either plain chunk texts or debug metadata */ @@ -40,83 +39,40 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface private const DOMINANT_DOC_MIN_HITS = 3; private const DOMINANT_DOC_MAX_CHUNKS = 4; private const EXACT_DOCUMENT_MAX_CHUNKS = 6; - - /** - * Conservative no-tag fallback: - * derive a temporary document scope only when the top global vector hits - * show repeated evidence for the same document(s). - */ - private const PSEUDO_SCOPE_GLOBAL_WINDOW = 10; - private const PSEUDO_SCOPE_MIN_DOC_HITS = 2; - private const PSEUDO_SCOPE_MAX_DOCS = 3; - - /** - * Soft document candidates are derived from global lexical hits first. - * This stage is placed between tag-routing and vector-based pseudo scope. - */ - private const SOFT_DOC_CANDIDATE_WINDOW = 8; - private const SOFT_DOC_CANDIDATE_MIN_DOC_HITS = 2; - private const SOFT_DOC_CANDIDATE_MAX_DOCS = 3; - private const SOFT_DOC_TOP_SCORE_MIN = 0.98; - - /** - * Scoped retrieval is useful in both cases, but true tag-routing should - * stay stronger than soft candidates and pseudo-scoping. - */ - private const TAG_SCOPED_VECTOR_BOOST = 1.20; - private const SOFT_DOC_SCOPED_VECTOR_BOOST = 1.12; - private const PSEUDO_SCOPED_VECTOR_BOOST = 1.08; - - /** - * Secondary vector query should help recall/robustness, but must not - * overpower the primary enriched semantic query. - */ - private const SECONDARY_GLOBAL_VECTOR_BOOST = 0.93; - private const SECONDARY_SCOPED_VECTOR_MULTIPLIER = 0.95; - - /** - * Lexical retrieval should support precision, but not overpower vector routing. - */ - private const LEXICAL_SCORE_THRESHOLD = 0.18; - private const GLOBAL_LEXICAL_BOOST = 0.90; - private const TAG_SCOPED_LEXICAL_BOOST = 1.04; - private const SOFT_DOC_SCOPED_LEXICAL_BOOST = 1.02; - private const PSEUDO_SCOPED_LEXICAL_BOOST = 1.00; - - /** - * Conservative re-rank stage based on document title / metadata alignment. - * - * This is intentionally applied after fusion so it sharpens ranking - * without replacing the underlying retrieval sources. - */ - private const TITLE_MATCH_BASE_BOOST = 0.04; - private const TITLE_MATCH_MAX_BOOST = 0.18; - private const FILE_MATCH_BASE_BOOST = 0.02; - private const FILE_MATCH_MAX_BOOST = 0.08; - private const META_MATCH_MAX_BOOST = 0.04; - private const EXACT_TITLE_PHRASE_BOOST = 0.08; - private const EXACT_FILE_PHRASE_BOOST = 0.04; - private const MAX_TITLE_METADATA_BOOST = 0.22; + private const FOCUSED_PRODUCT_WINDOW = 8; + private const FOCUSED_PRODUCT_MIN_SCORE = 10.0; + private const FOCUSED_PRODUCT_MIN_GAP = 4.0; + private const FOCUSED_PRODUCT_MAX_CHUNKS = 4; public function __construct( - private NdjsonChunkLookup $lookup, - private VectorSearchClient $vectorClient, - private NdjsonKeywordRetriever $keywordRetriever, - private TagRoutingService $tagRouting, + private NdjsonChunkLookup $lookup, + private VectorSearchClient $vectorClient, + private TagRoutingService $tagRouting, private ModelGenerationConfigRepository $configRepository, - private QueryCleaner $queryCleaner, - private IntentLite $intentLite, - private SalesIntentLite $salesIntentLite, - private CatalogIntentLite $catalogIntent, - private IntentRouteResolver $routeResolver, - private EntityCatalogService $entityCatalogService, - private QueryEnricher $queryEnricher, - ) { + private QueryCleaner $queryCleaner, + private IntentLite $intentLite, + private SalesIntentLite $salesIntentLite, + private CatalogIntentLite $catalogIntent, + private IntentRouteResolver $routeResolver, + private EntityCatalogService $entityCatalogService, + private QueryEnricher $queryEnricher, + ) + { } + // ========================================================= + // PUBLIC API + // ========================================================= + /** * Returns the final retrieval payload as plain text chunks. * + * Behaviour: + * - loads active retrieval config + * - executes the full orchestration pipeline + * - if the route resolves to a catalog list, returns the catalog block only + * - otherwise returns the selected chunk texts + * * @throws Exception */ public function retrieve(string $prompt): array @@ -141,6 +97,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface /** * Returns a debug-friendly retrieval result with scoring/meta information. * + * This method is used for inspection and tuning: + * - selected chunk ids + * - raw vector scores + * - fused RRF scores + * - intent / route information + * - threshold and list-query flags + * * @throws Exception */ public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array @@ -155,40 +118,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'document_id' => null, 'chunk_index' => null, 'raw_score' => null, - 'raw_vector_score' => null, - 'raw_keyword_score' => null, 'rrf_score' => null, 'threshold' => 0.0, - 'lexical_threshold' => self::LEXICAL_SCORE_THRESHOLD, 'intent' => $result['intent'], 'route' => $result['route'], 'entity_label' => $result['entityLabel'], 'is_list_query' => true, 'selection_mode' => 'catalog_list', - 'scope_mode' => 'catalog_list', - 'clean_query' => null, - 'semantic_query' => null, - 'secondary_vector_query' => null, - 'lexical_query' => null, - 'tag_candidate_doc_ids' => [], - 'soft_document_candidate_doc_ids' => [], - 'pseudo_scope_doc_ids' => [], - 'global_hit_count' => 0, - 'scoped_hit_count' => 0, - 'global_vector_hit_count' => 0, - 'global_primary_vector_hit_count' => 0, - 'global_secondary_vector_hit_count' => 0, - 'global_keyword_hit_count' => 0, - 'scoped_vector_hit_count' => 0, - 'scoped_primary_vector_hit_count' => 0, - 'scoped_secondary_vector_hit_count' => 0, - 'scoped_keyword_hit_count' => 0, - 'scoped_boost_factor' => 0.0, - 'scoped_vector_boost_factor' => 0.0, - 'secondary_scoped_vector_boost_factor' => 0.0, - 'scoped_keyword_boost_factor' => 0.0, - 'title_metadata_boost' => 0.0, - 'title_metadata_doc_boosts' => [], 'text' => $result['catalogBlock'], ]]; } @@ -207,49 +143,19 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $rank++; - $rawVectorScore = $result['rawVectorScores'][$chunkId] ?? null; - $rawKeywordScore = $result['rawKeywordScores'][$chunkId] ?? null; - $out[] = [ 'rank' => $rank, 'chunk_id' => $chunkId, 'document_id' => $result['rows'][$chunkId]['document_id'] ?? null, 'chunk_index' => $result['rows'][$chunkId]['chunk_index'] ?? null, - 'raw_score' => $this->maxNullableFloat($rawVectorScore, $rawKeywordScore), - 'raw_vector_score' => $rawVectorScore, - 'raw_keyword_score' => $rawKeywordScore, + 'raw_score' => $result['rawScores'][$chunkId] ?? null, 'rrf_score' => $result['rrfScores'][$chunkId] ?? null, 'threshold' => $result['threshold'], - 'lexical_threshold' => self::LEXICAL_SCORE_THRESHOLD, 'intent' => $result['intent'], 'route' => $result['route'], 'entity_label' => $result['entityLabel'], 'is_list_query' => $result['isListQuery'], 'selection_mode' => $result['selectionMode'], - 'scope_mode' => $result['scopeMode'], - 'clean_query' => $result['cleanQuery'], - 'semantic_query' => $result['semanticQuery'], - 'secondary_vector_query' => $result['secondaryVectorQuery'], - 'lexical_query' => $result['lexicalQuery'], - 'tag_candidate_doc_ids' => $result['tagCandidateDocIds'], - 'soft_document_candidate_doc_ids' => $result['softDocumentCandidateDocIds'], - 'pseudo_scope_doc_ids' => $result['pseudoScopeDocIds'], - 'global_hit_count' => $result['globalHitCount'], - 'scoped_hit_count' => $result['scopedHitCount'], - 'global_vector_hit_count' => $result['globalVectorHitCount'], - 'global_primary_vector_hit_count' => $result['globalPrimaryVectorHitCount'], - 'global_secondary_vector_hit_count' => $result['globalSecondaryVectorHitCount'], - 'global_keyword_hit_count' => $result['globalKeywordHitCount'], - 'scoped_vector_hit_count' => $result['scopedVectorHitCount'], - 'scoped_primary_vector_hit_count' => $result['scopedPrimaryVectorHitCount'], - 'scoped_secondary_vector_hit_count' => $result['scopedSecondaryVectorHitCount'], - 'scoped_keyword_hit_count' => $result['scopedKeywordHitCount'], - 'scoped_boost_factor' => $result['scopedBoostFactor'], - 'scoped_vector_boost_factor' => $result['scopedVectorBoostFactor'], - 'secondary_scoped_vector_boost_factor' => $result['secondaryScopedVectorBoostFactor'], - 'scoped_keyword_boost_factor' => $result['scopedKeywordBoostFactor'], - 'title_metadata_boost' => $result['titleMetadataBoosts'][$chunkId] ?? 0.0, - 'title_metadata_doc_boosts' => $result['titleMetadataDocBoosts'], 'text' => trim((string)$result['rows'][$chunkId]['text']), ]; } @@ -257,16 +163,29 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $out; } + // ========================================================= + // CENTRAL ORCHESTRATION + // ========================================================= + /** * Central orchestration entrypoint. * + * Pipeline: + * 1. Detect catalog entity and sales intent + * 2. Resolve route + * 3. If route is a catalog list route, try direct catalog output + * 4. If prompt matches one exact document title, use exact-document fast path + * 5. Otherwise, run the normal hybrid retrieval core + * 6. Select final chunk ids depending on query type + * * @throws Exception */ private function execute( - string $prompt, + string $prompt, ModelGenerationConfig $config, - bool $withScores - ): array { + bool $withScores + ): array + { $entityLabel = $this->catalogIntent->detect($prompt); $salesIntent = $this->detectSalesIntent($prompt); $route = $this->routeResolver->resolve($salesIntent, $entityLabel); @@ -281,35 +200,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'intent' => $salesIntent, 'isListQuery' => true, 'selectionMode' => 'catalog_list', - 'scopeMode' => 'catalog_list', - 'cleanQuery' => null, - 'semanticQuery' => null, - 'secondaryVectorQuery' => null, - 'lexicalQuery' => null, - 'tagCandidateDocIds' => [], - 'softDocumentCandidateDocIds' => [], - 'pseudoScopeDocIds' => [], - 'globalHitCount' => 0, - 'scopedHitCount' => 0, - 'globalVectorHitCount' => 0, - 'globalPrimaryVectorHitCount' => 0, - 'globalSecondaryVectorHitCount' => 0, - 'globalKeywordHitCount' => 0, - 'scopedVectorHitCount' => 0, - 'scopedPrimaryVectorHitCount' => 0, - 'scopedSecondaryVectorHitCount' => 0, - 'scopedKeywordHitCount' => 0, - 'scopedBoostFactor' => 0.0, - 'scopedVectorBoostFactor' => 0.0, - 'secondaryScopedVectorBoostFactor' => 0.0, - 'scopedKeywordBoostFactor' => 0.0, 'selectedChunkIds' => [], 'rows' => [], 'rrfScores' => [], - 'rawVectorScores' => [], - 'rawKeywordScores' => [], - 'titleMetadataBoosts' => [], - 'titleMetadataDocBoosts' => [], + 'rawScores' => [], 'threshold' => 0.0, 'catalogBlock' => trim($catalogBlock), ]; @@ -331,35 +225,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'intent' => $salesIntent, 'isListQuery' => false, 'selectionMode' => 'exact_document_title', - 'scopeMode' => 'exact_document_title', - 'cleanQuery' => null, - 'semanticQuery' => null, - 'secondaryVectorQuery' => null, - 'lexicalQuery' => null, - 'tagCandidateDocIds' => [], - 'softDocumentCandidateDocIds' => [], - 'pseudoScopeDocIds' => [], - 'globalHitCount' => 0, - 'scopedHitCount' => 0, - 'globalVectorHitCount' => 0, - 'globalPrimaryVectorHitCount' => 0, - 'globalSecondaryVectorHitCount' => 0, - 'globalKeywordHitCount' => 0, - 'scopedVectorHitCount' => 0, - 'scopedPrimaryVectorHitCount' => 0, - 'scopedSecondaryVectorHitCount' => 0, - 'scopedKeywordHitCount' => 0, - 'scopedBoostFactor' => 0.0, - 'scopedVectorBoostFactor' => 0.0, - 'secondaryScopedVectorBoostFactor' => 0.0, - 'scopedKeywordBoostFactor' => 0.0, 'selectedChunkIds' => $selectedChunkIds, 'rows' => $exactDocumentMatch['rows'], 'rrfScores' => $this->buildExactDocumentScores($selectedChunkIds), - 'rawVectorScores' => [], - 'rawKeywordScores' => [], - 'titleMetadataBoosts' => [], - 'titleMetadataDocBoosts' => [], + 'rawScores' => [], 'threshold' => 1.0, 'catalogBlock' => null, ]; @@ -375,39 +244,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'intent' => $salesIntent, 'isListQuery' => $core['is_list_query'], 'selectionMode' => null, - 'scopeMode' => $core['scope_mode'], - 'cleanQuery' => $core['clean_query'], - 'semanticQuery' => $core['semantic_query'], - 'secondaryVectorQuery' => $core['secondary_vector_query'], - 'lexicalQuery' => $core['lexical_query'], - 'tagCandidateDocIds' => $core['tag_candidate_doc_ids'], - 'softDocumentCandidateDocIds' => $core['soft_document_candidate_doc_ids'], - 'pseudoScopeDocIds' => $core['pseudo_scope_doc_ids'], - 'globalHitCount' => $core['global_hit_count'], - 'scopedHitCount' => $core['scoped_hit_count'], - 'globalVectorHitCount' => $core['global_vector_hit_count'], - 'globalPrimaryVectorHitCount' => $core['global_primary_vector_hit_count'], - 'globalSecondaryVectorHitCount' => $core['global_secondary_vector_hit_count'], - 'globalKeywordHitCount' => $core['global_keyword_hit_count'], - 'scopedVectorHitCount' => $core['scoped_vector_hit_count'], - 'scopedPrimaryVectorHitCount' => $core['scoped_primary_vector_hit_count'], - 'scopedSecondaryVectorHitCount' => $core['scoped_secondary_vector_hit_count'], - 'scopedKeywordHitCount' => $core['scoped_keyword_hit_count'], - 'scopedBoostFactor' => max( - $core['scoped_vector_boost_factor'], - $core['secondary_scoped_vector_boost_factor'], - $core['scoped_keyword_boost_factor'] - ), - 'scopedVectorBoostFactor' => $core['scoped_vector_boost_factor'], - 'secondaryScopedVectorBoostFactor' => $core['secondary_scoped_vector_boost_factor'], - 'scopedKeywordBoostFactor' => $core['scoped_keyword_boost_factor'], 'selectedChunkIds' => [], 'rows' => [], 'rrfScores' => [], - 'rawVectorScores' => [], - 'rawKeywordScores' => [], - 'titleMetadataBoosts' => $core['title_metadata_boosts'], - 'titleMetadataDocBoosts' => $core['title_metadata_doc_boosts'], + 'rawScores' => [], 'threshold' => $core['threshold'], 'catalogBlock' => null, ]; @@ -422,6 +262,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $selectionMode = 'list_deduplicated'; } else { $salesSelection = $this->selectSalesChunkIds( + $prompt, $core['ranked_chunk_ids'], $core['rows'], $core['limit'] @@ -437,272 +278,107 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'intent' => $salesIntent, 'isListQuery' => $core['is_list_query'], 'selectionMode' => $selectionMode, - 'scopeMode' => $core['scope_mode'], - 'cleanQuery' => $core['clean_query'], - 'semanticQuery' => $core['semantic_query'], - 'secondaryVectorQuery' => $core['secondary_vector_query'], - 'lexicalQuery' => $core['lexical_query'], - 'tagCandidateDocIds' => $core['tag_candidate_doc_ids'], - 'softDocumentCandidateDocIds' => $core['soft_document_candidate_doc_ids'], - 'pseudoScopeDocIds' => $core['pseudo_scope_doc_ids'], - 'globalHitCount' => $core['global_hit_count'], - 'scopedHitCount' => $core['scoped_hit_count'], - 'globalVectorHitCount' => $core['global_vector_hit_count'], - 'globalPrimaryVectorHitCount' => $core['global_primary_vector_hit_count'], - 'globalSecondaryVectorHitCount' => $core['global_secondary_vector_hit_count'], - 'globalKeywordHitCount' => $core['global_keyword_hit_count'], - 'scopedVectorHitCount' => $core['scoped_vector_hit_count'], - 'scopedPrimaryVectorHitCount' => $core['scoped_primary_vector_hit_count'], - 'scopedSecondaryVectorHitCount' => $core['scoped_secondary_vector_hit_count'], - 'scopedKeywordHitCount' => $core['scoped_keyword_hit_count'], - 'scopedBoostFactor' => max( - $core['scoped_vector_boost_factor'], - $core['secondary_scoped_vector_boost_factor'], - $core['scoped_keyword_boost_factor'] - ), - 'scopedVectorBoostFactor' => $core['scoped_vector_boost_factor'], - 'secondaryScopedVectorBoostFactor' => $core['secondary_scoped_vector_boost_factor'], - 'scopedKeywordBoostFactor' => $core['scoped_keyword_boost_factor'], 'selectedChunkIds' => $selectedChunkIds, 'rows' => $core['rows'], 'rrfScores' => $core['rrf_scores'], - 'rawVectorScores' => $core['raw_vector_scores'], - 'rawKeywordScores' => $core['raw_keyword_scores'], - 'titleMetadataBoosts' => $core['title_metadata_boosts'], - 'titleMetadataDocBoosts' => $core['title_metadata_doc_boosts'], + 'rawScores' => $core['raw_scores'], 'threshold' => $core['threshold'], 'catalogBlock' => null, ]; } + // ========================================================= + // CORE PIPELINE + // ========================================================= + /** * Executes the actual hybrid retrieval logic. * + * Steps: + * - derive limits from config within hard safety caps + * - detect whether the prompt is a "list query" + * - clean and enrich the prompt + * - compute threshold + vector topK based on intent/query type + * - route query into candidate document ids via tag routing + * - run global and optional scoped vector search + * - fuse hits + * - resolve chunk ids to chunk rows + * * @throws Exception */ private function runCore( - string $prompt, + string $prompt, ModelGenerationConfig $config, - bool $withScores, - string $salesIntent - ): array { + bool $withScores, + string $salesIntent + ): array + { $limit = max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK)); $isListQuery = $this->intentLite->isListQuery($prompt); $cleanQuery = $this->queryCleaner->clean($prompt); + $cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery); if ($cleanQuery === '') { return [ 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD, - 'clean_query' => '', - 'semantic_query' => '', - 'secondary_vector_query' => '', - 'lexical_query' => '', - 'scope_mode' => 'none', - 'tag_candidate_doc_ids' => [], - 'soft_document_candidate_doc_ids' => [], - 'pseudo_scope_doc_ids' => [], - 'global_hit_count' => 0, - 'scoped_hit_count' => 0, - 'global_vector_hit_count' => 0, - 'global_primary_vector_hit_count' => 0, - 'global_secondary_vector_hit_count' => 0, - 'global_keyword_hit_count' => 0, - 'scoped_vector_hit_count' => 0, - 'scoped_primary_vector_hit_count' => 0, - 'scoped_secondary_vector_hit_count' => 0, - 'scoped_keyword_hit_count' => 0, - 'scoped_vector_boost_factor' => 0.0, - 'secondary_scoped_vector_boost_factor' => 0.0, - 'scoped_keyword_boost_factor' => 0.0, 'ranked_chunk_ids' => [], 'rows' => [], 'rrf_scores' => [], - 'raw_vector_scores' => [], - 'raw_keyword_scores' => [], - 'title_metadata_boosts' => [], - 'title_metadata_doc_boosts' => [], + 'raw_scores' => [], ]; } - $semanticQuery = $this->queryEnricher->enrichPrompt($cleanQuery); - $secondaryVectorQuery = $cleanQuery !== $semanticQuery ? $cleanQuery : ''; - $lexicalQuery = $cleanQuery; - [$threshold, $topK] = $this->computeThresholdAndTopK( $salesIntent, $isListQuery, $vectorTopKBase ); - $tagCandidateDocIds = $this->tagRouting->route($semanticQuery); - $tagCandidateDocIds = is_array($tagCandidateDocIds) + $candidateDocIds = $this->tagRouting->route($cleanQuery); + $candidateDocIds = is_array($candidateDocIds) ? array_values(array_unique(array_filter( - $tagCandidateDocIds, + $candidateDocIds, static fn(mixed $value): bool => is_string($value) && $value !== '' ))) : []; - $globalPrimaryVectorHits = $this->vectorClient->search($semanticQuery, $topK); - $globalSecondaryVectorHits = $secondaryVectorQuery !== '' - ? $this->vectorClient->search($secondaryVectorQuery, $topK) - : []; - $globalKeywordHits = $this->keywordRetriever->search($lexicalQuery, $topK); + $globalHits = $this->vectorClient->search($cleanQuery, $topK); - $softDocumentCandidateDocIds = []; - $pseudoScopeDocIds = []; - $scopeMode = 'none'; - - $scopedVectorBoostFactor = 0.0; - $secondaryScopedVectorBoostFactor = 0.0; - $scopedKeywordBoostFactor = 0.0; - - $scopedPrimaryVectorHits = []; - $scopedSecondaryVectorHits = []; - $scopedKeywordHits = []; - - if ($tagCandidateDocIds !== []) { - $scopeMode = 'tag_routing'; - $scopedVectorBoostFactor = self::TAG_SCOPED_VECTOR_BOOST; - $secondaryScopedVectorBoostFactor = self::TAG_SCOPED_VECTOR_BOOST * self::SECONDARY_SCOPED_VECTOR_MULTIPLIER; - $scopedKeywordBoostFactor = self::TAG_SCOPED_LEXICAL_BOOST; - - $scopedPrimaryVectorHits = $this->vectorClient->searchScoped($semanticQuery, $topK, $tagCandidateDocIds); - $scopedSecondaryVectorHits = $secondaryVectorQuery !== '' - ? $this->vectorClient->searchScoped($secondaryVectorQuery, $topK, $tagCandidateDocIds) - : []; - $scopedKeywordHits = $this->keywordRetriever->search($lexicalQuery, $topK, $tagCandidateDocIds); - } else { - $softDocumentCandidateDocIds = $this->deriveSoftDocumentCandidateDocIds($globalKeywordHits); - - if ($softDocumentCandidateDocIds !== []) { - $scopeMode = 'soft_document_candidate'; - $scopedVectorBoostFactor = self::SOFT_DOC_SCOPED_VECTOR_BOOST; - $secondaryScopedVectorBoostFactor = self::SOFT_DOC_SCOPED_VECTOR_BOOST * self::SECONDARY_SCOPED_VECTOR_MULTIPLIER; - $scopedKeywordBoostFactor = self::SOFT_DOC_SCOPED_LEXICAL_BOOST; - - $scopedPrimaryVectorHits = $this->vectorClient->searchScoped($semanticQuery, $topK, $softDocumentCandidateDocIds); - $scopedSecondaryVectorHits = $secondaryVectorQuery !== '' - ? $this->vectorClient->searchScoped($secondaryVectorQuery, $topK, $softDocumentCandidateDocIds) - : []; - $scopedKeywordHits = $this->keywordRetriever->search($lexicalQuery, $topK, $softDocumentCandidateDocIds); - } else { - $pseudoScopeDocIds = $this->derivePseudoScopeDocumentIds($globalPrimaryVectorHits); - - if ($pseudoScopeDocIds !== []) { - $scopeMode = 'pseudo_scope'; - $scopedVectorBoostFactor = self::PSEUDO_SCOPED_VECTOR_BOOST; - $secondaryScopedVectorBoostFactor = self::PSEUDO_SCOPED_VECTOR_BOOST * self::SECONDARY_SCOPED_VECTOR_MULTIPLIER; - $scopedKeywordBoostFactor = self::PSEUDO_SCOPED_LEXICAL_BOOST; - - $scopedPrimaryVectorHits = $this->vectorClient->searchScoped($semanticQuery, $topK, $pseudoScopeDocIds); - $scopedSecondaryVectorHits = $secondaryVectorQuery !== '' - ? $this->vectorClient->searchScoped($secondaryVectorQuery, $topK, $pseudoScopeDocIds) - : []; - $scopedKeywordHits = $this->keywordRetriever->search($lexicalQuery, $topK, $pseudoScopeDocIds); - } - } + $scopedHits = []; + if ($candidateDocIds !== []) { + $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds); } - if ( - $globalPrimaryVectorHits === [] - && $globalSecondaryVectorHits === [] - && $globalKeywordHits === [] - && $scopedPrimaryVectorHits === [] - && $scopedSecondaryVectorHits === [] - && $scopedKeywordHits === [] - ) { + if ($globalHits === [] && $scopedHits === []) { return [ 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => $threshold, - 'clean_query' => $cleanQuery, - 'semantic_query' => $semanticQuery, - 'secondary_vector_query' => $secondaryVectorQuery, - 'lexical_query' => $lexicalQuery, - 'scope_mode' => $scopeMode, - 'tag_candidate_doc_ids' => $tagCandidateDocIds, - 'soft_document_candidate_doc_ids' => $softDocumentCandidateDocIds, - 'pseudo_scope_doc_ids' => $pseudoScopeDocIds, - 'global_hit_count' => 0, - 'scoped_hit_count' => 0, - 'global_vector_hit_count' => 0, - 'global_primary_vector_hit_count' => 0, - 'global_secondary_vector_hit_count' => 0, - 'global_keyword_hit_count' => 0, - 'scoped_vector_hit_count' => 0, - 'scoped_primary_vector_hit_count' => 0, - 'scoped_secondary_vector_hit_count' => 0, - 'scoped_keyword_hit_count' => 0, - 'scoped_vector_boost_factor' => $scopedVectorBoostFactor, - 'secondary_scoped_vector_boost_factor' => $secondaryScopedVectorBoostFactor, - 'scoped_keyword_boost_factor' => $scopedKeywordBoostFactor, 'ranked_chunk_ids' => [], 'rows' => [], 'rrf_scores' => [], - 'raw_vector_scores' => [], - 'raw_keyword_scores' => [], - 'title_metadata_boosts' => [], - 'title_metadata_doc_boosts' => [], + 'raw_scores' => [], ]; } - $fused = $this->fuseHitSources([ - [ - 'hits' => $globalPrimaryVectorHits, - 'threshold' => $threshold, - 'boost' => 1.0, - 'bucket' => 'vector', - ], - [ - 'hits' => $globalSecondaryVectorHits, - 'threshold' => $threshold, - 'boost' => self::SECONDARY_GLOBAL_VECTOR_BOOST, - 'bucket' => 'vector', - ], - [ - 'hits' => $globalKeywordHits, - 'threshold' => self::LEXICAL_SCORE_THRESHOLD, - 'boost' => self::GLOBAL_LEXICAL_BOOST, - 'bucket' => 'keyword', - ], - [ - 'hits' => $scopedPrimaryVectorHits, - 'threshold' => $threshold, - 'boost' => $scopedVectorBoostFactor, - 'bucket' => 'vector', - ], - [ - 'hits' => $scopedSecondaryVectorHits, - 'threshold' => $threshold, - 'boost' => $secondaryScopedVectorBoostFactor, - 'bucket' => 'vector', - ], - [ - 'hits' => $scopedKeywordHits, - 'threshold' => self::LEXICAL_SCORE_THRESHOLD, - 'boost' => $scopedKeywordBoostFactor, - 'bucket' => 'keyword', - ], - ], $withScores); + $fused = $this->fuseHits( + $globalHits, + $scopedHits, + $threshold, + $scopedHits !== [], + $withScores + ); $rrfScores = $fused['rrf_scores']; - $rawVectorScores = $fused['raw_vector_scores']; - $rawKeywordScores = $fused['raw_keyword_scores']; + $rawScores = $fused['raw_scores']; - if ($rrfScores === []) { - $rrfScores = $this->fallbackRrfFromSources( - $globalPrimaryVectorHits, - $globalSecondaryVectorHits, - $globalKeywordHits, - $scopedPrimaryVectorHits, - $scopedSecondaryVectorHits, - $scopedKeywordHits - ); + if ($rrfScores === [] && $globalHits !== []) { + $rrfScores = $this->fallbackRrfFromHits($globalHits); } if ($rrfScores === []) { @@ -710,45 +386,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => $threshold, - 'clean_query' => $cleanQuery, - 'semantic_query' => $semanticQuery, - 'secondary_vector_query' => $secondaryVectorQuery, - 'lexical_query' => $lexicalQuery, - 'scope_mode' => $scopeMode, - 'tag_candidate_doc_ids' => $tagCandidateDocIds, - 'soft_document_candidate_doc_ids' => $softDocumentCandidateDocIds, - 'pseudo_scope_doc_ids' => $pseudoScopeDocIds, - 'global_hit_count' => count($globalPrimaryVectorHits) + count($globalSecondaryVectorHits) + count($globalKeywordHits), - 'scoped_hit_count' => count($scopedPrimaryVectorHits) + count($scopedSecondaryVectorHits) + count($scopedKeywordHits), - 'global_vector_hit_count' => count($globalPrimaryVectorHits) + count($globalSecondaryVectorHits), - 'global_primary_vector_hit_count' => count($globalPrimaryVectorHits), - 'global_secondary_vector_hit_count' => count($globalSecondaryVectorHits), - 'global_keyword_hit_count' => count($globalKeywordHits), - 'scoped_vector_hit_count' => count($scopedPrimaryVectorHits) + count($scopedSecondaryVectorHits), - 'scoped_primary_vector_hit_count' => count($scopedPrimaryVectorHits), - 'scoped_secondary_vector_hit_count' => count($scopedSecondaryVectorHits), - 'scoped_keyword_hit_count' => count($scopedKeywordHits), - 'scoped_vector_boost_factor' => $scopedVectorBoostFactor, - 'secondary_scoped_vector_boost_factor' => $secondaryScopedVectorBoostFactor, - 'scoped_keyword_boost_factor' => $scopedKeywordBoostFactor, 'ranked_chunk_ids' => [], 'rows' => [], 'rrf_scores' => [], - 'raw_vector_scores' => $rawVectorScores, - 'raw_keyword_scores' => $rawKeywordScores, - 'title_metadata_boosts' => [], - 'title_metadata_doc_boosts' => [], + 'raw_scores' => $rawScores, ]; } - $rows = $this->lookup->findByChunkIds(array_keys($rrfScores)); - - [$rrfScores, $titleMetadataBoosts, $titleMetadataDocBoosts] = $this->applyTitleMetadataBoosts( - $rrfScores, - $rows, - $lexicalQuery - ); - arsort($rrfScores); $rankedChunkIds = array_keys($rrfScores); @@ -758,38 +402,22 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => $threshold, - 'clean_query' => $cleanQuery, - 'semantic_query' => $semanticQuery, - 'secondary_vector_query' => $secondaryVectorQuery, - 'lexical_query' => $lexicalQuery, - 'scope_mode' => $scopeMode, - 'tag_candidate_doc_ids' => $tagCandidateDocIds, - 'soft_document_candidate_doc_ids' => $softDocumentCandidateDocIds, - 'pseudo_scope_doc_ids' => $pseudoScopeDocIds, - 'global_hit_count' => count($globalPrimaryVectorHits) + count($globalSecondaryVectorHits) + count($globalKeywordHits), - 'scoped_hit_count' => count($scopedPrimaryVectorHits) + count($scopedSecondaryVectorHits) + count($scopedKeywordHits), - 'global_vector_hit_count' => count($globalPrimaryVectorHits) + count($globalSecondaryVectorHits), - 'global_primary_vector_hit_count' => count($globalPrimaryVectorHits), - 'global_secondary_vector_hit_count' => count($globalSecondaryVectorHits), - 'global_keyword_hit_count' => count($globalKeywordHits), - 'scoped_vector_hit_count' => count($scopedPrimaryVectorHits) + count($scopedSecondaryVectorHits), - 'scoped_primary_vector_hit_count' => count($scopedPrimaryVectorHits), - 'scoped_secondary_vector_hit_count' => count($scopedSecondaryVectorHits), - 'scoped_keyword_hit_count' => count($scopedKeywordHits), - 'scoped_vector_boost_factor' => $scopedVectorBoostFactor, - 'secondary_scoped_vector_boost_factor' => $secondaryScopedVectorBoostFactor, - 'scoped_keyword_boost_factor' => $scopedKeywordBoostFactor, 'ranked_chunk_ids' => $rankedChunkIds, 'rows' => $rows, 'rrf_scores' => $rrfScores, - 'raw_vector_scores' => $rawVectorScores, - 'raw_keyword_scores' => $rawKeywordScores, - 'title_metadata_boosts' => $titleMetadataBoosts, - 'title_metadata_doc_boosts' => $titleMetadataDocBoosts, + 'raw_scores' => $rawScores, ]; } + // ========================================================= + // SUPPORT + // ========================================================= + /** + * Loads the active model generation config. + * + * Retrieval is not allowed to proceed without an active config. + */ private function requireConfig(): ModelGenerationConfig { $config = $this->configRepository->findActiveForModel(); @@ -801,18 +429,32 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $config; } + /** + * Extracts the normalized sales intent string from the intent detector. + * + * Falls back to DISCOVERY when the detector payload is incomplete. + */ private function detectSalesIntent(string $prompt): string { $data = $this->salesIntentLite->detect($prompt); - return (string) ($data['intent'] ?? SalesIntentLite::DISCOVERY); + return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY); } + /** + * Computes retrieval threshold and vector topK. + * + * Rules: + * - objection/pricing intents are slightly stricter + * - list queries are allowed to retrieve a wider candidate set + * - all values are clamped to global hard limits + */ private function computeThresholdAndTopK( string $salesIntent, - bool $isListQuery, - int $vectorTopKBase - ): array { + bool $isListQuery, + int $vectorTopKBase + ): array + { $threshold = NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD; $topK = $vectorTopKBase; @@ -824,7 +466,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } if ($isListQuery) { - $topK = (int) round($topK * NdjsonHybridRetrieverConfig::LIST_BONUS); + $topK = (int)round($topK * NdjsonHybridRetrieverConfig::LIST_BONUS); } $topK = max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK)); @@ -837,175 +479,26 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } /** - * @param array> $globalKeywordHits - * @return string[] + * Fuses multiple hit lists into one RRF-style score map. + * + * Notes: + * - only hits above threshold are considered + * - rank position within each hit list contributes to the final score + * - scoped hits can be boosted + * - raw scores are optionally captured for debug output */ - private function deriveSoftDocumentCandidateDocIds(array $globalKeywordHits): array - { - $window = array_slice($globalKeywordHits, 0, self::SOFT_DOC_CANDIDATE_WINDOW); - $stats = []; - - foreach ($window as $rank => $hit) { - $documentId = $hit['document_id'] ?? null; - - if (!is_string($documentId) || $documentId === '') { - continue; - } - - $score = isset($hit['score']) && is_numeric($hit['score']) - ? (float) $hit['score'] - : 0.0; - - if (!isset($stats[$documentId])) { - $stats[$documentId] = [ - 'document_id' => $documentId, - 'count' => 0, - 'best_rank' => $rank, - 'best_score' => $score, - ]; - } - - $stats[$documentId]['count']++; - $stats[$documentId]['best_rank'] = min($stats[$documentId]['best_rank'], $rank); - $stats[$documentId]['best_score'] = max($stats[$documentId]['best_score'], $score); - } - - if ($stats === []) { - return []; - } - - uasort($stats, static function (array $a, array $b): int { - if ($a['count'] !== $b['count']) { - return $b['count'] <=> $a['count']; - } - - if (abs((float) $a['best_score'] - (float) $b['best_score']) > 0.000001) { - return ((float) $b['best_score'] <=> (float) $a['best_score']); - } - - return $a['best_rank'] <=> $b['best_rank']; - }); - - $selected = []; - - foreach ($stats as $row) { - $count = (int) $row['count']; - $bestRank = (int) $row['best_rank']; - $bestScore = (float) $row['best_score']; - - if ( - $count < self::SOFT_DOC_CANDIDATE_MIN_DOC_HITS - && !($bestRank === 0 && $bestScore >= self::SOFT_DOC_TOP_SCORE_MIN) - ) { - continue; - } - - $selected[] = (string) $row['document_id']; - - if (count($selected) >= self::SOFT_DOC_CANDIDATE_MAX_DOCS) { - break; - } - } - - return $selected; - } - - /** - * @param array> $globalPrimaryVectorHits - * @return string[] - */ - private function derivePseudoScopeDocumentIds(array $globalPrimaryVectorHits): array - { - $window = array_slice($globalPrimaryVectorHits, 0, self::PSEUDO_SCOPE_GLOBAL_WINDOW); - $stats = []; - - foreach ($window as $rank => $hit) { - $documentId = $hit['document_id'] ?? null; - - if (!is_string($documentId) || $documentId === '') { - continue; - } - - $score = isset($hit['score']) && is_numeric($hit['score']) - ? (float) $hit['score'] - : 0.0; - - if (!isset($stats[$documentId])) { - $stats[$documentId] = [ - 'document_id' => $documentId, - 'count' => 0, - 'best_rank' => $rank, - 'best_score' => $score, - ]; - } - - $stats[$documentId]['count']++; - $stats[$documentId]['best_rank'] = min($stats[$documentId]['best_rank'], $rank); - $stats[$documentId]['best_score'] = max($stats[$documentId]['best_score'], $score); - } - - if ($stats === []) { - return []; - } - - uasort($stats, static function (array $a, array $b): int { - if ($a['count'] !== $b['count']) { - return $b['count'] <=> $a['count']; - } - - if (abs((float) $a['best_score'] - (float) $b['best_score']) > 0.000001) { - return ((float) $b['best_score'] <=> (float) $a['best_score']); - } - - return $a['best_rank'] <=> $b['best_rank']; - }); - - $selected = []; - - foreach ($stats as $row) { - if ((int) $row['count'] < self::PSEUDO_SCOPE_MIN_DOC_HITS) { - continue; - } - - $selected[] = (string) $row['document_id']; - - if (count($selected) >= self::PSEUDO_SCOPE_MAX_DOCS) { - break; - } - } - - return $selected; - } - - /** - * @param array>, - * threshold: float, - * boost: float, - * bucket: string - * }> $sources - * @return array{ - * rrf_scores: array, - * raw_vector_scores: array, - * raw_keyword_scores: array - * } - */ - private function fuseHitSources(array $sources, bool $captureRaw): array + private function fuseHits( + array $globalHits, + array $scopedHits, + float $threshold, + bool $boostScoped, + bool $captureRaw + ): array { $rrfScores = []; - $rawVectorScores = []; - $rawKeywordScores = []; - - foreach ($sources as $source) { - $hits = $source['hits']; - $threshold = (float) $source['threshold']; - $boost = max(0.0, (float) $source['boost']); - $bucket = (string) $source['bucket']; - - if ($hits === [] || $boost <= 0.0) { - continue; - } + $rawScores = []; + $apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void { $rank = 0; foreach ($hits as $hit) { @@ -1013,83 +506,85 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - $raw = (float) $hit['score']; + $raw = (float)$hit['score']; if ($raw < $threshold) { continue; } - $chunkId = (string) $hit['chunk_id']; + $chunkId = (string)$hit['chunk_id']; if ($captureRaw) { - if ($bucket === 'vector') { - $rawVectorScores[$chunkId] = max($rawVectorScores[$chunkId] ?? 0.0, $raw); - } elseif ($bucket === 'keyword') { - $rawKeywordScores[$chunkId] = max($rawKeywordScores[$chunkId] ?? 0.0, $raw); - } + $rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw); } $rank++; $rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); - $rrf *= $boost; + + if ($boost) { + $rrf *= 1.2; + } $rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf; } - } + }; + + $apply($globalHits, false); + $apply($scopedHits, $boostScoped); return [ 'rrf_scores' => $rrfScores, - 'raw_vector_scores' => $rawVectorScores, - 'raw_keyword_scores' => $rawKeywordScores, + 'raw_scores' => $rawScores, ]; } /** - * @param array> ...$sourceLists - * @return array + * Builds a fallback RRF ranking purely from hit order. + * + * Used when thresholding removed all fused candidates but + * the global hit list itself still exists. */ - private function fallbackRrfFromSources(array ...$sourceLists): array + private function fallbackRrfFromHits(array $hits): array { - foreach ($sourceLists as $hits) { - $rrf = []; - $rank = 0; + $rrf = []; + $rank = 0; - foreach ($hits as $hit) { - if (!isset($hit['chunk_id'])) { - continue; - } - - $rank++; - $rrf[(string) $hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); - - if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) { - break; - } + foreach ($hits as $hit) { + if (!isset($hit['chunk_id'])) { + continue; } - if ($rrf !== []) { - return $rrf; + $rank++; + $rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); + + if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) { + break; } } - return []; + return $rrf; } /** + * Selects a coherent chunk window from one exact document title match. + * + * For exact product questions we prefer a pure document slice over + * cross-document fusion to avoid mixing neighbouring product families. + * * @param array> $rows * @return string[] */ private function selectExactDocumentChunkIds(array $rows, int $limit): array { uasort($rows, static function (array $a, array $b): int { - $aIndex = is_int($a['chunk_index'] ?? null) ? (int) $a['chunk_index'] : PHP_INT_MAX; - $bIndex = is_int($b['chunk_index'] ?? null) ? (int) $b['chunk_index'] : PHP_INT_MAX; + $aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX; + $bIndex = is_int($b['chunk_index'] ?? null) ? (int)$b['chunk_index'] : PHP_INT_MAX; if ($aIndex !== $bIndex) { return $aIndex <=> $bIndex; } - return strcmp((string) ($a['chunk_id'] ?? ''), (string) ($b['chunk_id'] ?? '')); + return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? '')); }); $selected = []; @@ -1097,7 +592,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface foreach ($rows as $row) { $chunkId = $row['chunk_id'] ?? null; - $text = trim((string) ($row['text'] ?? '')); + $text = trim((string)($row['text'] ?? '')); if (!is_string($chunkId) || $chunkId === '' || $text === '') { continue; @@ -1114,6 +609,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } /** + * Builds synthetic scores for exact-title fast-path selections. + * + * These scores are only used for debug output consistency. + * * @param string[] $chunkIds * @return array */ @@ -1122,12 +621,20 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $scores = []; foreach (array_values($chunkIds) as $rank => $chunkId) { - $scores[(string) $chunkId] = 1.0 / (1 + $rank); + $scores[(string)$chunkId] = 1.0 / (1 + $rank); } return $scores; } + /** + * Selection strategy for list-style queries. + * + * Goal: + * - avoid near-identical chunks + * - prefer diverse list entries + * - stop once the configured limit is reached + */ private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array { $seen = []; @@ -1138,19 +645,19 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - $chunk = trim((string) $rows[$id]['text']); + $chunk = trim((string)$rows[$id]['text']); if ($chunk === '') { continue; } - $key = md5(mb_strtolower((string) (preg_replace('/\s+/u', ' ', $chunk) ?? $chunk))); + $key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk))); if (isset($seen[$key])) { continue; } $seen[$key] = true; - $out[] = (string) $id; + $out[] = (string)$id; if (count($out) >= $limit) { break; @@ -1160,8 +667,43 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $out; } - private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array + /** + * Selection strategy for sales-oriented queries. + * + * Modes: + * - exact_document_title: + * used when the prompt clearly contains one exact document title + * and the answer should stay strictly within that document + * + * - sales_dominant_document: + * used when one document clearly dominates the top hit window + * and coherent neighbouring chunks from that document are more + * useful than cross-document spread + * + * - sales_spread: + * default mode that spreads chunks across documents and enforces + * distance between chunk positions of the same document + */ + private function selectSalesChunkIds(string $prompt, array $chunkIds, array $rows, int $limit): array { + $focusedDocId = $this->resolveFocusedSalesDocumentId($prompt, $chunkIds, $rows); + + if ($focusedDocId !== null) { + $focusedChunkIds = $this->selectFocusedProductChunkIds( + $focusedDocId, + $chunkIds, + $rows, + $limit + ); + + if ($focusedChunkIds !== []) { + return [ + 'ids' => $focusedChunkIds, + 'mode' => 'sales_product_dominant_document', + ]; + } + } + $dominantDocId = $this->detectDominantTopDocument($chunkIds, $rows); if ($dominantDocId !== null) { @@ -1191,6 +733,265 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface ]; } + + /** + * Resolves a strongly focused product document before normal sales spreading. + * + * This protects against classic false positives where neighbouring products, + * indicators or safety sheets outrank the actually requested device. + */ + private function resolveFocusedSalesDocumentId(string $prompt, array $chunkIds, array $rows): ?string + { + $promptProfile = $this->buildPromptProductProfile($prompt); + + if ($promptProfile['anchors'] === []) { + return null; + } + + $candidates = []; + $seenDocs = []; + + foreach (array_slice($chunkIds, 0, self::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) { + $row = $rows[$chunkId] ?? null; + if (!is_array($row)) { + continue; + } + + $documentId = $row['document_id'] ?? null; + if (!is_string($documentId) || $documentId === '' || isset($seenDocs[$documentId])) { + continue; + } + + $title = $this->extractDocumentTitle($row); + if ($title === '') { + continue; + } + + $seenDocs[$documentId] = true; + $score = $this->scoreFocusedProductCandidate($promptProfile, $title, $row, $rank); + + $candidates[] = [ + 'document_id' => $documentId, + 'score' => $score, + ]; + } + + if ($candidates === []) { + return null; + } + + usort($candidates, static function (array $a, array $b): int { + if ($a['score'] === $b['score']) { + return strcmp((string)$a['document_id'], (string)$b['document_id']); + } + + return $b['score'] <=> $a['score']; + }); + + $best = $candidates[0] ?? null; + if ($best === null) { + return null; + } + + $runnerUpScore = (float)($candidates[1]['score'] ?? -INF); + $bestScore = (float)$best['score']; + $gap = $bestScore - $runnerUpScore; + + if ($bestScore < self::FOCUSED_PRODUCT_MIN_SCORE || $gap < self::FOCUSED_PRODUCT_MIN_GAP) { + return null; + } + + $documentId = $best['document_id'] ?? null; + + return is_string($documentId) && $documentId !== '' ? $documentId : null; + } + + /** + * Builds a small prompt profile used for focused product dominance decisions. + * + * @return array{ + * normalized:string, + * anchors:string[], + * family_tokens:string[], + * number_tokens:string[], + * asks_reagent:bool, + * asks_document:bool, + * asks_safety:bool, + * asks_device:bool + * } + */ + private function buildPromptProductProfile(string $prompt): array + { + $normalized = $this->normalizeText($prompt); + $tokens = $this->tokenizeText($normalized); + + $reagentWords = [ + 'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb', + 'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde', + ]; + $documentWords = [ + 'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung', + 'sdb', 'sicherheitsdatenblatt', 'msds', + ]; + $safetyWords = [ + 'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung', + 'transport', 'lagerung', 'piktogramm', + ]; + $deviceWords = [ + 'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat', + 'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor', + ]; + + $asksReagent = $this->containsAnyToken($tokens, $reagentWords); + $asksDocument = $this->containsAnyToken($tokens, $documentWords); + $asksSafety = $this->containsAnyToken($tokens, $safetyWords); + $asksDevice = $this->containsAnyToken($tokens, $deviceWords) || (!$asksReagent && !$asksDocument && !$asksSafety); + + $anchors = []; + $familyTokens = []; + $numberTokens = []; + + foreach ($tokens as $token) { + if ($this->isGenericProductToken($token)) { + continue; + } + + if (preg_match('/\d/u', $token) === 1) { + $anchors[] = $token; + $numberTokens[] = $token; + $familyTokens[] = $token; + continue; + } + + if ($this->isImportantShortModelToken($token)) { + $anchors[] = $token; + $familyTokens[] = $token; + continue; + } + + if (mb_strlen($token, 'UTF-8') >= 3) { + $anchors[] = $token; + + if ($this->isFamilyDescriptorToken($token)) { + $familyTokens[] = $token; + } + } + } + + return [ + 'normalized' => $normalized, + 'anchors' => array_values(array_unique($anchors)), + 'family_tokens' => array_values(array_unique($familyTokens)), + 'number_tokens' => array_values(array_unique($numberTokens)), + 'asks_reagent' => $asksReagent, + 'asks_document' => $asksDocument, + 'asks_safety' => $asksSafety, + 'asks_device' => $asksDevice, + ]; + } + + /** + * Scores one candidate document for focused product selection. + */ + private function scoreFocusedProductCandidate(array $promptProfile, string $title, array $row, int $rank): float + { + $titleNormalized = $this->normalizeText($title); + $titleTokens = $this->tokenizeText($titleNormalized); + $titleTokenMap = array_fill_keys($titleTokens, true); + $textNormalized = $this->normalizeText((string)($row['text'] ?? '')); + + $score = max(0.0, 5.0 - $rank); + + if ($titleNormalized !== '' && str_contains(' ' . $promptProfile['normalized'] . ' ', ' ' . $titleNormalized . ' ')) { + $score += 24.0; + } + + $matchedAnchors = 0; + foreach ($promptProfile['anchors'] as $anchor) { + if (isset($titleTokenMap[$anchor])) { + $matchedAnchors++; + $score += $this->isImportantShortModelToken($anchor) ? 4.0 : 3.5; + continue; + } + + if (str_contains(' ' . $titleNormalized . ' ', ' ' . $anchor . ' ')) { + $matchedAnchors++; + $score += 3.0; + continue; + } + + $score -= $this->isFamilyDescriptorToken($anchor) ? 3.5 : 2.0; + } + + foreach ($promptProfile['number_tokens'] as $numberToken) { + if (isset($titleTokenMap[$numberToken])) { + $score += 4.0; + } else { + $score -= 5.0; + } + } + + foreach ($promptProfile['family_tokens'] as $familyToken) { + if (isset($titleTokenMap[$familyToken])) { + $score += 4.0; + } else { + $score -= 4.5; + } + } + + if ($promptProfile['asks_device']) { + if ($this->looksLikeReagentOrAccessoryDocument($row, $titleNormalized, $textNormalized)) { + $score -= 12.0; + } + + if ($this->looksLikeSafetyDocument($row, $titleNormalized, $textNormalized)) { + $score -= 8.0; + } + } + + if ($promptProfile['asks_reagent'] && $this->looksLikeReagentOrAccessoryDocument($row, $titleNormalized, $textNormalized)) { + $score += 6.0; + } + + if (($promptProfile['asks_document'] || $promptProfile['asks_safety']) && $this->looksLikeSafetyDocument($row, $titleNormalized, $textNormalized)) { + $score += 4.0; + } + + if ($matchedAnchors === 0) { + $score -= 10.0; + } + + return $score; + } + + /** + * Selects only the focused product document chunks. + * + * In this strict mode we intentionally do not fill remaining slots with + * neighbouring products, because that would reintroduce the original bug. + */ + private function selectFocusedProductChunkIds( + string $documentId, + array $chunkIds, + array $rows, + int $limit + ): array + { + return $this->selectDominantDocumentChunkIds( + $documentId, + $chunkIds, + $rows, + min($limit, self::FOCUSED_PRODUCT_MAX_CHUNKS) + ); + } + + /** + * Detects whether one document clearly dominates the first ranked window. + * + * This is especially useful for product-sheet style documents where + * several adjacent chunks belong together and should be passed to the model + * as one coherent factual block. + */ private function detectDominantTopDocument(array $chunkIds, array $rows): ?string { $docWindow = []; @@ -1200,7 +1001,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - $text = trim((string) $rows[$chunkId]['text']); + $text = trim((string)$rows[$chunkId]['text']); $docId = $rows[$chunkId]['document_id'] ?? null; if ($text === '' || !is_string($docId) || $docId === '') { @@ -1223,7 +1024,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return null; } - $dominantCount = (int) ($counts[$dominantDocId] ?? 0); + $dominantCount = (int)($counts[$dominantDocId] ?? 0); if ($dominantCount >= self::DOMINANT_DOC_MIN_HITS) { return $dominantDocId; @@ -1239,12 +1040,21 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return null; } + /** + * Selects a coherent chunk window from the dominant document. + * + * Strategy: + * - use the highest-ranked chunk of that document as anchor + * - prefer neighbouring chunk indices around that anchor + * - sort the final selection by chunk index for prompt coherence + */ private function selectDominantDocumentChunkIds( string $documentId, - array $chunkIds, - array $rows, - int $limit - ): array { + array $chunkIds, + array $rows, + int $limit + ): array + { $docHits = []; $anchorChunkIndex = null; @@ -1253,7 +1063,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - $text = trim((string) $rows[$chunkId]['text']); + $text = trim((string)$rows[$chunkId]['text']); $docId = $rows[$chunkId]['document_id'] ?? null; if ($text === '' || $docId !== $documentId) { @@ -1268,7 +1078,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } $docHits[] = [ - 'id' => (string) $chunkId, + 'id' => (string)$chunkId, 'rank' => $rank, 'chunk_index' => $chunkIndex, ]; @@ -1326,12 +1136,19 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface ); } + /** + * Fills the remaining sales slots after a dominant document selection. + * + * The already selected dominant-document chunks stay fixed. + * Remaining slots are filled with the normal spread strategy. + */ private function fillRemainingSalesChunkIds( array $seedChunkIds, array $chunkIds, array $rows, - int $limit - ): array { + int $limit + ): array + { $out = array_values(array_unique(array_map('strval', $seedChunkIds))); if (count($out) >= $limit) { @@ -1383,12 +1200,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } } - $text = trim((string) $rows[$chunkId]['text']); + $text = trim((string)$rows[$chunkId]['text']); if ($text === '') { continue; } - $out[] = (string) $chunkId; + $out[] = (string)$chunkId; $selected[$chunkId] = true; $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; @@ -1404,6 +1221,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $out; } + /** + * Default spread selection for sales-oriented queries. + * + * Goal: + * - avoid overloading the result with chunks from the same document + * - avoid chunks that are too close to each other in the same document + * - preserve top-ranked relevance while improving contextual spread + */ private function selectSalesChunkIdsSpread(array $chunkIds, array $rows, int $limit): array { $out = []; @@ -1436,12 +1261,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $docChunkPositions[$docId][] = $chunkIndex; } - $text = trim((string) $rows[$chunkId]['text']); + $text = trim((string)$rows[$chunkId]['text']); if ($text === '') { continue; } - $out[] = (string) $chunkId; + $out[] = (string)$chunkId; $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; if (count($out) >= $limit) { @@ -1452,6 +1277,177 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $out; } + + /** + * Extracts the document title from metadata or from the first product-title heading. + */ + private function extractDocumentTitle(array $row): string + { + $metadataTitle = $row['metadata']['document_title'] ?? null; + + if (is_string($metadataTitle) && trim($metadataTitle) !== '') { + return trim($metadataTitle); + } + + $text = (string)($row['text'] ?? ''); + + if ( + $text !== '' && + preg_match('/^#\s*Produkt\s+Titel:\s*`?([^`\n]+)`?/imu', $text, $matches) === 1 + ) { + return trim((string)($matches[1] ?? '')); + } + + return ''; + } + + /** + * Normalizes text for token-safe product comparisons. + */ + private function normalizeText(string $value): string + { + $value = mb_strtolower(trim($value), 'UTF-8'); + $value = str_replace(['-', '/', '_'], ' ', $value); + $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value; + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim($value); + } + + /** + * Tokenizes normalized text. + * + * @return string[] + */ + private function tokenizeText(string $value): array + { + if ($value === '') { + return []; + } + + return preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: []; + } + + /** + * Returns true when at least one token from the haystack matches the given words. + */ + private function containsAnyToken(array $tokens, array $needles): bool + { + if ($tokens === [] || $needles === []) { + return false; + } + + $tokenMap = array_fill_keys($tokens, true); + + foreach ($needles as $needle) { + if (isset($tokenMap[$needle])) { + return true; + } + } + + return false; + } + + /** + * Generic product words must not drive product dominance decisions. + */ + private function isGenericProductToken(string $token): bool + { + static $generic = [ + 'der', 'die', 'das', 'ein', 'eine', 'einen', 'einem', 'und', 'oder', 'mit', + 'fuer', 'für', 'von', 'im', 'in', 'am', 'an', 'auf', 'zu', 'zum', 'zur', + 'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät', + 'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte', + 'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung', + 'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend', + 'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher', + 'welches', 'brauche', 'suche', 'bitte', 'fuer', 'gegen', 'und', 'oder', + ]; + + return isset(array_fill_keys($generic, true)[$token]); + } + + /** + * Short technical model codes like TH or TC are allowed as anchors. + */ + private function isImportantShortModelToken(string $token): bool + { + static $allowed = ['th', 'tc', 'tp', 'tm', 'ph', 'rx']; + + return in_array($token, $allowed, true); + } + + /** + * Family descriptors are strong product differentiators. + */ + private function isFamilyDescriptorToken(string $token): bool + { + static $familyDescriptors = [ + 'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab', + 'inline', 'compact', 'panel', 'sc', + ]; + + return in_array($token, $familyDescriptors, true) + || $this->isImportantShortModelToken($token) + || preg_match('/\d/u', $token) === 1; + } + + /** + * Heuristic classifier for indicator, reagent, accessory and spare-part documents. + */ + private function looksLikeReagentOrAccessoryDocument(array $row, string $titleNormalized, string $textNormalized): bool + { + $haystack = trim($titleNormalized . ' ' . $textNormalized); + + if ($haystack === '') { + return false; + } + + $needles = [ + 'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie', + 'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche', + 'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz', + 'kerzenfilter', 'druckregler', + ]; + + foreach ($needles as $needle) { + if (str_contains($haystack, $needle)) { + return true; + } + } + + return false; + } + + /** + * Heuristic classifier for safety-style documents. + */ + private function looksLikeSafetyDocument(array $row, string $titleNormalized, string $textNormalized): bool + { + $haystack = trim($titleNormalized . ' ' . $textNormalized); + + if ($haystack === '') { + return false; + } + + $needles = [ + 'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung', + 'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp', + 'kennzeichnung', 'h290', 'pbt', 'vpvb', + ]; + + foreach ($needles as $needle) { + if (str_contains($haystack, $needle)) { + return true; + } + } + + return false; + } + + /** + * Converts selected chunk ids into the final plain text result list. + */ private function collectTextsFromIds(array $chunkIds, array $rows): array { $out = []; @@ -1461,7 +1457,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - $text = trim((string) $rows[$id]['text']); + $text = trim((string)$rows[$id]['text']); if ($text !== '') { $out[] = $text; @@ -1470,233 +1466,4 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $out; } - - /** - * Applies a conservative document-level re-rank based on title / metadata matching. - * - * This is intentionally executed after source fusion. It should sharpen ranking - * for clearly matching documents, but never replace the underlying retrieval logic. - * - * @param array $rrfScores - * @param array> $rows - * @return array{0: array, 1: array, 2: array} - */ - private function applyTitleMetadataBoosts(array $rrfScores, array $rows, string $lexicalQuery): array - { - $normalizedQuery = $this->normalizeForMatching($lexicalQuery); - $queryTokens = $this->tokenizeNormalizedQuery($normalizedQuery); - - if ($normalizedQuery === '' || $queryTokens === [] || $rrfScores === [] || $rows === []) { - return [$rrfScores, [], []]; - } - - $documentBoosts = []; - - foreach ($rows as $row) { - $documentId = $row['document_id'] ?? null; - - if (!is_string($documentId) || $documentId === '' || isset($documentBoosts[$documentId])) { - continue; - } - - $documentBoosts[$documentId] = $this->computeDocumentMetadataBoost( - $row, - $normalizedQuery, - $queryTokens - ); - } - - if ($documentBoosts === []) { - return [$rrfScores, [], []]; - } - - $chunkBoosts = []; - - foreach ($rrfScores as $chunkId => $score) { - $row = $rows[$chunkId] ?? null; - - if (!is_array($row)) { - continue; - } - - $documentId = $row['document_id'] ?? null; - - if (!is_string($documentId) || $documentId === '') { - continue; - } - - $boost = $documentBoosts[$documentId] ?? 0.0; - - if ($boost <= 0.0) { - continue; - } - - $rrfScores[$chunkId] = $score * (1.0 + $boost); - $chunkBoosts[$chunkId] = $boost; - } - - return [$rrfScores, $chunkBoosts, $documentBoosts]; - } - - /** - * @param array $row - * @param string[] $queryTokens - */ - private function computeDocumentMetadataBoost(array $row, string $normalizedQuery, array $queryTokens): float - { - $documentTitle = $this->normalizeForMatching($this->extractMetadataString($row, [ - 'document_title', - 'title', - ])); - - $fileName = $this->normalizeForMatching($this->extractMetadataString($row, [ - 'file_name', - 'filename', - 'original_filename', - 'source_name', - 'document_name', - ])); - - $metaText = $this->normalizeForMatching($this->extractMetadataString($row, [ - 'source_path', - 'path', - 'heading', - 'section_title', - 'category', - ])); - - $boost = 0.0; - - $titleCoverage = $this->computeNormalizedTokenCoverage($queryTokens, $documentTitle); - if ($titleCoverage > 0.0) { - $boost += min( - self::TITLE_MATCH_MAX_BOOST, - self::TITLE_MATCH_BASE_BOOST + ($titleCoverage * self::TITLE_MATCH_MAX_BOOST) - ); - } - - $fileCoverage = $this->computeNormalizedTokenCoverage($queryTokens, $fileName); - if ($fileCoverage > 0.0) { - $boost += min( - self::FILE_MATCH_MAX_BOOST, - self::FILE_MATCH_BASE_BOOST + ($fileCoverage * self::FILE_MATCH_MAX_BOOST) - ); - } - - $metaCoverage = $this->computeNormalizedTokenCoverage($queryTokens, $metaText); - if ($metaCoverage > 0.0) { - $boost += min( - self::META_MATCH_MAX_BOOST, - $metaCoverage * self::META_MATCH_MAX_BOOST - ); - } - - if (str_contains($normalizedQuery, ' ')) { - if ($documentTitle !== '' && str_contains(' ' . $documentTitle . ' ', ' ' . $normalizedQuery . ' ')) { - $boost += self::EXACT_TITLE_PHRASE_BOOST; - } - - if ($fileName !== '' && str_contains(' ' . $fileName . ' ', ' ' . $normalizedQuery . ' ')) { - $boost += self::EXACT_FILE_PHRASE_BOOST; - } - } - - return min(self::MAX_TITLE_METADATA_BOOST, $boost); - } - - /** - * @param array $row - * @param string[] $preferredKeys - */ - private function extractMetadataString(array $row, array $preferredKeys): string - { - foreach ($preferredKeys as $key) { - $topLevel = $row[$key] ?? null; - if (is_string($topLevel) && trim($topLevel) !== '') { - return trim($topLevel); - } - - $metadata = $row['metadata'] ?? null; - if (is_array($metadata)) { - $value = $metadata[$key] ?? null; - if (is_string($value) && trim($value) !== '') { - return trim($value); - } - } - } - - return ''; - } - - /** - * @param string[] $queryTokens - */ - private function computeNormalizedTokenCoverage(array $queryTokens, string $normalizedHaystack): float - { - if ($queryTokens === [] || $normalizedHaystack === '') { - return 0.0; - } - - $matched = 0; - - foreach ($queryTokens as $token) { - if ($token === '') { - continue; - } - - if (str_contains(' ' . $normalizedHaystack . ' ', ' ' . $token . ' ')) { - $matched++; - } - } - - if ($matched < 1) { - return 0.0; - } - - return $matched / max(1, count($queryTokens)); - } - - /** - * @return string[] - */ - private function tokenizeNormalizedQuery(string $normalizedQuery): array - { - if ($normalizedQuery === '') { - return []; - } - - $tokens = preg_split('/\s+/u', $normalizedQuery, -1, PREG_SPLIT_NO_EMPTY) ?: []; - $tokens = array_values(array_unique(array_filter( - $tokens, - static fn (string $token): bool => mb_strlen($token, 'UTF-8') >= 2 - ))); - - return $tokens; - } - - private function normalizeForMatching(string $value): string - { - $value = mb_strtolower(trim($value), 'UTF-8'); - $value = preg_replace('/[^\p{L}\p{N}]+/u', ' ', $value) ?? $value; - $value = preg_replace('/\s+/u', ' ', $value) ?? $value; - - return trim($value); - } - - private function maxNullableFloat(?float $a, ?float $b): ?float - { - if ($a === null && $b === null) { - return null; - } - - if ($a === null) { - return $b; - } - - if ($b === null) { - return $a; - } - - return max($a, $b); - } } \ No newline at end of file