From b800c1fc8f00866a9940b17337adbb7f947644e9 Mon Sep 17 00:00:00 2001 From: team 1 Date: Fri, 24 Apr 2026 11:22:25 +0200 Subject: [PATCH] fix retrieve 1 --- src/Config/NdjsonHybridRetrieverConfig.php | 12 +++ .../Retrieval/NdjsonHybridRetriever.php | 99 ++++++++++++++++--- 2 files changed, 100 insertions(+), 11 deletions(-) diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index dbab01b..e1f092a 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -72,6 +72,18 @@ final class NdjsonHybridRetrieverConfig */ public const RRF_K = 50; + /** + * Keyword retrieval is fused with vector retrieval as a factual safety net. + * It protects exact values, ranges, thresholds, model codes and domain terms + * that semantic retrieval can miss or rank too low. + */ + public const HARD_MAX_KEYWORDK = 36; + public const KEYWORD_TOPK_MULTIPLIER = 2.0; + public const KEYWORD_SCORE_THRESHOLD = 0.35; + public const KEYWORD_RRF_WEIGHT = 1.15; + public const SCOPED_VECTOR_RRF_WEIGHT = 1.20; + public const SCOPED_KEYWORD_RRF_WEIGHT = 1.30; + /** * Fallback size when thresholded fusion yields no candidates. * diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 175c68d..4b0e4a6 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -34,6 +34,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface public function __construct( private NdjsonChunkLookup $lookup, + private NdjsonKeywordRetriever $keywordRetriever, private VectorSearchClient $vectorClient, private TagRoutingService $tagRouting, private ModelGenerationConfigRepository $configRepository, @@ -178,6 +179,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $salesIntent = $this->detectSalesIntent($prompt); $route = $this->routeResolver->resolve($salesIntent, $entityLabel); + if ( + $route === IntentRouteResolver::ROUTE_CATALOG_LIST + && !$this->shouldUseCatalogListShortcut($prompt, $salesIntent) + ) { + $route = IntentRouteResolver::ROUTE_NORMAL; + } + if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) { $catalogBlock = $this->entityCatalogService->listByTerm($entityLabel); @@ -336,13 +344,23 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface : []; $globalHits = $this->vectorClient->search($cleanQuery, $topK); + $keywordHits = $this->keywordRetriever->search( + $cleanQuery, + $this->computeKeywordTopK($topK) + ); $scopedHits = []; + $scopedKeywordHits = []; if ($candidateDocIds !== []) { $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds); + $scopedKeywordHits = $this->keywordRetriever->search( + $cleanQuery, + $this->computeKeywordTopK($topK), + $candidateDocIds + ); } - if ($globalHits === [] && $scopedHits === []) { + if ($globalHits === [] && $scopedHits === [] && $keywordHits === [] && $scopedKeywordHits === []) { return [ 'limit' => $limit, 'is_list_query' => $isListQuery, @@ -357,8 +375,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $fused = $this->fuseHits( $globalHits, $scopedHits, + $keywordHits, + $scopedKeywordHits, $threshold, $scopedHits !== [], + $scopedKeywordHits !== [], $withScores ); @@ -429,6 +450,61 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY); } + /** + * The catalog shortcut is only safe for real list/catalog requests. + * Factual questions such as "what is the lowest threshold" must continue + * through normal retrieval, otherwise the system can return a product list + * instead of the requested value. + */ + private function shouldUseCatalogListShortcut(string $prompt, string $salesIntent): bool + { + if ($salesIntent !== SalesIntentLite::DISCOVERY) { + return false; + } + + if ($this->intentLite->isListQuery($prompt)) { + return true; + } + + $normalized = $this->normalizeText($prompt); + + if ($normalized === '') { + return false; + } + + $patterns = [ + '/\balle\b/u', + '/\bliste\b/u', + '/\bauflistung\b/u', + '/\buebersicht\b/u', + '/\bübersicht\b/u', + '/\bsortiment\b/u', + '/\bwelche\b.*\b(gibt|verfügbar|verfuegbar|existieren)\b/u', + '/\bzeige\b.*\b(produkte|geraete|geräte|modelle|artikel)\b/u', + '/\bwas\b.*\b(gibt es|verfügbar|verfuegbar)\b/u', + ]; + + foreach ($patterns as $pattern) { + if (preg_match($pattern, $normalized) === 1) { + return true; + } + } + + return false; + } + + /** + * Keyword retrieval is cheap and should look slightly wider than vector + * retrieval because it acts as a factual safety net for numbers, ranges, + * thresholds and exact technical terms. + */ + private function computeKeywordTopK(int $vectorTopK): int + { + $topK = (int) ceil($vectorTopK * NdjsonHybridRetrieverConfig::KEYWORD_TOPK_MULTIPLIER); + + return max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_KEYWORDK)); + } + /** * Computes retrieval threshold and vector topK. * @@ -478,15 +554,18 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface private function fuseHits( array $globalHits, array $scopedHits, - float $threshold, - bool $boostScoped, + array $keywordHits, + array $scopedKeywordHits, + float $vectorThreshold, + bool $boostScopedVector, + bool $boostScopedKeyword, bool $captureRaw ): array { $rrfScores = []; $rawScores = []; - $apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void { + $apply = function (array $hits, float $threshold, float $weight) use (&$rrfScores, &$rawScores, $captureRaw): void { $rank = 0; foreach ($hits as $hit) { @@ -507,18 +586,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } $rank++; - $rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); - - if ($boost) { - $rrf *= 1.2; - } + $rrf = (1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank)) * $weight; $rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf; } }; - $apply($globalHits, false); - $apply($scopedHits, $boostScoped); + $apply($globalHits, $vectorThreshold, 1.0); + $apply($scopedHits, $vectorThreshold, $boostScopedVector ? NdjsonHybridRetrieverConfig::SCOPED_VECTOR_RRF_WEIGHT : 1.0); + $apply($keywordHits, NdjsonHybridRetrieverConfig::KEYWORD_SCORE_THRESHOLD, NdjsonHybridRetrieverConfig::KEYWORD_RRF_WEIGHT); + $apply($scopedKeywordHits, NdjsonHybridRetrieverConfig::KEYWORD_SCORE_THRESHOLD, $boostScopedKeyword ? NdjsonHybridRetrieverConfig::SCOPED_KEYWORD_RRF_WEIGHT : NdjsonHybridRetrieverConfig::KEYWORD_RRF_WEIGHT); return [ 'rrf_scores' => $rrfScores,