harden retrieve logic

2026-04-17 14:52:53 +02:00
parent ae2b52ad18
commit 5c9d81adeb
4 changed files with 838 additions and 141 deletions
--- a/src/Knowledge/Retrieval/NdjsonChunkLookup.php
+++ b/src/Knowledge/Retrieval/NdjsonChunkLookup.php
@@ -1,6 +1,5 @@
 <?php

-
 declare(strict_types=1);

 namespace App\Knowledge\Retrieval;
@@ -26,17 +25,180 @@ final readonly class NdjsonChunkLookup

        foreach ($this->chunkManager->streamAll() as $row) {
            $id = $row['chunk_id'] ?? null;
+
            if (!is_string($id) || !isset($wanted[$id])) {
                continue;
            }

            $found[$id] = $row;

-            if (\count($found) === \count($wanted)) {
+            if (count($found) === count($wanted)) {
                break;
            }
        }

        return $found;
    }
-}
+
+    /**
+     * Returns all chunks of one document keyed by chunk_id.
+     *
+     * @return array<string,array<string,mixed>>
+     */
+    public function findByDocumentId(string $documentId): array
+    {
+        $rows = [];
+
+        foreach ($this->chunkManager->streamAll() as $row) {
+            $rowDocumentId = $row['document_id'] ?? null;
+            $chunkId = $row['chunk_id'] ?? null;
+
+            if ($rowDocumentId !== $documentId || !is_string($chunkId) || $chunkId === '') {
+                continue;
+            }
+
+            $rows[$chunkId] = $row;
+        }
+
+        return $rows;
+    }
+
+    /**
+     * Resolves the best exact document title match from the user prompt.
+     *
+     * Matching rules:
+     * - the normalized prompt must contain the full normalized document title
+     * - titles containing digits are preferred, e.g. "Testomat 808"
+     * - longer exact titles win over shorter generic titles
+     *
+     * @return array{
+     *     document_id:string,
+     *     document_title:string,
+     *     rows:array<string,array<string,mixed>>
+     * }|null
+     */
+    public function findBestExactDocumentByPrompt(string $prompt): ?array
+    {
+        $normalizedPrompt = $this->normalizeText($prompt);
+
+        if ($normalizedPrompt === '') {
+            return null;
+        }
+
+        $documents = [];
+
+        foreach ($this->chunkManager->streamAll() as $row) {
+            $documentId = $row['document_id'] ?? null;
+            $chunkId = $row['chunk_id'] ?? null;
+
+            if (!is_string($documentId) || $documentId === '' || !is_string($chunkId) || $chunkId === '') {
+                continue;
+            }
+
+            if (!isset($documents[$documentId])) {
+                $documentTitle = $this->extractDocumentTitle($row);
+
+                if ($documentTitle === '') {
+                    continue;
+                }
+
+                $documents[$documentId] = [
+                    'document_id' => $documentId,
+                    'document_title' => $documentTitle,
+                    'normalized_title' => $this->normalizeText($documentTitle),
+                    'rows' => [],
+                ];
+            }
+
+            $documents[$documentId]['rows'][$chunkId] = $row;
+        }
+
+        $best = null;
+        $bestScore = null;
+
+        foreach ($documents as $document) {
+            $normalizedTitle = $document['normalized_title'];
+
+            if (!$this->isConfidentTitleMatch($normalizedPrompt, $normalizedTitle)) {
+                continue;
+            }
+
+            $score = mb_strlen($normalizedTitle, 'UTF-8');
+
+            if (preg_match('/\d/u', $normalizedTitle) === 1) {
+                $score += 1000;
+            }
+
+            if ($best === null || $score > $bestScore) {
+                $best = $document;
+                $bestScore = $score;
+            }
+        }
+
+        if ($best === null) {
+            return null;
+        }
+
+        return [
+            'document_id' => $best['document_id'],
+            'document_title' => $best['document_title'],
+            'rows' => $best['rows'],
+        ];
+    }
+
+    /**
+     * @param array<string,mixed> $row
+     */
+    private function extractDocumentTitle(array $row): string
+    {
+        $metadataTitle = $row['metadata']['document_title'] ?? null;
+
+        if (is_string($metadataTitle) && trim($metadataTitle) !== '') {
+            return trim($metadataTitle);
+        }
+
+        $text = (string) ($row['text'] ?? '');
+
+        if (
+            $text !== '' &&
+            preg_match('/^#\s*Produkt\s+Titel:\s*`?([^`\n]+)`?/imu', $text, $matches) === 1
+        ) {
+            return trim((string) ($matches[1] ?? ''));
+        }
+
+        return '';
+    }
+
+    private function isConfidentTitleMatch(string $normalizedPrompt, string $normalizedTitle): bool
+    {
+        if ($normalizedPrompt === '' || $normalizedTitle === '') {
+            return false;
+        }
+
+        $paddedPrompt = ' ' . $normalizedPrompt . ' ';
+        $paddedTitle = ' ' . $normalizedTitle . ' ';
+
+        if (!str_contains($paddedPrompt, $paddedTitle)) {
+            return false;
+        }
+
+        $tokens = preg_split('/\s+/u', $normalizedTitle, -1, PREG_SPLIT_NO_EMPTY) ?: [];
+
+        $significantTokens = array_values(array_filter(
+            $tokens,
+            static fn (string $token): bool => mb_strlen($token, 'UTF-8') >= 3 || preg_match('/\d/u', $token) === 1
+        ));
+
+        return count($significantTokens) >= 2 || preg_match('/\d/u', $normalizedTitle) === 1;
+    }
+
+    private function normalizeText(string $value): string
+    {
+        $value = mb_strtolower(trim($value), 'UTF-8');
+        $value = str_replace(['-', '/', '_'], ' ', $value);
+        $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
+        $value = preg_replace('/\s+/u', ' ', $value) ?? $value;
+
+        return trim($value);
+    }
+}
--- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
+++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
@@ -23,6 +23,7 @@ use RuntimeException;
 * Main responsibilities:
 * - detect high-level request intent
 * - optionally short-circuit to catalog list output
+ * - resolve exact document-title matches before semantic retrieval
 * - run vector retrieval globally and optionally document-scoped
 * - fuse both result sets with RRF-style scoring
 * - apply selection rules for list queries vs. sales-style queries
@@ -30,6 +31,15 @@ use RuntimeException;
 */
 final readonly class NdjsonHybridRetriever implements RetrieverInterface
 {
+    /**
+     * When one document clearly dominates the top-ranked window,
+     * temporarily switch from "spread" mode to "dominant document" mode.
+     */
+    private const DOMINANT_DOC_WINDOW = 6;
+    private const DOMINANT_DOC_MIN_HITS = 3;
+    private const DOMINANT_DOC_MAX_CHUNKS = 4;
+    private const EXACT_DOCUMENT_MAX_CHUNKS = 6;
+
    public function __construct(
        private NdjsonChunkLookup               $lookup,
        private VectorSearchClient              $vectorClient,
@@ -58,6 +68,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
     * - executes the full orchestration pipeline
     * - if the route resolves to a catalog list, returns the catalog block only
     * - otherwise returns the selected chunk texts
+     *
     * @throws Exception
     */
    public function retrieve(string $prompt): array
@@ -65,12 +76,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
        $config = $this->requireConfig();
        $result = $this->execute($prompt, $config, false);

-        // Catalog list responses bypass normal chunk retrieval completely.
        if ($result['catalogBlock'] !== null) {
            return [$result['catalogBlock']];
        }

-        // No selected chunks means no usable retrieval result.
        if ($result['selectedChunkIds'] === []) {
            return [];
        }
@@ -90,6 +99,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
     * - fused RRF scores
     * - intent / route information
     * - threshold and list-query flags
+     *
     * @throws Exception
     */
    public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array
@@ -97,12 +107,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
        $config = $config ?? $this->requireConfig();
        $result = $this->execute($prompt, $config, true);

-        // For catalog list routes we expose a synthetic debug row.
        if ($result['catalogBlock'] !== null) {
            return [[
                'rank' => 1,
                'chunk_id' => '__CATALOG_LIST__',
                'document_id' => null,
+                'chunk_index' => null,
                'raw_score' => null,
                'rrf_score' => null,
                'threshold' => 0.0,
@@ -110,6 +120,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
                'route' => $result['route'],
                'entity_label' => $result['entityLabel'],
                'is_list_query' => true,
+                'selection_mode' => 'catalog_list',
                'text' => $result['catalogBlock'],
            ]];
        }
@@ -122,8 +133,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
        $rank = 0;

        foreach ($result['selectedChunkIds'] as $chunkId) {
-
-            // Skip ids that could not be resolved to real chunk rows.
            if (!isset($result['rows'][$chunkId])) {
                continue;
            }
@@ -134,6 +143,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
                'rank' => $rank,
                'chunk_id' => $chunkId,
                'document_id' => $result['rows'][$chunkId]['document_id'] ?? null,
+                'chunk_index' => $result['rows'][$chunkId]['chunk_index'] ?? null,
                'raw_score' => $result['rawScores'][$chunkId] ?? null,
                'rrf_score' => $result['rrfScores'][$chunkId] ?? null,
                'threshold' => $result['threshold'],
@@ -141,6 +151,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
                'route' => $result['route'],
                'entity_label' => $result['entityLabel'],
                'is_list_query' => $result['isListQuery'],
+                'selection_mode' => $result['selectionMode'],
                'text' => trim((string)$result['rows'][$chunkId]['text']),
            ];
        }
@@ -159,8 +170,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
     * 1. Detect catalog entity and sales intent
     * 2. Resolve route
     * 3. If route is a catalog list route, try direct catalog output
-     * 4. Otherwise, run the normal hybrid retrieval core
-     * 5. Select final chunk ids depending on query type
+     * 4. If prompt matches one exact document title, use exact-document fast path
+     * 5. Otherwise, run the normal hybrid retrieval core
+     * 6. Select final chunk ids depending on query type
+     *
     * @throws Exception
     */
    private function execute(
@@ -169,16 +182,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
        bool                  $withScores
    ): array
    {
-
        $entityLabel = $this->catalogIntent->detect($prompt);
        $salesIntent = $this->detectSalesIntent($prompt);
        $route = $this->routeResolver->resolve($salesIntent, $entityLabel);

-        // Fast path:
-        // If the route explicitly asks for a catalog list and we have an entity label,
-        // we return a prebuilt catalog block instead of semantic chunk retrieval.
        if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
-
            $catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);

            if ($catalogBlock !== null) {
@@ -187,6 +195,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
                    'entityLabel' => $entityLabel,
                    'intent' => $salesIntent,
                    'isListQuery' => true,
+                    'selectionMode' => 'catalog_list',
                    'selectedChunkIds' => [],
                    'rows' => [],
                    'rrfScores' => [],
@@ -197,15 +206,40 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
            }
        }

+        $exactDocumentMatch = $this->lookup->findBestExactDocumentByPrompt($prompt);
+
+        if ($exactDocumentMatch !== null) {
+            $selectedChunkIds = $this->selectExactDocumentChunkIds(
+                $exactDocumentMatch['rows'],
+                max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS))
+            );
+
+            if ($selectedChunkIds !== []) {
+                return [
+                    'route' => $route,
+                    'entityLabel' => $entityLabel,
+                    'intent' => $salesIntent,
+                    'isListQuery' => false,
+                    'selectionMode' => 'exact_document_title',
+                    'selectedChunkIds' => $selectedChunkIds,
+                    'rows' => $exactDocumentMatch['rows'],
+                    'rrfScores' => $this->buildExactDocumentScores($selectedChunkIds),
+                    'rawScores' => [],
+                    'threshold' => 1.0,
+                    'catalogBlock' => null,
+                ];
+            }
+        }
+
        $core = $this->runCore($prompt, $config, $withScores, $salesIntent);

-        // No ranked chunks or no resolved rows means retrieval produced nothing usable.
        if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
            return [
                'route' => $route,
                'entityLabel' => $entityLabel,
                'intent' => $salesIntent,
                'isListQuery' => $core['is_list_query'],
+                'selectionMode' => null,
                'selectedChunkIds' => [],
                'rows' => [],
                'rrfScores' => [],
@@ -215,18 +249,30 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
            ];
        }

-        // Selection strategy depends on query type:
-        // - list queries prefer deduplicated chunks
-        // - sales queries prefer spread across docs / chunk distance
-        $selectedChunkIds = $core['is_list_query']
-            ? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
-            : $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
+        if ($core['is_list_query']) {
+            $selectedChunkIds = $this->selectListChunkIds(
+                $core['ranked_chunk_ids'],
+                $core['rows'],
+                $core['limit']
+            );
+            $selectionMode = 'list_deduplicated';
+        } else {
+            $salesSelection = $this->selectSalesChunkIds(
+                $core['ranked_chunk_ids'],
+                $core['rows'],
+                $core['limit']
+            );
+
+            $selectedChunkIds = $salesSelection['ids'];
+            $selectionMode = $salesSelection['mode'];
+        }

        return [
            'route' => $route,
            'entityLabel' => $entityLabel,
            'intent' => $salesIntent,
            'isListQuery' => $core['is_list_query'],
+            'selectionMode' => $selectionMode,
            'selectedChunkIds' => $selectedChunkIds,
            'rows' => $core['rows'],
            'rrfScores' => $core['rrf_scores'],
@@ -252,6 +298,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
     * - run global and optional scoped vector search
     * - fuse hits
     * - resolve chunk ids to chunk rows
+     *
     * @throws Exception
     */
    private function runCore(
@@ -261,17 +308,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
        string                $salesIntent
    ): array
    {
-
        $limit = max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS));
        $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));

        $isListQuery = $this->intentLite->isListQuery($prompt);

-        // The prompt is normalized first, then enriched before retrieval.
        $cleanQuery = $this->queryCleaner->clean($prompt);
        $cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery);

-        // Empty cleaned query means retrieval would be meaningless.
        if ($cleanQuery === '') {
            return [
                'limit' => $limit,
@@ -290,22 +334,21 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
            $vectorTopKBase
        );

-        // Tag routing tries to narrow retrieval to relevant document ids.
        $candidateDocIds = $this->tagRouting->route($cleanQuery);
        $candidateDocIds = is_array($candidateDocIds)
-            ? array_values(array_unique(array_filter($candidateDocIds, 'is_string')))
+            ? array_values(array_unique(array_filter(
+                $candidateDocIds,
+                static fn(mixed $value): bool => is_string($value) && $value !== ''
+            )))
            : [];

-        // Always run a global search.
        $globalHits = $this->vectorClient->search($cleanQuery, $topK);

-        // Optionally run a scoped search if tag routing yielded document candidates.
        $scopedHits = [];
        if ($candidateDocIds !== []) {
            $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
        }

-        // Nothing found at all.
        if ($globalHits === [] && $scopedHits === []) {
            return [
                'limit' => $limit,
@@ -318,25 +361,19 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
            ];
        }

-        // Fuse global and scoped hits with optional scoped boost.
        $fused = $this->fuseHits(
            $globalHits,
            $scopedHits,
            $threshold,
-            $salesIntent === SalesIntentLite::OBJECTION,
+            $scopedHits !== [],
            $withScores
        );

        $rrfScores = $fused['rrf_scores'];
        $rawScores = $fused['raw_scores'];

-        // Fallback:
-        // If all hits were filtered by threshold but global hits exist,
-        // derive a weak RRF ranking from the raw hit order.
        if ($rrfScores === [] && $globalHits !== []) {
-            $rrfScores = $this->fallbackRrfFromHits(
-                $globalHits
-            );
+            $rrfScores = $this->fallbackRrfFromHits($globalHits);
        }

        if ($rrfScores === []) {
@@ -351,11 +388,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
            ];
        }

-        // Highest fused score first.
        arsort($rrfScores);
        $rankedChunkIds = array_keys($rrfScores);

-        // Resolve the ranking to actual NDJSON chunk rows.
        $rows = $this->lookup->findByChunkIds($rankedChunkIds);

        return [
@@ -381,9 +416,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
    private function requireConfig(): ModelGenerationConfig
    {
        $config = $this->configRepository->findActiveForModel();
+
        if ($config === null) {
            throw new RuntimeException('No active ModelGenerationConfig found.');
        }
+
        return $config;
    }

@@ -395,6 +432,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
    private function detectSalesIntent(string $prompt): string
    {
        $data = $this->salesIntentLite->detect($prompt);
+
        return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
    }

@@ -412,7 +450,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
        int    $vectorTopKBase
    ): array
    {
-
        $threshold = NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD;
        $topK = $vectorTopKBase;

@@ -428,7 +465,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
        }

        $topK = max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
-        $threshold = max(NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR, min(NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, $threshold));
+        $threshold = max(
+            NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR,
+            min(NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, $threshold)
+        );

        return [$threshold, $topK];
    }
@@ -450,31 +490,25 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
        bool  $captureRaw
    ): array
    {
-
        $rrfScores = [];
        $rawScores = [];

        $apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void {
-
            $rank = 0;

            foreach ($hits as $hit) {
-
-                // Every hit must provide a chunk id and a numeric score.
                if (!isset($hit['chunk_id'], $hit['score'])) {
                    continue;
                }

                $raw = (float)$hit['score'];

-                // Threshold is applied before rank fusion.
                if ($raw < $threshold) {
                    continue;
                }

                $chunkId = (string)$hit['chunk_id'];

-                // Store the best raw score per chunk for debug inspection.
                if ($captureRaw) {
                    $rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
                }
@@ -482,12 +516,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
                $rank++;
                $rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);

-                // Scoped result lists can get a slight relevance bonus.
                if ($boost) {
                    $rrf *= 1.2;
                }

-                // Scores from multiple hit lists accumulate.
                $rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
            }
        };
@@ -513,7 +545,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
        $rank = 0;

        foreach ($hits as $hit) {
-
            if (!isset($hit['chunk_id'])) {
                continue;
            }
@@ -529,6 +560,68 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
        return $rrf;
    }

+    /**
+     * Selects a coherent chunk window from one exact document title match.
+     *
+     * For exact product questions we prefer a pure document slice over
+     * cross-document fusion to avoid mixing neighbouring product families.
+     *
+     * @param array<string,array<string,mixed>> $rows
+     * @return string[]
+     */
+    private function selectExactDocumentChunkIds(array $rows, int $limit): array
+    {
+        uasort($rows, static function (array $a, array $b): int {
+            $aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX;
+            $bIndex = is_int($b['chunk_index'] ?? null) ? (int)$b['chunk_index'] : PHP_INT_MAX;
+
+            if ($aIndex !== $bIndex) {
+                return $aIndex <=> $bIndex;
+            }
+
+            return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? ''));
+        });
+
+        $selected = [];
+        $max = min($limit, self::EXACT_DOCUMENT_MAX_CHUNKS);
+
+        foreach ($rows as $row) {
+            $chunkId = $row['chunk_id'] ?? null;
+            $text = trim((string)($row['text'] ?? ''));
+
+            if (!is_string($chunkId) || $chunkId === '' || $text === '') {
+                continue;
+            }
+
+            $selected[] = $chunkId;
+
+            if (count($selected) >= $max) {
+                break;
+            }
+        }
+
+        return $selected;
+    }
+
+    /**
+     * Builds synthetic scores for exact-title fast-path selections.
+     *
+     * These scores are only used for debug output consistency.
+     *
+     * @param string[] $chunkIds
+     * @return array<string,float>
+     */
+    private function buildExactDocumentScores(array $chunkIds): array
+    {
+        $scores = [];
+
+        foreach (array_values($chunkIds) as $rank => $chunkId) {
+            $scores[(string)$chunkId] = 1.0 / (1 + $rank);
+        }
+
+        return $scores;
+    }
+
    /**
     * Selection strategy for list-style queries.
     *
@@ -543,7 +636,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
        $out = [];

        foreach ($chunkIds as $id) {
-
            if (!isset($rows[$id]['text'])) {
                continue;
            }
@@ -553,7 +645,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
                continue;
            }

-            // Deduplicate by normalized chunk text.
            $key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));

            if (isset($seen[$key])) {
@@ -574,18 +665,242 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
    /**
     * Selection strategy for sales-oriented queries.
     *
-     * Goal:
-     * - avoid overloading the result with chunks from the same document
-     * - avoid chunks that are too close to each other in the same document
-     * - preserve top-ranked relevance while improving contextual spread
+     * Modes:
+     * - exact_document_title:
+     *   used when the prompt clearly contains one exact document title
+     *   and the answer should stay strictly within that document
+     *
+     * - sales_dominant_document:
+     *   used when one document clearly dominates the top hit window
+     *   and coherent neighbouring chunks from that document are more
+     *   useful than cross-document spread
+     *
+     * - sales_spread:
+     *   default mode that spreads chunks across documents and enforces
+     *   distance between chunk positions of the same document
     */
    private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array
    {
-        $out = [];
+        $dominantDocId = $this->detectDominantTopDocument($chunkIds, $rows);
+
+        if ($dominantDocId !== null) {
+            $dominantChunkIds = $this->selectDominantDocumentChunkIds(
+                $dominantDocId,
+                $chunkIds,
+                $rows,
+                $limit
+            );
+
+            if ($dominantChunkIds !== []) {
+                return [
+                    'ids' => $this->fillRemainingSalesChunkIds(
+                        $dominantChunkIds,
+                        $chunkIds,
+                        $rows,
+                        $limit
+                    ),
+                    'mode' => 'sales_dominant_document',
+                ];
+            }
+        }
+
+        return [
+            'ids' => $this->selectSalesChunkIdsSpread($chunkIds, $rows, $limit),
+            'mode' => 'sales_spread',
+        ];
+    }
+
+    /**
+     * Detects whether one document clearly dominates the first ranked window.
+     *
+     * This is especially useful for product-sheet style documents where
+     * several adjacent chunks belong together and should be passed to the model
+     * as one coherent factual block.
+     */
+    private function detectDominantTopDocument(array $chunkIds, array $rows): ?string
+    {
+        $docWindow = [];
+
+        foreach (array_slice($chunkIds, 0, self::DOMINANT_DOC_WINDOW) as $chunkId) {
+            if (!isset($rows[$chunkId]['text'])) {
+                continue;
+            }
+
+            $text = trim((string)$rows[$chunkId]['text']);
+            $docId = $rows[$chunkId]['document_id'] ?? null;
+
+            if ($text === '' || !is_string($docId) || $docId === '') {
+                continue;
+            }
+
+            $docWindow[] = $docId;
+        }
+
+        if (count($docWindow) < 2) {
+            return null;
+        }
+
+        $counts = array_count_values($docWindow);
+        arsort($counts);
+
+        $dominantDocId = array_key_first($counts);
+
+        if (!is_string($dominantDocId) || $dominantDocId === '') {
+            return null;
+        }
+
+        $dominantCount = (int)($counts[$dominantDocId] ?? 0);
+
+        if ($dominantCount >= self::DOMINANT_DOC_MIN_HITS) {
+            return $dominantDocId;
+        }
+
+        $first = $docWindow[0] ?? null;
+        $second = $docWindow[1] ?? null;
+
+        if ($dominantCount >= 2 && $first === $dominantDocId && $second === $dominantDocId) {
+            return $dominantDocId;
+        }
+
+        return null;
+    }
+
+    /**
+     * Selects a coherent chunk window from the dominant document.
+     *
+     * Strategy:
+     * - use the highest-ranked chunk of that document as anchor
+     * - prefer neighbouring chunk indices around that anchor
+     * - sort the final selection by chunk index for prompt coherence
+     */
+    private function selectDominantDocumentChunkIds(
+        string $documentId,
+        array  $chunkIds,
+        array  $rows,
+        int    $limit
+    ): array
+    {
+        $docHits = [];
+        $anchorChunkIndex = null;
+
+        foreach ($chunkIds as $rank => $chunkId) {
+            if (!isset($rows[$chunkId]['text'])) {
+                continue;
+            }
+
+            $text = trim((string)$rows[$chunkId]['text']);
+            $docId = $rows[$chunkId]['document_id'] ?? null;
+
+            if ($text === '' || $docId !== $documentId) {
+                continue;
+            }
+
+            $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
+            $chunkIndex = is_int($chunkIndex) ? $chunkIndex : null;
+
+            if ($anchorChunkIndex === null && $chunkIndex !== null) {
+                $anchorChunkIndex = $chunkIndex;
+            }
+
+            $docHits[] = [
+                'id' => (string)$chunkId,
+                'rank' => $rank,
+                'chunk_index' => $chunkIndex,
+            ];
+        }
+
+        if ($docHits === []) {
+            return [];
+        }
+
+        $maxFromDoc = min($limit, self::DOMINANT_DOC_MAX_CHUNKS);
+
+        if ($anchorChunkIndex !== null) {
+            usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int {
+                $aDistance = $a['chunk_index'] === null ? PHP_INT_MAX : abs($a['chunk_index'] - $anchorChunkIndex);
+                $bDistance = $b['chunk_index'] === null ? PHP_INT_MAX : abs($b['chunk_index'] - $anchorChunkIndex);
+
+                if ($aDistance !== $bDistance) {
+                    return $aDistance <=> $bDistance;
+                }
+
+                return $a['rank'] <=> $b['rank'];
+            });
+        } else {
+            usort($docHits, static fn(array $a, array $b): int => $a['rank'] <=> $b['rank']);
+        }
+
+        $selected = array_slice($docHits, 0, $maxFromDoc);
+
+        usort($selected, static function (array $a, array $b): int {
+            $aIndex = $a['chunk_index'];
+            $bIndex = $b['chunk_index'];
+
+            if ($aIndex === null && $bIndex === null) {
+                return $a['rank'] <=> $b['rank'];
+            }
+
+            if ($aIndex === null) {
+                return 1;
+            }
+
+            if ($bIndex === null) {
+                return -1;
+            }
+
+            if ($aIndex !== $bIndex) {
+                return $aIndex <=> $bIndex;
+            }
+
+            return $a['rank'] <=> $b['rank'];
+        });
+
+        return array_map(
+            static fn(array $row): string => $row['id'],
+            $selected
+        );
+    }
+
+    /**
+     * Fills the remaining sales slots after a dominant document selection.
+     *
+     * The already selected dominant-document chunks stay fixed.
+     * Remaining slots are filled with the normal spread strategy.
+     */
+    private function fillRemainingSalesChunkIds(
+        array $seedChunkIds,
+        array $chunkIds,
+        array $rows,
+        int   $limit
+    ): array
+    {
+        $out = array_values(array_unique(array_map('strval', $seedChunkIds)));
+
+        if (count($out) >= $limit) {
+            return array_slice($out, 0, $limit);
+        }
+
+        $selected = array_fill_keys($out, true);
        $docCounter = [];
        $docChunkPositions = [];

+        foreach ($out as $chunkId) {
+            $docId = $rows[$chunkId]['document_id'] ?? null;
+            $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
+
+            if (is_string($docId) && $docId !== '') {
+                $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
+
+                if (is_int($chunkIndex)) {
+                    $docChunkPositions[$docId][] = $chunkIndex;
+                }
+            }
+        }
+
        foreach ($chunkIds as $chunkId) {
+            if (isset($selected[$chunkId])) {
+                continue;
+            }

            if (!isset($rows[$chunkId]['text'])) {
                continue;
@@ -594,23 +909,80 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
            $docId = $rows[$chunkId]['document_id'] ?? null;
            $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;

-            // Sales selection requires a valid document context.
-            if (!is_string($docId)) {
+            if (!is_string($docId) || $docId === '') {
                continue;
            }

-            // Limit how many chunks may come from the same document.
            if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
                continue;
            }

-            // Enforce a minimum distance between chunk positions of the same document.
            if (is_int($chunkIndex)) {
                foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
                    if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
                        continue 2;
                    }
                }
+            }
+
+            $text = trim((string)$rows[$chunkId]['text']);
+            if ($text === '') {
+                continue;
+            }
+
+            $out[] = (string)$chunkId;
+            $selected[$chunkId] = true;
+            $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
+
+            if (is_int($chunkIndex)) {
+                $docChunkPositions[$docId][] = $chunkIndex;
+            }
+
+            if (count($out) >= $limit) {
+                break;
+            }
+        }
+
+        return $out;
+    }
+
+    /**
+     * Default spread selection for sales-oriented queries.
+     *
+     * Goal:
+     * - avoid overloading the result with chunks from the same document
+     * - avoid chunks that are too close to each other in the same document
+     * - preserve top-ranked relevance while improving contextual spread
+     */
+    private function selectSalesChunkIdsSpread(array $chunkIds, array $rows, int $limit): array
+    {
+        $out = [];
+        $docCounter = [];
+        $docChunkPositions = [];
+
+        foreach ($chunkIds as $chunkId) {
+            if (!isset($rows[$chunkId]['text'])) {
+                continue;
+            }
+
+            $docId = $rows[$chunkId]['document_id'] ?? null;
+            $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
+
+            if (!is_string($docId) || $docId === '') {
+                continue;
+            }
+
+            if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
+                continue;
+            }
+
+            if (is_int($chunkIndex)) {
+                foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
+                    if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
+                        continue 2;
+                    }
+                }
+
                $docChunkPositions[$docId][] = $chunkIndex;
            }

@@ -638,7 +1010,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
        $out = [];

        foreach ($chunkIds as $id) {
-
            if (!isset($rows[$id]['text'])) {
                continue;
            }