MtoRagSystem/src/Knowledge/Retrieval/NdjsonHybridRetriever.php

<?php

declare(strict_types=1);

namespace App\Knowledge\Retrieval;

use App\Catalog\EntityCatalogService;
use App\Config\NdjsonHybridRetrieverConfig;
use App\Entity\ModelGenerationConfig;
use App\Intent\CatalogIntentLite;
use App\Intent\IntentLite;
use App\Intent\SalesIntentLite;
use App\Repository\ModelGenerationConfigRepository;
use App\Routing\IntentRouteResolver;
use App\Tag\TagRoutingService;
use App\Vector\VectorSearchClient;
use Doctrine\DBAL\Exception;
use RuntimeException;

/**
 * Hybrid retriever for NDJSON-based knowledge chunks.
 *
 * Main responsibilities:
 * - detect high-level request intent
 * - optionally short-circuit to catalog list output
 * - resolve exact document-title matches before semantic retrieval
 * - run vector retrieval globally and optionally document-scoped
 * - fuse both result sets with RRF-style scoring
 * - apply selection rules for list queries vs. sales-style queries
 * - return either plain chunk texts or debug metadata
 */
final readonly class NdjsonHybridRetriever implements RetrieverInterface
{

    public function __construct(
        private NdjsonChunkLookup               $lookup,
        private NdjsonKeywordRetriever          $keywordRetriever,
        private VectorSearchClient              $vectorClient,
        private TagRoutingService               $tagRouting,
        private ModelGenerationConfigRepository $configRepository,
        private QueryCleaner                    $queryCleaner,
        private IntentLite                      $intentLite,
        private SalesIntentLite                 $salesIntentLite,
        private CatalogIntentLite               $catalogIntent,
        private IntentRouteResolver             $routeResolver,
        private EntityCatalogService            $entityCatalogService,
        private QueryEnricher                   $queryEnricher,
        private NdjsonHybridRetrieverConfig     $retrieverConfig,
    )
    {
    }

    // =========================================================
    // PUBLIC API
    // =========================================================

    /**
     * Returns the final retrieval payload as plain text chunks.
     *
     * Behaviour:
     * - loads active retrieval config
     * - executes the full orchestration pipeline
     * - if the route resolves to a catalog list, returns the catalog block only
     * - otherwise returns the selected chunk texts
     *
     * @throws Exception
     */
    public function retrieve(string $prompt): array
    {
        $config = $this->requireConfig();
        $result = $this->execute($prompt, $config, false);

        if ($result['catalogBlock'] !== null) {
            return [$result['catalogBlock']];
        }

        if ($result['selectedChunkIds'] === []) {
            return [];
        }

        return $this->collectTextsFromIds(
            $result['selectedChunkIds'],
            $result['rows']
        );
    }

    /**
     * Returns a debug-friendly retrieval result with scoring/meta information.
     *
     * This method is used for inspection and tuning:
     * - selected chunk ids
     * - raw vector scores
     * - fused RRF scores
     * - intent / route information
     * - threshold and list-query flags
     *
     * @throws Exception
     */
    public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array
    {
        $config = $config ?? $this->requireConfig();
        $result = $this->execute($prompt, $config, true);

        if ($result['catalogBlock'] !== null) {
            return [[
                'rank' => 1,
                'chunk_id' => '__CATALOG_LIST__',
                'document_id' => null,
                'chunk_index' => null,
                'raw_score' => null,
                'rrf_score' => null,
                'threshold' => 0.0,
                'intent' => $result['intent'],
                'route' => $result['route'],
                'entity_label' => $result['entityLabel'],
                'is_list_query' => true,
                'selection_mode' => 'catalog_list',
                'text' => $result['catalogBlock'],
            ]];
        }

        if ($result['selectedChunkIds'] === []) {
            return [];
        }

        $out = [];
        $rank = 0;

        foreach ($result['selectedChunkIds'] as $chunkId) {
            if (!isset($result['rows'][$chunkId])) {
                continue;
            }

            $rank++;

            $out[] = [
                'rank' => $rank,
                'chunk_id' => $chunkId,
                'document_id' => $result['rows'][$chunkId]['document_id'] ?? null,
                'chunk_index' => $result['rows'][$chunkId]['chunk_index'] ?? null,
                'raw_score' => $result['rawScores'][$chunkId] ?? null,
                'rrf_score' => $result['rrfScores'][$chunkId] ?? null,
                'threshold' => $result['threshold'],
                'intent' => $result['intent'],
                'route' => $result['route'],
                'entity_label' => $result['entityLabel'],
                'is_list_query' => $result['isListQuery'],
                'selection_mode' => $result['selectionMode'],
                'text' => trim((string)$result['rows'][$chunkId]['text']),
            ];
        }

        return $out;
    }

    // =========================================================
    // CENTRAL ORCHESTRATION
    // =========================================================

    /**
     * Central orchestration entrypoint.
     *
     * Pipeline:
     * 1. Detect catalog entity and sales intent
     * 2. Resolve route
     * 3. If route is a catalog list route, try direct catalog output
     * 4. If prompt matches one exact document title, use exact-document fast path
     * 5. Otherwise, run the normal hybrid retrieval core
     * 6. Select final chunk ids depending on query type
     *
     * @throws Exception
     */
    private function execute(
        string                $prompt,
        ModelGenerationConfig $config,
        bool                  $withScores
    ): array
    {
        $entityLabel = $this->catalogIntent->detect($prompt);
        $salesIntent = $this->detectSalesIntent($prompt);
        $route = $this->routeResolver->resolve($salesIntent, $entityLabel);

        if (
            $route === IntentRouteResolver::ROUTE_CATALOG_LIST
            && !$this->shouldUseCatalogListShortcut($prompt, $salesIntent)
        ) {
            $route = IntentRouteResolver::ROUTE_NORMAL;
        }

        if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
            $catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);

            if ($catalogBlock !== null) {
                return [
                    'route' => $route,
                    'entityLabel' => $entityLabel,
                    'intent' => $salesIntent,
                    'isListQuery' => true,
                    'selectionMode' => 'catalog_list',
                    'selectedChunkIds' => [],
                    'rows' => [],
                    'rrfScores' => [],
                    'rawScores' => [],
                    'threshold' => 0.0,
                    'catalogBlock' => trim($catalogBlock),
                ];
            }
        }

        $exactDocumentMatch = $this->lookup->findBestExactDocumentByPrompt($prompt);

        if ($exactDocumentMatch !== null) {
            $selectedChunkIds = $this->selectExactDocumentChunkIds(
                $exactDocumentMatch['rows'],
                max(1, min($config->getRetrievalMaxChunks(), $this->retrieverConfig->hardMaxChunks())),
                $prompt
            );

            if ($selectedChunkIds !== []) {
                return [
                    'route' => $route,
                    'entityLabel' => $entityLabel,
                    'intent' => $salesIntent,
                    'isListQuery' => false,
                    'selectionMode' => 'exact_document_title',
                    'selectedChunkIds' => $selectedChunkIds,
                    'rows' => $exactDocumentMatch['rows'],
                    'rrfScores' => $this->buildExactDocumentScores($selectedChunkIds),
                    'rawScores' => [],
                    'threshold' => 1.0,
                    'catalogBlock' => null,
                ];
            }
        }

        $core = $this->runCore($prompt, $config, $withScores, $salesIntent);

        if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
            return [
                'route' => $route,
                'entityLabel' => $entityLabel,
                'intent' => $salesIntent,
                'isListQuery' => $core['is_list_query'],
                'selectionMode' => null,
                'selectedChunkIds' => [],
                'rows' => [],
                'rrfScores' => [],
                'rawScores' => [],
                'threshold' => $core['threshold'],
                'catalogBlock' => null,
            ];
        }

        if ($core['is_list_query']) {
            $selectedChunkIds = $this->selectListChunkIds(
                $core['ranked_chunk_ids'],
                $core['rows'],
                $core['limit']
            );
            $selectionMode = 'list_deduplicated';
        } else {
            $salesSelection = $this->selectSalesChunkIds(
                $prompt,
                $core['ranked_chunk_ids'],
                $core['rows'],
                $core['limit']
            );

            $selectedChunkIds = $salesSelection['ids'];
            $selectionMode = $salesSelection['mode'];
        }

        return [
            'route' => $route,
            'entityLabel' => $entityLabel,
            'intent' => $salesIntent,
            'isListQuery' => $core['is_list_query'],
            'selectionMode' => $selectionMode,
            'selectedChunkIds' => $selectedChunkIds,
            'rows' => $core['rows'],
            'rrfScores' => $core['rrf_scores'],
            'rawScores' => $core['raw_scores'],
            'threshold' => $core['threshold'],
            'catalogBlock' => null,
        ];
    }

    // =========================================================
    // CORE PIPELINE
    // =========================================================

    /**
     * Executes the actual hybrid retrieval logic.
     *
     * Steps:
     * - derive limits from config within hard safety caps
     * - detect whether the prompt is a "list query"
     * - clean and enrich the prompt
     * - compute threshold + vector topK based on intent/query type
     * - route query into candidate document ids via tag routing
     * - run global and optional scoped vector search
     * - fuse hits
     * - resolve chunk ids to chunk rows
     *
     * @throws Exception
     */
    private function runCore(
        string                $prompt,
        ModelGenerationConfig $config,
        bool                  $withScores,
        string                $salesIntent
    ): array
    {
        $limit = max(1, min($config->getRetrievalMaxChunks(), $this->retrieverConfig->hardMaxChunks()));
        $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), $this->retrieverConfig->hardMaxVectorK()));

        $isListQuery = $this->intentLite->isListQuery($prompt);

        $cleanQuery = $this->queryCleaner->clean($prompt);
        $cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery);

        if ($cleanQuery === '') {
            return [
                'limit' => $limit,
                'is_list_query' => $isListQuery,
                'threshold' => $this->retrieverConfig->vectorScoreThreshold(),
                'ranked_chunk_ids' => [],
                'rows' => [],
                'rrf_scores' => [],
                'raw_scores' => [],
            ];
        }

        [$threshold, $topK] = $this->computeThresholdAndTopK(
            $salesIntent,
            $isListQuery,
            $vectorTopKBase
        );

        $candidateDocIds = $this->tagRouting->route($cleanQuery);
        $candidateDocIds = is_array($candidateDocIds)
            ? array_values(array_unique(array_filter(
                $candidateDocIds,
                static fn(mixed $value): bool => is_string($value) && $value !== ''
            )))
            : [];

        $globalHits = $this->vectorClient->search($cleanQuery, $topK);
        $keywordHits = $this->keywordRetriever->search(
            $cleanQuery,
            $this->computeKeywordTopK($topK)
        );

        $scopedHits = [];
        $scopedKeywordHits = [];
        if ($candidateDocIds !== []) {
            $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
            $scopedKeywordHits = $this->keywordRetriever->search(
                $cleanQuery,
                $this->computeKeywordTopK($topK),
                $candidateDocIds
            );
        }

        if ($globalHits === [] && $scopedHits === [] && $keywordHits === [] && $scopedKeywordHits === []) {
            return [
                'limit' => $limit,
                'is_list_query' => $isListQuery,
                'threshold' => $threshold,
                'ranked_chunk_ids' => [],
                'rows' => [],
                'rrf_scores' => [],
                'raw_scores' => [],
            ];
        }

        $fused = $this->fuseHits(
            $globalHits,
            $scopedHits,
            $keywordHits,
            $scopedKeywordHits,
            $threshold,
            $scopedHits !== [],
            $scopedKeywordHits !== [],
            $withScores
        );

        $rrfScores = $fused['rrf_scores'];
        $rawScores = $fused['raw_scores'];

        if ($rrfScores === [] && $globalHits !== []) {
           // $rrfScores = $this->fallbackRrfFromHits($globalHits);
        }

        if ($rrfScores === []) {
            return [
                'limit' => $limit,
                'is_list_query' => $isListQuery,
                'threshold' => $threshold,
                'ranked_chunk_ids' => [],
                'rows' => [],
                'rrf_scores' => [],
                'raw_scores' => $rawScores,
            ];
        }

        arsort($rrfScores);
        $rankedChunkIds = array_keys($rrfScores);

        $rows = $this->lookup->findByChunkIds($rankedChunkIds);

        return [
            'limit' => $limit,
            'is_list_query' => $isListQuery,
            'threshold' => $threshold,
            'ranked_chunk_ids' => $rankedChunkIds,
            'rows' => $rows,
            'rrf_scores' => $rrfScores,
            'raw_scores' => $rawScores,
        ];
    }

    // =========================================================
    // SUPPORT
    // =========================================================

    /**
     * Loads the active model generation config.
     *
     * Retrieval is not allowed to proceed without an active config.
     */
    private function requireConfig(): ModelGenerationConfig
    {
        $config = $this->configRepository->findActiveForModel();

        if ($config === null) {
            throw new RuntimeException('No active ModelGenerationConfig found.');
        }

        return $config;
    }

    /**
     * Extracts the normalized sales intent string from the intent detector.
     *
     * Falls back to DISCOVERY when the detector payload is incomplete.
     */
    private function detectSalesIntent(string $prompt): string
    {
        $data = $this->salesIntentLite->detect($prompt);

        return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
    }

    /**
     * The catalog shortcut is only safe for real list/catalog requests.
     * Factual questions such as "what is the lowest threshold" must continue
     * through normal retrieval, otherwise the system can return a product list
     * instead of the requested value.
     */
    private function shouldUseCatalogListShortcut(string $prompt, string $salesIntent): bool
    {
        if ($salesIntent !== SalesIntentLite::DISCOVERY) {
            return false;
        }

        if ($this->intentLite->isListQuery($prompt)) {
            return true;
        }

        $normalized = $this->normalizeText($prompt);

        if ($normalized === '') {
            return false;
        }

        return $this->matchesAnyPattern($normalized, $this->retrieverConfig->catalogListShortcutPatterns());
    }

    /**
     * Keyword retrieval is cheap and should look slightly wider than vector
     * retrieval because it acts as a factual safety net for numbers, ranges,
     * thresholds and exact technical terms.
     */
    private function computeKeywordTopK(int $vectorTopK): int
    {
        $topK = (int) ceil($vectorTopK * $this->retrieverConfig->keywordTopKMultiplier());

        return max(1, min($topK, $this->retrieverConfig->hardMaxKeywordK()));
    }

    /**
     * Computes retrieval threshold and vector topK.
     *
     * Rules:
     * - objection/pricing intents are slightly stricter
     * - list queries are allowed to retrieve a wider candidate set
     * - all values are clamped to global hard limits
     */
    private function computeThresholdAndTopK(
        string $salesIntent,
        bool   $isListQuery,
        int    $vectorTopKBase
    ): array
    {
        $threshold = $this->retrieverConfig->vectorScoreThreshold();
        $topK = $vectorTopKBase;

        if (
            $salesIntent === SalesIntentLite::OBJECTION ||
            $salesIntent === SalesIntentLite::PRICING
        ) {
            $threshold += 0.02;
        }

        if ($isListQuery) {
            $topK = (int)round($topK * $this->retrieverConfig->listBonus());
        }

        $topK = max(1, min($topK, $this->retrieverConfig->hardMaxVectorK()));
        $threshold = max(
            $this->retrieverConfig->thresholdFloor(),
            min($this->retrieverConfig->thresholdCeil(), $threshold)
        );

        return [$threshold, $topK];
    }

    /**
     * Fuses multiple hit lists into one RRF-style score map.
     *
     * Notes:
     * - only hits above threshold are considered
     * - rank position within each hit list contributes to the final score
     * - scoped hits can be boosted
     * - raw scores are optionally captured for debug output
     */
    private function fuseHits(
        array $globalHits,
        array $scopedHits,
        array $keywordHits,
        array $scopedKeywordHits,
        float $vectorThreshold,
        bool  $boostScopedVector,
        bool  $boostScopedKeyword,
        bool  $captureRaw
    ): array
    {
        $rrfScores = [];
        $rawScores = [];

        $apply = function (array $hits, float $threshold, float $weight) use (&$rrfScores, &$rawScores, $captureRaw): void {
            $rank = 0;

            foreach ($hits as $hit) {
                if (!isset($hit['chunk_id'], $hit['score'])) {
                    continue;
                }

                $raw = (float)$hit['score'];

                if ($raw < $threshold) {
                    continue;
                }

                $chunkId = (string)$hit['chunk_id'];

                if ($captureRaw) {
                    $rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
                }

                $rank++;
                $rrf = (1.0 / ($this->retrieverConfig->rrfK() + $rank)) * $weight;

                $rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
            }
        };

        $apply($globalHits, $vectorThreshold, 1.0);
        $apply($scopedHits, $vectorThreshold, $boostScopedVector ? $this->retrieverConfig->scopedVectorRrfWeight() : 1.0);
        $apply($keywordHits, $this->retrieverConfig->keywordScoreThreshold(), $this->retrieverConfig->keywordRrfWeight());
        $apply($scopedKeywordHits, $this->retrieverConfig->keywordScoreThreshold(), $boostScopedKeyword ? $this->retrieverConfig->scopedKeywordRrfWeight() : $this->retrieverConfig->keywordRrfWeight());

        return [
            'rrf_scores' => $rrfScores,
            'raw_scores' => $rawScores,
        ];
    }

    /**
     * Builds a fallback RRF ranking purely from hit order.
     *
     * Used when thresholding removed all fused candidates but
     * the global hit list itself still exists.
     */
    private function fallbackRrfFromHits(array $hits): array
    {
        $rrf = [];
        $rank = 0;

        foreach ($hits as $hit) {
            if (!isset($hit['chunk_id'])) {
                continue;
            }

            $rank++;
            $rrf[(string)$hit['chunk_id']] = 1.0 / ($this->retrieverConfig->rrfK() + $rank);

            if ($rank >= $this->retrieverConfig->emptyRrfFallbackTopN()) {
                break;
            }
        }

        return $rrf;
    }

    /**
     * Selects a coherent chunk window from one exact document-title match.
     *
     * A pure first-N slice is too weak for follow-up questions: the title may
     * identify the right document, while the current follow-up asks for a
     * specific detail from a later chunk (for example an indicator, range,
     * threshold, interface, relay, or error code).
     *
     * Therefore this method stays inside the matched document, but ranks its
     * chunks by overlap with the effective retrieval query before sorting the
     * final selection back into document order for prompt readability.
     *
     * @param array<string,array<string,mixed>> $rows
     * @return string[]
     */
    private function selectExactDocumentChunkIds(array $rows, int $limit, string $prompt): array
    {
        $orderedRows = $this->sortRowsByChunkIndex($rows);
        $max = min($limit, $this->retrieverConfig->exactDocumentMaxChunks());

        if ($orderedRows === [] || $max <= 0) {
            return [];
        }

        $queryTokens = $this->expandExactSelectionTokenVariants(
            $this->buildExactDocumentSelectionTokens($prompt)
        );

        if ($queryTokens === []) {
            return $this->firstChunkIdsFromRows($orderedRows, $max);
        }

        $detailFocus = $this->buildExactDocumentDetailFocus($prompt);
        $scored = [];

        foreach ($orderedRows as $order => $row) {
            $chunkId = $row['chunk_id'] ?? null;
            $text = trim((string)($row['text'] ?? ''));

            if (!is_string($chunkId) || $chunkId === '' || $text === '') {
                continue;
            }

            $haystack = $this->normalizeText($this->extractDocumentTitle($row) . ' ' . $text);
            $haystackTokens = array_fill_keys(
                $this->expandExactSelectionTokenVariants($this->tokenizeText($haystack)),
                true
            );
            $score = 0.0;

            foreach ($queryTokens as $token) {
                if (!isset($haystackTokens[$token])) {
                    continue;
                }

                if (preg_match('/\d/u', $token) === 1) {
                    $score += 6.0;
                    continue;
                }

                if ($this->isExactDetailToken($token)) {
                    $score += 5.0;
                    continue;
                }

                $score += 2.0;
            }

            $score += $this->scoreExactDocumentDetailFocus($detailFocus, $haystack, $text);

            // Keep early chunks slightly competitive for overview facts,
            // without letting them hide strongly matching detail chunks.
            $score += max(0.0, 1.0 - ($order * 0.05));

            $scored[] = [
                'id' => $chunkId,
                'score' => $score,
                'order' => $order,
                'chunk_index' => is_int($row['chunk_index'] ?? null) ? (int)$row['chunk_index'] : null,
            ];
        }

        if ($scored === []) {
            return [];
        }

        usort($scored, static function (array $a, array $b): int {
            if ($a['score'] !== $b['score']) {
                return $b['score'] <=> $a['score'];
            }

            return $a['order'] <=> $b['order'];
        });

        $selected = array_slice($scored, 0, $max);

        usort($selected, static function (array $a, array $b): int {
            $aIndex = $a['chunk_index'];
            $bIndex = $b['chunk_index'];

            if ($aIndex === null && $bIndex === null) {
                return $a['order'] <=> $b['order'];
            }

            if ($aIndex === null) {
                return 1;
            }

            if ($bIndex === null) {
                return -1;
            }

            if ($aIndex !== $bIndex) {
                return $aIndex <=> $bIndex;
            }

            return $a['order'] <=> $b['order'];
        });

        return array_map(
            static fn(array $row): string => (string)$row['id'],
            $selected
        );
    }

    /**
     * @param array<string,array<string,mixed>> $rows
     * @return array<int,array<string,mixed>>
     */
    private function sortRowsByChunkIndex(array $rows): array
    {
        uasort($rows, static function (array $a, array $b): int {
            $aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX;
            $bIndex = is_int($b['chunk_index'] ?? null) ? (int)$b['chunk_index'] : PHP_INT_MAX;

            if ($aIndex !== $bIndex) {
                return $aIndex <=> $bIndex;
            }

            return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? ''));
        });

        return array_values($rows);
    }

    /**
     * @param array<int,array<string,mixed>> $rows
     * @return string[]
     */
    private function firstChunkIdsFromRows(array $rows, int $limit): array
    {
        $selected = [];

        foreach ($rows as $row) {
            $chunkId = $row['chunk_id'] ?? null;
            $text = trim((string)($row['text'] ?? ''));

            if (!is_string($chunkId) || $chunkId === '' || $text === '') {
                continue;
            }

            $selected[] = $chunkId;

            if (count($selected) >= $limit) {
                break;
            }
        }

        return $selected;
    }

    /**
     * @return string[]
     */
    private function buildExactDocumentSelectionTokens(string $prompt): array
    {
        $tokens = $this->tokenizeText($this->normalizeText($prompt));
        $out = [];

        foreach ($tokens as $token) {
            if ($this->isGenericExactSelectionToken($token)) {
                continue;
            }

            if (preg_match('/\d/u', $token) === 1 || mb_strlen($token, 'UTF-8') >= 3) {
                $out[] = $token;
            }
        }

        return array_values(array_unique($out));
    }

    /**
     * @param string[] $tokens
     * @return string[]
     */
    private function expandExactSelectionTokenVariants(array $tokens): array
    {
        $out = [];

        foreach ($tokens as $token) {
            foreach ($this->exactSelectionTokenVariants($token) as $variant) {
                $out[] = $variant;
            }
        }

        return array_values(array_unique(array_filter(
            $out,
            static fn(string $token): bool => $token !== ''
        )));
    }

    /**
     * @return string[]
     */
    private function exactSelectionTokenVariants(string $token): array
    {
        $token = trim($token);

        if ($token === '') {
            return [];
        }

        $variants = [$token];
        $length = mb_strlen($token, 'UTF-8');

        if ($length >= 5) {
            foreach (['typen', 'innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) {
                if (!str_ends_with($token, $suffix)) {
                    continue;
                }

                $stem = mb_substr($token, 0, $length - mb_strlen($suffix, 'UTF-8'), 'UTF-8');

                if (mb_strlen($stem, 'UTF-8') >= 3) {
                    $variants[] = $stem;
                }
            }
        }

        foreach ($this->retrieverConfig->exactSelectionTokenVariantPrefixes() as $prefix => $configuredVariants) {
            if (!str_starts_with($token, $prefix)) {
                continue;
            }

            foreach ($configuredVariants as $variant) {
                $variants[] = $variant;
            }
        }

        return array_values(array_unique($variants));
    }

    /**
     * @return array{asks_indicator:bool}
     */
    private function buildExactDocumentDetailFocus(string $prompt): array
    {
        $normalized = $this->normalizeText($prompt);
        $tokens = array_fill_keys(
            $this->expandExactSelectionTokenVariants($this->tokenizeText($normalized)),
            true
        );

        $asksIndicator = $this->containsAnyConfiguredToken(
            $tokens,
            $this->retrieverConfig->exactSelectionIndicatorQuestionTokens()
        ) || $this->containsAnyConfiguredPhrase(
            $normalized,
            $this->retrieverConfig->exactSelectionIndicatorQuestionPhrases()
        );

        return [
            'asks_indicator' => $asksIndicator,
        ];
    }

    /**
     * Gives detail chunks inside an already matched exact document a strong
     * advantage for follow-up questions such as "which indicator measures that
     * value". This remains scoped to the exact document, so it does not affect
     * shop searches or broad product discovery.
     *
     * @param array{asks_indicator:bool} $detailFocus
     */
    private function scoreExactDocumentDetailFocus(array $detailFocus, string $normalizedHaystack, string $rawText): float
    {
        $score = 0.0;

        if (!$detailFocus['asks_indicator']) {
            return $score;
        }

        if ($this->matchesAnyPattern($rawText, $this->retrieverConfig->exactSelectionIndicatorTableHeadingPatterns())) {
            $score += 14.0;
        }

        if ($this->matchesAnyPattern($rawText, $this->retrieverConfig->exactSelectionIndicatorTableHeaderPatterns())) {
            $score += 10.0;
        }

        if ($this->matchesAnyPattern($rawText, $this->retrieverConfig->exactSelectionIndicatorTableRowPatterns())) {
            $score += 8.0;
        }

        if (
            $this->containsAnyConfiguredPhrase(
                $normalizedHaystack,
                $this->retrieverConfig->exactSelectionIndicatorTableRequiredPrimaryTerms()
            )
            && $this->containsAnyConfiguredPhrase(
                $normalizedHaystack,
                $this->retrieverConfig->exactSelectionIndicatorTableRequiredContextTerms()
            )
        ) {
            $score += 5.0;
        }

        return $score;
    }

    private function isExactDetailToken(string $token): bool
    {
        return in_array($token, $this->retrieverConfig->exactDetailTokens(), true);
    }

    private function isGenericExactSelectionToken(string $token): bool
    {
        return in_array($token, $this->retrieverConfig->genericExactSelectionTokens(), true);
    }

    /**
     * @param string[] $patterns
     */
    private function matchesAnyPattern(string $value, array $patterns): bool
    {
        foreach ($patterns as $pattern) {
            if (preg_match($pattern, $value) === 1) {
                return true;
            }
        }

        return false;
    }

    /**
     * @param array<string, bool> $tokens
     * @param string[] $needles
     */
    private function containsAnyConfiguredToken(array $tokens, array $needles): bool
    {
        foreach ($needles as $needle) {
            if (isset($tokens[$needle])) {
                return true;
            }
        }

        return false;
    }

    /**
     * @param string[] $phrases
     */
    private function containsAnyConfiguredPhrase(string $haystack, array $phrases): bool
    {
        foreach ($phrases as $phrase) {
            if ($phrase !== '' && str_contains($haystack, $phrase)) {
                return true;
            }
        }

        return false;
    }

    /**
     * Builds synthetic scores for exact-title fast-path selections.
     *
     * These scores are only used for debug output consistency.
     *
     * @param string[] $chunkIds
     * @return array<string,float>
     */
    private function buildExactDocumentScores(array $chunkIds): array
    {
        $scores = [];

        foreach (array_values($chunkIds) as $rank => $chunkId) {
            $scores[(string)$chunkId] = 1.0 / (1 + $rank);
        }

        return $scores;
    }

    /**
     * Selection strategy for list-style queries.
     *
     * Goal:
     * - avoid near-identical chunks
     * - prefer diverse list entries
     * - stop once the configured limit is reached
     */
    private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array
    {
        $seen = [];
        $out = [];

        foreach ($chunkIds as $id) {
            if (!isset($rows[$id]['text'])) {
                continue;
            }

            $chunk = trim((string)$rows[$id]['text']);
            if ($chunk === '') {
                continue;
            }

            $key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));

            if (isset($seen[$key])) {
                continue;
            }

            $seen[$key] = true;
            $out[] = (string)$id;

            if (count($out) >= $limit) {
                break;
            }
        }

        return $out;
    }

    /**
     * Selection strategy for sales-oriented queries.
     *
     * Modes:
     * - exact_document_title:
     *   used when the prompt clearly contains one exact document title
     *   and the answer should stay strictly within that document
     *
     * - sales_dominant_document:
     *   used when one document clearly dominates the top hit window
     *   and coherent neighbouring chunks from that document are more
     *   useful than cross-document spread
     *
     * - sales_spread:
     *   default mode that spreads chunks across documents and enforces
     *   distance between chunk positions of the same document
     */
    private function selectSalesChunkIds(string $prompt, array $chunkIds, array $rows, int $limit): array
    {
        $focusedDocId = $this->resolveFocusedSalesDocumentId($prompt, $chunkIds, $rows);

        if ($focusedDocId !== null) {
            $focusedChunkIds = $this->selectFocusedProductChunkIds(
                $focusedDocId,
                $chunkIds,
                $rows,
                $limit
            );

            if ($focusedChunkIds !== []) {
                return [
                    'ids' => $focusedChunkIds,
                    'mode' => 'sales_product_dominant_document',
                ];
            }
        }

        $dominantDocId = $this->detectDominantTopDocument($chunkIds, $rows);

        if ($dominantDocId !== null) {
            $dominantChunkIds = $this->selectDominantDocumentChunkIds(
                $dominantDocId,
                $chunkIds,
                $rows,
                $limit
            );

            if ($dominantChunkIds !== []) {
                return [
                    'ids' => $this->fillRemainingSalesChunkIds(
                        $dominantChunkIds,
                        $chunkIds,
                        $rows,
                        $limit
                    ),
                    'mode' => 'sales_dominant_document',
                ];
            }
        }

        return [
            'ids' => $this->selectSalesChunkIdsSpread($chunkIds, $rows, $limit),
            'mode' => 'sales_spread',
        ];
    }


    /**
     * Resolves a strongly focused product document before normal sales spreading.
     *
     * This protects against classic false positives where neighbouring products,
     * indicators or safety sheets outrank the actually requested device.
     */
    private function resolveFocusedSalesDocumentId(string $prompt, array $chunkIds, array $rows): ?string
    {
        $promptProfile = $this->buildPromptProductProfile($prompt);

        if ($promptProfile['anchors'] === []) {
            return null;
        }

        $candidates = [];
        $seenDocs = [];

        foreach (array_slice($chunkIds, 0, $this->retrieverConfig->focusedProductWindow()) as $rank => $chunkId) {
            $row = $rows[$chunkId] ?? null;
            if (!is_array($row)) {
                continue;
            }

            $documentId = $row['document_id'] ?? null;
            if (!is_string($documentId) || $documentId === '' || isset($seenDocs[$documentId])) {
                continue;
            }

            $title = $this->extractDocumentTitle($row);
            if ($title === '') {
                continue;
            }

            $seenDocs[$documentId] = true;
            $score = $this->scoreFocusedProductCandidate($promptProfile, $title, $row, $rank);

            $candidates[] = [
                'document_id' => $documentId,
                'score' => $score,
            ];
        }

        if ($candidates === []) {
            return null;
        }

        usort($candidates, static function (array $a, array $b): int {
            if ($a['score'] === $b['score']) {
                return strcmp((string)$a['document_id'], (string)$b['document_id']);
            }

            return $b['score'] <=> $a['score'];
        });

        $best = $candidates[0] ?? null;
        if ($best === null) {
            return null;
        }

        $runnerUpScore = (float)($candidates[1]['score'] ?? -INF);
        $bestScore = (float)$best['score'];
        $gap = $bestScore - $runnerUpScore;

        if ($bestScore < $this->retrieverConfig->focusedProductMinScore() || $gap < $this->retrieverConfig->focusedProductMinGap()) {
            return null;
        }

        $documentId = $best['document_id'] ?? null;

        return is_string($documentId) && $documentId !== '' ? $documentId : null;
    }

    /**
     * Builds a small prompt profile used for focused product dominance decisions.
     *
     * @return array{
     *     normalized:string,
     *     anchors:string[],
     *     family_tokens:string[],
     *     number_tokens:string[],
     *     asks_reagent:bool,
     *     asks_document:bool,
     *     asks_safety:bool,
     *     asks_device:bool
     * }
     */
    private function buildPromptProductProfile(string $prompt): array
    {
        $normalized = $this->normalizeText($prompt);
        $tokens = $this->tokenizeText($normalized);

        $reagentWords = $this->retrieverConfig->looksLikeReagentWords();
        $documentWords = $this->retrieverConfig->looksLikeDocumentWords();
        $safetyWords = $this->retrieverConfig->looksLikeSafetyWords();
        $deviceWords = $this->retrieverConfig->looksLikeDeviceWords();

        $asksReagent = $this->containsAnyToken($tokens, $reagentWords);
        $asksDocument = $this->containsAnyToken($tokens, $documentWords);
        $asksSafety = $this->containsAnyToken($tokens, $safetyWords);
        $asksDevice = $this->containsAnyToken($tokens, $deviceWords) || (!$asksReagent && !$asksDocument && !$asksSafety);

        $anchors = [];
        $familyTokens = [];
        $numberTokens = [];

        foreach ($tokens as $token) {
            if ($this->isGenericProductToken($token)) {
                continue;
            }

            if (preg_match('/\d/u', $token) === 1) {
                $anchors[] = $token;
                $numberTokens[] = $token;
                $familyTokens[] = $token;
                continue;
            }

            if ($this->isImportantShortModelToken($token)) {
                $anchors[] = $token;
                $familyTokens[] = $token;
                continue;
            }

            if (mb_strlen($token, 'UTF-8') >= 3) {
                $anchors[] = $token;

                if ($this->isFamilyDescriptorToken($token)) {
                    $familyTokens[] = $token;
                }
            }
        }

        return [
            'normalized' => $normalized,
            'anchors' => array_values(array_unique($anchors)),
            'family_tokens' => array_values(array_unique($familyTokens)),
            'number_tokens' => array_values(array_unique($numberTokens)),
            'asks_reagent' => $asksReagent,
            'asks_document' => $asksDocument,
            'asks_safety' => $asksSafety,
            'asks_device' => $asksDevice,
        ];
    }

    /**
     * Scores one candidate document for focused product selection.
     */
    private function scoreFocusedProductCandidate(array $promptProfile, string $title, array $row, int $rank): float
    {
        $titleNormalized = $this->normalizeText($title);
        $titleTokens = $this->tokenizeText($titleNormalized);
        $titleTokenMap = array_fill_keys($titleTokens, true);
        $textNormalized = $this->normalizeText((string)($row['text'] ?? ''));

        $score = max(0.0, 5.0 - $rank);

        if ($titleNormalized !== '' && str_contains(' ' . $promptProfile['normalized'] . ' ', ' ' . $titleNormalized . ' ')) {
            $score += 24.0;
        }

        $matchedAnchors = 0;
        foreach ($promptProfile['anchors'] as $anchor) {
            if (isset($titleTokenMap[$anchor])) {
                $matchedAnchors++;
                $score += $this->isImportantShortModelToken($anchor) ? 4.0 : 3.5;
                continue;
            }

            if (str_contains(' ' . $titleNormalized . ' ', ' ' . $anchor . ' ')) {
                $matchedAnchors++;
                $score += 3.0;
                continue;
            }

            $score -= $this->isFamilyDescriptorToken($anchor) ? 3.5 : 2.0;
        }

        foreach ($promptProfile['number_tokens'] as $numberToken) {
            if (isset($titleTokenMap[$numberToken])) {
                $score += 4.0;
            } else {
                $score -= 5.0;
            }
        }

        foreach ($promptProfile['family_tokens'] as $familyToken) {
            if (isset($titleTokenMap[$familyToken])) {
                $score += 4.0;
            } else {
                $score -= 4.5;
            }
        }

        if ($promptProfile['asks_device']) {
            if ($this->looksLikeReagentOrAccessoryDocument($row, $titleNormalized, $textNormalized)) {
                $score -= 12.0;
            }

            if ($this->looksLikeSafetyDocument($row, $titleNormalized, $textNormalized)) {
                $score -= 8.0;
            }
        }

        if ($promptProfile['asks_reagent'] && $this->looksLikeReagentOrAccessoryDocument($row, $titleNormalized, $textNormalized)) {
            $score += 6.0;
        }

        if (($promptProfile['asks_document'] || $promptProfile['asks_safety']) && $this->looksLikeSafetyDocument($row, $titleNormalized, $textNormalized)) {
            $score += 4.0;
        }

        if ($matchedAnchors === 0) {
            $score -= 10.0;
        }

        return $score;
    }

    /**
     * Selects only the focused product document chunks.
     *
     * In this strict mode we intentionally do not fill remaining slots with
     * neighbouring products, because that would reintroduce the original bug.
     */
    private function selectFocusedProductChunkIds(
        string $documentId,
        array  $chunkIds,
        array  $rows,
        int    $limit
    ): array
    {
        return $this->selectDominantDocumentChunkIds(
            $documentId,
            $chunkIds,
            $rows,
            min($limit, $this->retrieverConfig->focusedProductMaxChunks())
        );
    }

    /**
     * Detects whether one document clearly dominates the first ranked window.
     *
     * This is especially useful for product-sheet style documents where
     * several adjacent chunks belong together and should be passed to the model
     * as one coherent factual block.
     */
    private function detectDominantTopDocument(array $chunkIds, array $rows): ?string
    {
        $docWindow = [];

        foreach (array_slice($chunkIds, 0, $this->retrieverConfig->dominantDocWindow()) as $chunkId) {
            if (!isset($rows[$chunkId]['text'])) {
                continue;
            }

            $text = trim((string)$rows[$chunkId]['text']);
            $docId = $rows[$chunkId]['document_id'] ?? null;

            if ($text === '' || !is_string($docId) || $docId === '') {
                continue;
            }

            $docWindow[] = $docId;
        }

        if (count($docWindow) < 2) {
            return null;
        }

        $counts = array_count_values($docWindow);
        arsort($counts);

        $dominantDocId = array_key_first($counts);

        if (!is_string($dominantDocId) || $dominantDocId === '') {
            return null;
        }

        $dominantCount = (int)($counts[$dominantDocId] ?? 0);

        if ($dominantCount >= $this->retrieverConfig->dominantDocMinHits()) {
            return $dominantDocId;
        }

        $first = $docWindow[0] ?? null;
        $second = $docWindow[1] ?? null;

        if ($dominantCount >= 2 && $first === $dominantDocId && $second === $dominantDocId) {
            return $dominantDocId;
        }

        return null;
    }

    /**
     * Selects a coherent chunk window from the dominant document.
     *
     * Strategy:
     * - use the highest-ranked chunk of that document as anchor
     * - prefer neighbouring chunk indices around that anchor
     * - sort the final selection by chunk index for prompt coherence
     */
    private function selectDominantDocumentChunkIds(
        string $documentId,
        array  $chunkIds,
        array  $rows,
        int    $limit
    ): array
    {
        $docHits = [];
        $anchorChunkIndex = null;

        foreach ($chunkIds as $rank => $chunkId) {
            if (!isset($rows[$chunkId]['text'])) {
                continue;
            }

            $text = trim((string)$rows[$chunkId]['text']);
            $docId = $rows[$chunkId]['document_id'] ?? null;

            if ($text === '' || $docId !== $documentId) {
                continue;
            }

            $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
            $chunkIndex = is_int($chunkIndex) ? $chunkIndex : null;

            if ($anchorChunkIndex === null && $chunkIndex !== null) {
                $anchorChunkIndex = $chunkIndex;
            }

            $docHits[] = [
                'id' => (string)$chunkId,
                'rank' => $rank,
                'chunk_index' => $chunkIndex,
            ];
        }

        if ($docHits === []) {
            return [];
        }

        $maxFromDoc = min($limit, $this->retrieverConfig->dominantDocMaxChunks());

        if ($anchorChunkIndex !== null) {
            usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int {
                $aDistance = $a['chunk_index'] === null ? PHP_INT_MAX : abs($a['chunk_index'] - $anchorChunkIndex);
                $bDistance = $b['chunk_index'] === null ? PHP_INT_MAX : abs($b['chunk_index'] - $anchorChunkIndex);

                if ($aDistance !== $bDistance) {
                    return $aDistance <=> $bDistance;
                }

                return $a['rank'] <=> $b['rank'];
            });
        } else {
            usort($docHits, static fn(array $a, array $b): int => $a['rank'] <=> $b['rank']);
        }

        $selected = array_slice($docHits, 0, $maxFromDoc);

        usort($selected, static function (array $a, array $b): int {
            $aIndex = $a['chunk_index'];
            $bIndex = $b['chunk_index'];

            if ($aIndex === null && $bIndex === null) {
                return $a['rank'] <=> $b['rank'];
            }

            if ($aIndex === null) {
                return 1;
            }

            if ($bIndex === null) {
                return -1;
            }

            if ($aIndex !== $bIndex) {
                return $aIndex <=> $bIndex;
            }

            return $a['rank'] <=> $b['rank'];
        });

        return array_map(
            static fn(array $row): string => $row['id'],
            $selected
        );
    }

    /**
     * Fills the remaining sales slots after a dominant document selection.
     *
     * The already selected dominant-document chunks stay fixed.
     * Remaining slots are filled with the normal spread strategy.
     */
    private function fillRemainingSalesChunkIds(
        array $seedChunkIds,
        array $chunkIds,
        array $rows,
        int   $limit
    ): array
    {
        $out = array_values(array_unique(array_map('strval', $seedChunkIds)));

        if (count($out) >= $limit) {
            return array_slice($out, 0, $limit);
        }

        $selected = array_fill_keys($out, true);
        $docCounter = [];
        $docChunkPositions = [];

        foreach ($out as $chunkId) {
            $docId = $rows[$chunkId]['document_id'] ?? null;
            $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;

            if (is_string($docId) && $docId !== '') {
                $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;

                if (is_int($chunkIndex)) {
                    $docChunkPositions[$docId][] = $chunkIndex;
                }
            }
        }

        foreach ($chunkIds as $chunkId) {
            if (isset($selected[$chunkId])) {
                continue;
            }

            if (!isset($rows[$chunkId]['text'])) {
                continue;
            }

            $docId = $rows[$chunkId]['document_id'] ?? null;
            $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;

            if (!is_string($docId) || $docId === '') {
                continue;
            }

            if (($docCounter[$docId] ?? 0) >= $this->retrieverConfig->maxChunksPerDoc()) {
                continue;
            }

            if (is_int($chunkIndex)) {
                foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
                    if (abs($prevIdx - $chunkIndex) < $this->retrieverConfig->minChunkDistance()) {
                        continue 2;
                    }
                }
            }

            $text = trim((string)$rows[$chunkId]['text']);
            if ($text === '') {
                continue;
            }

            $out[] = (string)$chunkId;
            $selected[$chunkId] = true;
            $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;

            if (is_int($chunkIndex)) {
                $docChunkPositions[$docId][] = $chunkIndex;
            }

            if (count($out) >= $limit) {
                break;
            }
        }

        return $out;
    }

    /**
     * Default spread selection for sales-oriented queries.
     *
     * Goal:
     * - avoid overloading the result with chunks from the same document
     * - avoid chunks that are too close to each other in the same document
     * - preserve top-ranked relevance while improving contextual spread
     */
    private function selectSalesChunkIdsSpread(array $chunkIds, array $rows, int $limit): array
    {
        $out = [];
        $docCounter = [];
        $docChunkPositions = [];

        foreach ($chunkIds as $chunkId) {
            if (!isset($rows[$chunkId]['text'])) {
                continue;
            }

            $docId = $rows[$chunkId]['document_id'] ?? null;
            $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;

            if (!is_string($docId) || $docId === '') {
                continue;
            }

            if (($docCounter[$docId] ?? 0) >= $this->retrieverConfig->maxChunksPerDoc()) {
                continue;
            }

            if (is_int($chunkIndex)) {
                foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
                    if (abs($prevIdx - $chunkIndex) < $this->retrieverConfig->minChunkDistance()) {
                        continue 2;
                    }
                }

                $docChunkPositions[$docId][] = $chunkIndex;
            }

            $text = trim((string)$rows[$chunkId]['text']);
            if ($text === '') {
                continue;
            }

            $out[] = (string)$chunkId;
            $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;

            if (count($out) >= $limit) {
                break;
            }
        }

        return $out;
    }


    /**
     * Extracts the document title from metadata or from the first product-title heading.
     */
    private function extractDocumentTitle(array $row): string
    {
        $metadataTitle = $row['metadata']['document_title'] ?? null;

        if (is_string($metadataTitle) && trim($metadataTitle) !== '') {
            return trim($metadataTitle);
        }

        $text = (string)($row['text'] ?? '');

        if (
            $text !== '' &&
            preg_match('/^#\s*Produkt\s+Titel:\s*`?([^`\n]+)`?/imu', $text, $matches) === 1
        ) {
            return trim((string)($matches[1] ?? ''));
        }

        return '';
    }

    /**
     * Normalizes text for token-safe product comparisons.
     */
    private function normalizeText(string $value): string
    {
        $value = mb_strtolower(trim($value), 'UTF-8');
        $value = str_replace(['-', '/', '_'], ' ', $value);
        $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
        $value = preg_replace('/\s+/u', ' ', $value) ?? $value;

        return trim($value);
    }

    /**
     * Tokenizes normalized text.
     *
     * @return string[]
     */
    private function tokenizeText(string $value): array
    {
        if ($value === '') {
            return [];
        }

        return preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: [];
    }

    /**
     * Returns true when at least one token from the haystack matches the given words.
     */
    private function containsAnyToken(array $tokens, array $needles): bool
    {
        if ($tokens === [] || $needles === []) {
            return false;
        }

        $tokenMap = array_fill_keys($tokens, true);

        foreach ($needles as $needle) {
            if (isset($tokenMap[$needle])) {
                return true;
            }
        }

        return false;
    }

    /**
     * Generic product words must not drive product dominance decisions.
     */
    private function isGenericProductToken(string $token): bool
    {
        $generic = $this->retrieverConfig->genericProductTokens();

        return isset(array_fill_keys($generic, true)[$token]);
    }

    /**
     * Short technical model codes like TH or TC are allowed as anchors.
     */
    private function isImportantShortModelToken(string $token): bool
    {
        $allowed = $this->retrieverConfig->importantShortModelTokens();

        return in_array($token, $allowed, true);
    }

    /**
     * Family descriptors are strong product differentiators.
     */
    private function isFamilyDescriptorToken(string $token): bool
    {
        $familyDescriptors = $this->retrieverConfig->familyDescriptorTokens();

        return in_array($token, $familyDescriptors, true)
            || $this->isImportantShortModelToken($token)
            || preg_match('/\d/u', $token) === 1;
    }

    /**
     * Heuristic classifier for indicator, reagent, accessory and spare-part documents.
     */
    private function looksLikeReagentOrAccessoryDocument(array $row, string $titleNormalized, string $textNormalized): bool
    {
        $haystack = trim($titleNormalized . ' ' . $textNormalized);

        if ($haystack === '') {
            return false;
        }

        $needles = $this->retrieverConfig->looksLikeReagentTokens();

        foreach ($needles as $needle) {
            if (str_contains($haystack, $needle)) {
                return true;
            }
        }

        return false;
    }

    /**
     * Heuristic classifier for safety-style documents.
     */
    private function looksLikeSafetyDocument(array $row, string $titleNormalized, string $textNormalized): bool
    {
        $haystack = trim($titleNormalized . ' ' . $textNormalized);

        if ($haystack === '') {
            return false;
        }

        $needles = $this->retrieverConfig->looksLikeSafetyDocs();

        foreach ($needles as $needle) {
            if (str_contains($haystack, $needle)) {
                return true;
            }
        }

        return false;
    }

    /**
     * Converts selected chunk ids into the final plain text result list.
     */
    private function collectTextsFromIds(array $chunkIds, array $rows): array
    {
        $out = [];

        foreach ($chunkIds as $id) {
            if (!isset($rows[$id]['text'])) {
                continue;
            }

            $text = trim((string)$rows[$id]['text']);

            if ($text !== '') {
                $out[] = $text;
            }
        }

        return $out;
    }
}