requireConfig(); $result = $this->execute($prompt, $config, false); if ($result['catalogBlock'] !== null) { return [$result['catalogBlock']]; } if ($result['selectedChunkIds'] === []) { return []; } return $this->collectTextsFromIds( $result['selectedChunkIds'], $result['rows'] ); } /** * Returns a debug-friendly retrieval result with scoring/meta information. * * This method is used for inspection and tuning: * - selected chunk ids * - raw vector scores * - fused RRF scores * - intent / route information * - threshold and list-query flags * * @throws Exception */ public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array { $config = $config ?? $this->requireConfig(); $result = $this->execute($prompt, $config, true); if ($result['catalogBlock'] !== null) { return [[ 'rank' => 1, 'chunk_id' => '__CATALOG_LIST__', 'document_id' => null, 'chunk_index' => null, 'raw_score' => null, 'rrf_score' => null, 'threshold' => 0.0, 'intent' => $result['intent'], 'route' => $result['route'], 'entity_label' => $result['entityLabel'], 'is_list_query' => true, 'selection_mode' => 'catalog_list', 'text' => $result['catalogBlock'], ]]; } if ($result['selectedChunkIds'] === []) { return []; } $out = []; $rank = 0; foreach ($result['selectedChunkIds'] as $chunkId) { if (!isset($result['rows'][$chunkId])) { continue; } $rank++; $out[] = [ 'rank' => $rank, 'chunk_id' => $chunkId, 'document_id' => $result['rows'][$chunkId]['document_id'] ?? null, 'chunk_index' => $result['rows'][$chunkId]['chunk_index'] ?? null, 'raw_score' => $result['rawScores'][$chunkId] ?? null, 'rrf_score' => $result['rrfScores'][$chunkId] ?? null, 'threshold' => $result['threshold'], 'intent' => $result['intent'], 'route' => $result['route'], 'entity_label' => $result['entityLabel'], 'is_list_query' => $result['isListQuery'], 'selection_mode' => $result['selectionMode'], 'text' => trim((string)$result['rows'][$chunkId]['text']), ]; } return $out; } // ========================================================= // CENTRAL ORCHESTRATION // ========================================================= /** * Central orchestration entrypoint. * * Pipeline: * 1. Detect catalog entity and sales intent * 2. Resolve route * 3. If route is a catalog list route, try direct catalog output * 4. If prompt matches one exact document title, use exact-document fast path * 5. Otherwise, run the normal hybrid retrieval core * 6. Select final chunk ids depending on query type * * @throws Exception */ private function execute( string $prompt, ModelGenerationConfig $config, bool $withScores ): array { $entityLabel = $this->catalogIntent->detect($prompt); $salesIntent = $this->detectSalesIntent($prompt); $route = $this->routeResolver->resolve($salesIntent, $entityLabel); if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) { $catalogBlock = $this->entityCatalogService->listByTerm($entityLabel); if ($catalogBlock !== null) { return [ 'route' => $route, 'entityLabel' => $entityLabel, 'intent' => $salesIntent, 'isListQuery' => true, 'selectionMode' => 'catalog_list', 'selectedChunkIds' => [], 'rows' => [], 'rrfScores' => [], 'rawScores' => [], 'threshold' => 0.0, 'catalogBlock' => trim($catalogBlock), ]; } } $exactDocumentMatch = $this->lookup->findBestExactDocumentByPrompt($prompt); if ($exactDocumentMatch !== null) { $selectedChunkIds = $this->selectExactDocumentChunkIds( $exactDocumentMatch['rows'], max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)) ); if ($selectedChunkIds !== []) { return [ 'route' => $route, 'entityLabel' => $entityLabel, 'intent' => $salesIntent, 'isListQuery' => false, 'selectionMode' => 'exact_document_title', 'selectedChunkIds' => $selectedChunkIds, 'rows' => $exactDocumentMatch['rows'], 'rrfScores' => $this->buildExactDocumentScores($selectedChunkIds), 'rawScores' => [], 'threshold' => 1.0, 'catalogBlock' => null, ]; } } $core = $this->runCore($prompt, $config, $withScores, $salesIntent); if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { return [ 'route' => $route, 'entityLabel' => $entityLabel, 'intent' => $salesIntent, 'isListQuery' => $core['is_list_query'], 'selectionMode' => null, 'selectedChunkIds' => [], 'rows' => [], 'rrfScores' => [], 'rawScores' => [], 'threshold' => $core['threshold'], 'catalogBlock' => null, ]; } if ($core['is_list_query']) { $selectedChunkIds = $this->selectListChunkIds( $core['ranked_chunk_ids'], $core['rows'], $core['limit'] ); $selectionMode = 'list_deduplicated'; } else { $salesSelection = $this->selectSalesChunkIds( $prompt, $core['ranked_chunk_ids'], $core['rows'], $core['limit'] ); $selectedChunkIds = $salesSelection['ids']; $selectionMode = $salesSelection['mode']; } return [ 'route' => $route, 'entityLabel' => $entityLabel, 'intent' => $salesIntent, 'isListQuery' => $core['is_list_query'], 'selectionMode' => $selectionMode, 'selectedChunkIds' => $selectedChunkIds, 'rows' => $core['rows'], 'rrfScores' => $core['rrf_scores'], 'rawScores' => $core['raw_scores'], 'threshold' => $core['threshold'], 'catalogBlock' => null, ]; } // ========================================================= // CORE PIPELINE // ========================================================= /** * Executes the actual hybrid retrieval logic. * * Steps: * - derive limits from config within hard safety caps * - detect whether the prompt is a "list query" * - clean and enrich the prompt * - compute threshold + vector topK based on intent/query type * - route query into candidate document ids via tag routing * - run global and optional scoped vector search * - fuse hits * - resolve chunk ids to chunk rows * * @throws Exception */ private function runCore( string $prompt, ModelGenerationConfig $config, bool $withScores, string $salesIntent ): array { $limit = max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK)); $isListQuery = $this->intentLite->isListQuery($prompt); $cleanQuery = $this->queryCleaner->clean($prompt); $cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery); if ($cleanQuery === '') { return [ 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD, 'ranked_chunk_ids' => [], 'rows' => [], 'rrf_scores' => [], 'raw_scores' => [], ]; } [$threshold, $topK] = $this->computeThresholdAndTopK( $salesIntent, $isListQuery, $vectorTopKBase ); $candidateDocIds = $this->tagRouting->route($cleanQuery); $candidateDocIds = is_array($candidateDocIds) ? array_values(array_unique(array_filter( $candidateDocIds, static fn(mixed $value): bool => is_string($value) && $value !== '' ))) : []; $globalHits = $this->vectorClient->search($cleanQuery, $topK); $scopedHits = []; if ($candidateDocIds !== []) { $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds); } if ($globalHits === [] && $scopedHits === []) { return [ 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => $threshold, 'ranked_chunk_ids' => [], 'rows' => [], 'rrf_scores' => [], 'raw_scores' => [], ]; } $fused = $this->fuseHits( $globalHits, $scopedHits, $threshold, $scopedHits !== [], $withScores ); $rrfScores = $fused['rrf_scores']; $rawScores = $fused['raw_scores']; if ($rrfScores === [] && $globalHits !== []) { $rrfScores = $this->fallbackRrfFromHits($globalHits); } if ($rrfScores === []) { return [ 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => $threshold, 'ranked_chunk_ids' => [], 'rows' => [], 'rrf_scores' => [], 'raw_scores' => $rawScores, ]; } arsort($rrfScores); $rankedChunkIds = array_keys($rrfScores); $rows = $this->lookup->findByChunkIds($rankedChunkIds); return [ 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => $threshold, 'ranked_chunk_ids' => $rankedChunkIds, 'rows' => $rows, 'rrf_scores' => $rrfScores, 'raw_scores' => $rawScores, ]; } // ========================================================= // SUPPORT // ========================================================= /** * Loads the active model generation config. * * Retrieval is not allowed to proceed without an active config. */ private function requireConfig(): ModelGenerationConfig { $config = $this->configRepository->findActiveForModel(); if ($config === null) { throw new RuntimeException('No active ModelGenerationConfig found.'); } return $config; } /** * Extracts the normalized sales intent string from the intent detector. * * Falls back to DISCOVERY when the detector payload is incomplete. */ private function detectSalesIntent(string $prompt): string { $data = $this->salesIntentLite->detect($prompt); return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY); } /** * Computes retrieval threshold and vector topK. * * Rules: * - objection/pricing intents are slightly stricter * - list queries are allowed to retrieve a wider candidate set * - all values are clamped to global hard limits */ private function computeThresholdAndTopK( string $salesIntent, bool $isListQuery, int $vectorTopKBase ): array { $threshold = NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD; $topK = $vectorTopKBase; if ( $salesIntent === SalesIntentLite::OBJECTION || $salesIntent === SalesIntentLite::PRICING ) { $threshold += 0.02; } if ($isListQuery) { $topK = (int)round($topK * NdjsonHybridRetrieverConfig::LIST_BONUS); } $topK = max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK)); $threshold = max( NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR, min(NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, $threshold) ); return [$threshold, $topK]; } /** * Fuses multiple hit lists into one RRF-style score map. * * Notes: * - only hits above threshold are considered * - rank position within each hit list contributes to the final score * - scoped hits can be boosted * - raw scores are optionally captured for debug output */ private function fuseHits( array $globalHits, array $scopedHits, float $threshold, bool $boostScoped, bool $captureRaw ): array { $rrfScores = []; $rawScores = []; $apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void { $rank = 0; foreach ($hits as $hit) { if (!isset($hit['chunk_id'], $hit['score'])) { continue; } $raw = (float)$hit['score']; if ($raw < $threshold) { continue; } $chunkId = (string)$hit['chunk_id']; if ($captureRaw) { $rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw); } $rank++; $rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); if ($boost) { $rrf *= 1.2; } $rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf; } }; $apply($globalHits, false); $apply($scopedHits, $boostScoped); return [ 'rrf_scores' => $rrfScores, 'raw_scores' => $rawScores, ]; } /** * Builds a fallback RRF ranking purely from hit order. * * Used when thresholding removed all fused candidates but * the global hit list itself still exists. */ private function fallbackRrfFromHits(array $hits): array { $rrf = []; $rank = 0; foreach ($hits as $hit) { if (!isset($hit['chunk_id'])) { continue; } $rank++; $rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) { break; } } return $rrf; } /** * Selects a coherent chunk window from one exact document title match. * * For exact product questions we prefer a pure document slice over * cross-document fusion to avoid mixing neighbouring product families. * * @param array> $rows * @return string[] */ private function selectExactDocumentChunkIds(array $rows, int $limit): array { uasort($rows, static function (array $a, array $b): int { $aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX; $bIndex = is_int($b['chunk_index'] ?? null) ? (int)$b['chunk_index'] : PHP_INT_MAX; if ($aIndex !== $bIndex) { return $aIndex <=> $bIndex; } return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? '')); }); $selected = []; $max = min($limit, self::EXACT_DOCUMENT_MAX_CHUNKS); foreach ($rows as $row) { $chunkId = $row['chunk_id'] ?? null; $text = trim((string)($row['text'] ?? '')); if (!is_string($chunkId) || $chunkId === '' || $text === '') { continue; } $selected[] = $chunkId; if (count($selected) >= $max) { break; } } return $selected; } /** * Builds synthetic scores for exact-title fast-path selections. * * These scores are only used for debug output consistency. * * @param string[] $chunkIds * @return array */ private function buildExactDocumentScores(array $chunkIds): array { $scores = []; foreach (array_values($chunkIds) as $rank => $chunkId) { $scores[(string)$chunkId] = 1.0 / (1 + $rank); } return $scores; } /** * Selection strategy for list-style queries. * * Goal: * - avoid near-identical chunks * - prefer diverse list entries * - stop once the configured limit is reached */ private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array { $seen = []; $out = []; foreach ($chunkIds as $id) { if (!isset($rows[$id]['text'])) { continue; } $chunk = trim((string)$rows[$id]['text']); if ($chunk === '') { continue; } $key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk))); if (isset($seen[$key])) { continue; } $seen[$key] = true; $out[] = (string)$id; if (count($out) >= $limit) { break; } } return $out; } /** * Selection strategy for sales-oriented queries. * * Modes: * - exact_document_title: * used when the prompt clearly contains one exact document title * and the answer should stay strictly within that document * * - sales_dominant_document: * used when one document clearly dominates the top hit window * and coherent neighbouring chunks from that document are more * useful than cross-document spread * * - sales_spread: * default mode that spreads chunks across documents and enforces * distance between chunk positions of the same document */ private function selectSalesChunkIds(string $prompt, array $chunkIds, array $rows, int $limit): array { $focusedDocId = $this->resolveFocusedSalesDocumentId($prompt, $chunkIds, $rows); if ($focusedDocId !== null) { $focusedChunkIds = $this->selectFocusedProductChunkIds( $focusedDocId, $chunkIds, $rows, $limit ); if ($focusedChunkIds !== []) { return [ 'ids' => $focusedChunkIds, 'mode' => 'sales_product_dominant_document', ]; } } $dominantDocId = $this->detectDominantTopDocument($chunkIds, $rows); if ($dominantDocId !== null) { $dominantChunkIds = $this->selectDominantDocumentChunkIds( $dominantDocId, $chunkIds, $rows, $limit ); if ($dominantChunkIds !== []) { return [ 'ids' => $this->fillRemainingSalesChunkIds( $dominantChunkIds, $chunkIds, $rows, $limit ), 'mode' => 'sales_dominant_document', ]; } } return [ 'ids' => $this->selectSalesChunkIdsSpread($chunkIds, $rows, $limit), 'mode' => 'sales_spread', ]; } /** * Resolves a strongly focused product document before normal sales spreading. * * This protects against classic false positives where neighbouring products, * indicators or safety sheets outrank the actually requested device. */ private function resolveFocusedSalesDocumentId(string $prompt, array $chunkIds, array $rows): ?string { $promptProfile = $this->buildPromptProductProfile($prompt); if ($promptProfile['anchors'] === []) { return null; } $candidates = []; $seenDocs = []; foreach (array_slice($chunkIds, 0, self::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) { $row = $rows[$chunkId] ?? null; if (!is_array($row)) { continue; } $documentId = $row['document_id'] ?? null; if (!is_string($documentId) || $documentId === '' || isset($seenDocs[$documentId])) { continue; } $title = $this->extractDocumentTitle($row); if ($title === '') { continue; } $seenDocs[$documentId] = true; $score = $this->scoreFocusedProductCandidate($promptProfile, $title, $row, $rank); $candidates[] = [ 'document_id' => $documentId, 'score' => $score, ]; } if ($candidates === []) { return null; } usort($candidates, static function (array $a, array $b): int { if ($a['score'] === $b['score']) { return strcmp((string)$a['document_id'], (string)$b['document_id']); } return $b['score'] <=> $a['score']; }); $best = $candidates[0] ?? null; if ($best === null) { return null; } $runnerUpScore = (float)($candidates[1]['score'] ?? -INF); $bestScore = (float)$best['score']; $gap = $bestScore - $runnerUpScore; if ($bestScore < self::FOCUSED_PRODUCT_MIN_SCORE || $gap < self::FOCUSED_PRODUCT_MIN_GAP) { return null; } $documentId = $best['document_id'] ?? null; return is_string($documentId) && $documentId !== '' ? $documentId : null; } /** * Builds a small prompt profile used for focused product dominance decisions. * * @return array{ * normalized:string, * anchors:string[], * family_tokens:string[], * number_tokens:string[], * asks_reagent:bool, * asks_document:bool, * asks_safety:bool, * asks_device:bool * } */ private function buildPromptProductProfile(string $prompt): array { $normalized = $this->normalizeText($prompt); $tokens = $this->tokenizeText($normalized); $reagentWords = [ 'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb', 'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde', ]; $documentWords = [ 'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung', 'sdb', 'sicherheitsdatenblatt', 'msds', ]; $safetyWords = [ 'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung', 'transport', 'lagerung', 'piktogramm', ]; $deviceWords = [ 'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat', 'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor', ]; $asksReagent = $this->containsAnyToken($tokens, $reagentWords); $asksDocument = $this->containsAnyToken($tokens, $documentWords); $asksSafety = $this->containsAnyToken($tokens, $safetyWords); $asksDevice = $this->containsAnyToken($tokens, $deviceWords) || (!$asksReagent && !$asksDocument && !$asksSafety); $anchors = []; $familyTokens = []; $numberTokens = []; foreach ($tokens as $token) { if ($this->isGenericProductToken($token)) { continue; } if (preg_match('/\d/u', $token) === 1) { $anchors[] = $token; $numberTokens[] = $token; $familyTokens[] = $token; continue; } if ($this->isImportantShortModelToken($token)) { $anchors[] = $token; $familyTokens[] = $token; continue; } if (mb_strlen($token, 'UTF-8') >= 3) { $anchors[] = $token; if ($this->isFamilyDescriptorToken($token)) { $familyTokens[] = $token; } } } return [ 'normalized' => $normalized, 'anchors' => array_values(array_unique($anchors)), 'family_tokens' => array_values(array_unique($familyTokens)), 'number_tokens' => array_values(array_unique($numberTokens)), 'asks_reagent' => $asksReagent, 'asks_document' => $asksDocument, 'asks_safety' => $asksSafety, 'asks_device' => $asksDevice, ]; } /** * Scores one candidate document for focused product selection. */ private function scoreFocusedProductCandidate(array $promptProfile, string $title, array $row, int $rank): float { $titleNormalized = $this->normalizeText($title); $titleTokens = $this->tokenizeText($titleNormalized); $titleTokenMap = array_fill_keys($titleTokens, true); $textNormalized = $this->normalizeText((string)($row['text'] ?? '')); $score = max(0.0, 5.0 - $rank); if ($titleNormalized !== '' && str_contains(' ' . $promptProfile['normalized'] . ' ', ' ' . $titleNormalized . ' ')) { $score += 24.0; } $matchedAnchors = 0; foreach ($promptProfile['anchors'] as $anchor) { if (isset($titleTokenMap[$anchor])) { $matchedAnchors++; $score += $this->isImportantShortModelToken($anchor) ? 4.0 : 3.5; continue; } if (str_contains(' ' . $titleNormalized . ' ', ' ' . $anchor . ' ')) { $matchedAnchors++; $score += 3.0; continue; } $score -= $this->isFamilyDescriptorToken($anchor) ? 3.5 : 2.0; } foreach ($promptProfile['number_tokens'] as $numberToken) { if (isset($titleTokenMap[$numberToken])) { $score += 4.0; } else { $score -= 5.0; } } foreach ($promptProfile['family_tokens'] as $familyToken) { if (isset($titleTokenMap[$familyToken])) { $score += 4.0; } else { $score -= 4.5; } } if ($promptProfile['asks_device']) { if ($this->looksLikeReagentOrAccessoryDocument($row, $titleNormalized, $textNormalized)) { $score -= 12.0; } if ($this->looksLikeSafetyDocument($row, $titleNormalized, $textNormalized)) { $score -= 8.0; } } if ($promptProfile['asks_reagent'] && $this->looksLikeReagentOrAccessoryDocument($row, $titleNormalized, $textNormalized)) { $score += 6.0; } if (($promptProfile['asks_document'] || $promptProfile['asks_safety']) && $this->looksLikeSafetyDocument($row, $titleNormalized, $textNormalized)) { $score += 4.0; } if ($matchedAnchors === 0) { $score -= 10.0; } return $score; } /** * Selects only the focused product document chunks. * * In this strict mode we intentionally do not fill remaining slots with * neighbouring products, because that would reintroduce the original bug. */ private function selectFocusedProductChunkIds( string $documentId, array $chunkIds, array $rows, int $limit ): array { return $this->selectDominantDocumentChunkIds( $documentId, $chunkIds, $rows, min($limit, self::FOCUSED_PRODUCT_MAX_CHUNKS) ); } /** * Detects whether one document clearly dominates the first ranked window. * * This is especially useful for product-sheet style documents where * several adjacent chunks belong together and should be passed to the model * as one coherent factual block. */ private function detectDominantTopDocument(array $chunkIds, array $rows): ?string { $docWindow = []; foreach (array_slice($chunkIds, 0, self::DOMINANT_DOC_WINDOW) as $chunkId) { if (!isset($rows[$chunkId]['text'])) { continue; } $text = trim((string)$rows[$chunkId]['text']); $docId = $rows[$chunkId]['document_id'] ?? null; if ($text === '' || !is_string($docId) || $docId === '') { continue; } $docWindow[] = $docId; } if (count($docWindow) < 2) { return null; } $counts = array_count_values($docWindow); arsort($counts); $dominantDocId = array_key_first($counts); if (!is_string($dominantDocId) || $dominantDocId === '') { return null; } $dominantCount = (int)($counts[$dominantDocId] ?? 0); if ($dominantCount >= self::DOMINANT_DOC_MIN_HITS) { return $dominantDocId; } $first = $docWindow[0] ?? null; $second = $docWindow[1] ?? null; if ($dominantCount >= 2 && $first === $dominantDocId && $second === $dominantDocId) { return $dominantDocId; } return null; } /** * Selects a coherent chunk window from the dominant document. * * Strategy: * - use the highest-ranked chunk of that document as anchor * - prefer neighbouring chunk indices around that anchor * - sort the final selection by chunk index for prompt coherence */ private function selectDominantDocumentChunkIds( string $documentId, array $chunkIds, array $rows, int $limit ): array { $docHits = []; $anchorChunkIndex = null; foreach ($chunkIds as $rank => $chunkId) { if (!isset($rows[$chunkId]['text'])) { continue; } $text = trim((string)$rows[$chunkId]['text']); $docId = $rows[$chunkId]['document_id'] ?? null; if ($text === '' || $docId !== $documentId) { continue; } $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; $chunkIndex = is_int($chunkIndex) ? $chunkIndex : null; if ($anchorChunkIndex === null && $chunkIndex !== null) { $anchorChunkIndex = $chunkIndex; } $docHits[] = [ 'id' => (string)$chunkId, 'rank' => $rank, 'chunk_index' => $chunkIndex, ]; } if ($docHits === []) { return []; } $maxFromDoc = min($limit, self::DOMINANT_DOC_MAX_CHUNKS); if ($anchorChunkIndex !== null) { usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int { $aDistance = $a['chunk_index'] === null ? PHP_INT_MAX : abs($a['chunk_index'] - $anchorChunkIndex); $bDistance = $b['chunk_index'] === null ? PHP_INT_MAX : abs($b['chunk_index'] - $anchorChunkIndex); if ($aDistance !== $bDistance) { return $aDistance <=> $bDistance; } return $a['rank'] <=> $b['rank']; }); } else { usort($docHits, static fn(array $a, array $b): int => $a['rank'] <=> $b['rank']); } $selected = array_slice($docHits, 0, $maxFromDoc); usort($selected, static function (array $a, array $b): int { $aIndex = $a['chunk_index']; $bIndex = $b['chunk_index']; if ($aIndex === null && $bIndex === null) { return $a['rank'] <=> $b['rank']; } if ($aIndex === null) { return 1; } if ($bIndex === null) { return -1; } if ($aIndex !== $bIndex) { return $aIndex <=> $bIndex; } return $a['rank'] <=> $b['rank']; }); return array_map( static fn(array $row): string => $row['id'], $selected ); } /** * Fills the remaining sales slots after a dominant document selection. * * The already selected dominant-document chunks stay fixed. * Remaining slots are filled with the normal spread strategy. */ private function fillRemainingSalesChunkIds( array $seedChunkIds, array $chunkIds, array $rows, int $limit ): array { $out = array_values(array_unique(array_map('strval', $seedChunkIds))); if (count($out) >= $limit) { return array_slice($out, 0, $limit); } $selected = array_fill_keys($out, true); $docCounter = []; $docChunkPositions = []; foreach ($out as $chunkId) { $docId = $rows[$chunkId]['document_id'] ?? null; $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; if (is_string($docId) && $docId !== '') { $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; if (is_int($chunkIndex)) { $docChunkPositions[$docId][] = $chunkIndex; } } } foreach ($chunkIds as $chunkId) { if (isset($selected[$chunkId])) { continue; } if (!isset($rows[$chunkId]['text'])) { continue; } $docId = $rows[$chunkId]['document_id'] ?? null; $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; if (!is_string($docId) || $docId === '') { continue; } if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) { continue; } if (is_int($chunkIndex)) { foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) { if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) { continue 2; } } } $text = trim((string)$rows[$chunkId]['text']); if ($text === '') { continue; } $out[] = (string)$chunkId; $selected[$chunkId] = true; $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; if (is_int($chunkIndex)) { $docChunkPositions[$docId][] = $chunkIndex; } if (count($out) >= $limit) { break; } } return $out; } /** * Default spread selection for sales-oriented queries. * * Goal: * - avoid overloading the result with chunks from the same document * - avoid chunks that are too close to each other in the same document * - preserve top-ranked relevance while improving contextual spread */ private function selectSalesChunkIdsSpread(array $chunkIds, array $rows, int $limit): array { $out = []; $docCounter = []; $docChunkPositions = []; foreach ($chunkIds as $chunkId) { if (!isset($rows[$chunkId]['text'])) { continue; } $docId = $rows[$chunkId]['document_id'] ?? null; $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; if (!is_string($docId) || $docId === '') { continue; } if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) { continue; } if (is_int($chunkIndex)) { foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) { if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) { continue 2; } } $docChunkPositions[$docId][] = $chunkIndex; } $text = trim((string)$rows[$chunkId]['text']); if ($text === '') { continue; } $out[] = (string)$chunkId; $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; if (count($out) >= $limit) { break; } } return $out; } /** * Extracts the document title from metadata or from the first product-title heading. */ private function extractDocumentTitle(array $row): string { $metadataTitle = $row['metadata']['document_title'] ?? null; if (is_string($metadataTitle) && trim($metadataTitle) !== '') { return trim($metadataTitle); } $text = (string)($row['text'] ?? ''); if ( $text !== '' && preg_match('/^#\s*Produkt\s+Titel:\s*`?([^`\n]+)`?/imu', $text, $matches) === 1 ) { return trim((string)($matches[1] ?? '')); } return ''; } /** * Normalizes text for token-safe product comparisons. */ private function normalizeText(string $value): string { $value = mb_strtolower(trim($value), 'UTF-8'); $value = str_replace(['-', '/', '_'], ' ', $value); $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value; $value = preg_replace('/\s+/u', ' ', $value) ?? $value; return trim($value); } /** * Tokenizes normalized text. * * @return string[] */ private function tokenizeText(string $value): array { if ($value === '') { return []; } return preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: []; } /** * Returns true when at least one token from the haystack matches the given words. */ private function containsAnyToken(array $tokens, array $needles): bool { if ($tokens === [] || $needles === []) { return false; } $tokenMap = array_fill_keys($tokens, true); foreach ($needles as $needle) { if (isset($tokenMap[$needle])) { return true; } } return false; } /** * Generic product words must not drive product dominance decisions. */ private function isGenericProductToken(string $token): bool { static $generic = [ 'der', 'die', 'das', 'ein', 'eine', 'einen', 'einem', 'und', 'oder', 'mit', 'fuer', 'für', 'von', 'im', 'in', 'am', 'an', 'auf', 'zu', 'zum', 'zur', 'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät', 'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte', 'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung', 'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend', 'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher', 'welches', 'brauche', 'suche', 'bitte', 'fuer', 'gegen', 'und', 'oder', ]; return isset(array_fill_keys($generic, true)[$token]); } /** * Short technical model codes like TH or TC are allowed as anchors. */ private function isImportantShortModelToken(string $token): bool { static $allowed = ['th', 'tc', 'tp', 'tm', 'ph', 'rx']; return in_array($token, $allowed, true); } /** * Family descriptors are strong product differentiators. */ private function isFamilyDescriptorToken(string $token): bool { static $familyDescriptors = [ 'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab', 'inline', 'compact', 'panel', 'sc', ]; return in_array($token, $familyDescriptors, true) || $this->isImportantShortModelToken($token) || preg_match('/\d/u', $token) === 1; } /** * Heuristic classifier for indicator, reagent, accessory and spare-part documents. */ private function looksLikeReagentOrAccessoryDocument(array $row, string $titleNormalized, string $textNormalized): bool { $haystack = trim($titleNormalized . ' ' . $textNormalized); if ($haystack === '') { return false; } $needles = [ 'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie', 'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche', 'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz', 'kerzenfilter', 'druckregler', ]; foreach ($needles as $needle) { if (str_contains($haystack, $needle)) { return true; } } return false; } /** * Heuristic classifier for safety-style documents. */ private function looksLikeSafetyDocument(array $row, string $titleNormalized, string $textNormalized): bool { $haystack = trim($titleNormalized . ' ' . $textNormalized); if ($haystack === '') { return false; } $needles = [ 'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung', 'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp', 'kennzeichnung', 'h290', 'pbt', 'vpvb', ]; foreach ($needles as $needle) { if (str_contains($haystack, $needle)) { return true; } } return false; } /** * Converts selected chunk ids into the final plain text result list. */ private function collectTextsFromIds(array $chunkIds, array $rows): array { $out = []; foreach ($chunkIds as $id) { if (!isset($rows[$id]['text'])) { continue; } $text = trim((string)$rows[$id]['text']); if ($text !== '') { $out[] = $text; } } return $out; } }