Files
MtoRagSystem/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
2026-04-21 17:20:16 +02:00

1469 lines
45 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Catalog\EntityCatalogService;
use App\Config\NdjsonHybridRetrieverConfig;
use App\Entity\ModelGenerationConfig;
use App\Intent\CatalogIntentLite;
use App\Intent\IntentLite;
use App\Intent\SalesIntentLite;
use App\Repository\ModelGenerationConfigRepository;
use App\Routing\IntentRouteResolver;
use App\Tag\TagRoutingService;
use App\Vector\VectorSearchClient;
use Doctrine\DBAL\Exception;
use RuntimeException;
/**
* Hybrid retriever for NDJSON-based knowledge chunks.
*
* Main responsibilities:
* - detect high-level request intent
* - optionally short-circuit to catalog list output
* - resolve exact document-title matches before semantic retrieval
* - run vector retrieval globally and optionally document-scoped
* - fuse both result sets with RRF-style scoring
* - apply selection rules for list queries vs. sales-style queries
* - return either plain chunk texts or debug metadata
*/
final readonly class NdjsonHybridRetriever implements RetrieverInterface
{
/**
* When one document clearly dominates the top-ranked window,
* temporarily switch from "spread" mode to "dominant document" mode.
*/
private const DOMINANT_DOC_WINDOW = 6;
private const DOMINANT_DOC_MIN_HITS = 3;
private const DOMINANT_DOC_MAX_CHUNKS = 4;
private const EXACT_DOCUMENT_MAX_CHUNKS = 6;
private const FOCUSED_PRODUCT_WINDOW = 8;
private const FOCUSED_PRODUCT_MIN_SCORE = 10.0;
private const FOCUSED_PRODUCT_MIN_GAP = 4.0;
private const FOCUSED_PRODUCT_MAX_CHUNKS = 4;
public function __construct(
private NdjsonChunkLookup $lookup,
private VectorSearchClient $vectorClient,
private TagRoutingService $tagRouting,
private ModelGenerationConfigRepository $configRepository,
private QueryCleaner $queryCleaner,
private IntentLite $intentLite,
private SalesIntentLite $salesIntentLite,
private CatalogIntentLite $catalogIntent,
private IntentRouteResolver $routeResolver,
private EntityCatalogService $entityCatalogService,
private QueryEnricher $queryEnricher,
)
{
}
// =========================================================
// PUBLIC API
// =========================================================
/**
* Returns the final retrieval payload as plain text chunks.
*
* Behaviour:
* - loads active retrieval config
* - executes the full orchestration pipeline
* - if the route resolves to a catalog list, returns the catalog block only
* - otherwise returns the selected chunk texts
*
* @throws Exception
*/
public function retrieve(string $prompt): array
{
$config = $this->requireConfig();
$result = $this->execute($prompt, $config, false);
if ($result['catalogBlock'] !== null) {
return [$result['catalogBlock']];
}
if ($result['selectedChunkIds'] === []) {
return [];
}
return $this->collectTextsFromIds(
$result['selectedChunkIds'],
$result['rows']
);
}
/**
* Returns a debug-friendly retrieval result with scoring/meta information.
*
* This method is used for inspection and tuning:
* - selected chunk ids
* - raw vector scores
* - fused RRF scores
* - intent / route information
* - threshold and list-query flags
*
* @throws Exception
*/
public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array
{
$config = $config ?? $this->requireConfig();
$result = $this->execute($prompt, $config, true);
if ($result['catalogBlock'] !== null) {
return [[
'rank' => 1,
'chunk_id' => '__CATALOG_LIST__',
'document_id' => null,
'chunk_index' => null,
'raw_score' => null,
'rrf_score' => null,
'threshold' => 0.0,
'intent' => $result['intent'],
'route' => $result['route'],
'entity_label' => $result['entityLabel'],
'is_list_query' => true,
'selection_mode' => 'catalog_list',
'text' => $result['catalogBlock'],
]];
}
if ($result['selectedChunkIds'] === []) {
return [];
}
$out = [];
$rank = 0;
foreach ($result['selectedChunkIds'] as $chunkId) {
if (!isset($result['rows'][$chunkId])) {
continue;
}
$rank++;
$out[] = [
'rank' => $rank,
'chunk_id' => $chunkId,
'document_id' => $result['rows'][$chunkId]['document_id'] ?? null,
'chunk_index' => $result['rows'][$chunkId]['chunk_index'] ?? null,
'raw_score' => $result['rawScores'][$chunkId] ?? null,
'rrf_score' => $result['rrfScores'][$chunkId] ?? null,
'threshold' => $result['threshold'],
'intent' => $result['intent'],
'route' => $result['route'],
'entity_label' => $result['entityLabel'],
'is_list_query' => $result['isListQuery'],
'selection_mode' => $result['selectionMode'],
'text' => trim((string)$result['rows'][$chunkId]['text']),
];
}
return $out;
}
// =========================================================
// CENTRAL ORCHESTRATION
// =========================================================
/**
* Central orchestration entrypoint.
*
* Pipeline:
* 1. Detect catalog entity and sales intent
* 2. Resolve route
* 3. If route is a catalog list route, try direct catalog output
* 4. If prompt matches one exact document title, use exact-document fast path
* 5. Otherwise, run the normal hybrid retrieval core
* 6. Select final chunk ids depending on query type
*
* @throws Exception
*/
private function execute(
string $prompt,
ModelGenerationConfig $config,
bool $withScores
): array
{
$entityLabel = $this->catalogIntent->detect($prompt);
$salesIntent = $this->detectSalesIntent($prompt);
$route = $this->routeResolver->resolve($salesIntent, $entityLabel);
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
$catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);
if ($catalogBlock !== null) {
return [
'route' => $route,
'entityLabel' => $entityLabel,
'intent' => $salesIntent,
'isListQuery' => true,
'selectionMode' => 'catalog_list',
'selectedChunkIds' => [],
'rows' => [],
'rrfScores' => [],
'rawScores' => [],
'threshold' => 0.0,
'catalogBlock' => trim($catalogBlock),
];
}
}
$exactDocumentMatch = $this->lookup->findBestExactDocumentByPrompt($prompt);
if ($exactDocumentMatch !== null) {
$selectedChunkIds = $this->selectExactDocumentChunkIds(
$exactDocumentMatch['rows'],
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS))
);
if ($selectedChunkIds !== []) {
return [
'route' => $route,
'entityLabel' => $entityLabel,
'intent' => $salesIntent,
'isListQuery' => false,
'selectionMode' => 'exact_document_title',
'selectedChunkIds' => $selectedChunkIds,
'rows' => $exactDocumentMatch['rows'],
'rrfScores' => $this->buildExactDocumentScores($selectedChunkIds),
'rawScores' => [],
'threshold' => 1.0,
'catalogBlock' => null,
];
}
}
$core = $this->runCore($prompt, $config, $withScores, $salesIntent);
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
return [
'route' => $route,
'entityLabel' => $entityLabel,
'intent' => $salesIntent,
'isListQuery' => $core['is_list_query'],
'selectionMode' => null,
'selectedChunkIds' => [],
'rows' => [],
'rrfScores' => [],
'rawScores' => [],
'threshold' => $core['threshold'],
'catalogBlock' => null,
];
}
if ($core['is_list_query']) {
$selectedChunkIds = $this->selectListChunkIds(
$core['ranked_chunk_ids'],
$core['rows'],
$core['limit']
);
$selectionMode = 'list_deduplicated';
} else {
$salesSelection = $this->selectSalesChunkIds(
$prompt,
$core['ranked_chunk_ids'],
$core['rows'],
$core['limit']
);
$selectedChunkIds = $salesSelection['ids'];
$selectionMode = $salesSelection['mode'];
}
return [
'route' => $route,
'entityLabel' => $entityLabel,
'intent' => $salesIntent,
'isListQuery' => $core['is_list_query'],
'selectionMode' => $selectionMode,
'selectedChunkIds' => $selectedChunkIds,
'rows' => $core['rows'],
'rrfScores' => $core['rrf_scores'],
'rawScores' => $core['raw_scores'],
'threshold' => $core['threshold'],
'catalogBlock' => null,
];
}
// =========================================================
// CORE PIPELINE
// =========================================================
/**
* Executes the actual hybrid retrieval logic.
*
* Steps:
* - derive limits from config within hard safety caps
* - detect whether the prompt is a "list query"
* - clean and enrich the prompt
* - compute threshold + vector topK based on intent/query type
* - route query into candidate document ids via tag routing
* - run global and optional scoped vector search
* - fuse hits
* - resolve chunk ids to chunk rows
*
* @throws Exception
*/
private function runCore(
string $prompt,
ModelGenerationConfig $config,
bool $withScores,
string $salesIntent
): array
{
$limit = max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
$isListQuery = $this->intentLite->isListQuery($prompt);
$cleanQuery = $this->queryCleaner->clean($prompt);
$cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery);
if ($cleanQuery === '') {
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD,
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
'raw_scores' => [],
];
}
[$threshold, $topK] = $this->computeThresholdAndTopK(
$salesIntent,
$isListQuery,
$vectorTopKBase
);
$candidateDocIds = $this->tagRouting->route($cleanQuery);
$candidateDocIds = is_array($candidateDocIds)
? array_values(array_unique(array_filter(
$candidateDocIds,
static fn(mixed $value): bool => is_string($value) && $value !== ''
)))
: [];
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
$scopedHits = [];
if ($candidateDocIds !== []) {
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
}
if ($globalHits === [] && $scopedHits === []) {
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'threshold' => $threshold,
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
'raw_scores' => [],
];
}
$fused = $this->fuseHits(
$globalHits,
$scopedHits,
$threshold,
$scopedHits !== [],
$withScores
);
$rrfScores = $fused['rrf_scores'];
$rawScores = $fused['raw_scores'];
if ($rrfScores === [] && $globalHits !== []) {
$rrfScores = $this->fallbackRrfFromHits($globalHits);
}
if ($rrfScores === []) {
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'threshold' => $threshold,
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
'raw_scores' => $rawScores,
];
}
arsort($rrfScores);
$rankedChunkIds = array_keys($rrfScores);
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'threshold' => $threshold,
'ranked_chunk_ids' => $rankedChunkIds,
'rows' => $rows,
'rrf_scores' => $rrfScores,
'raw_scores' => $rawScores,
];
}
// =========================================================
// SUPPORT
// =========================================================
/**
* Loads the active model generation config.
*
* Retrieval is not allowed to proceed without an active config.
*/
private function requireConfig(): ModelGenerationConfig
{
$config = $this->configRepository->findActiveForModel();
if ($config === null) {
throw new RuntimeException('No active ModelGenerationConfig found.');
}
return $config;
}
/**
* Extracts the normalized sales intent string from the intent detector.
*
* Falls back to DISCOVERY when the detector payload is incomplete.
*/
private function detectSalesIntent(string $prompt): string
{
$data = $this->salesIntentLite->detect($prompt);
return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
}
/**
* Computes retrieval threshold and vector topK.
*
* Rules:
* - objection/pricing intents are slightly stricter
* - list queries are allowed to retrieve a wider candidate set
* - all values are clamped to global hard limits
*/
private function computeThresholdAndTopK(
string $salesIntent,
bool $isListQuery,
int $vectorTopKBase
): array
{
$threshold = NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD;
$topK = $vectorTopKBase;
if (
$salesIntent === SalesIntentLite::OBJECTION ||
$salesIntent === SalesIntentLite::PRICING
) {
$threshold += 0.02;
}
if ($isListQuery) {
$topK = (int)round($topK * NdjsonHybridRetrieverConfig::LIST_BONUS);
}
$topK = max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
$threshold = max(
NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR,
min(NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, $threshold)
);
return [$threshold, $topK];
}
/**
* Fuses multiple hit lists into one RRF-style score map.
*
* Notes:
* - only hits above threshold are considered
* - rank position within each hit list contributes to the final score
* - scoped hits can be boosted
* - raw scores are optionally captured for debug output
*/
private function fuseHits(
array $globalHits,
array $scopedHits,
float $threshold,
bool $boostScoped,
bool $captureRaw
): array
{
$rrfScores = [];
$rawScores = [];
$apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void {
$rank = 0;
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
$raw = (float)$hit['score'];
if ($raw < $threshold) {
continue;
}
$chunkId = (string)$hit['chunk_id'];
if ($captureRaw) {
$rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
}
$rank++;
$rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
if ($boost) {
$rrf *= 1.2;
}
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
}
};
$apply($globalHits, false);
$apply($scopedHits, $boostScoped);
return [
'rrf_scores' => $rrfScores,
'raw_scores' => $rawScores,
];
}
/**
* Builds a fallback RRF ranking purely from hit order.
*
* Used when thresholding removed all fused candidates but
* the global hit list itself still exists.
*/
private function fallbackRrfFromHits(array $hits): array
{
$rrf = [];
$rank = 0;
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'])) {
continue;
}
$rank++;
$rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) {
break;
}
}
return $rrf;
}
/**
* Selects a coherent chunk window from one exact document title match.
*
* For exact product questions we prefer a pure document slice over
* cross-document fusion to avoid mixing neighbouring product families.
*
* @param array<string,array<string,mixed>> $rows
* @return string[]
*/
private function selectExactDocumentChunkIds(array $rows, int $limit): array
{
uasort($rows, static function (array $a, array $b): int {
$aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX;
$bIndex = is_int($b['chunk_index'] ?? null) ? (int)$b['chunk_index'] : PHP_INT_MAX;
if ($aIndex !== $bIndex) {
return $aIndex <=> $bIndex;
}
return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? ''));
});
$selected = [];
$max = min($limit, self::EXACT_DOCUMENT_MAX_CHUNKS);
foreach ($rows as $row) {
$chunkId = $row['chunk_id'] ?? null;
$text = trim((string)($row['text'] ?? ''));
if (!is_string($chunkId) || $chunkId === '' || $text === '') {
continue;
}
$selected[] = $chunkId;
if (count($selected) >= $max) {
break;
}
}
return $selected;
}
/**
* Builds synthetic scores for exact-title fast-path selections.
*
* These scores are only used for debug output consistency.
*
* @param string[] $chunkIds
* @return array<string,float>
*/
private function buildExactDocumentScores(array $chunkIds): array
{
$scores = [];
foreach (array_values($chunkIds) as $rank => $chunkId) {
$scores[(string)$chunkId] = 1.0 / (1 + $rank);
}
return $scores;
}
/**
* Selection strategy for list-style queries.
*
* Goal:
* - avoid near-identical chunks
* - prefer diverse list entries
* - stop once the configured limit is reached
*/
private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array
{
$seen = [];
$out = [];
foreach ($chunkIds as $id) {
if (!isset($rows[$id]['text'])) {
continue;
}
$chunk = trim((string)$rows[$id]['text']);
if ($chunk === '') {
continue;
}
$key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = (string)$id;
if (count($out) >= $limit) {
break;
}
}
return $out;
}
/**
* Selection strategy for sales-oriented queries.
*
* Modes:
* - exact_document_title:
* used when the prompt clearly contains one exact document title
* and the answer should stay strictly within that document
*
* - sales_dominant_document:
* used when one document clearly dominates the top hit window
* and coherent neighbouring chunks from that document are more
* useful than cross-document spread
*
* - sales_spread:
* default mode that spreads chunks across documents and enforces
* distance between chunk positions of the same document
*/
private function selectSalesChunkIds(string $prompt, array $chunkIds, array $rows, int $limit): array
{
$focusedDocId = $this->resolveFocusedSalesDocumentId($prompt, $chunkIds, $rows);
if ($focusedDocId !== null) {
$focusedChunkIds = $this->selectFocusedProductChunkIds(
$focusedDocId,
$chunkIds,
$rows,
$limit
);
if ($focusedChunkIds !== []) {
return [
'ids' => $focusedChunkIds,
'mode' => 'sales_product_dominant_document',
];
}
}
$dominantDocId = $this->detectDominantTopDocument($chunkIds, $rows);
if ($dominantDocId !== null) {
$dominantChunkIds = $this->selectDominantDocumentChunkIds(
$dominantDocId,
$chunkIds,
$rows,
$limit
);
if ($dominantChunkIds !== []) {
return [
'ids' => $this->fillRemainingSalesChunkIds(
$dominantChunkIds,
$chunkIds,
$rows,
$limit
),
'mode' => 'sales_dominant_document',
];
}
}
return [
'ids' => $this->selectSalesChunkIdsSpread($chunkIds, $rows, $limit),
'mode' => 'sales_spread',
];
}
/**
* Resolves a strongly focused product document before normal sales spreading.
*
* This protects against classic false positives where neighbouring products,
* indicators or safety sheets outrank the actually requested device.
*/
private function resolveFocusedSalesDocumentId(string $prompt, array $chunkIds, array $rows): ?string
{
$promptProfile = $this->buildPromptProductProfile($prompt);
if ($promptProfile['anchors'] === []) {
return null;
}
$candidates = [];
$seenDocs = [];
foreach (array_slice($chunkIds, 0, self::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) {
$row = $rows[$chunkId] ?? null;
if (!is_array($row)) {
continue;
}
$documentId = $row['document_id'] ?? null;
if (!is_string($documentId) || $documentId === '' || isset($seenDocs[$documentId])) {
continue;
}
$title = $this->extractDocumentTitle($row);
if ($title === '') {
continue;
}
$seenDocs[$documentId] = true;
$score = $this->scoreFocusedProductCandidate($promptProfile, $title, $row, $rank);
$candidates[] = [
'document_id' => $documentId,
'score' => $score,
];
}
if ($candidates === []) {
return null;
}
usort($candidates, static function (array $a, array $b): int {
if ($a['score'] === $b['score']) {
return strcmp((string)$a['document_id'], (string)$b['document_id']);
}
return $b['score'] <=> $a['score'];
});
$best = $candidates[0] ?? null;
if ($best === null) {
return null;
}
$runnerUpScore = (float)($candidates[1]['score'] ?? -INF);
$bestScore = (float)$best['score'];
$gap = $bestScore - $runnerUpScore;
if ($bestScore < self::FOCUSED_PRODUCT_MIN_SCORE || $gap < self::FOCUSED_PRODUCT_MIN_GAP) {
return null;
}
$documentId = $best['document_id'] ?? null;
return is_string($documentId) && $documentId !== '' ? $documentId : null;
}
/**
* Builds a small prompt profile used for focused product dominance decisions.
*
* @return array{
* normalized:string,
* anchors:string[],
* family_tokens:string[],
* number_tokens:string[],
* asks_reagent:bool,
* asks_document:bool,
* asks_safety:bool,
* asks_device:bool
* }
*/
private function buildPromptProductProfile(string $prompt): array
{
$normalized = $this->normalizeText($prompt);
$tokens = $this->tokenizeText($normalized);
$reagentWords = [
'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb',
'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde',
];
$documentWords = [
'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung',
'sdb', 'sicherheitsdatenblatt', 'msds',
];
$safetyWords = [
'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung',
'transport', 'lagerung', 'piktogramm',
];
$deviceWords = [
'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat',
'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor',
];
$asksReagent = $this->containsAnyToken($tokens, $reagentWords);
$asksDocument = $this->containsAnyToken($tokens, $documentWords);
$asksSafety = $this->containsAnyToken($tokens, $safetyWords);
$asksDevice = $this->containsAnyToken($tokens, $deviceWords) || (!$asksReagent && !$asksDocument && !$asksSafety);
$anchors = [];
$familyTokens = [];
$numberTokens = [];
foreach ($tokens as $token) {
if ($this->isGenericProductToken($token)) {
continue;
}
if (preg_match('/\d/u', $token) === 1) {
$anchors[] = $token;
$numberTokens[] = $token;
$familyTokens[] = $token;
continue;
}
if ($this->isImportantShortModelToken($token)) {
$anchors[] = $token;
$familyTokens[] = $token;
continue;
}
if (mb_strlen($token, 'UTF-8') >= 3) {
$anchors[] = $token;
if ($this->isFamilyDescriptorToken($token)) {
$familyTokens[] = $token;
}
}
}
return [
'normalized' => $normalized,
'anchors' => array_values(array_unique($anchors)),
'family_tokens' => array_values(array_unique($familyTokens)),
'number_tokens' => array_values(array_unique($numberTokens)),
'asks_reagent' => $asksReagent,
'asks_document' => $asksDocument,
'asks_safety' => $asksSafety,
'asks_device' => $asksDevice,
];
}
/**
* Scores one candidate document for focused product selection.
*/
private function scoreFocusedProductCandidate(array $promptProfile, string $title, array $row, int $rank): float
{
$titleNormalized = $this->normalizeText($title);
$titleTokens = $this->tokenizeText($titleNormalized);
$titleTokenMap = array_fill_keys($titleTokens, true);
$textNormalized = $this->normalizeText((string)($row['text'] ?? ''));
$score = max(0.0, 5.0 - $rank);
if ($titleNormalized !== '' && str_contains(' ' . $promptProfile['normalized'] . ' ', ' ' . $titleNormalized . ' ')) {
$score += 24.0;
}
$matchedAnchors = 0;
foreach ($promptProfile['anchors'] as $anchor) {
if (isset($titleTokenMap[$anchor])) {
$matchedAnchors++;
$score += $this->isImportantShortModelToken($anchor) ? 4.0 : 3.5;
continue;
}
if (str_contains(' ' . $titleNormalized . ' ', ' ' . $anchor . ' ')) {
$matchedAnchors++;
$score += 3.0;
continue;
}
$score -= $this->isFamilyDescriptorToken($anchor) ? 3.5 : 2.0;
}
foreach ($promptProfile['number_tokens'] as $numberToken) {
if (isset($titleTokenMap[$numberToken])) {
$score += 4.0;
} else {
$score -= 5.0;
}
}
foreach ($promptProfile['family_tokens'] as $familyToken) {
if (isset($titleTokenMap[$familyToken])) {
$score += 4.0;
} else {
$score -= 4.5;
}
}
if ($promptProfile['asks_device']) {
if ($this->looksLikeReagentOrAccessoryDocument($row, $titleNormalized, $textNormalized)) {
$score -= 12.0;
}
if ($this->looksLikeSafetyDocument($row, $titleNormalized, $textNormalized)) {
$score -= 8.0;
}
}
if ($promptProfile['asks_reagent'] && $this->looksLikeReagentOrAccessoryDocument($row, $titleNormalized, $textNormalized)) {
$score += 6.0;
}
if (($promptProfile['asks_document'] || $promptProfile['asks_safety']) && $this->looksLikeSafetyDocument($row, $titleNormalized, $textNormalized)) {
$score += 4.0;
}
if ($matchedAnchors === 0) {
$score -= 10.0;
}
return $score;
}
/**
* Selects only the focused product document chunks.
*
* In this strict mode we intentionally do not fill remaining slots with
* neighbouring products, because that would reintroduce the original bug.
*/
private function selectFocusedProductChunkIds(
string $documentId,
array $chunkIds,
array $rows,
int $limit
): array
{
return $this->selectDominantDocumentChunkIds(
$documentId,
$chunkIds,
$rows,
min($limit, self::FOCUSED_PRODUCT_MAX_CHUNKS)
);
}
/**
* Detects whether one document clearly dominates the first ranked window.
*
* This is especially useful for product-sheet style documents where
* several adjacent chunks belong together and should be passed to the model
* as one coherent factual block.
*/
private function detectDominantTopDocument(array $chunkIds, array $rows): ?string
{
$docWindow = [];
foreach (array_slice($chunkIds, 0, self::DOMINANT_DOC_WINDOW) as $chunkId) {
if (!isset($rows[$chunkId]['text'])) {
continue;
}
$text = trim((string)$rows[$chunkId]['text']);
$docId = $rows[$chunkId]['document_id'] ?? null;
if ($text === '' || !is_string($docId) || $docId === '') {
continue;
}
$docWindow[] = $docId;
}
if (count($docWindow) < 2) {
return null;
}
$counts = array_count_values($docWindow);
arsort($counts);
$dominantDocId = array_key_first($counts);
if (!is_string($dominantDocId) || $dominantDocId === '') {
return null;
}
$dominantCount = (int)($counts[$dominantDocId] ?? 0);
if ($dominantCount >= self::DOMINANT_DOC_MIN_HITS) {
return $dominantDocId;
}
$first = $docWindow[0] ?? null;
$second = $docWindow[1] ?? null;
if ($dominantCount >= 2 && $first === $dominantDocId && $second === $dominantDocId) {
return $dominantDocId;
}
return null;
}
/**
* Selects a coherent chunk window from the dominant document.
*
* Strategy:
* - use the highest-ranked chunk of that document as anchor
* - prefer neighbouring chunk indices around that anchor
* - sort the final selection by chunk index for prompt coherence
*/
private function selectDominantDocumentChunkIds(
string $documentId,
array $chunkIds,
array $rows,
int $limit
): array
{
$docHits = [];
$anchorChunkIndex = null;
foreach ($chunkIds as $rank => $chunkId) {
if (!isset($rows[$chunkId]['text'])) {
continue;
}
$text = trim((string)$rows[$chunkId]['text']);
$docId = $rows[$chunkId]['document_id'] ?? null;
if ($text === '' || $docId !== $documentId) {
continue;
}
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
$chunkIndex = is_int($chunkIndex) ? $chunkIndex : null;
if ($anchorChunkIndex === null && $chunkIndex !== null) {
$anchorChunkIndex = $chunkIndex;
}
$docHits[] = [
'id' => (string)$chunkId,
'rank' => $rank,
'chunk_index' => $chunkIndex,
];
}
if ($docHits === []) {
return [];
}
$maxFromDoc = min($limit, self::DOMINANT_DOC_MAX_CHUNKS);
if ($anchorChunkIndex !== null) {
usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int {
$aDistance = $a['chunk_index'] === null ? PHP_INT_MAX : abs($a['chunk_index'] - $anchorChunkIndex);
$bDistance = $b['chunk_index'] === null ? PHP_INT_MAX : abs($b['chunk_index'] - $anchorChunkIndex);
if ($aDistance !== $bDistance) {
return $aDistance <=> $bDistance;
}
return $a['rank'] <=> $b['rank'];
});
} else {
usort($docHits, static fn(array $a, array $b): int => $a['rank'] <=> $b['rank']);
}
$selected = array_slice($docHits, 0, $maxFromDoc);
usort($selected, static function (array $a, array $b): int {
$aIndex = $a['chunk_index'];
$bIndex = $b['chunk_index'];
if ($aIndex === null && $bIndex === null) {
return $a['rank'] <=> $b['rank'];
}
if ($aIndex === null) {
return 1;
}
if ($bIndex === null) {
return -1;
}
if ($aIndex !== $bIndex) {
return $aIndex <=> $bIndex;
}
return $a['rank'] <=> $b['rank'];
});
return array_map(
static fn(array $row): string => $row['id'],
$selected
);
}
/**
* Fills the remaining sales slots after a dominant document selection.
*
* The already selected dominant-document chunks stay fixed.
* Remaining slots are filled with the normal spread strategy.
*/
private function fillRemainingSalesChunkIds(
array $seedChunkIds,
array $chunkIds,
array $rows,
int $limit
): array
{
$out = array_values(array_unique(array_map('strval', $seedChunkIds)));
if (count($out) >= $limit) {
return array_slice($out, 0, $limit);
}
$selected = array_fill_keys($out, true);
$docCounter = [];
$docChunkPositions = [];
foreach ($out as $chunkId) {
$docId = $rows[$chunkId]['document_id'] ?? null;
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
if (is_string($docId) && $docId !== '') {
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
if (is_int($chunkIndex)) {
$docChunkPositions[$docId][] = $chunkIndex;
}
}
}
foreach ($chunkIds as $chunkId) {
if (isset($selected[$chunkId])) {
continue;
}
if (!isset($rows[$chunkId]['text'])) {
continue;
}
$docId = $rows[$chunkId]['document_id'] ?? null;
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
if (!is_string($docId) || $docId === '') {
continue;
}
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
continue;
}
if (is_int($chunkIndex)) {
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
continue 2;
}
}
}
$text = trim((string)$rows[$chunkId]['text']);
if ($text === '') {
continue;
}
$out[] = (string)$chunkId;
$selected[$chunkId] = true;
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
if (is_int($chunkIndex)) {
$docChunkPositions[$docId][] = $chunkIndex;
}
if (count($out) >= $limit) {
break;
}
}
return $out;
}
/**
* Default spread selection for sales-oriented queries.
*
* Goal:
* - avoid overloading the result with chunks from the same document
* - avoid chunks that are too close to each other in the same document
* - preserve top-ranked relevance while improving contextual spread
*/
private function selectSalesChunkIdsSpread(array $chunkIds, array $rows, int $limit): array
{
$out = [];
$docCounter = [];
$docChunkPositions = [];
foreach ($chunkIds as $chunkId) {
if (!isset($rows[$chunkId]['text'])) {
continue;
}
$docId = $rows[$chunkId]['document_id'] ?? null;
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
if (!is_string($docId) || $docId === '') {
continue;
}
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
continue;
}
if (is_int($chunkIndex)) {
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
continue 2;
}
}
$docChunkPositions[$docId][] = $chunkIndex;
}
$text = trim((string)$rows[$chunkId]['text']);
if ($text === '') {
continue;
}
$out[] = (string)$chunkId;
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
if (count($out) >= $limit) {
break;
}
}
return $out;
}
/**
* Extracts the document title from metadata or from the first product-title heading.
*/
private function extractDocumentTitle(array $row): string
{
$metadataTitle = $row['metadata']['document_title'] ?? null;
if (is_string($metadataTitle) && trim($metadataTitle) !== '') {
return trim($metadataTitle);
}
$text = (string)($row['text'] ?? '');
if (
$text !== '' &&
preg_match('/^#\s*Produkt\s+Titel:\s*`?([^`\n]+)`?/imu', $text, $matches) === 1
) {
return trim((string)($matches[1] ?? ''));
}
return '';
}
/**
* Normalizes text for token-safe product comparisons.
*/
private function normalizeText(string $value): string
{
$value = mb_strtolower(trim($value), 'UTF-8');
$value = str_replace(['-', '/', '_'], ' ', $value);
$value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
return trim($value);
}
/**
* Tokenizes normalized text.
*
* @return string[]
*/
private function tokenizeText(string $value): array
{
if ($value === '') {
return [];
}
return preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: [];
}
/**
* Returns true when at least one token from the haystack matches the given words.
*/
private function containsAnyToken(array $tokens, array $needles): bool
{
if ($tokens === [] || $needles === []) {
return false;
}
$tokenMap = array_fill_keys($tokens, true);
foreach ($needles as $needle) {
if (isset($tokenMap[$needle])) {
return true;
}
}
return false;
}
/**
* Generic product words must not drive product dominance decisions.
*/
private function isGenericProductToken(string $token): bool
{
static $generic = [
'der', 'die', 'das', 'ein', 'eine', 'einen', 'einem', 'und', 'oder', 'mit',
'fuer', 'für', 'von', 'im', 'in', 'am', 'an', 'auf', 'zu', 'zum', 'zur',
'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät',
'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte',
'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung',
'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend',
'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher',
'welches', 'brauche', 'suche', 'bitte', 'fuer', 'gegen', 'und', 'oder',
];
return isset(array_fill_keys($generic, true)[$token]);
}
/**
* Short technical model codes like TH or TC are allowed as anchors.
*/
private function isImportantShortModelToken(string $token): bool
{
static $allowed = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
return in_array($token, $allowed, true);
}
/**
* Family descriptors are strong product differentiators.
*/
private function isFamilyDescriptorToken(string $token): bool
{
static $familyDescriptors = [
'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab',
'inline', 'compact', 'panel', 'sc',
];
return in_array($token, $familyDescriptors, true)
|| $this->isImportantShortModelToken($token)
|| preg_match('/\d/u', $token) === 1;
}
/**
* Heuristic classifier for indicator, reagent, accessory and spare-part documents.
*/
private function looksLikeReagentOrAccessoryDocument(array $row, string $titleNormalized, string $textNormalized): bool
{
$haystack = trim($titleNormalized . ' ' . $textNormalized);
if ($haystack === '') {
return false;
}
$needles = [
'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie',
'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche',
'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz',
'kerzenfilter', 'druckregler',
];
foreach ($needles as $needle) {
if (str_contains($haystack, $needle)) {
return true;
}
}
return false;
}
/**
* Heuristic classifier for safety-style documents.
*/
private function looksLikeSafetyDocument(array $row, string $titleNormalized, string $textNormalized): bool
{
$haystack = trim($titleNormalized . ' ' . $textNormalized);
if ($haystack === '') {
return false;
}
$needles = [
'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung',
'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp',
'kennzeichnung', 'h290', 'pbt', 'vpvb',
];
foreach ($needles as $needle) {
if (str_contains($haystack, $needle)) {
return true;
}
}
return false;
}
/**
* Converts selected chunk ids into the final plain text result list.
*/
private function collectTextsFromIds(array $chunkIds, array $rows): array
{
$out = [];
foreach ($chunkIds as $id) {
if (!isset($rows[$id]['text'])) {
continue;
}
$text = trim((string)$rows[$id]['text']);
if ($text !== '') {
$out[] = $text;
}
}
return $out;
}
}