1816 lines
56 KiB
PHP
1816 lines
56 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Knowledge\Retrieval;
|
|
|
|
use App\Catalog\EntityCatalogService;
|
|
use App\Config\NdjsonHybridRetrieverConfig;
|
|
use App\Entity\ModelGenerationConfig;
|
|
use App\Intent\CatalogIntentLite;
|
|
use App\Intent\IntentLite;
|
|
use App\Intent\SalesIntentLite;
|
|
use App\Repository\ModelGenerationConfigRepository;
|
|
use App\Routing\IntentRouteResolver;
|
|
use App\Tag\TagRoutingService;
|
|
use App\Vector\VectorSearchClient;
|
|
use Doctrine\DBAL\Exception;
|
|
use RuntimeException;
|
|
|
|
/**
|
|
* Hybrid retriever for NDJSON-based knowledge chunks.
|
|
*
|
|
* Main responsibilities:
|
|
* - detect high-level request intent
|
|
* - optionally short-circuit to catalog list output
|
|
* - resolve exact document-title matches before semantic retrieval
|
|
* - run vector retrieval globally and optionally document-scoped
|
|
* - fuse both result sets with RRF-style scoring
|
|
* - apply selection rules for list queries vs. sales-style queries
|
|
* - return either plain chunk texts or debug metadata
|
|
*/
|
|
final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|
{
|
|
|
|
public function __construct(
|
|
private NdjsonChunkLookup $lookup,
|
|
private NdjsonKeywordRetriever $keywordRetriever,
|
|
private VectorSearchClient $vectorClient,
|
|
private TagRoutingService $tagRouting,
|
|
private ModelGenerationConfigRepository $configRepository,
|
|
private QueryCleaner $queryCleaner,
|
|
private IntentLite $intentLite,
|
|
private SalesIntentLite $salesIntentLite,
|
|
private CatalogIntentLite $catalogIntent,
|
|
private IntentRouteResolver $routeResolver,
|
|
private EntityCatalogService $entityCatalogService,
|
|
private QueryEnricher $queryEnricher,
|
|
private NdjsonHybridRetrieverConfig $retrieverConfig,
|
|
)
|
|
{
|
|
}
|
|
|
|
// =========================================================
|
|
// PUBLIC API
|
|
// =========================================================
|
|
|
|
/**
|
|
* Returns the final retrieval payload as plain text chunks.
|
|
*
|
|
* Behaviour:
|
|
* - loads active retrieval config
|
|
* - executes the full orchestration pipeline
|
|
* - if the route resolves to a catalog list, returns the catalog block only
|
|
* - otherwise returns the selected chunk texts
|
|
*
|
|
* @throws Exception
|
|
*/
|
|
public function retrieve(string $prompt): array
|
|
{
|
|
$config = $this->requireConfig();
|
|
$result = $this->execute($prompt, $config, false);
|
|
|
|
if ($result['catalogBlock'] !== null) {
|
|
return [$result['catalogBlock']];
|
|
}
|
|
|
|
if ($result['selectedChunkIds'] === []) {
|
|
return [];
|
|
}
|
|
|
|
return $this->collectTextsFromIds(
|
|
$result['selectedChunkIds'],
|
|
$result['rows']
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Returns a debug-friendly retrieval result with scoring/meta information.
|
|
*
|
|
* This method is used for inspection and tuning:
|
|
* - selected chunk ids
|
|
* - raw vector scores
|
|
* - fused RRF scores
|
|
* - intent / route information
|
|
* - threshold and list-query flags
|
|
*
|
|
* @throws Exception
|
|
*/
|
|
public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array
|
|
{
|
|
$config = $config ?? $this->requireConfig();
|
|
$result = $this->execute($prompt, $config, true);
|
|
|
|
if ($result['catalogBlock'] !== null) {
|
|
return [[
|
|
'rank' => 1,
|
|
'chunk_id' => '__CATALOG_LIST__',
|
|
'document_id' => null,
|
|
'chunk_index' => null,
|
|
'raw_score' => null,
|
|
'rrf_score' => null,
|
|
'threshold' => 0.0,
|
|
'intent' => $result['intent'],
|
|
'route' => $result['route'],
|
|
'entity_label' => $result['entityLabel'],
|
|
'is_list_query' => true,
|
|
'selection_mode' => 'catalog_list',
|
|
'text' => $result['catalogBlock'],
|
|
]];
|
|
}
|
|
|
|
if ($result['selectedChunkIds'] === []) {
|
|
return [];
|
|
}
|
|
|
|
$out = [];
|
|
$rank = 0;
|
|
|
|
foreach ($result['selectedChunkIds'] as $chunkId) {
|
|
if (!isset($result['rows'][$chunkId])) {
|
|
continue;
|
|
}
|
|
|
|
$rank++;
|
|
|
|
$out[] = [
|
|
'rank' => $rank,
|
|
'chunk_id' => $chunkId,
|
|
'document_id' => $result['rows'][$chunkId]['document_id'] ?? null,
|
|
'chunk_index' => $result['rows'][$chunkId]['chunk_index'] ?? null,
|
|
'raw_score' => $result['rawScores'][$chunkId] ?? null,
|
|
'rrf_score' => $result['rrfScores'][$chunkId] ?? null,
|
|
'threshold' => $result['threshold'],
|
|
'intent' => $result['intent'],
|
|
'route' => $result['route'],
|
|
'entity_label' => $result['entityLabel'],
|
|
'is_list_query' => $result['isListQuery'],
|
|
'selection_mode' => $result['selectionMode'],
|
|
'text' => trim((string)$result['rows'][$chunkId]['text']),
|
|
];
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
// =========================================================
|
|
// CENTRAL ORCHESTRATION
|
|
// =========================================================
|
|
|
|
/**
|
|
* Central orchestration entrypoint.
|
|
*
|
|
* Pipeline:
|
|
* 1. Detect catalog entity and sales intent
|
|
* 2. Resolve route
|
|
* 3. If route is a catalog list route, try direct catalog output
|
|
* 4. If prompt matches one exact document title, use exact-document fast path
|
|
* 5. Otherwise, run the normal hybrid retrieval core
|
|
* 6. Select final chunk ids depending on query type
|
|
*
|
|
* @throws Exception
|
|
*/
|
|
private function execute(
|
|
string $prompt,
|
|
ModelGenerationConfig $config,
|
|
bool $withScores
|
|
): array
|
|
{
|
|
$entityLabel = $this->catalogIntent->detect($prompt);
|
|
$salesIntent = $this->detectSalesIntent($prompt);
|
|
$route = $this->routeResolver->resolve($salesIntent, $entityLabel);
|
|
|
|
if (
|
|
$route === IntentRouteResolver::ROUTE_CATALOG_LIST
|
|
&& !$this->shouldUseCatalogListShortcut($prompt, $salesIntent)
|
|
) {
|
|
$route = IntentRouteResolver::ROUTE_NORMAL;
|
|
}
|
|
|
|
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
|
|
$catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);
|
|
|
|
if ($catalogBlock !== null) {
|
|
return [
|
|
'route' => $route,
|
|
'entityLabel' => $entityLabel,
|
|
'intent' => $salesIntent,
|
|
'isListQuery' => true,
|
|
'selectionMode' => 'catalog_list',
|
|
'selectedChunkIds' => [],
|
|
'rows' => [],
|
|
'rrfScores' => [],
|
|
'rawScores' => [],
|
|
'threshold' => 0.0,
|
|
'catalogBlock' => trim($catalogBlock),
|
|
];
|
|
}
|
|
}
|
|
|
|
$exactDocumentMatch = $this->lookup->findBestExactDocumentByPrompt($prompt);
|
|
|
|
if ($exactDocumentMatch !== null) {
|
|
$selectedChunkIds = $this->selectExactDocumentChunkIds(
|
|
$exactDocumentMatch['rows'],
|
|
max(1, min($config->getRetrievalMaxChunks(), $this->retrieverConfig->hardMaxChunks())),
|
|
$prompt
|
|
);
|
|
|
|
if ($selectedChunkIds !== []) {
|
|
return [
|
|
'route' => $route,
|
|
'entityLabel' => $entityLabel,
|
|
'intent' => $salesIntent,
|
|
'isListQuery' => false,
|
|
'selectionMode' => 'exact_document_title',
|
|
'selectedChunkIds' => $selectedChunkIds,
|
|
'rows' => $exactDocumentMatch['rows'],
|
|
'rrfScores' => $this->buildExactDocumentScores($selectedChunkIds),
|
|
'rawScores' => [],
|
|
'threshold' => 1.0,
|
|
'catalogBlock' => null,
|
|
];
|
|
}
|
|
}
|
|
|
|
$core = $this->runCore($prompt, $config, $withScores, $salesIntent);
|
|
|
|
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
|
return [
|
|
'route' => $route,
|
|
'entityLabel' => $entityLabel,
|
|
'intent' => $salesIntent,
|
|
'isListQuery' => $core['is_list_query'],
|
|
'selectionMode' => null,
|
|
'selectedChunkIds' => [],
|
|
'rows' => [],
|
|
'rrfScores' => [],
|
|
'rawScores' => [],
|
|
'threshold' => $core['threshold'],
|
|
'catalogBlock' => null,
|
|
];
|
|
}
|
|
|
|
if ($core['is_list_query']) {
|
|
$selectedChunkIds = $this->selectListChunkIds(
|
|
$core['ranked_chunk_ids'],
|
|
$core['rows'],
|
|
$core['limit']
|
|
);
|
|
$selectionMode = 'list_deduplicated';
|
|
} else {
|
|
$salesSelection = $this->selectSalesChunkIds(
|
|
$prompt,
|
|
$core['ranked_chunk_ids'],
|
|
$core['rows'],
|
|
$core['limit']
|
|
);
|
|
|
|
$selectedChunkIds = $salesSelection['ids'];
|
|
$selectionMode = $salesSelection['mode'];
|
|
}
|
|
|
|
return [
|
|
'route' => $route,
|
|
'entityLabel' => $entityLabel,
|
|
'intent' => $salesIntent,
|
|
'isListQuery' => $core['is_list_query'],
|
|
'selectionMode' => $selectionMode,
|
|
'selectedChunkIds' => $selectedChunkIds,
|
|
'rows' => $core['rows'],
|
|
'rrfScores' => $core['rrf_scores'],
|
|
'rawScores' => $core['raw_scores'],
|
|
'threshold' => $core['threshold'],
|
|
'catalogBlock' => null,
|
|
];
|
|
}
|
|
|
|
// =========================================================
|
|
// CORE PIPELINE
|
|
// =========================================================
|
|
|
|
/**
|
|
* Executes the actual hybrid retrieval logic.
|
|
*
|
|
* Steps:
|
|
* - derive limits from config within hard safety caps
|
|
* - detect whether the prompt is a "list query"
|
|
* - clean and enrich the prompt
|
|
* - compute threshold + vector topK based on intent/query type
|
|
* - route query into candidate document ids via tag routing
|
|
* - run global and optional scoped vector search
|
|
* - fuse hits
|
|
* - resolve chunk ids to chunk rows
|
|
*
|
|
* @throws Exception
|
|
*/
|
|
private function runCore(
|
|
string $prompt,
|
|
ModelGenerationConfig $config,
|
|
bool $withScores,
|
|
string $salesIntent
|
|
): array
|
|
{
|
|
$limit = max(1, min($config->getRetrievalMaxChunks(), $this->retrieverConfig->hardMaxChunks()));
|
|
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), $this->retrieverConfig->hardMaxVectorK()));
|
|
|
|
$isListQuery = $this->intentLite->isListQuery($prompt);
|
|
|
|
$cleanQuery = $this->queryCleaner->clean($prompt);
|
|
$cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery);
|
|
|
|
if ($cleanQuery === '') {
|
|
return [
|
|
'limit' => $limit,
|
|
'is_list_query' => $isListQuery,
|
|
'threshold' => $this->retrieverConfig->vectorScoreThreshold(),
|
|
'ranked_chunk_ids' => [],
|
|
'rows' => [],
|
|
'rrf_scores' => [],
|
|
'raw_scores' => [],
|
|
];
|
|
}
|
|
|
|
[$threshold, $topK] = $this->computeThresholdAndTopK(
|
|
$salesIntent,
|
|
$isListQuery,
|
|
$vectorTopKBase
|
|
);
|
|
|
|
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
|
$candidateDocIds = is_array($candidateDocIds)
|
|
? array_values(array_unique(array_filter(
|
|
$candidateDocIds,
|
|
static fn(mixed $value): bool => is_string($value) && $value !== ''
|
|
)))
|
|
: [];
|
|
|
|
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
|
$keywordHits = $this->keywordRetriever->search(
|
|
$cleanQuery,
|
|
$this->computeKeywordTopK($topK)
|
|
);
|
|
|
|
$scopedHits = [];
|
|
$scopedKeywordHits = [];
|
|
if ($candidateDocIds !== []) {
|
|
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
|
|
$scopedKeywordHits = $this->keywordRetriever->search(
|
|
$cleanQuery,
|
|
$this->computeKeywordTopK($topK),
|
|
$candidateDocIds
|
|
);
|
|
}
|
|
|
|
if ($globalHits === [] && $scopedHits === [] && $keywordHits === [] && $scopedKeywordHits === []) {
|
|
return [
|
|
'limit' => $limit,
|
|
'is_list_query' => $isListQuery,
|
|
'threshold' => $threshold,
|
|
'ranked_chunk_ids' => [],
|
|
'rows' => [],
|
|
'rrf_scores' => [],
|
|
'raw_scores' => [],
|
|
];
|
|
}
|
|
|
|
$fused = $this->fuseHits(
|
|
$globalHits,
|
|
$scopedHits,
|
|
$keywordHits,
|
|
$scopedKeywordHits,
|
|
$threshold,
|
|
$scopedHits !== [],
|
|
$scopedKeywordHits !== [],
|
|
$withScores
|
|
);
|
|
|
|
$rrfScores = $fused['rrf_scores'];
|
|
$rawScores = $fused['raw_scores'];
|
|
|
|
if ($rrfScores === [] && $globalHits !== []) {
|
|
// $rrfScores = $this->fallbackRrfFromHits($globalHits);
|
|
}
|
|
|
|
if ($rrfScores === []) {
|
|
return [
|
|
'limit' => $limit,
|
|
'is_list_query' => $isListQuery,
|
|
'threshold' => $threshold,
|
|
'ranked_chunk_ids' => [],
|
|
'rows' => [],
|
|
'rrf_scores' => [],
|
|
'raw_scores' => $rawScores,
|
|
];
|
|
}
|
|
|
|
arsort($rrfScores);
|
|
$rankedChunkIds = array_keys($rrfScores);
|
|
|
|
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
|
|
|
|
return [
|
|
'limit' => $limit,
|
|
'is_list_query' => $isListQuery,
|
|
'threshold' => $threshold,
|
|
'ranked_chunk_ids' => $rankedChunkIds,
|
|
'rows' => $rows,
|
|
'rrf_scores' => $rrfScores,
|
|
'raw_scores' => $rawScores,
|
|
];
|
|
}
|
|
|
|
// =========================================================
|
|
// SUPPORT
|
|
// =========================================================
|
|
|
|
/**
|
|
* Loads the active model generation config.
|
|
*
|
|
* Retrieval is not allowed to proceed without an active config.
|
|
*/
|
|
private function requireConfig(): ModelGenerationConfig
|
|
{
|
|
$config = $this->configRepository->findActiveForModel();
|
|
|
|
if ($config === null) {
|
|
throw new RuntimeException('No active ModelGenerationConfig found.');
|
|
}
|
|
|
|
return $config;
|
|
}
|
|
|
|
/**
|
|
* Extracts the normalized sales intent string from the intent detector.
|
|
*
|
|
* Falls back to DISCOVERY when the detector payload is incomplete.
|
|
*/
|
|
private function detectSalesIntent(string $prompt): string
|
|
{
|
|
$data = $this->salesIntentLite->detect($prompt);
|
|
|
|
return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
|
|
}
|
|
|
|
/**
|
|
* The catalog shortcut is only safe for real list/catalog requests.
|
|
* Factual questions such as "what is the lowest threshold" must continue
|
|
* through normal retrieval, otherwise the system can return a product list
|
|
* instead of the requested value.
|
|
*/
|
|
private function shouldUseCatalogListShortcut(string $prompt, string $salesIntent): bool
|
|
{
|
|
if ($salesIntent !== SalesIntentLite::DISCOVERY) {
|
|
return false;
|
|
}
|
|
|
|
if ($this->intentLite->isListQuery($prompt)) {
|
|
return true;
|
|
}
|
|
|
|
$normalized = $this->normalizeText($prompt);
|
|
|
|
if ($normalized === '') {
|
|
return false;
|
|
}
|
|
|
|
return $this->matchesAnyPattern($normalized, $this->retrieverConfig->catalogListShortcutPatterns());
|
|
}
|
|
|
|
/**
|
|
* Keyword retrieval is cheap and should look slightly wider than vector
|
|
* retrieval because it acts as a factual safety net for numbers, ranges,
|
|
* thresholds and exact technical terms.
|
|
*/
|
|
private function computeKeywordTopK(int $vectorTopK): int
|
|
{
|
|
$topK = (int) ceil($vectorTopK * $this->retrieverConfig->keywordTopKMultiplier());
|
|
|
|
return max(1, min($topK, $this->retrieverConfig->hardMaxKeywordK()));
|
|
}
|
|
|
|
/**
|
|
* Computes retrieval threshold and vector topK.
|
|
*
|
|
* Rules:
|
|
* - objection/pricing intents are slightly stricter
|
|
* - list queries are allowed to retrieve a wider candidate set
|
|
* - all values are clamped to global hard limits
|
|
*/
|
|
private function computeThresholdAndTopK(
|
|
string $salesIntent,
|
|
bool $isListQuery,
|
|
int $vectorTopKBase
|
|
): array
|
|
{
|
|
$threshold = $this->retrieverConfig->vectorScoreThreshold();
|
|
$topK = $vectorTopKBase;
|
|
|
|
if (
|
|
$salesIntent === SalesIntentLite::OBJECTION ||
|
|
$salesIntent === SalesIntentLite::PRICING
|
|
) {
|
|
$threshold += 0.02;
|
|
}
|
|
|
|
if ($isListQuery) {
|
|
$topK = (int)round($topK * $this->retrieverConfig->listBonus());
|
|
}
|
|
|
|
$topK = max(1, min($topK, $this->retrieverConfig->hardMaxVectorK()));
|
|
$threshold = max(
|
|
$this->retrieverConfig->thresholdFloor(),
|
|
min($this->retrieverConfig->thresholdCeil(), $threshold)
|
|
);
|
|
|
|
return [$threshold, $topK];
|
|
}
|
|
|
|
/**
|
|
* Fuses multiple hit lists into one RRF-style score map.
|
|
*
|
|
* Notes:
|
|
* - only hits above threshold are considered
|
|
* - rank position within each hit list contributes to the final score
|
|
* - scoped hits can be boosted
|
|
* - raw scores are optionally captured for debug output
|
|
*/
|
|
private function fuseHits(
|
|
array $globalHits,
|
|
array $scopedHits,
|
|
array $keywordHits,
|
|
array $scopedKeywordHits,
|
|
float $vectorThreshold,
|
|
bool $boostScopedVector,
|
|
bool $boostScopedKeyword,
|
|
bool $captureRaw
|
|
): array
|
|
{
|
|
$rrfScores = [];
|
|
$rawScores = [];
|
|
|
|
$apply = function (array $hits, float $threshold, float $weight) use (&$rrfScores, &$rawScores, $captureRaw): void {
|
|
$rank = 0;
|
|
|
|
foreach ($hits as $hit) {
|
|
if (!isset($hit['chunk_id'], $hit['score'])) {
|
|
continue;
|
|
}
|
|
|
|
$raw = (float)$hit['score'];
|
|
|
|
if ($raw < $threshold) {
|
|
continue;
|
|
}
|
|
|
|
$chunkId = (string)$hit['chunk_id'];
|
|
|
|
if ($captureRaw) {
|
|
$rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
|
|
}
|
|
|
|
$rank++;
|
|
$rrf = (1.0 / ($this->retrieverConfig->rrfK() + $rank)) * $weight;
|
|
|
|
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
|
|
}
|
|
};
|
|
|
|
$apply($globalHits, $vectorThreshold, 1.0);
|
|
$apply($scopedHits, $vectorThreshold, $boostScopedVector ? $this->retrieverConfig->scopedVectorRrfWeight() : 1.0);
|
|
$apply($keywordHits, $this->retrieverConfig->keywordScoreThreshold(), $this->retrieverConfig->keywordRrfWeight());
|
|
$apply($scopedKeywordHits, $this->retrieverConfig->keywordScoreThreshold(), $boostScopedKeyword ? $this->retrieverConfig->scopedKeywordRrfWeight() : $this->retrieverConfig->keywordRrfWeight());
|
|
|
|
return [
|
|
'rrf_scores' => $rrfScores,
|
|
'raw_scores' => $rawScores,
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Builds a fallback RRF ranking purely from hit order.
|
|
*
|
|
* Used when thresholding removed all fused candidates but
|
|
* the global hit list itself still exists.
|
|
*/
|
|
private function fallbackRrfFromHits(array $hits): array
|
|
{
|
|
$rrf = [];
|
|
$rank = 0;
|
|
|
|
foreach ($hits as $hit) {
|
|
if (!isset($hit['chunk_id'])) {
|
|
continue;
|
|
}
|
|
|
|
$rank++;
|
|
$rrf[(string)$hit['chunk_id']] = 1.0 / ($this->retrieverConfig->rrfK() + $rank);
|
|
|
|
if ($rank >= $this->retrieverConfig->emptyRrfFallbackTopN()) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $rrf;
|
|
}
|
|
|
|
/**
|
|
* Selects a coherent chunk window from one exact document-title match.
|
|
*
|
|
* A pure first-N slice is too weak for follow-up questions: the title may
|
|
* identify the right document, while the current follow-up asks for a
|
|
* specific detail from a later chunk (for example an indicator, range,
|
|
* threshold, interface, relay, or error code).
|
|
*
|
|
* Therefore this method stays inside the matched document, but ranks its
|
|
* chunks by overlap with the effective retrieval query before sorting the
|
|
* final selection back into document order for prompt readability.
|
|
*
|
|
* @param array<string,array<string,mixed>> $rows
|
|
* @return string[]
|
|
*/
|
|
private function selectExactDocumentChunkIds(array $rows, int $limit, string $prompt): array
|
|
{
|
|
$orderedRows = $this->sortRowsByChunkIndex($rows);
|
|
$max = min($limit, $this->retrieverConfig->exactDocumentMaxChunks());
|
|
|
|
if ($orderedRows === [] || $max <= 0) {
|
|
return [];
|
|
}
|
|
|
|
$queryTokens = $this->expandExactSelectionTokenVariants(
|
|
$this->buildExactDocumentSelectionTokens($prompt)
|
|
);
|
|
|
|
if ($queryTokens === []) {
|
|
return $this->firstChunkIdsFromRows($orderedRows, $max);
|
|
}
|
|
|
|
$detailFocus = $this->buildExactDocumentDetailFocus($prompt);
|
|
$scored = [];
|
|
|
|
foreach ($orderedRows as $order => $row) {
|
|
$chunkId = $row['chunk_id'] ?? null;
|
|
$text = trim((string)($row['text'] ?? ''));
|
|
|
|
if (!is_string($chunkId) || $chunkId === '' || $text === '') {
|
|
continue;
|
|
}
|
|
|
|
$haystack = $this->normalizeText($this->extractDocumentTitle($row) . ' ' . $text);
|
|
$haystackTokens = array_fill_keys(
|
|
$this->expandExactSelectionTokenVariants($this->tokenizeText($haystack)),
|
|
true
|
|
);
|
|
$score = 0.0;
|
|
|
|
foreach ($queryTokens as $token) {
|
|
if (!isset($haystackTokens[$token])) {
|
|
continue;
|
|
}
|
|
|
|
if (preg_match('/\d/u', $token) === 1) {
|
|
$score += 6.0;
|
|
continue;
|
|
}
|
|
|
|
if ($this->isExactDetailToken($token)) {
|
|
$score += 5.0;
|
|
continue;
|
|
}
|
|
|
|
$score += 2.0;
|
|
}
|
|
|
|
$score += $this->scoreExactDocumentDetailFocus($detailFocus, $haystack, $text);
|
|
|
|
// Keep early chunks slightly competitive for overview facts,
|
|
// without letting them hide strongly matching detail chunks.
|
|
$score += max(0.0, 1.0 - ($order * 0.05));
|
|
|
|
$scored[] = [
|
|
'id' => $chunkId,
|
|
'score' => $score,
|
|
'order' => $order,
|
|
'chunk_index' => is_int($row['chunk_index'] ?? null) ? (int)$row['chunk_index'] : null,
|
|
];
|
|
}
|
|
|
|
if ($scored === []) {
|
|
return [];
|
|
}
|
|
|
|
usort($scored, static function (array $a, array $b): int {
|
|
if ($a['score'] !== $b['score']) {
|
|
return $b['score'] <=> $a['score'];
|
|
}
|
|
|
|
return $a['order'] <=> $b['order'];
|
|
});
|
|
|
|
$selected = array_slice($scored, 0, $max);
|
|
|
|
usort($selected, static function (array $a, array $b): int {
|
|
$aIndex = $a['chunk_index'];
|
|
$bIndex = $b['chunk_index'];
|
|
|
|
if ($aIndex === null && $bIndex === null) {
|
|
return $a['order'] <=> $b['order'];
|
|
}
|
|
|
|
if ($aIndex === null) {
|
|
return 1;
|
|
}
|
|
|
|
if ($bIndex === null) {
|
|
return -1;
|
|
}
|
|
|
|
if ($aIndex !== $bIndex) {
|
|
return $aIndex <=> $bIndex;
|
|
}
|
|
|
|
return $a['order'] <=> $b['order'];
|
|
});
|
|
|
|
return array_map(
|
|
static fn(array $row): string => (string)$row['id'],
|
|
$selected
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @param array<string,array<string,mixed>> $rows
|
|
* @return array<int,array<string,mixed>>
|
|
*/
|
|
private function sortRowsByChunkIndex(array $rows): array
|
|
{
|
|
uasort($rows, static function (array $a, array $b): int {
|
|
$aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX;
|
|
$bIndex = is_int($b['chunk_index'] ?? null) ? (int)$b['chunk_index'] : PHP_INT_MAX;
|
|
|
|
if ($aIndex !== $bIndex) {
|
|
return $aIndex <=> $bIndex;
|
|
}
|
|
|
|
return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? ''));
|
|
});
|
|
|
|
return array_values($rows);
|
|
}
|
|
|
|
/**
|
|
* @param array<int,array<string,mixed>> $rows
|
|
* @return string[]
|
|
*/
|
|
private function firstChunkIdsFromRows(array $rows, int $limit): array
|
|
{
|
|
$selected = [];
|
|
|
|
foreach ($rows as $row) {
|
|
$chunkId = $row['chunk_id'] ?? null;
|
|
$text = trim((string)($row['text'] ?? ''));
|
|
|
|
if (!is_string($chunkId) || $chunkId === '' || $text === '') {
|
|
continue;
|
|
}
|
|
|
|
$selected[] = $chunkId;
|
|
|
|
if (count($selected) >= $limit) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $selected;
|
|
}
|
|
|
|
/**
|
|
* @return string[]
|
|
*/
|
|
private function buildExactDocumentSelectionTokens(string $prompt): array
|
|
{
|
|
$tokens = $this->tokenizeText($this->normalizeText($prompt));
|
|
$out = [];
|
|
|
|
foreach ($tokens as $token) {
|
|
if ($this->isGenericExactSelectionToken($token)) {
|
|
continue;
|
|
}
|
|
|
|
if (preg_match('/\d/u', $token) === 1 || mb_strlen($token, 'UTF-8') >= 3) {
|
|
$out[] = $token;
|
|
}
|
|
}
|
|
|
|
return array_values(array_unique($out));
|
|
}
|
|
|
|
/**
|
|
* @param string[] $tokens
|
|
* @return string[]
|
|
*/
|
|
private function expandExactSelectionTokenVariants(array $tokens): array
|
|
{
|
|
$out = [];
|
|
|
|
foreach ($tokens as $token) {
|
|
foreach ($this->exactSelectionTokenVariants($token) as $variant) {
|
|
$out[] = $variant;
|
|
}
|
|
}
|
|
|
|
return array_values(array_unique(array_filter(
|
|
$out,
|
|
static fn(string $token): bool => $token !== ''
|
|
)));
|
|
}
|
|
|
|
/**
|
|
* @return string[]
|
|
*/
|
|
private function exactSelectionTokenVariants(string $token): array
|
|
{
|
|
$token = trim($token);
|
|
|
|
if ($token === '') {
|
|
return [];
|
|
}
|
|
|
|
$variants = [$token];
|
|
$length = mb_strlen($token, 'UTF-8');
|
|
|
|
if ($length >= 5) {
|
|
foreach (['typen', 'innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) {
|
|
if (!str_ends_with($token, $suffix)) {
|
|
continue;
|
|
}
|
|
|
|
$stem = mb_substr($token, 0, $length - mb_strlen($suffix, 'UTF-8'), 'UTF-8');
|
|
|
|
if (mb_strlen($stem, 'UTF-8') >= 3) {
|
|
$variants[] = $stem;
|
|
}
|
|
}
|
|
}
|
|
|
|
foreach ($this->retrieverConfig->exactSelectionTokenVariantPrefixes() as $prefix => $configuredVariants) {
|
|
if (!str_starts_with($token, $prefix)) {
|
|
continue;
|
|
}
|
|
|
|
foreach ($configuredVariants as $variant) {
|
|
$variants[] = $variant;
|
|
}
|
|
}
|
|
|
|
return array_values(array_unique($variants));
|
|
}
|
|
|
|
/**
|
|
* @return array{asks_indicator:bool}
|
|
*/
|
|
private function buildExactDocumentDetailFocus(string $prompt): array
|
|
{
|
|
$normalized = $this->normalizeText($prompt);
|
|
$tokens = array_fill_keys(
|
|
$this->expandExactSelectionTokenVariants($this->tokenizeText($normalized)),
|
|
true
|
|
);
|
|
|
|
$asksIndicator = $this->containsAnyConfiguredToken(
|
|
$tokens,
|
|
$this->retrieverConfig->exactSelectionIndicatorQuestionTokens()
|
|
) || $this->containsAnyConfiguredPhrase(
|
|
$normalized,
|
|
$this->retrieverConfig->exactSelectionIndicatorQuestionPhrases()
|
|
);
|
|
|
|
return [
|
|
'asks_indicator' => $asksIndicator,
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Gives detail chunks inside an already matched exact document a strong
|
|
* advantage for follow-up questions such as "which indicator measures that
|
|
* value". This remains scoped to the exact document, so it does not affect
|
|
* shop searches or broad product discovery.
|
|
*
|
|
* @param array{asks_indicator:bool} $detailFocus
|
|
*/
|
|
private function scoreExactDocumentDetailFocus(array $detailFocus, string $normalizedHaystack, string $rawText): float
|
|
{
|
|
$score = 0.0;
|
|
|
|
if (!$detailFocus['asks_indicator']) {
|
|
return $score;
|
|
}
|
|
|
|
if ($this->matchesAnyPattern($rawText, $this->retrieverConfig->exactSelectionIndicatorTableHeadingPatterns())) {
|
|
$score += 14.0;
|
|
}
|
|
|
|
if ($this->matchesAnyPattern($rawText, $this->retrieverConfig->exactSelectionIndicatorTableHeaderPatterns())) {
|
|
$score += 10.0;
|
|
}
|
|
|
|
if ($this->matchesAnyPattern($rawText, $this->retrieverConfig->exactSelectionIndicatorTableRowPatterns())) {
|
|
$score += 8.0;
|
|
}
|
|
|
|
if (
|
|
$this->containsAnyConfiguredPhrase(
|
|
$normalizedHaystack,
|
|
$this->retrieverConfig->exactSelectionIndicatorTableRequiredPrimaryTerms()
|
|
)
|
|
&& $this->containsAnyConfiguredPhrase(
|
|
$normalizedHaystack,
|
|
$this->retrieverConfig->exactSelectionIndicatorTableRequiredContextTerms()
|
|
)
|
|
) {
|
|
$score += 5.0;
|
|
}
|
|
|
|
return $score;
|
|
}
|
|
|
|
private function isExactDetailToken(string $token): bool
|
|
{
|
|
return in_array($token, $this->retrieverConfig->exactDetailTokens(), true);
|
|
}
|
|
|
|
private function isGenericExactSelectionToken(string $token): bool
|
|
{
|
|
return in_array($token, $this->retrieverConfig->genericExactSelectionTokens(), true);
|
|
}
|
|
|
|
/**
|
|
* @param string[] $patterns
|
|
*/
|
|
private function matchesAnyPattern(string $value, array $patterns): bool
|
|
{
|
|
foreach ($patterns as $pattern) {
|
|
if (preg_match($pattern, $value) === 1) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* @param array<string, bool> $tokens
|
|
* @param string[] $needles
|
|
*/
|
|
private function containsAnyConfiguredToken(array $tokens, array $needles): bool
|
|
{
|
|
foreach ($needles as $needle) {
|
|
if (isset($tokens[$needle])) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* @param string[] $phrases
|
|
*/
|
|
private function containsAnyConfiguredPhrase(string $haystack, array $phrases): bool
|
|
{
|
|
foreach ($phrases as $phrase) {
|
|
if ($phrase !== '' && str_contains($haystack, $phrase)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Builds synthetic scores for exact-title fast-path selections.
|
|
*
|
|
* These scores are only used for debug output consistency.
|
|
*
|
|
* @param string[] $chunkIds
|
|
* @return array<string,float>
|
|
*/
|
|
private function buildExactDocumentScores(array $chunkIds): array
|
|
{
|
|
$scores = [];
|
|
|
|
foreach (array_values($chunkIds) as $rank => $chunkId) {
|
|
$scores[(string)$chunkId] = 1.0 / (1 + $rank);
|
|
}
|
|
|
|
return $scores;
|
|
}
|
|
|
|
/**
|
|
* Selection strategy for list-style queries.
|
|
*
|
|
* Goal:
|
|
* - avoid near-identical chunks
|
|
* - prefer diverse list entries
|
|
* - stop once the configured limit is reached
|
|
*/
|
|
private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array
|
|
{
|
|
$seen = [];
|
|
$out = [];
|
|
|
|
foreach ($chunkIds as $id) {
|
|
if (!isset($rows[$id]['text'])) {
|
|
continue;
|
|
}
|
|
|
|
$chunk = trim((string)$rows[$id]['text']);
|
|
if ($chunk === '') {
|
|
continue;
|
|
}
|
|
|
|
$key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));
|
|
|
|
if (isset($seen[$key])) {
|
|
continue;
|
|
}
|
|
|
|
$seen[$key] = true;
|
|
$out[] = (string)$id;
|
|
|
|
if (count($out) >= $limit) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
/**
|
|
* Selection strategy for sales-oriented queries.
|
|
*
|
|
* Modes:
|
|
* - exact_document_title:
|
|
* used when the prompt clearly contains one exact document title
|
|
* and the answer should stay strictly within that document
|
|
*
|
|
* - sales_dominant_document:
|
|
* used when one document clearly dominates the top hit window
|
|
* and coherent neighbouring chunks from that document are more
|
|
* useful than cross-document spread
|
|
*
|
|
* - sales_spread:
|
|
* default mode that spreads chunks across documents and enforces
|
|
* distance between chunk positions of the same document
|
|
*/
|
|
private function selectSalesChunkIds(string $prompt, array $chunkIds, array $rows, int $limit): array
|
|
{
|
|
$focusedDocId = $this->resolveFocusedSalesDocumentId($prompt, $chunkIds, $rows);
|
|
|
|
if ($focusedDocId !== null) {
|
|
$focusedChunkIds = $this->selectFocusedProductChunkIds(
|
|
$focusedDocId,
|
|
$chunkIds,
|
|
$rows,
|
|
$limit
|
|
);
|
|
|
|
if ($focusedChunkIds !== []) {
|
|
return [
|
|
'ids' => $focusedChunkIds,
|
|
'mode' => 'sales_product_dominant_document',
|
|
];
|
|
}
|
|
}
|
|
|
|
$dominantDocId = $this->detectDominantTopDocument($chunkIds, $rows);
|
|
|
|
if ($dominantDocId !== null) {
|
|
$dominantChunkIds = $this->selectDominantDocumentChunkIds(
|
|
$dominantDocId,
|
|
$chunkIds,
|
|
$rows,
|
|
$limit
|
|
);
|
|
|
|
if ($dominantChunkIds !== []) {
|
|
return [
|
|
'ids' => $this->fillRemainingSalesChunkIds(
|
|
$dominantChunkIds,
|
|
$chunkIds,
|
|
$rows,
|
|
$limit
|
|
),
|
|
'mode' => 'sales_dominant_document',
|
|
];
|
|
}
|
|
}
|
|
|
|
return [
|
|
'ids' => $this->selectSalesChunkIdsSpread($chunkIds, $rows, $limit),
|
|
'mode' => 'sales_spread',
|
|
];
|
|
}
|
|
|
|
|
|
/**
|
|
* Resolves a strongly focused product document before normal sales spreading.
|
|
*
|
|
* This protects against classic false positives where neighbouring products,
|
|
* indicators or safety sheets outrank the actually requested device.
|
|
*/
|
|
private function resolveFocusedSalesDocumentId(string $prompt, array $chunkIds, array $rows): ?string
|
|
{
|
|
$promptProfile = $this->buildPromptProductProfile($prompt);
|
|
|
|
if ($promptProfile['anchors'] === []) {
|
|
return null;
|
|
}
|
|
|
|
$candidates = [];
|
|
$seenDocs = [];
|
|
|
|
foreach (array_slice($chunkIds, 0, $this->retrieverConfig->focusedProductWindow()) as $rank => $chunkId) {
|
|
$row = $rows[$chunkId] ?? null;
|
|
if (!is_array($row)) {
|
|
continue;
|
|
}
|
|
|
|
$documentId = $row['document_id'] ?? null;
|
|
if (!is_string($documentId) || $documentId === '' || isset($seenDocs[$documentId])) {
|
|
continue;
|
|
}
|
|
|
|
$title = $this->extractDocumentTitle($row);
|
|
if ($title === '') {
|
|
continue;
|
|
}
|
|
|
|
$seenDocs[$documentId] = true;
|
|
$score = $this->scoreFocusedProductCandidate($promptProfile, $title, $row, $rank);
|
|
|
|
$candidates[] = [
|
|
'document_id' => $documentId,
|
|
'score' => $score,
|
|
];
|
|
}
|
|
|
|
if ($candidates === []) {
|
|
return null;
|
|
}
|
|
|
|
usort($candidates, static function (array $a, array $b): int {
|
|
if ($a['score'] === $b['score']) {
|
|
return strcmp((string)$a['document_id'], (string)$b['document_id']);
|
|
}
|
|
|
|
return $b['score'] <=> $a['score'];
|
|
});
|
|
|
|
$best = $candidates[0] ?? null;
|
|
if ($best === null) {
|
|
return null;
|
|
}
|
|
|
|
$runnerUpScore = (float)($candidates[1]['score'] ?? -INF);
|
|
$bestScore = (float)$best['score'];
|
|
$gap = $bestScore - $runnerUpScore;
|
|
|
|
if ($bestScore < $this->retrieverConfig->focusedProductMinScore() || $gap < $this->retrieverConfig->focusedProductMinGap()) {
|
|
return null;
|
|
}
|
|
|
|
$documentId = $best['document_id'] ?? null;
|
|
|
|
return is_string($documentId) && $documentId !== '' ? $documentId : null;
|
|
}
|
|
|
|
/**
|
|
* Builds a small prompt profile used for focused product dominance decisions.
|
|
*
|
|
* @return array{
|
|
* normalized:string,
|
|
* anchors:string[],
|
|
* family_tokens:string[],
|
|
* number_tokens:string[],
|
|
* asks_reagent:bool,
|
|
* asks_document:bool,
|
|
* asks_safety:bool,
|
|
* asks_device:bool
|
|
* }
|
|
*/
|
|
private function buildPromptProductProfile(string $prompt): array
|
|
{
|
|
$normalized = $this->normalizeText($prompt);
|
|
$tokens = $this->tokenizeText($normalized);
|
|
|
|
$reagentWords = $this->retrieverConfig->looksLikeReagentWords();
|
|
$documentWords = $this->retrieverConfig->looksLikeDocumentWords();
|
|
$safetyWords = $this->retrieverConfig->looksLikeSafetyWords();
|
|
$deviceWords = $this->retrieverConfig->looksLikeDeviceWords();
|
|
|
|
$asksReagent = $this->containsAnyToken($tokens, $reagentWords);
|
|
$asksDocument = $this->containsAnyToken($tokens, $documentWords);
|
|
$asksSafety = $this->containsAnyToken($tokens, $safetyWords);
|
|
$asksDevice = $this->containsAnyToken($tokens, $deviceWords) || (!$asksReagent && !$asksDocument && !$asksSafety);
|
|
|
|
$anchors = [];
|
|
$familyTokens = [];
|
|
$numberTokens = [];
|
|
|
|
foreach ($tokens as $token) {
|
|
if ($this->isGenericProductToken($token)) {
|
|
continue;
|
|
}
|
|
|
|
if (preg_match('/\d/u', $token) === 1) {
|
|
$anchors[] = $token;
|
|
$numberTokens[] = $token;
|
|
$familyTokens[] = $token;
|
|
continue;
|
|
}
|
|
|
|
if ($this->isImportantShortModelToken($token)) {
|
|
$anchors[] = $token;
|
|
$familyTokens[] = $token;
|
|
continue;
|
|
}
|
|
|
|
if (mb_strlen($token, 'UTF-8') >= 3) {
|
|
$anchors[] = $token;
|
|
|
|
if ($this->isFamilyDescriptorToken($token)) {
|
|
$familyTokens[] = $token;
|
|
}
|
|
}
|
|
}
|
|
|
|
return [
|
|
'normalized' => $normalized,
|
|
'anchors' => array_values(array_unique($anchors)),
|
|
'family_tokens' => array_values(array_unique($familyTokens)),
|
|
'number_tokens' => array_values(array_unique($numberTokens)),
|
|
'asks_reagent' => $asksReagent,
|
|
'asks_document' => $asksDocument,
|
|
'asks_safety' => $asksSafety,
|
|
'asks_device' => $asksDevice,
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Scores one candidate document for focused product selection.
|
|
*/
|
|
private function scoreFocusedProductCandidate(array $promptProfile, string $title, array $row, int $rank): float
|
|
{
|
|
$titleNormalized = $this->normalizeText($title);
|
|
$titleTokens = $this->tokenizeText($titleNormalized);
|
|
$titleTokenMap = array_fill_keys($titleTokens, true);
|
|
$textNormalized = $this->normalizeText((string)($row['text'] ?? ''));
|
|
|
|
$score = max(0.0, 5.0 - $rank);
|
|
|
|
if ($titleNormalized !== '' && str_contains(' ' . $promptProfile['normalized'] . ' ', ' ' . $titleNormalized . ' ')) {
|
|
$score += 24.0;
|
|
}
|
|
|
|
$matchedAnchors = 0;
|
|
foreach ($promptProfile['anchors'] as $anchor) {
|
|
if (isset($titleTokenMap[$anchor])) {
|
|
$matchedAnchors++;
|
|
$score += $this->isImportantShortModelToken($anchor) ? 4.0 : 3.5;
|
|
continue;
|
|
}
|
|
|
|
if (str_contains(' ' . $titleNormalized . ' ', ' ' . $anchor . ' ')) {
|
|
$matchedAnchors++;
|
|
$score += 3.0;
|
|
continue;
|
|
}
|
|
|
|
$score -= $this->isFamilyDescriptorToken($anchor) ? 3.5 : 2.0;
|
|
}
|
|
|
|
foreach ($promptProfile['number_tokens'] as $numberToken) {
|
|
if (isset($titleTokenMap[$numberToken])) {
|
|
$score += 4.0;
|
|
} else {
|
|
$score -= 5.0;
|
|
}
|
|
}
|
|
|
|
foreach ($promptProfile['family_tokens'] as $familyToken) {
|
|
if (isset($titleTokenMap[$familyToken])) {
|
|
$score += 4.0;
|
|
} else {
|
|
$score -= 4.5;
|
|
}
|
|
}
|
|
|
|
if ($promptProfile['asks_device']) {
|
|
if ($this->looksLikeReagentOrAccessoryDocument($row, $titleNormalized, $textNormalized)) {
|
|
$score -= 12.0;
|
|
}
|
|
|
|
if ($this->looksLikeSafetyDocument($row, $titleNormalized, $textNormalized)) {
|
|
$score -= 8.0;
|
|
}
|
|
}
|
|
|
|
if ($promptProfile['asks_reagent'] && $this->looksLikeReagentOrAccessoryDocument($row, $titleNormalized, $textNormalized)) {
|
|
$score += 6.0;
|
|
}
|
|
|
|
if (($promptProfile['asks_document'] || $promptProfile['asks_safety']) && $this->looksLikeSafetyDocument($row, $titleNormalized, $textNormalized)) {
|
|
$score += 4.0;
|
|
}
|
|
|
|
if ($matchedAnchors === 0) {
|
|
$score -= 10.0;
|
|
}
|
|
|
|
return $score;
|
|
}
|
|
|
|
/**
|
|
* Selects only the focused product document chunks.
|
|
*
|
|
* In this strict mode we intentionally do not fill remaining slots with
|
|
* neighbouring products, because that would reintroduce the original bug.
|
|
*/
|
|
private function selectFocusedProductChunkIds(
|
|
string $documentId,
|
|
array $chunkIds,
|
|
array $rows,
|
|
int $limit
|
|
): array
|
|
{
|
|
return $this->selectDominantDocumentChunkIds(
|
|
$documentId,
|
|
$chunkIds,
|
|
$rows,
|
|
min($limit, $this->retrieverConfig->focusedProductMaxChunks())
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Detects whether one document clearly dominates the first ranked window.
|
|
*
|
|
* This is especially useful for product-sheet style documents where
|
|
* several adjacent chunks belong together and should be passed to the model
|
|
* as one coherent factual block.
|
|
*/
|
|
private function detectDominantTopDocument(array $chunkIds, array $rows): ?string
|
|
{
|
|
$docWindow = [];
|
|
|
|
foreach (array_slice($chunkIds, 0, $this->retrieverConfig->dominantDocWindow()) as $chunkId) {
|
|
if (!isset($rows[$chunkId]['text'])) {
|
|
continue;
|
|
}
|
|
|
|
$text = trim((string)$rows[$chunkId]['text']);
|
|
$docId = $rows[$chunkId]['document_id'] ?? null;
|
|
|
|
if ($text === '' || !is_string($docId) || $docId === '') {
|
|
continue;
|
|
}
|
|
|
|
$docWindow[] = $docId;
|
|
}
|
|
|
|
if (count($docWindow) < 2) {
|
|
return null;
|
|
}
|
|
|
|
$counts = array_count_values($docWindow);
|
|
arsort($counts);
|
|
|
|
$dominantDocId = array_key_first($counts);
|
|
|
|
if (!is_string($dominantDocId) || $dominantDocId === '') {
|
|
return null;
|
|
}
|
|
|
|
$dominantCount = (int)($counts[$dominantDocId] ?? 0);
|
|
|
|
if ($dominantCount >= $this->retrieverConfig->dominantDocMinHits()) {
|
|
return $dominantDocId;
|
|
}
|
|
|
|
$first = $docWindow[0] ?? null;
|
|
$second = $docWindow[1] ?? null;
|
|
|
|
if ($dominantCount >= 2 && $first === $dominantDocId && $second === $dominantDocId) {
|
|
return $dominantDocId;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Selects a coherent chunk window from the dominant document.
|
|
*
|
|
* Strategy:
|
|
* - use the highest-ranked chunk of that document as anchor
|
|
* - prefer neighbouring chunk indices around that anchor
|
|
* - sort the final selection by chunk index for prompt coherence
|
|
*/
|
|
private function selectDominantDocumentChunkIds(
|
|
string $documentId,
|
|
array $chunkIds,
|
|
array $rows,
|
|
int $limit
|
|
): array
|
|
{
|
|
$docHits = [];
|
|
$anchorChunkIndex = null;
|
|
|
|
foreach ($chunkIds as $rank => $chunkId) {
|
|
if (!isset($rows[$chunkId]['text'])) {
|
|
continue;
|
|
}
|
|
|
|
$text = trim((string)$rows[$chunkId]['text']);
|
|
$docId = $rows[$chunkId]['document_id'] ?? null;
|
|
|
|
if ($text === '' || $docId !== $documentId) {
|
|
continue;
|
|
}
|
|
|
|
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
|
$chunkIndex = is_int($chunkIndex) ? $chunkIndex : null;
|
|
|
|
if ($anchorChunkIndex === null && $chunkIndex !== null) {
|
|
$anchorChunkIndex = $chunkIndex;
|
|
}
|
|
|
|
$docHits[] = [
|
|
'id' => (string)$chunkId,
|
|
'rank' => $rank,
|
|
'chunk_index' => $chunkIndex,
|
|
];
|
|
}
|
|
|
|
if ($docHits === []) {
|
|
return [];
|
|
}
|
|
|
|
$maxFromDoc = min($limit, $this->retrieverConfig->dominantDocMaxChunks());
|
|
|
|
if ($anchorChunkIndex !== null) {
|
|
usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int {
|
|
$aDistance = $a['chunk_index'] === null ? PHP_INT_MAX : abs($a['chunk_index'] - $anchorChunkIndex);
|
|
$bDistance = $b['chunk_index'] === null ? PHP_INT_MAX : abs($b['chunk_index'] - $anchorChunkIndex);
|
|
|
|
if ($aDistance !== $bDistance) {
|
|
return $aDistance <=> $bDistance;
|
|
}
|
|
|
|
return $a['rank'] <=> $b['rank'];
|
|
});
|
|
} else {
|
|
usort($docHits, static fn(array $a, array $b): int => $a['rank'] <=> $b['rank']);
|
|
}
|
|
|
|
$selected = array_slice($docHits, 0, $maxFromDoc);
|
|
|
|
usort($selected, static function (array $a, array $b): int {
|
|
$aIndex = $a['chunk_index'];
|
|
$bIndex = $b['chunk_index'];
|
|
|
|
if ($aIndex === null && $bIndex === null) {
|
|
return $a['rank'] <=> $b['rank'];
|
|
}
|
|
|
|
if ($aIndex === null) {
|
|
return 1;
|
|
}
|
|
|
|
if ($bIndex === null) {
|
|
return -1;
|
|
}
|
|
|
|
if ($aIndex !== $bIndex) {
|
|
return $aIndex <=> $bIndex;
|
|
}
|
|
|
|
return $a['rank'] <=> $b['rank'];
|
|
});
|
|
|
|
return array_map(
|
|
static fn(array $row): string => $row['id'],
|
|
$selected
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Fills the remaining sales slots after a dominant document selection.
|
|
*
|
|
* The already selected dominant-document chunks stay fixed.
|
|
* Remaining slots are filled with the normal spread strategy.
|
|
*/
|
|
private function fillRemainingSalesChunkIds(
|
|
array $seedChunkIds,
|
|
array $chunkIds,
|
|
array $rows,
|
|
int $limit
|
|
): array
|
|
{
|
|
$out = array_values(array_unique(array_map('strval', $seedChunkIds)));
|
|
|
|
if (count($out) >= $limit) {
|
|
return array_slice($out, 0, $limit);
|
|
}
|
|
|
|
$selected = array_fill_keys($out, true);
|
|
$docCounter = [];
|
|
$docChunkPositions = [];
|
|
|
|
foreach ($out as $chunkId) {
|
|
$docId = $rows[$chunkId]['document_id'] ?? null;
|
|
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
|
|
|
if (is_string($docId) && $docId !== '') {
|
|
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
|
|
|
|
if (is_int($chunkIndex)) {
|
|
$docChunkPositions[$docId][] = $chunkIndex;
|
|
}
|
|
}
|
|
}
|
|
|
|
foreach ($chunkIds as $chunkId) {
|
|
if (isset($selected[$chunkId])) {
|
|
continue;
|
|
}
|
|
|
|
if (!isset($rows[$chunkId]['text'])) {
|
|
continue;
|
|
}
|
|
|
|
$docId = $rows[$chunkId]['document_id'] ?? null;
|
|
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
|
|
|
if (!is_string($docId) || $docId === '') {
|
|
continue;
|
|
}
|
|
|
|
if (($docCounter[$docId] ?? 0) >= $this->retrieverConfig->maxChunksPerDoc()) {
|
|
continue;
|
|
}
|
|
|
|
if (is_int($chunkIndex)) {
|
|
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
|
|
if (abs($prevIdx - $chunkIndex) < $this->retrieverConfig->minChunkDistance()) {
|
|
continue 2;
|
|
}
|
|
}
|
|
}
|
|
|
|
$text = trim((string)$rows[$chunkId]['text']);
|
|
if ($text === '') {
|
|
continue;
|
|
}
|
|
|
|
$out[] = (string)$chunkId;
|
|
$selected[$chunkId] = true;
|
|
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
|
|
|
|
if (is_int($chunkIndex)) {
|
|
$docChunkPositions[$docId][] = $chunkIndex;
|
|
}
|
|
|
|
if (count($out) >= $limit) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
/**
|
|
* Default spread selection for sales-oriented queries.
|
|
*
|
|
* Goal:
|
|
* - avoid overloading the result with chunks from the same document
|
|
* - avoid chunks that are too close to each other in the same document
|
|
* - preserve top-ranked relevance while improving contextual spread
|
|
*/
|
|
private function selectSalesChunkIdsSpread(array $chunkIds, array $rows, int $limit): array
|
|
{
|
|
$out = [];
|
|
$docCounter = [];
|
|
$docChunkPositions = [];
|
|
|
|
foreach ($chunkIds as $chunkId) {
|
|
if (!isset($rows[$chunkId]['text'])) {
|
|
continue;
|
|
}
|
|
|
|
$docId = $rows[$chunkId]['document_id'] ?? null;
|
|
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
|
|
|
if (!is_string($docId) || $docId === '') {
|
|
continue;
|
|
}
|
|
|
|
if (($docCounter[$docId] ?? 0) >= $this->retrieverConfig->maxChunksPerDoc()) {
|
|
continue;
|
|
}
|
|
|
|
if (is_int($chunkIndex)) {
|
|
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
|
|
if (abs($prevIdx - $chunkIndex) < $this->retrieverConfig->minChunkDistance()) {
|
|
continue 2;
|
|
}
|
|
}
|
|
|
|
$docChunkPositions[$docId][] = $chunkIndex;
|
|
}
|
|
|
|
$text = trim((string)$rows[$chunkId]['text']);
|
|
if ($text === '') {
|
|
continue;
|
|
}
|
|
|
|
$out[] = (string)$chunkId;
|
|
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
|
|
|
|
if (count($out) >= $limit) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
|
|
/**
|
|
* Extracts the document title from metadata or from the first product-title heading.
|
|
*/
|
|
private function extractDocumentTitle(array $row): string
|
|
{
|
|
$metadataTitle = $row['metadata']['document_title'] ?? null;
|
|
|
|
if (is_string($metadataTitle) && trim($metadataTitle) !== '') {
|
|
return trim($metadataTitle);
|
|
}
|
|
|
|
$text = (string)($row['text'] ?? '');
|
|
|
|
if (
|
|
$text !== '' &&
|
|
preg_match('/^#\s*Produkt\s+Titel:\s*`?([^`\n]+)`?/imu', $text, $matches) === 1
|
|
) {
|
|
return trim((string)($matches[1] ?? ''));
|
|
}
|
|
|
|
return '';
|
|
}
|
|
|
|
/**
|
|
* Normalizes text for token-safe product comparisons.
|
|
*/
|
|
private function normalizeText(string $value): string
|
|
{
|
|
$value = mb_strtolower(trim($value), 'UTF-8');
|
|
$value = str_replace(['-', '/', '_'], ' ', $value);
|
|
$value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
|
|
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
|
|
|
return trim($value);
|
|
}
|
|
|
|
/**
|
|
* Tokenizes normalized text.
|
|
*
|
|
* @return string[]
|
|
*/
|
|
private function tokenizeText(string $value): array
|
|
{
|
|
if ($value === '') {
|
|
return [];
|
|
}
|
|
|
|
return preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
|
}
|
|
|
|
/**
|
|
* Returns true when at least one token from the haystack matches the given words.
|
|
*/
|
|
private function containsAnyToken(array $tokens, array $needles): bool
|
|
{
|
|
if ($tokens === [] || $needles === []) {
|
|
return false;
|
|
}
|
|
|
|
$tokenMap = array_fill_keys($tokens, true);
|
|
|
|
foreach ($needles as $needle) {
|
|
if (isset($tokenMap[$needle])) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Generic product words must not drive product dominance decisions.
|
|
*/
|
|
private function isGenericProductToken(string $token): bool
|
|
{
|
|
$generic = $this->retrieverConfig->genericProductTokens();
|
|
|
|
return isset(array_fill_keys($generic, true)[$token]);
|
|
}
|
|
|
|
/**
|
|
* Short technical model codes like TH or TC are allowed as anchors.
|
|
*/
|
|
private function isImportantShortModelToken(string $token): bool
|
|
{
|
|
$allowed = $this->retrieverConfig->importantShortModelTokens();
|
|
|
|
return in_array($token, $allowed, true);
|
|
}
|
|
|
|
/**
|
|
* Family descriptors are strong product differentiators.
|
|
*/
|
|
private function isFamilyDescriptorToken(string $token): bool
|
|
{
|
|
$familyDescriptors = $this->retrieverConfig->familyDescriptorTokens();
|
|
|
|
return in_array($token, $familyDescriptors, true)
|
|
|| $this->isImportantShortModelToken($token)
|
|
|| preg_match('/\d/u', $token) === 1;
|
|
}
|
|
|
|
/**
|
|
* Heuristic classifier for indicator, reagent, accessory and spare-part documents.
|
|
*/
|
|
private function looksLikeReagentOrAccessoryDocument(array $row, string $titleNormalized, string $textNormalized): bool
|
|
{
|
|
$haystack = trim($titleNormalized . ' ' . $textNormalized);
|
|
|
|
if ($haystack === '') {
|
|
return false;
|
|
}
|
|
|
|
$needles = $this->retrieverConfig->looksLikeReagentTokens();
|
|
|
|
foreach ($needles as $needle) {
|
|
if (str_contains($haystack, $needle)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Heuristic classifier for safety-style documents.
|
|
*/
|
|
private function looksLikeSafetyDocument(array $row, string $titleNormalized, string $textNormalized): bool
|
|
{
|
|
$haystack = trim($titleNormalized . ' ' . $textNormalized);
|
|
|
|
if ($haystack === '') {
|
|
return false;
|
|
}
|
|
|
|
$needles = $this->retrieverConfig->looksLikeSafetyDocs();
|
|
|
|
foreach ($needles as $needle) {
|
|
if (str_contains($haystack, $needle)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Converts selected chunk ids into the final plain text result list.
|
|
*/
|
|
private function collectTextsFromIds(array $chunkIds, array $rows): array
|
|
{
|
|
$out = [];
|
|
|
|
foreach ($chunkIds as $id) {
|
|
if (!isset($rows[$id]['text'])) {
|
|
continue;
|
|
}
|
|
|
|
$text = trim((string)$rows[$id]['text']);
|
|
|
|
if ($text !== '') {
|
|
$out[] = $text;
|
|
}
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
} |