optimize document loader

This commit is contained in:
team2
2026-03-01 10:48:23 +01:00
parent c89d404300
commit eb9ab2ec48

View File

@@ -18,15 +18,22 @@ final class NdjsonHybridRetriever implements RetrieverInterface
{ {
private const VECTOR_SCORE_THRESHOLD = 0.72; private const VECTOR_SCORE_THRESHOLD = 0.72;
// Guardrails
private const HARD_MAX_CHUNKS = 90; private const HARD_MAX_CHUNKS = 90;
private const HARD_MAX_VECTORK = 250; private const HARD_MAX_VECTORK = 250;
private const LIST_BONUS = 1.5; private const LIST_BONUS = 1.25;
// Selection / Fusion
private const MAX_CHUNKS_PER_DOC = 2; private const MAX_CHUNKS_PER_DOC = 2;
private const MIN_CHUNK_DISTANCE = 2; private const MIN_CHUNK_DISTANCE = 2;
private const RRF_K = 60; private const RRF_K = 60;
// Hardening (nur Edge-Cases; Standardverhalten bleibt gleich)
private const THRESHOLD_FLOOR = 0.65;
private const THRESHOLD_CEIL = 0.90;
private const EMPTY_RRF_FALLBACK_TOPN = 5;
public function __construct( public function __construct(
private readonly NdjsonChunkLookup $lookup, private readonly NdjsonChunkLookup $lookup,
private readonly VectorSearchClient $vectorClient, private readonly VectorSearchClient $vectorClient,
@@ -66,10 +73,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$entityLabel = $this->catalogIntent->detect($prompt); $entityLabel = $this->catalogIntent->detect($prompt);
// 2) Intent (regelbasiert) // 2) Intent (regelbasiert)
$intent = (string)($this->salesIntentLite->detect($prompt)['intent'] ?? SalesIntentLite::DISCOVERY); $salesIntent = $this->detectSalesIntent($prompt);
// 3) Route bestimmen (Intent + Entity) // 3) Route bestimmen (Intent + Entity)
$route = $this->routeResolver->resolve($intent, $entityLabel); $route = $this->routeResolver->resolve($salesIntent, $entityLabel);
// 4) Early Exit nur für catalog_list // 4) Early Exit nur für catalog_list
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) { if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
@@ -84,37 +91,26 @@ final class NdjsonHybridRetriever implements RetrieverInterface
// NORMALER CORE // NORMALER CORE
// ------------------------------------------------------------ // ------------------------------------------------------------
$core = $this->runCore($prompt, $config, false); $core = $this->runCore($prompt, $config, false, $salesIntent);
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
return []; return [];
} }
// ✅ ORIGINAL: Normal Mode -> Sales optimized selection
if (!$core['is_list_query']) { if (!$core['is_list_query']) {
return $this->collectSalesOptimized( $selectedIds = $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
$core['ranked_chunk_ids'], return $this->collectTextsFromIds($selectedIds, $core['rows']);
$core['rows'],
$core['limit']
);
} }
// ✅ ORIGINAL: List Mode -> simple collectTexts $selectedIds = $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
return $this->collectTexts( return $this->collectTextsFromIds($selectedIds, $core['rows']);
$core['ranked_chunk_ids'],
$core['rows'],
$core['limit']
);
} }
// ========================================================= // =========================================================
// DEBUG (unverändert, kein Early-Exit damit Debug immer Core zeigt) // DEBUG (deterministisch: gleiche Intent-Bestimmung wie Prod)
// ========================================================= // =========================================================
/** /**
* Gibt genau DIE Treffer zurück, die auch in Produktion ausgewählt werden,
* plus Scores/Meta pro ausgewähltem Chunk.
*
* @return array<int, array{ * @return array<int, array{
* rank:int, * rank:int,
* chunk_id:string, * chunk_id:string,
@@ -135,15 +131,18 @@ final class NdjsonHybridRetriever implements RetrieverInterface
throw new \RuntimeException('No active ModelGenerationConfig found.'); throw new \RuntimeException('No active ModelGenerationConfig found.');
} }
$core = $this->runCore($prompt, $config, true); $salesIntent = $this->detectSalesIntent($prompt);
// Debug zeigt Core ohne Early Exit, aber mit identischem Intent-Input.
$core = $this->runCore($prompt, $config, true, $salesIntent);
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
return []; return [];
} }
$selectedChunkIds = $core['is_list_query'] $selectedChunkIds = $core['is_list_query']
? $this->selectChunkIdsListMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']) ? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
: $this->selectChunkIdsSalesMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']); : $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
if ($selectedChunkIds === []) { if ($selectedChunkIds === []) {
return []; return [];
@@ -162,7 +161,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$out[] = [ $out[] = [
'rank' => $rank, 'rank' => $rank,
'chunk_id' => $chunkId, 'chunk_id' => (string)$chunkId,
'document_id' => isset($core['rows'][$chunkId]['document_id']) ? (string)$core['rows'][$chunkId]['document_id'] : null, 'document_id' => isset($core['rows'][$chunkId]['document_id']) ? (string)$core['rows'][$chunkId]['document_id'] : null,
'raw_score' => isset($core['raw_scores'][$chunkId]) ? (float)$core['raw_scores'][$chunkId] : null, 'raw_score' => isset($core['raw_scores'][$chunkId]) ? (float)$core['raw_scores'][$chunkId] : null,
'rrf_score' => isset($core['rrf_scores'][$chunkId]) ? (float)$core['rrf_scores'][$chunkId] : null, 'rrf_score' => isset($core['rrf_scores'][$chunkId]) ? (float)$core['rrf_scores'][$chunkId] : null,
@@ -177,7 +176,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
} }
// ========================================================= // =========================================================
// CORE PIPELINE (einmalig, shared) // CORE PIPELINE
// ========================================================= // =========================================================
/** /**
@@ -193,19 +192,103 @@ final class NdjsonHybridRetriever implements RetrieverInterface
* raw_scores:array<string,float> * raw_scores:array<string,float>
* } * }
*/ */
private function runCore(string $prompt, ModelGenerationConfig $config, bool $withScores): array private function runCore(
{ string $prompt,
ModelGenerationConfig $config,
bool $withScores,
string $salesIntent
): array {
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
$isListQuery = $this->intentLite->isListQuery($prompt); $isListQuery = $this->intentLite->isListQuery($prompt);
$salesIntent = $this->salesIntentLite->detect($prompt)['intent'];
$cleanQuery = $this->queryCleaner->clean($prompt); $cleanQuery = $this->queryCleaner->clean($prompt);
if ($cleanQuery === '') { if ($cleanQuery === '') {
$cleanQuery = $prompt; $cleanQuery = $prompt;
} }
[$threshold, $topK] = $this->computeThresholdAndTopK($salesIntent, $isListQuery, $vectorTopKBase);
// Candidate Routing (keine Set-Map nötig; scoped nur wenn IDs existieren)
$candidateDocIds = $this->tagRouting->route($cleanQuery);
$candidateDocIds = is_array($candidateDocIds) ? array_values(array_unique(array_filter($candidateDocIds, 'is_string'))) : [];
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
$scopedHits = [];
if ($candidateDocIds !== []) {
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
}
if ($globalHits === [] && $scopedHits === []) {
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'sales_intent' => $salesIntent,
'threshold' => $threshold,
'topk' => $topK,
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
'raw_scores' => [],
];
}
$fused = $this->fuseHits(
$globalHits,
$scopedHits,
$threshold,
$salesIntent === SalesIntentLite::OBJECTION,
$withScores
);
$rrfScores = $fused['rrf_scores'];
$rawScores = $fused['raw_scores'];
// 🛡 Hardening: wenn Threshold alles rausfiltert, aber globale Hits existieren,
// nehmen wir Top-N als minimalen Kontext. Greift nur in Edge-Cases.
if ($rrfScores === [] && $globalHits !== []) {
$rrfScores = $this->fallbackRrfFromHits($globalHits, self::EMPTY_RRF_FALLBACK_TOPN);
}
if ($rrfScores === []) {
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'sales_intent' => $salesIntent,
'threshold' => $threshold,
'topk' => $topK,
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
'raw_scores' => $rawScores,
];
}
arsort($rrfScores);
$rankedChunkIds = array_keys($rrfScores);
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'sales_intent' => $salesIntent,
'threshold' => $threshold,
'topk' => $topK,
'ranked_chunk_ids' => $rankedChunkIds,
'rows' => $rows,
'rrf_scores' => $rrfScores,
'raw_scores' => $rawScores,
];
}
/**
* @return array{0: float, 1: int} threshold, topK
*/
private function computeThresholdAndTopK(string $salesIntent, bool $isListQuery, int $vectorTopKBase): array
{
$threshold = self::VECTOR_SCORE_THRESHOLD; $threshold = self::VECTOR_SCORE_THRESHOLD;
$topK = $vectorTopKBase; $topK = $vectorTopKBase;
@@ -242,88 +325,29 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$topK = max(1, min($topK, self::HARD_MAX_VECTORK)); $topK = max(1, min($topK, self::HARD_MAX_VECTORK));
$candidateDocIds = $this->tagRouting->route($cleanQuery); // Enterprise clamp: verhindert Drift, ohne den aktuellen Normalfall zu ändern.
$candidateSet = null; $threshold = max(self::THRESHOLD_FLOOR, min(self::THRESHOLD_CEIL, $threshold));
if (is_array($candidateDocIds) && $candidateDocIds !== []) { return [$threshold, $topK];
$candidateSet = array_fill_keys($candidateDocIds, true);
}
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
$scopedHits = [];
if ($candidateSet !== null) {
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, array_keys($candidateSet));
}
if ($globalHits === [] && $scopedHits === []) {
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'sales_intent' => (string)$salesIntent,
'threshold' => $threshold,
'topk' => $topK,
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
'raw_scores' => [],
];
} }
/**
* @return array{
* rrf_scores: array<string,float>,
* raw_scores: array<string,float>
* }
*/
private function fuseHits(
array $globalHits,
array $scopedHits,
float $threshold,
bool $boostScoped,
bool $captureRaw
): array {
$rrfScores = []; $rrfScores = [];
$rawScores = []; $rawScores = [];
$this->applyRrfWithOptionalRaw($globalHits, $rrfScores, $rawScores, $threshold, false, $withScores); $apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void {
$this->applyRrfWithOptionalRaw(
$scopedHits,
$rrfScores,
$rawScores,
$threshold,
$salesIntent === SalesIntentLite::OBJECTION,
$withScores
);
if ($rrfScores === []) {
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'sales_intent' => (string)$salesIntent,
'threshold' => $threshold,
'topk' => $topK,
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
'raw_scores' => $rawScores,
];
}
arsort($rrfScores);
$rankedChunkIds = array_keys($rrfScores);
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'sales_intent' => (string)$salesIntent,
'threshold' => $threshold,
'topk' => $topK,
'ranked_chunk_ids' => $rankedChunkIds,
'rows' => $rows,
'rrf_scores' => $rrfScores,
'raw_scores' => $rawScores,
];
}
private function applyRrfWithOptionalRaw(
array $hits,
array &$rrfScores,
array &$rawScores,
float $threshold,
bool $boost = false,
bool $captureRaw = false
): void
{
$rank = 0; $rank = 0;
foreach ($hits as $hit) { foreach ($hits as $hit) {
@@ -339,27 +363,68 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$chunkId = (string)$hit['chunk_id']; $chunkId = (string)$hit['chunk_id'];
if ($captureRaw) { if ($captureRaw) {
if (!isset($rawScores[$chunkId]) || $raw > $rawScores[$chunkId]) { $rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
$rawScores[$chunkId] = $raw;
}
} }
$rank++; $rank++;
$rrf = 1 / (self::RRF_K + $rank); $rrf = 1.0 / (self::RRF_K + $rank);
if ($boost) { if ($boost) {
$rrf *= 1.2; $rrf *= 1.2;
} }
if (!isset($rrfScores[$chunkId])) { $rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
$rrfScores[$chunkId] = 0.0; }
};
$apply($globalHits, false);
$apply($scopedHits, $boostScoped);
return [
'rrf_scores' => $rrfScores,
'raw_scores' => $rawScores,
];
} }
$rrfScores[$chunkId] += $rrf; /**
* Minimaler Fallback: baut RRF nur aus der Reihenfolge (ohne Threshold),
* damit Edge-Cases nicht leer laufen.
*
* @return array<string,float>
*/
private function fallbackRrfFromHits(array $hits, int $topN): array
{
$rrf = [];
$rank = 0;
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'])) {
continue;
}
$rank++;
$chunkId = (string)$hit['chunk_id'];
$rrf[$chunkId] = 1.0 / (self::RRF_K + $rank);
if ($rank >= $topN) {
break;
} }
} }
private function selectChunkIdsListMode(array $chunkIds, array $rows, int $limit): array return $rrf;
}
private function detectSalesIntent(string $prompt): string
{
$data = $this->salesIntentLite->detect($prompt);
return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
}
// =========================================================
// SELECTION (shared)
// =========================================================
private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array
{ {
$seen = []; $seen = [];
$out = []; $out = [];
@@ -374,7 +439,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface
continue; continue;
} }
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); // Dedupe Key (billig & stabil)
$key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));
if (isset($seen[$key])) { if (isset($seen[$key])) {
continue; continue;
@@ -391,7 +457,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return $out; return $out;
} }
private function selectChunkIdsSalesMode(array $chunkIds, array $rows, int $limit): array private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array
{ {
$out = []; $out = [];
$docCounter = []; $docCounter = [];
@@ -439,88 +505,12 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return $out; return $out;
} }
private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void // =========================================================
// COLLECT (shared)
// =========================================================
private function collectTextsFromIds(array $chunkIds, array $rows): array
{ {
$rank = 0;
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
$raw = (float)$hit['score'];
if ($raw < $threshold) {
continue;
}
$chunkId = (string)$hit['chunk_id'];
$rank++;
$rrf = 1 / (self::RRF_K + $rank);
if ($boost) {
$rrf *= 1.2;
}
if (!isset($rrfScores[$chunkId])) {
$rrfScores[$chunkId] = 0.0;
}
$rrfScores[$chunkId] += $rrf;
}
}
private function collectSalesOptimized(array $chunkIds, array $rows, int $limit): array
{
$out = [];
$docCounter = [];
$docChunkPositions = [];
foreach ($chunkIds as $chunkId) {
if (!isset($rows[$chunkId]['text'])) {
continue;
}
$docId = $rows[$chunkId]['document_id'] ?? null;
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
if (!is_string($docId)) {
continue;
}
if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) {
continue;
}
if (is_int($chunkIndex)) {
$prev = $docChunkPositions[$docId] ?? [];
foreach ($prev as $prevIdx) {
if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) {
continue 2;
}
}
$docChunkPositions[$docId][] = $chunkIndex;
}
$text = trim((string)$rows[$chunkId]['text']);
if ($text === '') {
continue;
}
$out[] = $text;
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
if (\count($out) >= $limit) {
break;
}
}
return $out;
}
private function collectTexts(array $chunkIds, array $rows, int $limit): array
{
$seen = [];
$out = []; $out = [];
foreach ($chunkIds as $id) { foreach ($chunkIds as $id) {
@@ -528,23 +518,12 @@ final class NdjsonHybridRetriever implements RetrieverInterface
continue; continue;
} }
$chunk = trim((string)$rows[$id]['text']); $text = trim((string)$rows[$id]['text']);
if ($chunk === '') { if ($text === '') {
continue; continue;
} }
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); $out[] = $text;
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = $chunk;
if (\count($out) >= $limit) {
break;
}
} }
return $out; return $out;