optimize document loader
This commit is contained in:
@@ -18,15 +18,22 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
{
|
{
|
||||||
private const VECTOR_SCORE_THRESHOLD = 0.72;
|
private const VECTOR_SCORE_THRESHOLD = 0.72;
|
||||||
|
|
||||||
|
// Guardrails
|
||||||
private const HARD_MAX_CHUNKS = 90;
|
private const HARD_MAX_CHUNKS = 90;
|
||||||
private const HARD_MAX_VECTORK = 250;
|
private const HARD_MAX_VECTORK = 250;
|
||||||
|
|
||||||
private const LIST_BONUS = 1.5;
|
private const LIST_BONUS = 1.25;
|
||||||
|
|
||||||
|
// Selection / Fusion
|
||||||
private const MAX_CHUNKS_PER_DOC = 2;
|
private const MAX_CHUNKS_PER_DOC = 2;
|
||||||
private const MIN_CHUNK_DISTANCE = 2;
|
private const MIN_CHUNK_DISTANCE = 2;
|
||||||
private const RRF_K = 60;
|
private const RRF_K = 60;
|
||||||
|
|
||||||
|
// Hardening (nur Edge-Cases; Standardverhalten bleibt gleich)
|
||||||
|
private const THRESHOLD_FLOOR = 0.65;
|
||||||
|
private const THRESHOLD_CEIL = 0.90;
|
||||||
|
private const EMPTY_RRF_FALLBACK_TOPN = 5;
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly NdjsonChunkLookup $lookup,
|
private readonly NdjsonChunkLookup $lookup,
|
||||||
private readonly VectorSearchClient $vectorClient,
|
private readonly VectorSearchClient $vectorClient,
|
||||||
@@ -66,10 +73,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
$entityLabel = $this->catalogIntent->detect($prompt);
|
$entityLabel = $this->catalogIntent->detect($prompt);
|
||||||
|
|
||||||
// 2) Intent (regelbasiert)
|
// 2) Intent (regelbasiert)
|
||||||
$intent = (string)($this->salesIntentLite->detect($prompt)['intent'] ?? SalesIntentLite::DISCOVERY);
|
$salesIntent = $this->detectSalesIntent($prompt);
|
||||||
|
|
||||||
// 3) Route bestimmen (Intent + Entity)
|
// 3) Route bestimmen (Intent + Entity)
|
||||||
$route = $this->routeResolver->resolve($intent, $entityLabel);
|
$route = $this->routeResolver->resolve($salesIntent, $entityLabel);
|
||||||
|
|
||||||
// 4) Early Exit nur für catalog_list
|
// 4) Early Exit nur für catalog_list
|
||||||
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
|
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
|
||||||
@@ -84,41 +91,30 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
// NORMALER CORE
|
// NORMALER CORE
|
||||||
// ------------------------------------------------------------
|
// ------------------------------------------------------------
|
||||||
|
|
||||||
$core = $this->runCore($prompt, $config, false);
|
$core = $this->runCore($prompt, $config, false, $salesIntent);
|
||||||
|
|
||||||
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
// ✅ ORIGINAL: Normal Mode -> Sales optimized selection
|
|
||||||
if (!$core['is_list_query']) {
|
if (!$core['is_list_query']) {
|
||||||
return $this->collectSalesOptimized(
|
$selectedIds = $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
||||||
$core['ranked_chunk_ids'],
|
return $this->collectTextsFromIds($selectedIds, $core['rows']);
|
||||||
$core['rows'],
|
|
||||||
$core['limit']
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ✅ ORIGINAL: List Mode -> simple collectTexts
|
$selectedIds = $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
||||||
return $this->collectTexts(
|
return $this->collectTextsFromIds($selectedIds, $core['rows']);
|
||||||
$core['ranked_chunk_ids'],
|
|
||||||
$core['rows'],
|
|
||||||
$core['limit']
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// =========================================================
|
// =========================================================
|
||||||
// DEBUG (unverändert, kein Early-Exit damit Debug immer Core zeigt)
|
// DEBUG (deterministisch: gleiche Intent-Bestimmung wie Prod)
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gibt genau DIE Treffer zurück, die auch in Produktion ausgewählt werden,
|
|
||||||
* plus Scores/Meta pro ausgewähltem Chunk.
|
|
||||||
*
|
|
||||||
* @return array<int, array{
|
* @return array<int, array{
|
||||||
* rank:int,
|
* rank:int,
|
||||||
* chunk_id:string,
|
* chunk_id:string,
|
||||||
* document_id: (string|null),
|
* document_id:(string|null),
|
||||||
* raw_score:(float|null),
|
* raw_score:(float|null),
|
||||||
* rrf_score:(float|null),
|
* rrf_score:(float|null),
|
||||||
* threshold:float,
|
* threshold:float,
|
||||||
@@ -135,15 +131,18 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
throw new \RuntimeException('No active ModelGenerationConfig found.');
|
throw new \RuntimeException('No active ModelGenerationConfig found.');
|
||||||
}
|
}
|
||||||
|
|
||||||
$core = $this->runCore($prompt, $config, true);
|
$salesIntent = $this->detectSalesIntent($prompt);
|
||||||
|
|
||||||
|
// Debug zeigt Core ohne Early Exit, aber mit identischem Intent-Input.
|
||||||
|
$core = $this->runCore($prompt, $config, true, $salesIntent);
|
||||||
|
|
||||||
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
$selectedChunkIds = $core['is_list_query']
|
$selectedChunkIds = $core['is_list_query']
|
||||||
? $this->selectChunkIdsListMode($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
|
? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
|
||||||
: $this->selectChunkIdsSalesMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
: $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
||||||
|
|
||||||
if ($selectedChunkIds === []) {
|
if ($selectedChunkIds === []) {
|
||||||
return [];
|
return [];
|
||||||
@@ -162,7 +161,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
|
|
||||||
$out[] = [
|
$out[] = [
|
||||||
'rank' => $rank,
|
'rank' => $rank,
|
||||||
'chunk_id' => $chunkId,
|
'chunk_id' => (string)$chunkId,
|
||||||
'document_id' => isset($core['rows'][$chunkId]['document_id']) ? (string)$core['rows'][$chunkId]['document_id'] : null,
|
'document_id' => isset($core['rows'][$chunkId]['document_id']) ? (string)$core['rows'][$chunkId]['document_id'] : null,
|
||||||
'raw_score' => isset($core['raw_scores'][$chunkId]) ? (float)$core['raw_scores'][$chunkId] : null,
|
'raw_score' => isset($core['raw_scores'][$chunkId]) ? (float)$core['raw_scores'][$chunkId] : null,
|
||||||
'rrf_score' => isset($core['rrf_scores'][$chunkId]) ? (float)$core['rrf_scores'][$chunkId] : null,
|
'rrf_score' => isset($core['rrf_scores'][$chunkId]) ? (float)$core['rrf_scores'][$chunkId] : null,
|
||||||
@@ -177,7 +176,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
}
|
}
|
||||||
|
|
||||||
// =========================================================
|
// =========================================================
|
||||||
// CORE PIPELINE (einmalig, shared)
|
// CORE PIPELINE
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -187,25 +186,109 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
* sales_intent:string,
|
* sales_intent:string,
|
||||||
* threshold:float,
|
* threshold:float,
|
||||||
* topk:int,
|
* topk:int,
|
||||||
* ranked_chunk_ids: string[],
|
* ranked_chunk_ids:string[],
|
||||||
* rows: array<string, array<string,mixed>>,
|
* rows:array<string, array<string,mixed>>,
|
||||||
* rrf_scores: array<string,float>,
|
* rrf_scores:array<string,float>,
|
||||||
* raw_scores: array<string,float>
|
* raw_scores:array<string,float>
|
||||||
* }
|
* }
|
||||||
*/
|
*/
|
||||||
private function runCore(string $prompt, ModelGenerationConfig $config, bool $withScores): array
|
private function runCore(
|
||||||
{
|
string $prompt,
|
||||||
|
ModelGenerationConfig $config,
|
||||||
|
bool $withScores,
|
||||||
|
string $salesIntent
|
||||||
|
): array {
|
||||||
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
|
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
|
||||||
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
|
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
|
||||||
|
|
||||||
$isListQuery = $this->intentLite->isListQuery($prompt);
|
$isListQuery = $this->intentLite->isListQuery($prompt);
|
||||||
$salesIntent = $this->salesIntentLite->detect($prompt)['intent'];
|
|
||||||
|
|
||||||
$cleanQuery = $this->queryCleaner->clean($prompt);
|
$cleanQuery = $this->queryCleaner->clean($prompt);
|
||||||
if ($cleanQuery === '') {
|
if ($cleanQuery === '') {
|
||||||
$cleanQuery = $prompt;
|
$cleanQuery = $prompt;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[$threshold, $topK] = $this->computeThresholdAndTopK($salesIntent, $isListQuery, $vectorTopKBase);
|
||||||
|
|
||||||
|
// Candidate Routing (keine Set-Map nötig; scoped nur wenn IDs existieren)
|
||||||
|
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
||||||
|
$candidateDocIds = is_array($candidateDocIds) ? array_values(array_unique(array_filter($candidateDocIds, 'is_string'))) : [];
|
||||||
|
|
||||||
|
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
||||||
|
|
||||||
|
$scopedHits = [];
|
||||||
|
if ($candidateDocIds !== []) {
|
||||||
|
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($globalHits === [] && $scopedHits === []) {
|
||||||
|
return [
|
||||||
|
'limit' => $limit,
|
||||||
|
'is_list_query' => $isListQuery,
|
||||||
|
'sales_intent' => $salesIntent,
|
||||||
|
'threshold' => $threshold,
|
||||||
|
'topk' => $topK,
|
||||||
|
'ranked_chunk_ids' => [],
|
||||||
|
'rows' => [],
|
||||||
|
'rrf_scores' => [],
|
||||||
|
'raw_scores' => [],
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
$fused = $this->fuseHits(
|
||||||
|
$globalHits,
|
||||||
|
$scopedHits,
|
||||||
|
$threshold,
|
||||||
|
$salesIntent === SalesIntentLite::OBJECTION,
|
||||||
|
$withScores
|
||||||
|
);
|
||||||
|
|
||||||
|
$rrfScores = $fused['rrf_scores'];
|
||||||
|
$rawScores = $fused['raw_scores'];
|
||||||
|
|
||||||
|
// 🛡 Hardening: wenn Threshold alles rausfiltert, aber globale Hits existieren,
|
||||||
|
// nehmen wir Top-N als minimalen Kontext. Greift nur in Edge-Cases.
|
||||||
|
if ($rrfScores === [] && $globalHits !== []) {
|
||||||
|
$rrfScores = $this->fallbackRrfFromHits($globalHits, self::EMPTY_RRF_FALLBACK_TOPN);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($rrfScores === []) {
|
||||||
|
return [
|
||||||
|
'limit' => $limit,
|
||||||
|
'is_list_query' => $isListQuery,
|
||||||
|
'sales_intent' => $salesIntent,
|
||||||
|
'threshold' => $threshold,
|
||||||
|
'topk' => $topK,
|
||||||
|
'ranked_chunk_ids' => [],
|
||||||
|
'rows' => [],
|
||||||
|
'rrf_scores' => [],
|
||||||
|
'raw_scores' => $rawScores,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
arsort($rrfScores);
|
||||||
|
$rankedChunkIds = array_keys($rrfScores);
|
||||||
|
|
||||||
|
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
|
||||||
|
|
||||||
|
return [
|
||||||
|
'limit' => $limit,
|
||||||
|
'is_list_query' => $isListQuery,
|
||||||
|
'sales_intent' => $salesIntent,
|
||||||
|
'threshold' => $threshold,
|
||||||
|
'topk' => $topK,
|
||||||
|
'ranked_chunk_ids' => $rankedChunkIds,
|
||||||
|
'rows' => $rows,
|
||||||
|
'rrf_scores' => $rrfScores,
|
||||||
|
'raw_scores' => $rawScores,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return array{0: float, 1: int} threshold, topK
|
||||||
|
*/
|
||||||
|
private function computeThresholdAndTopK(string $salesIntent, bool $isListQuery, int $vectorTopKBase): array
|
||||||
|
{
|
||||||
$threshold = self::VECTOR_SCORE_THRESHOLD;
|
$threshold = self::VECTOR_SCORE_THRESHOLD;
|
||||||
$topK = $vectorTopKBase;
|
$topK = $vectorTopKBase;
|
||||||
|
|
||||||
@@ -242,124 +325,106 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
|
|
||||||
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
|
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
|
||||||
|
|
||||||
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
// Enterprise clamp: verhindert Drift, ohne den aktuellen Normalfall zu ändern.
|
||||||
$candidateSet = null;
|
$threshold = max(self::THRESHOLD_FLOOR, min(self::THRESHOLD_CEIL, $threshold));
|
||||||
|
|
||||||
if (is_array($candidateDocIds) && $candidateDocIds !== []) {
|
return [$threshold, $topK];
|
||||||
$candidateSet = array_fill_keys($candidateDocIds, true);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
|
||||||
|
|
||||||
$scopedHits = [];
|
|
||||||
if ($candidateSet !== null) {
|
|
||||||
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, array_keys($candidateSet));
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($globalHits === [] && $scopedHits === []) {
|
|
||||||
return [
|
|
||||||
'limit' => $limit,
|
|
||||||
'is_list_query' => $isListQuery,
|
|
||||||
'sales_intent' => (string)$salesIntent,
|
|
||||||
'threshold' => $threshold,
|
|
||||||
'topk' => $topK,
|
|
||||||
'ranked_chunk_ids' => [],
|
|
||||||
'rows' => [],
|
|
||||||
'rrf_scores' => [],
|
|
||||||
'raw_scores' => [],
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return array{
|
||||||
|
* rrf_scores: array<string,float>,
|
||||||
|
* raw_scores: array<string,float>
|
||||||
|
* }
|
||||||
|
*/
|
||||||
|
private function fuseHits(
|
||||||
|
array $globalHits,
|
||||||
|
array $scopedHits,
|
||||||
|
float $threshold,
|
||||||
|
bool $boostScoped,
|
||||||
|
bool $captureRaw
|
||||||
|
): array {
|
||||||
$rrfScores = [];
|
$rrfScores = [];
|
||||||
$rawScores = [];
|
$rawScores = [];
|
||||||
|
|
||||||
$this->applyRrfWithOptionalRaw($globalHits, $rrfScores, $rawScores, $threshold, false, $withScores);
|
$apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void {
|
||||||
$this->applyRrfWithOptionalRaw(
|
$rank = 0;
|
||||||
$scopedHits,
|
|
||||||
$rrfScores,
|
|
||||||
$rawScores,
|
|
||||||
$threshold,
|
|
||||||
$salesIntent === SalesIntentLite::OBJECTION,
|
|
||||||
$withScores
|
|
||||||
);
|
|
||||||
|
|
||||||
if ($rrfScores === []) {
|
foreach ($hits as $hit) {
|
||||||
return [
|
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||||
'limit' => $limit,
|
continue;
|
||||||
'is_list_query' => $isListQuery,
|
}
|
||||||
'sales_intent' => (string)$salesIntent,
|
|
||||||
'threshold' => $threshold,
|
|
||||||
'topk' => $topK,
|
|
||||||
'ranked_chunk_ids' => [],
|
|
||||||
'rows' => [],
|
|
||||||
'rrf_scores' => [],
|
|
||||||
'raw_scores' => $rawScores,
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
arsort($rrfScores);
|
$raw = (float)$hit['score'];
|
||||||
$rankedChunkIds = array_keys($rrfScores);
|
if ($raw < $threshold) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
|
$chunkId = (string)$hit['chunk_id'];
|
||||||
|
|
||||||
|
if ($captureRaw) {
|
||||||
|
$rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
|
||||||
|
}
|
||||||
|
|
||||||
|
$rank++;
|
||||||
|
$rrf = 1.0 / (self::RRF_K + $rank);
|
||||||
|
|
||||||
|
if ($boost) {
|
||||||
|
$rrf *= 1.2;
|
||||||
|
}
|
||||||
|
|
||||||
|
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
$apply($globalHits, false);
|
||||||
|
$apply($scopedHits, $boostScoped);
|
||||||
|
|
||||||
return [
|
return [
|
||||||
'limit' => $limit,
|
|
||||||
'is_list_query' => $isListQuery,
|
|
||||||
'sales_intent' => (string)$salesIntent,
|
|
||||||
'threshold' => $threshold,
|
|
||||||
'topk' => $topK,
|
|
||||||
'ranked_chunk_ids' => $rankedChunkIds,
|
|
||||||
'rows' => $rows,
|
|
||||||
'rrf_scores' => $rrfScores,
|
'rrf_scores' => $rrfScores,
|
||||||
'raw_scores' => $rawScores,
|
'raw_scores' => $rawScores,
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
private function applyRrfWithOptionalRaw(
|
/**
|
||||||
array $hits,
|
* Minimaler Fallback: baut RRF nur aus der Reihenfolge (ohne Threshold),
|
||||||
array &$rrfScores,
|
* damit Edge-Cases nicht leer laufen.
|
||||||
array &$rawScores,
|
*
|
||||||
float $threshold,
|
* @return array<string,float>
|
||||||
bool $boost = false,
|
*/
|
||||||
bool $captureRaw = false
|
private function fallbackRrfFromHits(array $hits, int $topN): array
|
||||||
): void
|
|
||||||
{
|
{
|
||||||
|
$rrf = [];
|
||||||
$rank = 0;
|
$rank = 0;
|
||||||
|
|
||||||
foreach ($hits as $hit) {
|
foreach ($hits as $hit) {
|
||||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
if (!isset($hit['chunk_id'])) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$raw = (float)$hit['score'];
|
|
||||||
if ($raw < $threshold) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$chunkId = (string)$hit['chunk_id'];
|
|
||||||
|
|
||||||
if ($captureRaw) {
|
|
||||||
if (!isset($rawScores[$chunkId]) || $raw > $rawScores[$chunkId]) {
|
|
||||||
$rawScores[$chunkId] = $raw;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$rank++;
|
$rank++;
|
||||||
$rrf = 1 / (self::RRF_K + $rank);
|
$chunkId = (string)$hit['chunk_id'];
|
||||||
|
$rrf[$chunkId] = 1.0 / (self::RRF_K + $rank);
|
||||||
|
|
||||||
if ($boost) {
|
if ($rank >= $topN) {
|
||||||
$rrf *= 1.2;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!isset($rrfScores[$chunkId])) {
|
|
||||||
$rrfScores[$chunkId] = 0.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
$rrfScores[$chunkId] += $rrf;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return $rrf;
|
||||||
}
|
}
|
||||||
|
|
||||||
private function selectChunkIdsListMode(array $chunkIds, array $rows, int $limit): array
|
private function detectSalesIntent(string $prompt): string
|
||||||
|
{
|
||||||
|
$data = $this->salesIntentLite->detect($prompt);
|
||||||
|
return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
|
||||||
|
}
|
||||||
|
|
||||||
|
// =========================================================
|
||||||
|
// SELECTION (shared)
|
||||||
|
// =========================================================
|
||||||
|
|
||||||
|
private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array
|
||||||
{
|
{
|
||||||
$seen = [];
|
$seen = [];
|
||||||
$out = [];
|
$out = [];
|
||||||
@@ -374,7 +439,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
|
// Dedupe Key (billig & stabil)
|
||||||
|
$key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));
|
||||||
|
|
||||||
if (isset($seen[$key])) {
|
if (isset($seen[$key])) {
|
||||||
continue;
|
continue;
|
||||||
@@ -391,7 +457,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
return $out;
|
return $out;
|
||||||
}
|
}
|
||||||
|
|
||||||
private function selectChunkIdsSalesMode(array $chunkIds, array $rows, int $limit): array
|
private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array
|
||||||
{
|
{
|
||||||
$out = [];
|
$out = [];
|
||||||
$docCounter = [];
|
$docCounter = [];
|
||||||
@@ -439,88 +505,12 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
return $out;
|
return $out;
|
||||||
}
|
}
|
||||||
|
|
||||||
private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void
|
// =========================================================
|
||||||
|
// COLLECT (shared)
|
||||||
|
// =========================================================
|
||||||
|
|
||||||
|
private function collectTextsFromIds(array $chunkIds, array $rows): array
|
||||||
{
|
{
|
||||||
$rank = 0;
|
|
||||||
|
|
||||||
foreach ($hits as $hit) {
|
|
||||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$raw = (float)$hit['score'];
|
|
||||||
if ($raw < $threshold) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$chunkId = (string)$hit['chunk_id'];
|
|
||||||
|
|
||||||
$rank++;
|
|
||||||
$rrf = 1 / (self::RRF_K + $rank);
|
|
||||||
|
|
||||||
if ($boost) {
|
|
||||||
$rrf *= 1.2;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!isset($rrfScores[$chunkId])) {
|
|
||||||
$rrfScores[$chunkId] = 0.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
$rrfScores[$chunkId] += $rrf;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private function collectSalesOptimized(array $chunkIds, array $rows, int $limit): array
|
|
||||||
{
|
|
||||||
$out = [];
|
|
||||||
$docCounter = [];
|
|
||||||
$docChunkPositions = [];
|
|
||||||
|
|
||||||
foreach ($chunkIds as $chunkId) {
|
|
||||||
if (!isset($rows[$chunkId]['text'])) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
|
||||||
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
|
||||||
|
|
||||||
if (!is_string($docId)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_int($chunkIndex)) {
|
|
||||||
$prev = $docChunkPositions[$docId] ?? [];
|
|
||||||
foreach ($prev as $prevIdx) {
|
|
||||||
if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) {
|
|
||||||
continue 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
$docChunkPositions[$docId][] = $chunkIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
$text = trim((string)$rows[$chunkId]['text']);
|
|
||||||
if ($text === '') {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$out[] = $text;
|
|
||||||
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
|
|
||||||
|
|
||||||
if (\count($out) >= $limit) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return $out;
|
|
||||||
}
|
|
||||||
|
|
||||||
private function collectTexts(array $chunkIds, array $rows, int $limit): array
|
|
||||||
{
|
|
||||||
$seen = [];
|
|
||||||
$out = [];
|
$out = [];
|
||||||
|
|
||||||
foreach ($chunkIds as $id) {
|
foreach ($chunkIds as $id) {
|
||||||
@@ -528,23 +518,12 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$chunk = trim((string)$rows[$id]['text']);
|
$text = trim((string)$rows[$id]['text']);
|
||||||
if ($chunk === '') {
|
if ($text === '') {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
|
$out[] = $text;
|
||||||
|
|
||||||
if (isset($seen[$key])) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$seen[$key] = true;
|
|
||||||
$out[] = $chunk;
|
|
||||||
|
|
||||||
if (\count($out) >= $limit) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return $out;
|
return $out;
|
||||||
|
|||||||
Reference in New Issue
Block a user