optimite tag logic boost

This commit is contained in:
team2
2026-02-26 19:45:58 +01:00
parent 8e9f20d41f
commit 12f2a48f88
3 changed files with 143 additions and 105 deletions

View File

@@ -14,11 +14,16 @@ use App\Vector\VectorSearchClient;
final class NdjsonHybridRetriever implements RetrieverInterface
{
private const VECTOR_SCORE_THRESHOLD = 0.4;
private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 3;
private const HARD_MAX_CHUNKS = 200;
private const HARD_MAX_VECTORK = 200;
/**
* Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter).
* Enterprise Default: klein halten, sonst dominieren Tags wieder.
*/
private const TAG_SCORE_BONUS = 0.08;
public function __construct(
private readonly ChunkManager $chunkManager,
private readonly NdjsonChunkLookup $lookup,
@@ -61,7 +66,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
}
// -------------------------------------------------
// 1) Tag Routing (bereinigte Query)
// 1) Tag Routing (bereinigte Query) -> NUR Bonus
// -------------------------------------------------
$candidateDocIds = $this->tagRouting->route($cleanQuery);
$candidateSet = null;
@@ -75,87 +80,115 @@ final class NdjsonHybridRetriever implements RetrieverInterface
// -------------------------------------------------
$topK = $vectorTopKBase;
// List mode: höhere Abdeckung, um mehr Dokumente zu ranken
if ($isListQuery) {
$topK = max($vectorTopKBase * 3, 80);
}
if ($candidateSet !== null) {
$topK = min(
max($topK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $topK),
self::HARD_MAX_VECTORK
);
}
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
// -------------------------------------------------
// 3) Vector Search (bereinigte Query; scoped wenn möglich)
// 3) Vector Search (immer GLOBAL; Tags sind KEIN Filter)
// -------------------------------------------------
if ($candidateSet !== null) {
$hits = $this->vectorClient->searchScoped(
$cleanQuery,
$topK,
array_keys($candidateSet)
);
// Wenn scoped nichts liefert → global fallback
if ($hits === []) {
$hits = $this->vectorClient->search($cleanQuery, $vectorTopKBase);
}
} else {
$hits = $this->vectorClient->search($cleanQuery, $topK);
}
$hits = $this->vectorClient->search($cleanQuery, $topK);
if ($hits === []) {
return $candidateSet !== null
? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit)
: [];
// Tags dürfen NICHT als Fallback wirken (sonst wieder zu mächtig)
return [];
}
// -------------------------------------------------
// 4) ChunkIds + Lookup
// 4) ChunkIds + Scores sammeln (raw)
// -------------------------------------------------
$chunkIds = [];
/** @var array<string,float> $rawScoreByChunkId */
$rawScoreByChunkId = [];
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
$raw = (float)$hit['score'];
// Threshold wird auf RAW Score angewendet (Qualitätsgate)
if ($raw < self::VECTOR_SCORE_THRESHOLD) {
continue;
}
$chunkIds[] = (string)$hit['chunk_id'];
$chunkId = (string)$hit['chunk_id'];
// Falls mehrfach: den besten raw score behalten
if (!isset($rawScoreByChunkId[$chunkId]) || $raw > $rawScoreByChunkId[$chunkId]) {
$rawScoreByChunkId[$chunkId] = $raw;
}
}
if ($chunkIds === []) {
return $candidateSet !== null
? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit)
: [];
if ($rawScoreByChunkId === []) {
return [];
}
$rows = $this->lookup->findByChunkIds($chunkIds);
// Lookup liefert docId + Text etc.
$rows = $this->lookup->findByChunkIds(array_keys($rawScoreByChunkId));
// -------------------------------------------------
// 5) Listenmodus → Dokument-Ranking
// 5) Adjusted Score (Tag Bonus) + Ranking
// -------------------------------------------------
if ($isListQuery && $candidateSet !== null) {
/** @var array<string,float> $adjScoreByChunkId */
$adjScoreByChunkId = [];
$rankedDocIds = $this->rankDocumentsFromHits($hits, $rows, $candidateSet);
foreach ($rawScoreByChunkId as $chunkId => $rawScore) {
if (!isset($rows[$chunkId])) {
continue;
}
$adj = $rawScore;
if ($candidateSet !== null) {
$docId = $rows[$chunkId]['document_id'] ?? null;
if (is_string($docId) && isset($candidateSet[$docId])) {
$adj += self::TAG_SCORE_BONUS;
}
}
$adjScoreByChunkId[$chunkId] = $adj;
}
if ($adjScoreByChunkId === []) {
return [];
}
// Sort: adjusted desc, deterministic tie-break by chunkId
uksort($adjScoreByChunkId, static function (string $a, string $b) use ($adjScoreByChunkId): int {
$sa = $adjScoreByChunkId[$a];
$sb = $adjScoreByChunkId[$b];
if ($sa === $sb) {
return $a <=> $b;
}
return ($sb <=> $sa);
});
$rankedChunkIds = array_keys($adjScoreByChunkId);
// -------------------------------------------------
// 6) Listenmodus → Dokument-Ranking (mit Tag-Bonus in Scores)
// -------------------------------------------------
if ($isListQuery) {
$rankedDocIds = $this->rankDocumentsFromAdjustedScores($adjScoreByChunkId, $rows);
if ($rankedDocIds === []) {
return $this->fallbackChunksFromCandidateDocs($candidateSet, $limit);
return [];
}
$topDocIds = array_slice($rankedDocIds, 0, $limit);
return $this->collectBestChunkPerDocument($topDocIds, $hits, $rows);
return $this->collectBestChunkPerDocumentAdjusted($topDocIds, $adjScoreByChunkId, $rows);
}
// -------------------------------------------------
// 6) Normaler Chunk-Modus
// 7) Normaler Chunk-Modus (nach adjusted Ranking)
// -------------------------------------------------
return $this->collectTexts($chunkIds, $rows, $limit);
return $this->collectTexts($rankedChunkIds, $rows, $limit);
}
// =========================================================
@@ -174,30 +207,29 @@ final class NdjsonHybridRetriever implements RetrieverInterface
}
// =========================================================
// DOCUMENT RANKING
// DOCUMENT RANKING (Adjusted Scores incl. Tag Bonus)
// =========================================================
private function rankDocumentsFromHits(
array $hits,
array $rows,
array $candidateSet
): array {
/**
* @param array<string,float> $adjScoreByChunkId
* @param array<string,array<string,mixed>> $rows
* @return string[]
*/
private function rankDocumentsFromAdjustedScores(array $adjScoreByChunkId, array $rows): array
{
$documentScores = [];
foreach ($hits as $hit) {
$chunkId = (string)($hit['chunk_id'] ?? '');
foreach ($adjScoreByChunkId as $chunkId => $score) {
if (!isset($rows[$chunkId])) {
continue;
}
$docId = $rows[$chunkId]['document_id'] ?? null;
if (!is_string($docId) || !isset($candidateSet[$docId])) {
if (!is_string($docId) || $docId === '') {
continue;
}
$documentScores[$docId][] = (float)$hit['score'];
$documentScores[$docId][] = (float)$score;
}
if ($documentScores === []) {
@@ -217,21 +249,21 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return array_keys($ranked);
}
private function collectBestChunkPerDocument(
array $docIds,
array $hits,
array $rows
): array {
/**
* @param string[] $docIds
* @param array<string,float> $adjScoreByChunkId
* @param array<string,array<string,mixed>> $rows
* @return string[]
*/
private function collectBestChunkPerDocumentAdjusted(array $docIds, array $adjScoreByChunkId, array $rows): array
{
$result = [];
foreach ($docIds as $docId) {
$bestScore = -INF;
$bestText = null;
foreach ($hits as $hit) {
$chunkId = (string)($hit['chunk_id'] ?? '');
foreach ($adjScoreByChunkId as $chunkId => $score) {
if (!isset($rows[$chunkId])) {
continue;
}
@@ -240,8 +272,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface
continue;
}
if ((float)$hit['score'] > $bestScore) {
$bestScore = (float)$hit['score'];
if ((float)$score > $bestScore) {
$bestScore = (float)$score;
$bestText = $rows[$chunkId]['text'] ?? null;
}
}
@@ -255,45 +287,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
}
// =========================================================
// FALLBACK + NORMAL MODE
// NORMAL MODE
// =========================================================
private function fallbackChunksFromCandidateDocs(array $candidateSet, int $limit): array
{
$seen = [];
$out = [];
foreach ($this->chunkManager->streamAll() as $row) {
$docId = $row['document_id'] ?? null;
if (!is_string($docId) || !isset($candidateSet[$docId])) {
continue;
}
$text = $row['text'] ?? null;
if (!is_string($text) || $text === '') {
continue;
}
$chunk = trim($text);
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = $chunk;
if (\count($out) >= $limit) {
break;
}
}
return $out;
}
private function collectTexts(array $chunkIds, array $rows, int $limit): array
{
$seen = [];
@@ -304,7 +300,11 @@ final class NdjsonHybridRetriever implements RetrieverInterface
continue;
}
$chunk = trim($rows[$id]['text']);
$chunk = trim((string)$rows[$id]['text']);
if ($chunk === '') {
continue;
}
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) {