optimite tag logic boost
This commit is contained in:
@@ -14,11 +14,16 @@ use App\Vector\VectorSearchClient;
|
||||
final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
private const VECTOR_SCORE_THRESHOLD = 0.4;
|
||||
private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 3;
|
||||
|
||||
private const HARD_MAX_CHUNKS = 200;
|
||||
private const HARD_MAX_VECTORK = 200;
|
||||
|
||||
/**
|
||||
* Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter).
|
||||
* Enterprise Default: klein halten, sonst dominieren Tags wieder.
|
||||
*/
|
||||
private const TAG_SCORE_BONUS = 0.08;
|
||||
|
||||
public function __construct(
|
||||
private readonly ChunkManager $chunkManager,
|
||||
private readonly NdjsonChunkLookup $lookup,
|
||||
@@ -61,7 +66,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// 1) Tag Routing (bereinigte Query)
|
||||
// 1) Tag Routing (bereinigte Query) -> NUR Bonus
|
||||
// -------------------------------------------------
|
||||
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
||||
$candidateSet = null;
|
||||
@@ -75,87 +80,115 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
// -------------------------------------------------
|
||||
$topK = $vectorTopKBase;
|
||||
|
||||
// List mode: höhere Abdeckung, um mehr Dokumente zu ranken
|
||||
if ($isListQuery) {
|
||||
$topK = max($vectorTopKBase * 3, 80);
|
||||
}
|
||||
|
||||
if ($candidateSet !== null) {
|
||||
$topK = min(
|
||||
max($topK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $topK),
|
||||
self::HARD_MAX_VECTORK
|
||||
);
|
||||
}
|
||||
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
|
||||
|
||||
// -------------------------------------------------
|
||||
// 3) Vector Search (bereinigte Query; scoped wenn möglich)
|
||||
// 3) Vector Search (immer GLOBAL; Tags sind KEIN Filter)
|
||||
// -------------------------------------------------
|
||||
if ($candidateSet !== null) {
|
||||
$hits = $this->vectorClient->searchScoped(
|
||||
$cleanQuery,
|
||||
$topK,
|
||||
array_keys($candidateSet)
|
||||
);
|
||||
|
||||
// Wenn scoped nichts liefert → global fallback
|
||||
if ($hits === []) {
|
||||
$hits = $this->vectorClient->search($cleanQuery, $vectorTopKBase);
|
||||
}
|
||||
|
||||
} else {
|
||||
$hits = $this->vectorClient->search($cleanQuery, $topK);
|
||||
}
|
||||
$hits = $this->vectorClient->search($cleanQuery, $topK);
|
||||
|
||||
if ($hits === []) {
|
||||
return $candidateSet !== null
|
||||
? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit)
|
||||
: [];
|
||||
// Tags dürfen NICHT als Fallback wirken (sonst wieder zu mächtig)
|
||||
return [];
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// 4) ChunkIds + Lookup
|
||||
// 4) ChunkIds + Scores sammeln (raw)
|
||||
// -------------------------------------------------
|
||||
$chunkIds = [];
|
||||
/** @var array<string,float> $rawScoreByChunkId */
|
||||
$rawScoreByChunkId = [];
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
|
||||
$raw = (float)$hit['score'];
|
||||
|
||||
// Threshold wird auf RAW Score angewendet (Qualitätsgate)
|
||||
if ($raw < self::VECTOR_SCORE_THRESHOLD) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkIds[] = (string)$hit['chunk_id'];
|
||||
$chunkId = (string)$hit['chunk_id'];
|
||||
|
||||
// Falls mehrfach: den besten raw score behalten
|
||||
if (!isset($rawScoreByChunkId[$chunkId]) || $raw > $rawScoreByChunkId[$chunkId]) {
|
||||
$rawScoreByChunkId[$chunkId] = $raw;
|
||||
}
|
||||
}
|
||||
|
||||
if ($chunkIds === []) {
|
||||
return $candidateSet !== null
|
||||
? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit)
|
||||
: [];
|
||||
if ($rawScoreByChunkId === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$rows = $this->lookup->findByChunkIds($chunkIds);
|
||||
// Lookup liefert docId + Text etc.
|
||||
$rows = $this->lookup->findByChunkIds(array_keys($rawScoreByChunkId));
|
||||
|
||||
// -------------------------------------------------
|
||||
// 5) Listenmodus → Dokument-Ranking
|
||||
// 5) Adjusted Score (Tag Bonus) + Ranking
|
||||
// -------------------------------------------------
|
||||
if ($isListQuery && $candidateSet !== null) {
|
||||
/** @var array<string,float> $adjScoreByChunkId */
|
||||
$adjScoreByChunkId = [];
|
||||
|
||||
$rankedDocIds = $this->rankDocumentsFromHits($hits, $rows, $candidateSet);
|
||||
foreach ($rawScoreByChunkId as $chunkId => $rawScore) {
|
||||
if (!isset($rows[$chunkId])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$adj = $rawScore;
|
||||
|
||||
if ($candidateSet !== null) {
|
||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||
if (is_string($docId) && isset($candidateSet[$docId])) {
|
||||
$adj += self::TAG_SCORE_BONUS;
|
||||
}
|
||||
}
|
||||
|
||||
$adjScoreByChunkId[$chunkId] = $adj;
|
||||
}
|
||||
|
||||
if ($adjScoreByChunkId === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Sort: adjusted desc, deterministic tie-break by chunkId
|
||||
uksort($adjScoreByChunkId, static function (string $a, string $b) use ($adjScoreByChunkId): int {
|
||||
$sa = $adjScoreByChunkId[$a];
|
||||
$sb = $adjScoreByChunkId[$b];
|
||||
|
||||
if ($sa === $sb) {
|
||||
return $a <=> $b;
|
||||
}
|
||||
return ($sb <=> $sa);
|
||||
});
|
||||
|
||||
$rankedChunkIds = array_keys($adjScoreByChunkId);
|
||||
|
||||
// -------------------------------------------------
|
||||
// 6) Listenmodus → Dokument-Ranking (mit Tag-Bonus in Scores)
|
||||
// -------------------------------------------------
|
||||
if ($isListQuery) {
|
||||
$rankedDocIds = $this->rankDocumentsFromAdjustedScores($adjScoreByChunkId, $rows);
|
||||
|
||||
if ($rankedDocIds === []) {
|
||||
return $this->fallbackChunksFromCandidateDocs($candidateSet, $limit);
|
||||
return [];
|
||||
}
|
||||
|
||||
$topDocIds = array_slice($rankedDocIds, 0, $limit);
|
||||
|
||||
return $this->collectBestChunkPerDocument($topDocIds, $hits, $rows);
|
||||
return $this->collectBestChunkPerDocumentAdjusted($topDocIds, $adjScoreByChunkId, $rows);
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// 6) Normaler Chunk-Modus
|
||||
// 7) Normaler Chunk-Modus (nach adjusted Ranking)
|
||||
// -------------------------------------------------
|
||||
return $this->collectTexts($chunkIds, $rows, $limit);
|
||||
return $this->collectTexts($rankedChunkIds, $rows, $limit);
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
@@ -174,30 +207,29 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// DOCUMENT RANKING
|
||||
// DOCUMENT RANKING (Adjusted Scores incl. Tag Bonus)
|
||||
// =========================================================
|
||||
|
||||
private function rankDocumentsFromHits(
|
||||
array $hits,
|
||||
array $rows,
|
||||
array $candidateSet
|
||||
): array {
|
||||
/**
|
||||
* @param array<string,float> $adjScoreByChunkId
|
||||
* @param array<string,array<string,mixed>> $rows
|
||||
* @return string[]
|
||||
*/
|
||||
private function rankDocumentsFromAdjustedScores(array $adjScoreByChunkId, array $rows): array
|
||||
{
|
||||
$documentScores = [];
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
$chunkId = (string)($hit['chunk_id'] ?? '');
|
||||
|
||||
foreach ($adjScoreByChunkId as $chunkId => $score) {
|
||||
if (!isset($rows[$chunkId])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||
|
||||
if (!is_string($docId) || !isset($candidateSet[$docId])) {
|
||||
if (!is_string($docId) || $docId === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$documentScores[$docId][] = (float)$hit['score'];
|
||||
$documentScores[$docId][] = (float)$score;
|
||||
}
|
||||
|
||||
if ($documentScores === []) {
|
||||
@@ -217,21 +249,21 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return array_keys($ranked);
|
||||
}
|
||||
|
||||
private function collectBestChunkPerDocument(
|
||||
array $docIds,
|
||||
array $hits,
|
||||
array $rows
|
||||
): array {
|
||||
/**
|
||||
* @param string[] $docIds
|
||||
* @param array<string,float> $adjScoreByChunkId
|
||||
* @param array<string,array<string,mixed>> $rows
|
||||
* @return string[]
|
||||
*/
|
||||
private function collectBestChunkPerDocumentAdjusted(array $docIds, array $adjScoreByChunkId, array $rows): array
|
||||
{
|
||||
$result = [];
|
||||
|
||||
foreach ($docIds as $docId) {
|
||||
|
||||
$bestScore = -INF;
|
||||
$bestText = null;
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
$chunkId = (string)($hit['chunk_id'] ?? '');
|
||||
|
||||
foreach ($adjScoreByChunkId as $chunkId => $score) {
|
||||
if (!isset($rows[$chunkId])) {
|
||||
continue;
|
||||
}
|
||||
@@ -240,8 +272,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((float)$hit['score'] > $bestScore) {
|
||||
$bestScore = (float)$hit['score'];
|
||||
if ((float)$score > $bestScore) {
|
||||
$bestScore = (float)$score;
|
||||
$bestText = $rows[$chunkId]['text'] ?? null;
|
||||
}
|
||||
}
|
||||
@@ -255,45 +287,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// FALLBACK + NORMAL MODE
|
||||
// NORMAL MODE
|
||||
// =========================================================
|
||||
|
||||
private function fallbackChunksFromCandidateDocs(array $candidateSet, int $limit): array
|
||||
{
|
||||
$seen = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($this->chunkManager->streamAll() as $row) {
|
||||
$docId = $row['document_id'] ?? null;
|
||||
|
||||
if (!is_string($docId) || !isset($candidateSet[$docId])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$text = $row['text'] ?? null;
|
||||
|
||||
if (!is_string($text) || $text === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunk = trim($text);
|
||||
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
|
||||
|
||||
if (isset($seen[$key])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$seen[$key] = true;
|
||||
$out[] = $chunk;
|
||||
|
||||
if (\count($out) >= $limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
private function collectTexts(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$seen = [];
|
||||
@@ -304,7 +300,11 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunk = trim($rows[$id]['text']);
|
||||
$chunk = trim((string)$rows[$id]['text']);
|
||||
if ($chunk === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
|
||||
|
||||
if (isset($seen[$key])) {
|
||||
|
||||
@@ -12,7 +12,7 @@ use Symfony\Component\Uid\Uuid;
|
||||
final class TagRoutingService
|
||||
{
|
||||
private const DEFAULT_TOPK = 8;
|
||||
private const MIN_BEST_SCORE = 0.10;
|
||||
private const MIN_BEST_SCORE = 0.25;
|
||||
private const MAX_CANDIDATE_DOCS = 200;
|
||||
|
||||
public function __construct(
|
||||
|
||||
Reference in New Issue
Block a user