alpha new hybridretriver line

This commit is contained in:
team2
2026-02-26 13:51:54 +01:00
parent ec22f8bbbd
commit 052ff55eda
3 changed files with 129 additions and 41 deletions

View File

@@ -12,8 +12,8 @@ use App\Vector\VectorSearchClient;
final class NdjsonHybridRetriever implements RetrieverInterface
{
private const VECTOR_SCORE_THRESHOLD = 0.25;
private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 10;
private const VECTOR_SCORE_THRESHOLD = 0.22;
private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 3;
private const HARD_MAX_CHUNKS = 200;
private const HARD_MAX_VECTORK = 200;
@@ -49,9 +49,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$isListQuery = $this->isListQuery($prompt);
// -------------------------------
// -------------------------------------------------
// 1) Tag Routing
// -------------------------------
// -------------------------------------------------
$candidateDocIds = $this->tagRouting->route($prompt);
$candidateSet = null;
@@ -59,19 +59,40 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$candidateSet = array_fill_keys($candidateDocIds, true);
}
// -------------------------------
// 2) Vector Search
// -------------------------------
// -------------------------------------------------
// 2) TopK bestimmen
// -------------------------------------------------
$topK = $vectorTopKBase;
if ($isListQuery) {
$topK = max($vectorTopKBase * 3, 80);
}
if ($candidateSet !== null) {
$topK = min(
max($vectorTopKBase * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $vectorTopKBase),
max($topK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $topK),
self::HARD_MAX_VECTORK
);
}
$hits = $this->vectorClient->search($prompt, $topK);
// -------------------------------------------------
// 3) Vector Search (Scoped wenn möglich)
// -------------------------------------------------
if ($candidateSet !== null) {
$hits = $this->vectorClient->searchScoped(
$prompt,
$topK,
array_keys($candidateSet)
);
// Wenn scoped nichts liefert → global fallback
if ($hits === []) {
$hits = $this->vectorClient->search($prompt, $vectorTopKBase);
}
} else {
$hits = $this->vectorClient->search($prompt, $topK);
}
if ($hits === []) {
return $candidateSet !== null
@@ -79,9 +100,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
: [];
}
// -------------------------------
// 3) Chunk-IDs + Lookup einmalig
// -------------------------------
// -------------------------------------------------
// 4) ChunkIds + Lookup
// -------------------------------------------------
$chunkIds = [];
foreach ($hits as $hit) {
@@ -104,9 +125,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$rows = $this->lookup->findByChunkIds($chunkIds);
// -------------------------------
// 4) Listen-Modus → Dokument-Ranking
// -------------------------------
// -------------------------------------------------
// 5) Listenmodus → Dokument-Ranking
// -------------------------------------------------
if ($isListQuery && $candidateSet !== null) {
$rankedDocIds = $this->rankDocumentsFromHits($hits, $rows, $candidateSet);
@@ -120,9 +141,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return $this->collectBestChunkPerDocument($topDocIds, $hits, $rows);
}
// -------------------------------
// 5) Normaler Chunk-Modus
// -------------------------------
// -------------------------------------------------
// 6) Normaler Chunk-Modus
// -------------------------------------------------
return $this->collectTexts($chunkIds, $rows, $limit);
}
@@ -153,18 +174,13 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$documentScores = [];
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
$chunkId = (string)$hit['chunk_id'];
$chunkId = (string)($hit['chunk_id'] ?? '');
if (!isset($rows[$chunkId])) {
continue;
}
$row = $rows[$chunkId];
$docId = $row['document_id'] ?? null;
$docId = $rows[$chunkId]['document_id'] ?? null;
if (!is_string($docId) || !isset($candidateSet[$docId])) {
continue;
@@ -203,25 +219,19 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$bestText = null;
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
$chunkId = (string)$hit['chunk_id'];
$chunkId = (string)($hit['chunk_id'] ?? '');
if (!isset($rows[$chunkId])) {
continue;
}
$row = $rows[$chunkId];
if (($row['document_id'] ?? null) !== $docId) {
if (($rows[$chunkId]['document_id'] ?? null) !== $docId) {
continue;
}
if ((float)$hit['score'] > $bestScore) {
$bestScore = (float)$hit['score'];
$bestText = $row['text'] ?? null;
$bestText = $rows[$chunkId]['text'] ?? null;
}
}