alpha new hybridretriver line
This commit is contained in:
@@ -12,8 +12,8 @@ use App\Vector\VectorSearchClient;
|
||||
|
||||
final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
private const VECTOR_SCORE_THRESHOLD = 0.25;
|
||||
private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 10;
|
||||
private const VECTOR_SCORE_THRESHOLD = 0.22;
|
||||
private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 3;
|
||||
|
||||
private const HARD_MAX_CHUNKS = 200;
|
||||
private const HARD_MAX_VECTORK = 200;
|
||||
@@ -49,9 +49,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
$isListQuery = $this->isListQuery($prompt);
|
||||
|
||||
// -------------------------------
|
||||
// -------------------------------------------------
|
||||
// 1) Tag Routing
|
||||
// -------------------------------
|
||||
// -------------------------------------------------
|
||||
$candidateDocIds = $this->tagRouting->route($prompt);
|
||||
$candidateSet = null;
|
||||
|
||||
@@ -59,19 +59,40 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$candidateSet = array_fill_keys($candidateDocIds, true);
|
||||
}
|
||||
|
||||
// -------------------------------
|
||||
// 2) Vector Search
|
||||
// -------------------------------
|
||||
// -------------------------------------------------
|
||||
// 2) TopK bestimmen
|
||||
// -------------------------------------------------
|
||||
$topK = $vectorTopKBase;
|
||||
|
||||
if ($isListQuery) {
|
||||
$topK = max($vectorTopKBase * 3, 80);
|
||||
}
|
||||
|
||||
if ($candidateSet !== null) {
|
||||
$topK = min(
|
||||
max($vectorTopKBase * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $vectorTopKBase),
|
||||
max($topK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $topK),
|
||||
self::HARD_MAX_VECTORK
|
||||
);
|
||||
}
|
||||
|
||||
$hits = $this->vectorClient->search($prompt, $topK);
|
||||
// -------------------------------------------------
|
||||
// 3) Vector Search (Scoped wenn möglich)
|
||||
// -------------------------------------------------
|
||||
if ($candidateSet !== null) {
|
||||
$hits = $this->vectorClient->searchScoped(
|
||||
$prompt,
|
||||
$topK,
|
||||
array_keys($candidateSet)
|
||||
);
|
||||
|
||||
// Wenn scoped nichts liefert → global fallback
|
||||
if ($hits === []) {
|
||||
$hits = $this->vectorClient->search($prompt, $vectorTopKBase);
|
||||
}
|
||||
|
||||
} else {
|
||||
$hits = $this->vectorClient->search($prompt, $topK);
|
||||
}
|
||||
|
||||
if ($hits === []) {
|
||||
return $candidateSet !== null
|
||||
@@ -79,9 +100,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
: [];
|
||||
}
|
||||
|
||||
// -------------------------------
|
||||
// 3) Chunk-IDs + Lookup einmalig
|
||||
// -------------------------------
|
||||
// -------------------------------------------------
|
||||
// 4) ChunkIds + Lookup
|
||||
// -------------------------------------------------
|
||||
$chunkIds = [];
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
@@ -104,9 +125,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
$rows = $this->lookup->findByChunkIds($chunkIds);
|
||||
|
||||
// -------------------------------
|
||||
// 4) Listen-Modus → Dokument-Ranking
|
||||
// -------------------------------
|
||||
// -------------------------------------------------
|
||||
// 5) Listenmodus → Dokument-Ranking
|
||||
// -------------------------------------------------
|
||||
if ($isListQuery && $candidateSet !== null) {
|
||||
|
||||
$rankedDocIds = $this->rankDocumentsFromHits($hits, $rows, $candidateSet);
|
||||
@@ -120,9 +141,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return $this->collectBestChunkPerDocument($topDocIds, $hits, $rows);
|
||||
}
|
||||
|
||||
// -------------------------------
|
||||
// 5) Normaler Chunk-Modus
|
||||
// -------------------------------
|
||||
// -------------------------------------------------
|
||||
// 6) Normaler Chunk-Modus
|
||||
// -------------------------------------------------
|
||||
return $this->collectTexts($chunkIds, $rows, $limit);
|
||||
}
|
||||
|
||||
@@ -153,18 +174,13 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$documentScores = [];
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkId = (string)$hit['chunk_id'];
|
||||
$chunkId = (string)($hit['chunk_id'] ?? '');
|
||||
|
||||
if (!isset($rows[$chunkId])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$row = $rows[$chunkId];
|
||||
$docId = $row['document_id'] ?? null;
|
||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||
|
||||
if (!is_string($docId) || !isset($candidateSet[$docId])) {
|
||||
continue;
|
||||
@@ -203,25 +219,19 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$bestText = null;
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkId = (string)$hit['chunk_id'];
|
||||
$chunkId = (string)($hit['chunk_id'] ?? '');
|
||||
|
||||
if (!isset($rows[$chunkId])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$row = $rows[$chunkId];
|
||||
|
||||
if (($row['document_id'] ?? null) !== $docId) {
|
||||
if (($rows[$chunkId]['document_id'] ?? null) !== $docId) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((float)$hit['score'] > $bestScore) {
|
||||
$bestScore = (float)$hit['score'];
|
||||
$bestText = $row['text'] ?? null;
|
||||
$bestText = $rows[$chunkId]['text'] ?? null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user