alpha new hybridretriver line

This commit is contained in:
team2
2026-02-26 07:33:35 +01:00
parent df97f9314b
commit 8beb6e7d7e

View File

@@ -14,7 +14,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
{ {
private const VECTOR_SCORE_THRESHOLD = 0.25; private const VECTOR_SCORE_THRESHOLD = 0.25;
private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 10; private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 10;
private const KEYWORD_MIN_HITS = 1;
private const HARD_MAX_CHUNKS = 200; private const HARD_MAX_CHUNKS = 200;
private const HARD_MAX_VECTORK = 200; private const HARD_MAX_VECTORK = 200;
@@ -27,9 +26,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
private readonly ModelGenerationConfigRepository $configRepository, private readonly ModelGenerationConfigRepository $configRepository,
) {} ) {}
/**
* Normalbetrieb ausschließlich aktive Config.
*/
public function retrieve(string $prompt): array public function retrieve(string $prompt): array
{ {
$config = $this->configRepository->findActiveForModel(); $config = $this->configRepository->findActiveForModel();
@@ -41,37 +37,31 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return $this->retrieveInternal($prompt, $config); return $this->retrieveInternal($prompt, $config);
} }
/**
* Admin-Testbetrieb explizite Config.
* Verändert KEINEN globalen Zustand.
*/
public function retrieveForConfig(string $prompt, ModelGenerationConfig $config): array public function retrieveForConfig(string $prompt, ModelGenerationConfig $config): array
{ {
return $this->retrieveInternal($prompt, $config); return $this->retrieveInternal($prompt, $config);
} }
/**
* Zentrale Retrieval-Logik (keine Duplikation).
*/
private function retrieveInternal(string $prompt, ModelGenerationConfig $config): array private function retrieveInternal(string $prompt, ModelGenerationConfig $config): array
{ {
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
// --------------------------------------------------------- $isListQuery = $this->isListQuery($prompt);
// 1) Tag-Vector FIRST -> candidateSet (DocIDs)
// --------------------------------------------------------- // -------------------------------
$candidateDocIds = $this->tagRouting->route($prompt); // <= DAS muss intern auf Tag-Vector gehen // 1) Tag Routing
// -------------------------------
$candidateDocIds = $this->tagRouting->route($prompt);
$candidateSet = null; $candidateSet = null;
if (is_array($candidateDocIds) && $candidateDocIds !== []) { if (is_array($candidateDocIds) && $candidateDocIds !== []) {
$candidateSet = array_fill_keys($candidateDocIds, true); $candidateSet = array_fill_keys($candidateDocIds, true);
} }
// --------------------------------------------------------- // -------------------------------
// 2) Vector chunks (primary) // 2) Vector Search
// --------------------------------------------------------- // -------------------------------
$topK = $vectorTopKBase; $topK = $vectorTopKBase;
if ($candidateSet !== null) { if ($candidateSet !== null) {
@@ -84,85 +74,183 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$hits = $this->vectorClient->search($prompt, $topK); $hits = $this->vectorClient->search($prompt, $topK);
if ($hits === []) { if ($hits === []) {
// Tags-only System: kein Vector-Hit -> keine Chunks return $candidateSet !== null
return []; ? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit)
: [];
} }
// -------------------------------
// 3) Chunk-IDs + Lookup einmalig
// -------------------------------
$chunkIds = []; $chunkIds = [];
foreach ($hits as $hit) { foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) { if (!isset($hit['chunk_id'], $hit['score'])) {
continue; continue;
} }
if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) { if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
continue; continue;
} }
$chunkIds[] = (string)$hit['chunk_id']; $chunkIds[] = (string)$hit['chunk_id'];
} }
if ($chunkIds === []) { if ($chunkIds === []) {
return []; return $candidateSet !== null
? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit)
: [];
} }
$rows = $this->lookup->findByChunkIds($chunkIds); $rows = $this->lookup->findByChunkIds($chunkIds);
$finalChunkIds = $chunkIds;
// --------------------------------------------------------- // -------------------------------
// 3) Routed filtering (wenn candidateSet vorhanden) // 4) Listen-Modus → Dokument-Ranking
// --------------------------------------------------------- // -------------------------------
if ($candidateSet !== null) { if ($isListQuery && $candidateSet !== null) {
$filtered = [];
foreach ($chunkIds as $id) { $rankedDocIds = $this->rankDocumentsFromHits($hits, $rows, $candidateSet);
$row = $rows[$id] ?? null;
if (!is_array($row)) {
continue;
}
$docId = $row['document_id'] ?? null; if ($rankedDocIds === []) {
if (!is_string($docId) || !isset($candidateSet[$docId])) { return $this->fallbackChunksFromCandidateDocs($candidateSet, $limit);
continue;
}
$filtered[] = $id;
} }
// Wenn Routing ALLES wegfiltert -> einmal global retry $topDocIds = array_slice($rankedDocIds, 0, $limit);
if ($filtered === []) {
$hits2 = $this->vectorClient->search($prompt, $vectorTopKBase); return $this->collectBestChunkPerDocument($topDocIds, $hits, $rows);
if ($hits2 === []) { }
return [];
// -------------------------------
// 5) Normaler Chunk-Modus
// -------------------------------
return $this->collectTexts($chunkIds, $rows, $limit);
}
// =========================================================
// LIST QUERY DETECTION
// =========================================================
private function isListQuery(string $prompt): bool
{
$prompt = mb_strtolower($prompt);
return str_contains($prompt, 'liste')
|| str_contains($prompt, 'zeige')
|| str_contains($prompt, 'nenn')
|| str_contains($prompt, 'welche')
|| preg_match('/\b\d+\b/', $prompt) === 1;
}
// =========================================================
// DOCUMENT RANKING
// =========================================================
private function rankDocumentsFromHits(
array $hits,
array $rows,
array $candidateSet
): array {
$documentScores = [];
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
$chunkId = (string)$hit['chunk_id'];
if (!isset($rows[$chunkId])) {
continue;
}
$row = $rows[$chunkId];
$docId = $row['document_id'] ?? null;
if (!is_string($docId) || !isset($candidateSet[$docId])) {
continue;
}
$documentScores[$docId][] = (float)$hit['score'];
}
if ($documentScores === []) {
return [];
}
$ranked = [];
foreach ($documentScores as $docId => $scores) {
rsort($scores);
$topScores = array_slice($scores, 0, 3);
$ranked[$docId] = array_sum($topScores) / count($topScores);
}
arsort($ranked);
return array_keys($ranked);
}
private function collectBestChunkPerDocument(
array $docIds,
array $hits,
array $rows
): array {
$result = [];
foreach ($docIds as $docId) {
$bestScore = -INF;
$bestText = null;
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
} }
$chunkIds2 = []; $chunkId = (string)$hit['chunk_id'];
foreach ($hits2 as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) { if (!isset($rows[$chunkId])) {
continue; continue;
}
if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
continue;
}
$chunkIds2[] = (string)$hit['chunk_id'];
} }
if ($chunkIds2 === []) { $row = $rows[$chunkId];
return [];
if (($row['document_id'] ?? null) !== $docId) {
continue;
} }
$rows = $this->lookup->findByChunkIds($chunkIds2); if ((float)$hit['score'] > $bestScore) {
$finalChunkIds = $chunkIds2; $bestScore = (float)$hit['score'];
} else { $bestText = $row['text'] ?? null;
$finalChunkIds = $filtered; }
}
if (is_string($bestText) && $bestText !== '') {
$result[] = trim($bestText);
} }
} }
// --------------------------------------------------------- return $result;
// 4) Collect texts + Dedupe + Limit }
// ---------------------------------------------------------
// =========================================================
// FALLBACK + NORMAL MODE
// =========================================================
private function fallbackChunksFromCandidateDocs(array $candidateSet, int $limit): array
{
$seen = []; $seen = [];
$out = []; $out = [];
foreach ($finalChunkIds as $id) { foreach ($this->chunkManager->streamAll() as $row) {
$text = $rows[$id]['text'] ?? null; $docId = $row['document_id'] ?? null;
if (!is_string($docId) || !isset($candidateSet[$docId])) {
continue;
}
$text = $row['text'] ?? null;
if (!is_string($text) || $text === '') { if (!is_string($text) || $text === '') {
continue; continue;
} }
@@ -185,84 +273,29 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return $out; return $out;
} }
private function keywordSearchStreaming(array $terms, int $limit, ?array $candidateSet): array private function collectTexts(array $chunkIds, array $rows, int $limit): array
{ {
if ($terms === []) {
return [];
}
$maxScore = \count($terms);
$top = [];
foreach ($this->chunkManager->streamAll() as $row) {
$text = $row['text'] ?? null;
if (!is_string($text) || $text === '') {
continue;
}
if ($candidateSet !== null) {
$docId = $row['document_id'] ?? null;
if (!is_string($docId) || !isset($candidateSet[$docId])) {
continue;
}
}
$haystack = mb_strtolower($text);
$score = 0;
foreach ($terms as $t) {
if ($t !== '' && mb_stripos($haystack, $t) !== false) {
$score++;
}
}
if ($score < self::KEYWORD_MIN_HITS) {
continue;
}
$top[] = [
'score' => $score,
'text' => trim($text),
];
usort($top, static function (array $a, array $b): int {
$cmp = ($b['score'] <=> $a['score']);
if ($cmp !== 0) {
return $cmp;
}
return (mb_strlen($a['text']) <=> mb_strlen($b['text']));
});
if (\count($top) > $limit) {
$top = array_slice($top, 0, $limit);
}
if (\count($top) === $limit && ($top[0]['score'] ?? 0) >= $maxScore) {
break;
}
}
return array_map(static fn($item) => (string)$item['text'], $top);
}
private function extractTerms(string $text): array
{
$text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
$parts = array_values(array_filter(
explode(' ', $text),
static fn(string $w) => mb_strlen($w) > 2
));
$seen = []; $seen = [];
$out = []; $out = [];
foreach ($parts as $w) { foreach ($chunkIds as $id) {
if (!isset($seen[$w])) { if (!isset($rows[$id]['text'])) {
$seen[$w] = true; continue;
$out[] = $w; }
$chunk = trim($rows[$id]['text']);
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = $chunk;
if (\count($out) >= $limit) {
break;
} }
} }