Files
MtoRagSystem/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
2026-02-26 17:43:22 +01:00

324 lines
9.5 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Entity\ModelGenerationConfig;
use App\Knowledge\ChunkManager;
use App\Knowledge\QueryCleaner;
use App\Repository\ModelGenerationConfigRepository;
use App\Tag\TagRoutingService;
use App\Vector\VectorSearchClient;
final class NdjsonHybridRetriever implements RetrieverInterface
{
private const VECTOR_SCORE_THRESHOLD = 0.22;
private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 3;
private const HARD_MAX_CHUNKS = 200;
private const HARD_MAX_VECTORK = 200;
public function __construct(
private readonly ChunkManager $chunkManager,
private readonly NdjsonChunkLookup $lookup,
private readonly VectorSearchClient $vectorClient,
private readonly TagRoutingService $tagRouting,
private readonly ModelGenerationConfigRepository $configRepository,
private readonly QueryCleaner $queryCleaner,
) {}
public function retrieve(string $prompt): array
{
$config = $this->configRepository->findActiveForModel();
if ($config === null) {
throw new \RuntimeException('No active ModelGenerationConfig found.');
}
return $this->retrieveInternal($prompt, $config);
}
public function retrieveForConfig(string $prompt, ModelGenerationConfig $config): array
{
return $this->retrieveInternal($prompt, $config);
}
private function retrieveInternal(string $prompt, ModelGenerationConfig $config): array
{
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
// Wichtig: List-Detection bleibt auf Originalprompt (sonst entfernst du "zeige/liste" etc.)
$isListQuery = $this->isListQuery($prompt);
// -------------------------------------------------
// CLEAN QUERY (nur für Retrieval: Tags + Vector)
// -------------------------------------------------
$cleanQuery = $this->queryCleaner->clean($prompt);
if ($cleanQuery === '') {
$cleanQuery = $prompt;
}
// -------------------------------------------------
// 1) Tag Routing (bereinigte Query)
// -------------------------------------------------
$candidateDocIds = $this->tagRouting->route($cleanQuery);
$candidateSet = null;
if (is_array($candidateDocIds) && $candidateDocIds !== []) {
$candidateSet = array_fill_keys($candidateDocIds, true);
}
// -------------------------------------------------
// 2) TopK bestimmen
// -------------------------------------------------
$topK = $vectorTopKBase;
if ($isListQuery) {
$topK = max($vectorTopKBase * 3, 80);
}
if ($candidateSet !== null) {
$topK = min(
max($topK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $topK),
self::HARD_MAX_VECTORK
);
}
// -------------------------------------------------
// 3) Vector Search (bereinigte Query; scoped wenn möglich)
// -------------------------------------------------
if ($candidateSet !== null) {
$hits = $this->vectorClient->searchScoped(
$cleanQuery,
$topK,
array_keys($candidateSet)
);
// Wenn scoped nichts liefert → global fallback
if ($hits === []) {
$hits = $this->vectorClient->search($cleanQuery, $vectorTopKBase);
}
} else {
$hits = $this->vectorClient->search($cleanQuery, $topK);
}
if ($hits === []) {
return $candidateSet !== null
? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit)
: [];
}
// -------------------------------------------------
// 4) ChunkIds + Lookup
// -------------------------------------------------
$chunkIds = [];
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
continue;
}
$chunkIds[] = (string)$hit['chunk_id'];
}
if ($chunkIds === []) {
return $candidateSet !== null
? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit)
: [];
}
$rows = $this->lookup->findByChunkIds($chunkIds);
// -------------------------------------------------
// 5) Listenmodus → Dokument-Ranking
// -------------------------------------------------
if ($isListQuery && $candidateSet !== null) {
$rankedDocIds = $this->rankDocumentsFromHits($hits, $rows, $candidateSet);
if ($rankedDocIds === []) {
return $this->fallbackChunksFromCandidateDocs($candidateSet, $limit);
}
$topDocIds = array_slice($rankedDocIds, 0, $limit);
return $this->collectBestChunkPerDocument($topDocIds, $hits, $rows);
}
// -------------------------------------------------
// 6) Normaler Chunk-Modus
// -------------------------------------------------
return $this->collectTexts($chunkIds, $rows, $limit);
}
// =========================================================
// LIST QUERY DETECTION
// =========================================================
private function isListQuery(string $prompt): bool
{
$prompt = mb_strtolower($prompt);
return str_contains($prompt, 'liste')
|| str_contains($prompt, 'zeige')
|| str_contains($prompt, 'nenn')
|| str_contains($prompt, 'welche')
|| preg_match('/\b\d+\b/', $prompt) === 1;
}
// =========================================================
// DOCUMENT RANKING
// =========================================================
private function rankDocumentsFromHits(
array $hits,
array $rows,
array $candidateSet
): array {
$documentScores = [];
foreach ($hits as $hit) {
$chunkId = (string)($hit['chunk_id'] ?? '');
if (!isset($rows[$chunkId])) {
continue;
}
$docId = $rows[$chunkId]['document_id'] ?? null;
if (!is_string($docId) || !isset($candidateSet[$docId])) {
continue;
}
$documentScores[$docId][] = (float)$hit['score'];
}
if ($documentScores === []) {
return [];
}
$ranked = [];
foreach ($documentScores as $docId => $scores) {
rsort($scores);
$topScores = array_slice($scores, 0, 3);
$ranked[$docId] = array_sum($topScores) / count($topScores);
}
arsort($ranked);
return array_keys($ranked);
}
private function collectBestChunkPerDocument(
array $docIds,
array $hits,
array $rows
): array {
$result = [];
foreach ($docIds as $docId) {
$bestScore = -INF;
$bestText = null;
foreach ($hits as $hit) {
$chunkId = (string)($hit['chunk_id'] ?? '');
if (!isset($rows[$chunkId])) {
continue;
}
if (($rows[$chunkId]['document_id'] ?? null) !== $docId) {
continue;
}
if ((float)$hit['score'] > $bestScore) {
$bestScore = (float)$hit['score'];
$bestText = $rows[$chunkId]['text'] ?? null;
}
}
if (is_string($bestText) && $bestText !== '') {
$result[] = trim($bestText);
}
}
return $result;
}
// =========================================================
// FALLBACK + NORMAL MODE
// =========================================================
private function fallbackChunksFromCandidateDocs(array $candidateSet, int $limit): array
{
$seen = [];
$out = [];
foreach ($this->chunkManager->streamAll() as $row) {
$docId = $row['document_id'] ?? null;
if (!is_string($docId) || !isset($candidateSet[$docId])) {
continue;
}
$text = $row['text'] ?? null;
if (!is_string($text) || $text === '') {
continue;
}
$chunk = trim($text);
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = $chunk;
if (\count($out) >= $limit) {
break;
}
}
return $out;
}
private function collectTexts(array $chunkIds, array $rows, int $limit): array
{
$seen = [];
$out = [];
foreach ($chunkIds as $id) {
if (!isset($rows[$id]['text'])) {
continue;
}
$chunk = trim($rows[$id]['text']);
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = $chunk;
if (\count($out) >= $limit) {
break;
}
}
return $out;
}
}