alpha new hybridretriver line

This commit is contained in:
team2
2026-02-26 07:02:07 +01:00
parent c12ae8b45e
commit df97f9314b
9 changed files with 460 additions and 152 deletions

View File

@@ -4,43 +4,65 @@ declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Entity\ModelGenerationConfig;
use App\Knowledge\ChunkManager;
use App\Repository\ModelGenerationConfigRepository;
use App\Tag\TagRoutingService;
use App\Vector\VectorSearchClient;
final class NdjsonHybridRetriever implements RetrieverInterface
{
private const VECTOR_SCORE_THRESHOLD = 0.25;
/**
* Wenn Tag-Routing aktiv ist, erhöhen wir TopK,
* weil wir danach per document_id filtern.
*/
private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 10;
/**
* Keyword-Scan: Mindest-Trefferanzahl an Terms, damit ein Chunk als Kandidat gilt.
*/
private const KEYWORD_MIN_HITS = 1;
private const HARD_MAX_CHUNKS = 200;
private const HARD_MAX_VECTORK = 200;
public function __construct(
private readonly ChunkManager $chunkManager,
private readonly NdjsonChunkLookup $lookup,
private readonly ChunkManager $chunkManager,
private readonly NdjsonChunkLookup $lookup,
private readonly VectorSearchClient $vectorClient,
private readonly TagRoutingService $tagRouting,
private readonly int $maxChunks = 100,
private readonly int $vectorTopK = 100,
private readonly TagRoutingService $tagRouting,
private readonly ModelGenerationConfigRepository $configRepository,
) {}
public function retrieve(string $prompt, int $limit = null): array
/**
* Normalbetrieb ausschließlich aktive Config.
*/
public function retrieve(string $prompt): array
{
$limit = $this->maxChunks;
$config = $this->configRepository->findActiveForModel();
if ($config === null) {
throw new \RuntimeException('No active ModelGenerationConfig found.');
}
return $this->retrieveInternal($prompt, $config);
}
/**
* Admin-Testbetrieb explizite Config.
* Verändert KEINEN globalen Zustand.
*/
public function retrieveForConfig(string $prompt, ModelGenerationConfig $config): array
{
return $this->retrieveInternal($prompt, $config);
}
/**
* Zentrale Retrieval-Logik (keine Duplikation).
*/
private function retrieveInternal(string $prompt, ModelGenerationConfig $config): array
{
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
// ---------------------------------------------------------
// 0) Tag-Routing FIRST (soft gate)
// 1) Tag-Vector FIRST -> candidateSet (DocIDs)
// ---------------------------------------------------------
$candidateDocIds = $this->tagRouting->route($prompt);
$candidateDocIds = $this->tagRouting->route($prompt); // <= DAS muss intern auf Tag-Vector gehen
$candidateSet = null;
if (is_array($candidateDocIds) && $candidateDocIds !== []) {
@@ -48,31 +70,22 @@ final class NdjsonHybridRetriever implements RetrieverInterface
}
// ---------------------------------------------------------
// 1) Keyword first (simple streaming scan)
// 2) Vector chunks (primary)
// ---------------------------------------------------------
$terms = $this->extractTerms($prompt);
$keywordChunks = $this->keywordSearchStreaming($terms, $limit, $candidateSet);
if (\count($keywordChunks) >= $limit) {
return array_slice($keywordChunks, 0, $limit);
}
// ---------------------------------------------------------
// 2) Vector fallback / enrichment
// - If routed: increase TopK, then filter by document_id
// - Soft fallback: if filtering yields nothing -> global vector once
// ---------------------------------------------------------
$topK = $this->vectorTopK;
$topK = $vectorTopKBase;
if ($candidateSet !== null) {
$topK = max($this->vectorTopK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $this->vectorTopK);
$topK = min($topK, 200); // guardrail
$topK = min(
max($vectorTopKBase * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $vectorTopKBase),
self::HARD_MAX_VECTORK
);
}
$hits = $this->vectorClient->search($prompt, $topK);
if ($hits === []) {
return $keywordChunks;
// Tags-only System: kein Vector-Hit -> keine Chunks
return [];
}
$chunkIds = [];
@@ -87,14 +100,15 @@ final class NdjsonHybridRetriever implements RetrieverInterface
}
if ($chunkIds === []) {
return $keywordChunks;
return [];
}
$rows = $this->lookup->findByChunkIds($chunkIds);
// routed filtering by document_id
$finalChunkIds = $chunkIds;
// ---------------------------------------------------------
// 3) Routed filtering (wenn candidateSet vorhanden)
// ---------------------------------------------------------
if ($candidateSet !== null) {
$filtered = [];
@@ -103,18 +117,20 @@ final class NdjsonHybridRetriever implements RetrieverInterface
if (!is_array($row)) {
continue;
}
$docId = $row['document_id'] ?? null;
if (!is_string($docId) || !isset($candidateSet[$docId])) {
continue;
}
$filtered[] = $id;
}
// Soft fallback: if routing filtered everything away, retry global vector once
// Wenn Routing ALLES wegfiltert -> einmal global retry
if ($filtered === []) {
$hits2 = $this->vectorClient->search($prompt, $this->vectorTopK);
$hits2 = $this->vectorClient->search($prompt, $vectorTopKBase);
if ($hits2 === []) {
return $keywordChunks;
return [];
}
$chunkIds2 = [];
@@ -129,7 +145,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
}
if ($chunkIds2 === []) {
return $keywordChunks;
return [];
}
$rows = $this->lookup->findByChunkIds($chunkIds2);
@@ -139,24 +155,25 @@ final class NdjsonHybridRetriever implements RetrieverInterface
}
}
foreach ($finalChunkIds as $id) {
if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) {
continue;
}
$keywordChunks[] = trim($rows[$id]['text']);
}
// ---------------------------------------------------------
// 3) dedupe + limit
// 4) Collect texts + Dedupe + Limit
// ---------------------------------------------------------
$seen = [];
$out = [];
foreach ($keywordChunks as $chunk) {
foreach ($finalChunkIds as $id) {
$text = $rows[$id]['text'] ?? null;
if (!is_string($text) || $text === '') {
continue;
}
$chunk = trim($text);
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = $chunk;
@@ -168,16 +185,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return $out;
}
/**
* Streaming Keyword Search über index.ndjson.
* Minimal, aber nützlich:
* - Score = Anzahl gefundener Terms
* - CandidateDocs (Tag-Routing) reduziert Scan massiv
*
* @param string[] $terms
* @param array<string,true>|null $candidateSet
* @return string[]
*/
private function keywordSearchStreaming(array $terms, int $limit, ?array $candidateSet): array
{
if ($terms === []) {
@@ -185,31 +192,28 @@ final class NdjsonHybridRetriever implements RetrieverInterface
}
$maxScore = \count($terms);
// top list: each item = ['score' => int, 'text' => string]
$top = [];
foreach ($this->chunkManager->streamAll() as $row) {
$text = $row['text'] ?? null;
if (!is_string($text) || $text === '') {
continue;
}
if ($candidateSet !== null) {
$docId = $row['document_id'] ?? null;
if (!is_string($docId) || !isset($candidateSet[$docId])) {
continue;
}
}
$haystack = mb_strtolower($text);
$score = 0;
foreach ($terms as $t) {
if ($t === '') {
continue;
}
if (mb_stripos($haystack, $t) !== false) {
if ($t !== '' && mb_stripos($haystack, $t) !== false) {
$score++;
}
}
@@ -223,14 +227,11 @@ final class NdjsonHybridRetriever implements RetrieverInterface
'text' => trim($text),
];
// keep only best N (simple sort, N is tiny)
usort($top, static function (array $a, array $b): int {
// higher score first
$cmp = ($b['score'] <=> $a['score']);
if ($cmp !== 0) {
return $cmp;
}
// shorter chunk first (often more precise)
return (mb_strlen($a['text']) <=> mb_strlen($b['text']));
});
@@ -238,25 +239,14 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$top = array_slice($top, 0, $limit);
}
// early exit: perfect matches filled
if (\count($top) === $limit && ($top[0]['score'] ?? 0) >= $maxScore) {
break;
}
}
$out = [];
foreach ($top as $item) {
$out[] = (string)$item['text'];
}
return $out;
return array_map(static fn($item) => (string)$item['text'], $top);
}
/**
* Minimal term extraction (stabiles Verhalten, wenig Magie)
*
* @return string[]
*/
private function extractTerms(string $text): array
{
$text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
@@ -266,15 +256,14 @@ final class NdjsonHybridRetriever implements RetrieverInterface
static fn(string $w) => mb_strlen($w) > 2
));
// unique, order preserved
$seen = [];
$out = [];
foreach ($parts as $w) {
if (isset($seen[$w])) {
continue;
if (!isset($seen[$w])) {
$seen[$w] = true;
$out[] = $w;
}
$seen[$w] = true;
$out[] = $w;
}
return $out;

View File

@@ -1,11 +1,28 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
/**
* Retrieval ist vollständig konfigurationsgetrieben.
*
* - retrievalMaxChunks stammt ausschließlich aus der aktiven ModelGenerationConfig.
* - retrievalVectorTopK stammt ausschließlich aus der aktiven ModelGenerationConfig.
* - Es existiert kein Runtime-Override.
*
* Ziel:
* Deterministisches, auditierbares Retrieval-Verhalten.
*/
interface RetrieverInterface
{
/**
* @return string[] Plain text knowledge chunks
* Retrieves relevant knowledge chunks for a given prompt.
*
* The number of returned chunks is strictly defined by
* the active ModelGenerationConfig (retrievalMaxChunks).
*
* @return string[] Plain text knowledge chunks
*/
public function retrieve(string $prompt, int $limit = 10): array;
}
public function retrieve(string $prompt): array;
}