add tagging
This commit is contained in:
@@ -4,33 +4,75 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Knowledge\QueryCleaner;
|
||||
use App\Knowledge\ChunkManager;
|
||||
use App\Tag\TagRoutingService;
|
||||
use App\Vector\VectorSearchClient;
|
||||
|
||||
final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
private const VECTOR_SCORE_THRESHOLD = 0.25;
|
||||
private const VECTOR_SCORE_THRESHOLD = 0.65;
|
||||
|
||||
/**
|
||||
* Wenn Tag-Routing aktiv ist, erhöhen wir TopK,
|
||||
* weil wir danach per document_id filtern.
|
||||
*/
|
||||
private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 10;
|
||||
|
||||
/**
|
||||
* Keyword-Scan: Mindest-Trefferanzahl an Terms, damit ein Chunk als Kandidat gilt.
|
||||
*/
|
||||
private const KEYWORD_MIN_HITS = 1;
|
||||
|
||||
public function __construct(
|
||||
private readonly ChunkManager $chunkManager,
|
||||
private readonly NdjsonChunkLookup $lookup,
|
||||
private readonly VectorSearchClient $vectorClient,
|
||||
private readonly QueryCleaner $queryCleaner,
|
||||
private readonly int $maxChunks = 25,
|
||||
private readonly int $vectorTopK = 10,
|
||||
)
|
||||
{
|
||||
}
|
||||
private readonly TagRoutingService $tagRouting,
|
||||
private readonly int $maxChunks = 3,
|
||||
private readonly int $vectorTopK = 5,
|
||||
) {}
|
||||
|
||||
public function retrieve(string $prompt, int $limit = null): array
|
||||
{
|
||||
$limit = $this->maxChunks;
|
||||
$keywordChunks = [];
|
||||
$query = $this->queryCleaner->clean($prompt);
|
||||
$limit ??= $this->maxChunks;
|
||||
|
||||
// Vector / enrichment
|
||||
$hits = $this->vectorClient->search($query, $this->vectorTopK);
|
||||
// ---------------------------------------------------------
|
||||
// 0) Tag-Routing FIRST (soft gate)
|
||||
// ---------------------------------------------------------
|
||||
$candidateDocIds = $this->tagRouting->route($prompt);
|
||||
|
||||
$candidateSet = null;
|
||||
|
||||
if (is_array($candidateDocIds) && $candidateDocIds !== []) {
|
||||
$candidateSet = array_fill_keys($candidateDocIds, true);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 1) Keyword first (simple streaming scan)
|
||||
// ---------------------------------------------------------
|
||||
$terms = $this->extractTerms($prompt);
|
||||
|
||||
$keywordChunks = $this->keywordSearchStreaming($terms, $limit, $candidateSet);
|
||||
|
||||
if (\count($keywordChunks) >= $limit) {
|
||||
return array_slice($keywordChunks, 0, $limit);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 2) Vector fallback / enrichment
|
||||
// - If routed: increase TopK, then filter by document_id
|
||||
// - Soft fallback: if filtering yields nothing -> global vector once
|
||||
// ---------------------------------------------------------
|
||||
$topK = $this->vectorTopK;
|
||||
|
||||
if ($candidateSet !== null) {
|
||||
$topK = max($this->vectorTopK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $this->vectorTopK);
|
||||
$topK = min($topK, 200); // guardrail
|
||||
}
|
||||
|
||||
$hits = $this->vectorClient->search($prompt, $topK);
|
||||
if ($hits === []) {
|
||||
return $this->diversifyByDevice($keywordChunks, $limit, 1);
|
||||
return $keywordChunks;
|
||||
}
|
||||
|
||||
$chunkIds = [];
|
||||
@@ -45,73 +87,78 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
if ($chunkIds === []) {
|
||||
return $this->diversifyByDevice($keywordChunks, $limit, 1);
|
||||
return $keywordChunks;
|
||||
}
|
||||
|
||||
$rows = $this->lookup->findByChunkIds($chunkIds);
|
||||
|
||||
foreach ($chunkIds as $id) {
|
||||
// routed filtering by document_id
|
||||
$finalChunkIds = $chunkIds;
|
||||
|
||||
if ($candidateSet !== null) {
|
||||
$filtered = [];
|
||||
|
||||
foreach ($chunkIds as $id) {
|
||||
$row = $rows[$id] ?? null;
|
||||
if (!is_array($row)) {
|
||||
continue;
|
||||
}
|
||||
$docId = $row['document_id'] ?? null;
|
||||
if (!is_string($docId) || !isset($candidateSet[$docId])) {
|
||||
continue;
|
||||
}
|
||||
$filtered[] = $id;
|
||||
}
|
||||
|
||||
// Soft fallback: if routing filtered everything away, retry global vector once
|
||||
if ($filtered === []) {
|
||||
$hits2 = $this->vectorClient->search($prompt, $this->vectorTopK);
|
||||
if ($hits2 === []) {
|
||||
return $keywordChunks;
|
||||
}
|
||||
|
||||
$chunkIds2 = [];
|
||||
foreach ($hits2 as $hit) {
|
||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||
continue;
|
||||
}
|
||||
if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
|
||||
continue;
|
||||
}
|
||||
$chunkIds2[] = (string)$hit['chunk_id'];
|
||||
}
|
||||
|
||||
if ($chunkIds2 === []) {
|
||||
return $keywordChunks;
|
||||
}
|
||||
|
||||
$rows = $this->lookup->findByChunkIds($chunkIds2);
|
||||
$finalChunkIds = $chunkIds2;
|
||||
} else {
|
||||
$finalChunkIds = $filtered;
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($finalChunkIds as $id) {
|
||||
if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) {
|
||||
continue;
|
||||
}
|
||||
$keywordChunks[] = trim($rows[$id]['text']);
|
||||
}
|
||||
|
||||
// dedupe
|
||||
// ---------------------------------------------------------
|
||||
// 3) dedupe + limit
|
||||
// ---------------------------------------------------------
|
||||
$seen = [];
|
||||
$deduped = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($keywordChunks as $chunk) {
|
||||
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
|
||||
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
|
||||
if (isset($seen[$key])) {
|
||||
continue;
|
||||
}
|
||||
$seen[$key] = true;
|
||||
$deduped[] = $chunk;
|
||||
}
|
||||
|
||||
// diversify
|
||||
return $this->diversifyByDevice($deduped, $limit, 1);
|
||||
}
|
||||
|
||||
private function extractTerms(string $text): array
|
||||
{
|
||||
$text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
|
||||
|
||||
return array_values(array_filter(
|
||||
explode(' ', $text),
|
||||
static fn(string $w) => mb_strlen($w) > 2
|
||||
));
|
||||
}
|
||||
|
||||
private function extractDevice(string $chunk): string
|
||||
{
|
||||
$firstLine = explode("\n", $chunk, 2)[0] ?? '';
|
||||
return trim($firstLine);
|
||||
}
|
||||
|
||||
private function diversifyByDevice(array $chunks, int $limit, int $maxPerDevice = 1): array
|
||||
{
|
||||
$seenDevices = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$device = $this->extractDevice($chunk);
|
||||
|
||||
if ($device === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isset($seenDevices[$device])) {
|
||||
$seenDevices[$device] = 0;
|
||||
}
|
||||
|
||||
if ($seenDevices[$device] >= $maxPerDevice) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = $chunk;
|
||||
$seenDevices[$device]++;
|
||||
|
||||
if (\count($out) >= $limit) {
|
||||
break;
|
||||
@@ -120,4 +167,116 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
return $out;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Streaming Keyword Search über index.ndjson.
|
||||
* Minimal, aber nützlich:
|
||||
* - Score = Anzahl gefundener Terms
|
||||
* - CandidateDocs (Tag-Routing) reduziert Scan massiv
|
||||
*
|
||||
* @param string[] $terms
|
||||
* @param array<string,true>|null $candidateSet
|
||||
* @return string[]
|
||||
*/
|
||||
private function keywordSearchStreaming(array $terms, int $limit, ?array $candidateSet): array
|
||||
{
|
||||
if ($terms === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$maxScore = \count($terms);
|
||||
|
||||
// top list: each item = ['score' => int, 'text' => string]
|
||||
$top = [];
|
||||
|
||||
foreach ($this->chunkManager->streamAll() as $row) {
|
||||
$text = $row['text'] ?? null;
|
||||
if (!is_string($text) || $text === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($candidateSet !== null) {
|
||||
$docId = $row['document_id'] ?? null;
|
||||
if (!is_string($docId) || !isset($candidateSet[$docId])) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
$haystack = mb_strtolower($text);
|
||||
|
||||
$score = 0;
|
||||
foreach ($terms as $t) {
|
||||
if ($t === '') {
|
||||
continue;
|
||||
}
|
||||
if (mb_stripos($haystack, $t) !== false) {
|
||||
$score++;
|
||||
}
|
||||
}
|
||||
|
||||
if ($score < self::KEYWORD_MIN_HITS) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$top[] = [
|
||||
'score' => $score,
|
||||
'text' => trim($text),
|
||||
];
|
||||
|
||||
// keep only best N (simple sort, N is tiny)
|
||||
usort($top, static function (array $a, array $b): int {
|
||||
// higher score first
|
||||
$cmp = ($b['score'] <=> $a['score']);
|
||||
if ($cmp !== 0) {
|
||||
return $cmp;
|
||||
}
|
||||
// shorter chunk first (often more precise)
|
||||
return (mb_strlen($a['text']) <=> mb_strlen($b['text']));
|
||||
});
|
||||
|
||||
if (\count($top) > $limit) {
|
||||
$top = array_slice($top, 0, $limit);
|
||||
}
|
||||
|
||||
// early exit: perfect matches filled
|
||||
if (\count($top) === $limit && ($top[0]['score'] ?? 0) >= $maxScore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$out = [];
|
||||
foreach ($top as $item) {
|
||||
$out[] = (string)$item['text'];
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Minimal term extraction (stabiles Verhalten, wenig Magie)
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
private function extractTerms(string $text): array
|
||||
{
|
||||
$text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
|
||||
|
||||
$parts = array_values(array_filter(
|
||||
explode(' ', $text),
|
||||
static fn(string $w) => mb_strlen($w) > 2
|
||||
));
|
||||
|
||||
// unique, order preserved
|
||||
$seen = [];
|
||||
$out = [];
|
||||
foreach ($parts as $w) {
|
||||
if (isset($seen[$w])) {
|
||||
continue;
|
||||
}
|
||||
$seen[$w] = true;
|
||||
$out[] = $w;
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user