MtoRagSystem/src/Knowledge/Retrieval/NdjsonHybridRetriever.php

<?php

declare(strict_types=1);

namespace App\Knowledge\Retrieval;

use App\Knowledge\ChunkManager;
use App\Tag\TagRoutingService;
use App\Vector\VectorSearchClient;

final class NdjsonHybridRetriever implements RetrieverInterface
{
    private const VECTOR_SCORE_THRESHOLD = 0.25;

    /**
     * Wenn Tag-Routing aktiv ist, erhöhen wir TopK,
     * weil wir danach per document_id filtern.
     */
    private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 10;

    /**
     * Keyword-Scan: Mindest-Trefferanzahl an Terms, damit ein Chunk als Kandidat gilt.
     */
    private const KEYWORD_MIN_HITS = 1;

    public function __construct(
        private readonly ChunkManager       $chunkManager,
        private readonly NdjsonChunkLookup  $lookup,
        private readonly VectorSearchClient $vectorClient,
        private readonly TagRoutingService  $tagRouting,
        private readonly int                $maxChunks = 3,
        private readonly int                $vectorTopK = 5,
    ) {}

    public function retrieve(string $prompt, int $limit = null): array
    {
        $limit ??= $this->maxChunks;

        // ---------------------------------------------------------
        // 0) Tag-Routing FIRST (soft gate)
        // ---------------------------------------------------------
        $candidateDocIds = $this->tagRouting->route($prompt);

        $candidateSet = null;

        if (is_array($candidateDocIds) && $candidateDocIds !== []) {
            $candidateSet = array_fill_keys($candidateDocIds, true);
        }

        // ---------------------------------------------------------
        // 1) Keyword first (simple streaming scan)
        // ---------------------------------------------------------
        $terms = $this->extractTerms($prompt);

        $keywordChunks = $this->keywordSearchStreaming($terms, $limit, $candidateSet);

        if (\count($keywordChunks) >= $limit) {
            return array_slice($keywordChunks, 0, $limit);
        }

        // ---------------------------------------------------------
        // 2) Vector fallback / enrichment
        //    - If routed: increase TopK, then filter by document_id
        //    - Soft fallback: if filtering yields nothing -> global vector once
        // ---------------------------------------------------------
        $topK = $this->vectorTopK;

        if ($candidateSet !== null) {
            $topK = max($this->vectorTopK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $this->vectorTopK);
            $topK = min($topK, 200); // guardrail
        }

        $hits = $this->vectorClient->search($prompt, $topK);
        if ($hits === []) {
            return $keywordChunks;
        }

        $chunkIds = [];
        foreach ($hits as $hit) {
            if (!isset($hit['chunk_id'], $hit['score'])) {
                continue;
            }
            if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
                continue;
            }
            $chunkIds[] = (string)$hit['chunk_id'];
        }

        if ($chunkIds === []) {
            return $keywordChunks;
        }

        $rows = $this->lookup->findByChunkIds($chunkIds);

        // routed filtering by document_id
        $finalChunkIds = $chunkIds;

        if ($candidateSet !== null) {
            $filtered = [];

            foreach ($chunkIds as $id) {
                $row = $rows[$id] ?? null;
                if (!is_array($row)) {
                    continue;
                }
                $docId = $row['document_id'] ?? null;
                if (!is_string($docId) || !isset($candidateSet[$docId])) {
                    continue;
                }
                $filtered[] = $id;
            }

            // Soft fallback: if routing filtered everything away, retry global vector once
            if ($filtered === []) {
                $hits2 = $this->vectorClient->search($prompt, $this->vectorTopK);
                if ($hits2 === []) {
                    return $keywordChunks;
                }

                $chunkIds2 = [];
                foreach ($hits2 as $hit) {
                    if (!isset($hit['chunk_id'], $hit['score'])) {
                        continue;
                    }
                    if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
                        continue;
                    }
                    $chunkIds2[] = (string)$hit['chunk_id'];
                }

                if ($chunkIds2 === []) {
                    return $keywordChunks;
                }

                $rows = $this->lookup->findByChunkIds($chunkIds2);
                $finalChunkIds = $chunkIds2;
            } else {
                $finalChunkIds = $filtered;
            }
        }

        foreach ($finalChunkIds as $id) {
            if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) {
                continue;
            }
            $keywordChunks[] = trim($rows[$id]['text']);
        }

        // ---------------------------------------------------------
        // 3) dedupe + limit
        // ---------------------------------------------------------
        $seen = [];
        $out = [];

        foreach ($keywordChunks as $chunk) {
            $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
            if (isset($seen[$key])) {
                continue;
            }
            $seen[$key] = true;
            $out[] = $chunk;

            if (\count($out) >= $limit) {
                break;
            }
        }

        return $out;
    }

    /**
     * Streaming Keyword Search über index.ndjson.
     * Minimal, aber nützlich:
     * - Score = Anzahl gefundener Terms
     * - CandidateDocs (Tag-Routing) reduziert Scan massiv
     *
     * @param string[] $terms
     * @param array<string,true>|null $candidateSet
     * @return string[]
     */
    private function keywordSearchStreaming(array $terms, int $limit, ?array $candidateSet): array
    {
        if ($terms === []) {
            return [];
        }

        $maxScore = \count($terms);

        // top list: each item = ['score' => int, 'text' => string]
        $top = [];

        foreach ($this->chunkManager->streamAll() as $row) {
            $text = $row['text'] ?? null;
            if (!is_string($text) || $text === '') {
                continue;
            }

            if ($candidateSet !== null) {
                $docId = $row['document_id'] ?? null;
                if (!is_string($docId) || !isset($candidateSet[$docId])) {
                    continue;
                }
            }

            $haystack = mb_strtolower($text);

            $score = 0;
            foreach ($terms as $t) {
                if ($t === '') {
                    continue;
                }
                if (mb_stripos($haystack, $t) !== false) {
                    $score++;
                }
            }

            if ($score < self::KEYWORD_MIN_HITS) {
                continue;
            }

            $top[] = [
                'score' => $score,
                'text'  => trim($text),
            ];

            // keep only best N (simple sort, N is tiny)
            usort($top, static function (array $a, array $b): int {
                // higher score first
                $cmp = ($b['score'] <=> $a['score']);
                if ($cmp !== 0) {
                    return $cmp;
                }
                // shorter chunk first (often more precise)
                return (mb_strlen($a['text']) <=> mb_strlen($b['text']));
            });

            if (\count($top) > $limit) {
                $top = array_slice($top, 0, $limit);
            }

            // early exit: perfect matches filled
            if (\count($top) === $limit && ($top[0]['score'] ?? 0) >= $maxScore) {
                break;
            }
        }

        $out = [];
        foreach ($top as $item) {
            $out[] = (string)$item['text'];
        }

        return $out;
    }

    /**
     * Minimal term extraction (stabiles Verhalten, wenig Magie)
     *
     * @return string[]
     */
    private function extractTerms(string $text): array
    {
        $text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));

        $parts = array_values(array_filter(
            explode(' ', $text),
            static fn(string $w) => mb_strlen($w) > 2
        ));

        // unique, order preserved
        $seen = [];
        $out = [];
        foreach ($parts as $w) {
            if (isset($seen[$w])) {
                continue;
            }
            $seen[$w] = true;
            $out[] = $w;
        }

        return $out;
    }
}