alpha new hybridretriver line

2026-02-26 13:51:54 +01:00
parent ec22f8bbbd
commit 052ff55eda
3 changed files with 129 additions and 41 deletions
--- a/python/vector/vector_service.py
+++ b/python/vector/vector_service.py
@@ -2,7 +2,7 @@
 import json
 from pathlib import Path
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Dict
 import numpy as np
 import faiss
@@ -25,6 +25,7 @@ TAG_INDEX_PATH = KNOWLEDGE_DIR / "vector_tags.index"
 TAG_MAP_PATH = KNOWLEDGE_DIR / "vector_tags.index.meta.json"
 INDEX_META_PATH = KNOWLEDGE_DIR / "index_meta.json"
 INDEX_NDJSON_PATH = KNOWLEDGE_DIR / "index.ndjson"
 # ============================================================
@@ -36,8 +37,11 @@ app = FastAPI()
 model: Optional[SentenceTransformer] = None
 chunk_index = None
 chunk_ids: Optional[List[Any]] = None
 chunk_doc_map: Dict[str, str] = {}
 tag_index = None
 tag_ids: Optional[List[Any]] = None
 loaded_embedding_model_name: Optional[str] = None
@@ -48,12 +52,35 @@ loaded_embedding_model_name: Optional[str] = None
 class SearchRequest(BaseModel):
    query: str
    limit: int = 8
    doc_ids: Optional[List[str]] = None  # NEW
 # ============================================================
 # Loader
 # ============================================================
 def load_chunk_doc_map():
    global chunk_doc_map
    chunk_doc_map = {}
    if not INDEX_NDJSON_PATH.exists():
        return
    with INDEX_NDJSON_PATH.open("r", encoding="utf-8") as f:
        for line in f:
            try:
                row = json.loads(line)
            except Exception:
                continue
            chunk_id = row.get("chunk_id")
            document_id = row.get("document_id")
            if isinstance(chunk_id, str) and isinstance(document_id, str):
                chunk_doc_map[chunk_id] = document_id
 def load_all():
    global model, chunk_index, chunk_ids, tag_index, tag_ids, loaded_embedding_model_name
@@ -81,6 +108,10 @@ def load_all():
        chunk_index = None
        chunk_ids = None
    # Load chunk → document map
    print("[Reload] Loading chunk-doc map")
    load_chunk_doc_map()
    # Reload tag index
    if TAG_INDEX_PATH.exists() and TAG_MAP_PATH.exists():
        print("[Reload] Loading tag index")
@@ -134,20 +165,37 @@ def search_chunks(req: SearchRequest):
    query_vec = model.encode([req.query], normalize_embeddings=True)
    query_vec = np.array(query_vec).astype("float32")
-    scores, indices = chunk_index.search(query_vec, req.limit)
+    # Wenn doc_ids gesetzt sind → mehr holen, dann filtern
    effective_limit = req.limit
    if req.doc_ids:
        effective_limit = max(req.limit * 5, 50)
    scores, indices = chunk_index.search(query_vec, effective_limit)
    results = []
    for score, idx in zip(scores[0], indices[0]):
        if idx == -1:
            continue
        if idx < 0 or idx >= len(chunk_ids):
            continue
        chunk_id = chunk_ids[idx]
        # NEW: doc-scoped filter
        if req.doc_ids:
            doc_id = chunk_doc_map.get(chunk_id)
            if doc_id not in req.doc_ids:
                continue
        results.append({
-            "chunk_id": chunk_ids[idx],
+            "chunk_id": chunk_id,
            "score": float(score),
        })
        if len(results) >= req.limit:
            break
    return results
--- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
+++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
@@ -12,8 +12,8 @@ use App\Vector\VectorSearchClient;
 final class NdjsonHybridRetriever implements RetrieverInterface
 {
-    private const VECTOR_SCORE_THRESHOLD = 0.25;
+    private const VECTOR_SCORE_THRESHOLD = 0.22;
-    private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 10;
+    private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 3;
    private const HARD_MAX_CHUNKS = 200;
    private const HARD_MAX_VECTORK = 200;
@@ -49,9 +49,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $isListQuery = $this->isListQuery($prompt);
-        // -------------------------------
+        // -------------------------------------------------
        // 1) Tag Routing
-        // -------------------------------
+        // -------------------------------------------------
        $candidateDocIds = $this->tagRouting->route($prompt);
        $candidateSet = null;
@@ -59,19 +59,40 @@ final class NdjsonHybridRetriever implements RetrieverInterface
            $candidateSet = array_fill_keys($candidateDocIds, true);
        }
-        // -------------------------------
+        // -------------------------------------------------
-        // 2) Vector Search
+        // 2) TopK bestimmen
-        // -------------------------------
+        // -------------------------------------------------
        $topK = $vectorTopKBase;
        if ($isListQuery) {
            $topK = max($vectorTopKBase * 3, 80);
        }
        if ($candidateSet !== null) {
            $topK = min(
-                max($vectorTopKBase * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $vectorTopKBase),
+                max($topK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $topK),
                self::HARD_MAX_VECTORK
            );
        }
-        $hits = $this->vectorClient->search($prompt, $topK);
+        // -------------------------------------------------
        // 3) Vector Search (Scoped wenn möglich)
        // -------------------------------------------------
        if ($candidateSet !== null) {
            $hits = $this->vectorClient->searchScoped(
                $prompt,
                $topK,
                array_keys($candidateSet)
            );
            // Wenn scoped nichts liefert → global fallback
            if ($hits === []) {
                $hits = $this->vectorClient->search($prompt, $vectorTopKBase);
            }
        } else {
            $hits = $this->vectorClient->search($prompt, $topK);
        }
        if ($hits === []) {
            return $candidateSet !== null
@@ -79,9 +100,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
                : [];
        }
-        // -------------------------------
+        // -------------------------------------------------
-        // 3) Chunk-IDs + Lookup einmalig
+        // 4) ChunkIds + Lookup
-        // -------------------------------
+        // -------------------------------------------------
        $chunkIds = [];
        foreach ($hits as $hit) {
@@ -104,9 +125,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $rows = $this->lookup->findByChunkIds($chunkIds);
-        // -------------------------------
+        // -------------------------------------------------
-        // 4) Listen-Modus → Dokument-Ranking
+        // 5) Listenmodus → Dokument-Ranking
-        // -------------------------------
+        // -------------------------------------------------
        if ($isListQuery && $candidateSet !== null) {
            $rankedDocIds = $this->rankDocumentsFromHits($hits, $rows, $candidateSet);
@@ -120,9 +141,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
            return $this->collectBestChunkPerDocument($topDocIds, $hits, $rows);
        }
-        // -------------------------------
+        // -------------------------------------------------
-        // 5) Normaler Chunk-Modus
+        // 6) Normaler Chunk-Modus
-        // -------------------------------
+        // -------------------------------------------------
        return $this->collectTexts($chunkIds, $rows, $limit);
    }
@@ -153,18 +174,13 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $documentScores = [];
        foreach ($hits as $hit) {
-            if (!isset($hit['chunk_id'], $hit['score'])) {
+            $chunkId = (string)($hit['chunk_id'] ?? '');
                continue;
            }
            $chunkId = (string)$hit['chunk_id'];
            if (!isset($rows[$chunkId])) {
                continue;
            }
-            $row = $rows[$chunkId];
+            $docId = $rows[$chunkId]['document_id'] ?? null;
            $docId = $row['document_id'] ?? null;
            if (!is_string($docId) || !isset($candidateSet[$docId])) {
                continue;
@@ -203,25 +219,19 @@ final class NdjsonHybridRetriever implements RetrieverInterface
            $bestText = null;
            foreach ($hits as $hit) {
-                if (!isset($hit['chunk_id'], $hit['score'])) {
+                $chunkId = (string)($hit['chunk_id'] ?? '');
                    continue;
                }
                $chunkId = (string)$hit['chunk_id'];
                if (!isset($rows[$chunkId])) {
                    continue;
                }
-                $row = $rows[$chunkId];
+                if (($rows[$chunkId]['document_id'] ?? null) !== $docId) {
                if (($row['document_id'] ?? null) !== $docId) {
                    continue;
                }
                if ((float)$hit['score'] > $bestScore) {
                    $bestScore = (float)$hit['score'];
-                    $bestText = $row['text'] ?? null;
+                    $bestText = $rows[$chunkId]['text'] ?? null;
                }
            }
--- a/src/Vector/VectorSearchClient.php
+++ b/src/Vector/VectorSearchClient.php
@@ -25,17 +25,47 @@ final class VectorSearchClient
        $this->agentLogger = $agentLogger;
    }
    /**
     * Standard global search
     */
    public function search(string $query, int $limit = 5): array
    {
        return $this->executeSearch([
            'query' => $query,
            'limit' => $limit,
        ]);
    }
    /**
     * Scoped search: nur innerhalb bestimmter Dokumente
     */
    public function searchScoped(
        string $query,
        int $limit,
        array $docIds
    ): array {
        if ($docIds === []) {
            return [];
        }
        return $this->executeSearch([
            'query'   => $query,
            'limit'   => $limit,
            'doc_ids' => array_values($docIds),
        ]);
    }
    /**
     * Gemeinsame HTTP-Logik (keine Duplikation)
     */
    private function executeSearch(array $payload): array
    {
        try {
            $response = $this->http->request(
                'POST',
                $this->serviceUrl . '/search-chunks',
                [
-                    'json' => [
+                    'json' => $payload,
                        'query' => $query,
                        'limit' => $limit,
                    ],
                    'timeout' => 10,
                ]
            );