From 3a5804e44cc334503eedad84ffcc286271b3e4d2 Mon Sep 17 00:00:00 2001 From: team2 Date: Fri, 27 Feb 2026 21:03:59 +0100 Subject: [PATCH] optimize as sales rag --- python/vector/vector_service.py | 164 ++++++++-- src/Intent/SalesIntentLite.php | 160 +++++++++ .../Ingest/KnowledgeIngestService.php | 1 + .../Retrieval/NdjsonHybridRetriever.php | 308 ++++++++---------- src/Tag/TagVectorSearchClient.php | 29 +- src/Vector/VectorSearchClient.php | 92 +++++- 6 files changed, 541 insertions(+), 213 deletions(-) create mode 100644 src/Intent/SalesIntentLite.php diff --git a/python/vector/vector_service.py b/python/vector/vector_service.py index 20f4d9f..eebfc23 100644 --- a/python/vector/vector_service.py +++ b/python/vector/vector_service.py @@ -78,7 +78,12 @@ app = FastAPI() model: Optional[SentenceTransformer] = None chunk_index = None chunk_ids: Optional[List[Any]] = None + +# Sales-RAG signals derived from NDJSON (loaded on startup and reload): +# - chunk_doc_map: chunk_id -> document_id +# - chunk_pos_map: chunk_id -> chunk_index (position within document, if available) chunk_doc_map: Dict[str, str] = {} +chunk_pos_map: Dict[str, int] = {} tag_index = None tag_ids: Optional[List[Any]] = None @@ -115,10 +120,32 @@ def _safe_read_json(path: Path) -> Optional[dict]: return None -def load_chunk_doc_map() -> None: - global chunk_doc_map +def _as_key(value: Any) -> Optional[str]: + """ + Normalize IDs to string keys for maps. Returns None if unusable. + """ + if value is None: + return None + if isinstance(value, str): + v = value.strip() + return v if v else None + try: + v = str(value).strip() + return v if v else None + except Exception: + return None + + +def load_chunk_maps_from_ndjson() -> None: + """ + Builds two maps from index.ndjson: + - chunk_id -> document_id + - chunk_id -> chunk_index (position inside document, if present) + """ + global chunk_doc_map, chunk_pos_map chunk_doc_map = {} + chunk_pos_map = {} if not INDEX_NDJSON_PATH.exists(): return @@ -126,18 +153,53 @@ def load_chunk_doc_map() -> None: try: with INDEX_NDJSON_PATH.open("r", encoding="utf-8") as f: for line in f: + line = line.strip() + if not line: + continue + try: row = json.loads(line) except Exception: continue - chunk_id = row.get("chunk_id") - document_id = row.get("document_id") + chunk_id_key = _as_key(row.get("chunk_id")) + if not chunk_id_key: + continue + + document_id = row.get("document_id") + doc_id_key = _as_key(document_id) + if doc_id_key: + chunk_doc_map[chunk_id_key] = doc_id_key + + # chunk_index is optional but very useful for Sales-RAG diversity rules + # (e.g. min distance within a doc) + ci = row.get("chunk_index") + if isinstance(ci, int): + chunk_pos_map[chunk_id_key] = ci + else: + # tolerate numeric strings + if isinstance(ci, str): + s = ci.strip() + if s.isdigit(): + try: + chunk_pos_map[chunk_id_key] = int(s) + except Exception: + pass - if isinstance(chunk_id, str) and isinstance(document_id, str): - chunk_doc_map[chunk_id] = document_id except Exception as e: - logger.warning("Failed to load chunk-doc map from ndjson: %s", str(e)) + logger.warning("Failed to load chunk maps from ndjson: %s", str(e)) + + +def _sanitize_limit(limit: int, default: int = 8, max_limit: int = 200) -> int: + try: + v = int(limit) + except Exception: + return default + if v <= 0: + return default + if v > max_limit: + return max_limit + return v def load_all() -> None: @@ -175,8 +237,8 @@ def load_all() -> None: chunk_index = None chunk_ids = None - logger.info("[Reload] Loading chunk-doc map") - load_chunk_doc_map() + logger.info("[Reload] Loading chunk maps (doc_id + chunk_index)") + load_chunk_maps_from_ndjson() if TAG_INDEX_PATH.exists() and TAG_MAP_PATH.exists(): logger.info("[Reload] Loading tag index") @@ -199,7 +261,12 @@ def load_all() -> None: current_index_version = index_version if isinstance(index_version, int) else None - logger.info("[Reload] Completed (index_version=%s runtime=%s)", str(current_index_version), str(current_runtime_stamp)) + logger.info( + "[Reload] Completed (index_version=%s runtime=%s embedding_model=%s)", + str(current_index_version), + str(current_runtime_stamp), + str(loaded_embedding_model_name), + ) # ============================================================ @@ -227,12 +294,20 @@ def observer_loop() -> None: new_runtime = v if isinstance(v, str) else None if new_version != current_index_version: - logger.info("[Observer] index_version changed (%s -> %s) -> Reload", str(current_index_version), str(new_version)) + logger.info( + "[Observer] index_version changed (%s -> %s) -> Reload", + str(current_index_version), + str(new_version), + ) load_all() continue if new_runtime != current_runtime_stamp: - logger.info("[Observer] runtime changed (%s -> %s) -> Reload", str(current_runtime_stamp), str(new_runtime)) + logger.info( + "[Observer] runtime changed (%s -> %s) -> Reload", + str(current_runtime_stamp), + str(new_runtime), + ) load_all() except Exception as e: @@ -267,6 +342,7 @@ def health(): "chunk_index_loaded": chunk_index is not None, "tag_index_loaded": tag_index is not None, "model_loaded": model is not None, + "embedding_model": loaded_embedding_model_name, "index_version": current_index_version, "runtime_stamp": current_runtime_stamp, "log_file": str(LOG_FILE), @@ -287,15 +363,33 @@ def search_chunks(req: SearchRequest): if chunk_index is None or chunk_ids is None or model is None: raise HTTPException(status_code=503, detail="Chunk index not available") + # Safety: clamp limit to prevent abuse / accidental huge queries + limit = _sanitize_limit(req.limit, default=8, max_limit=200) + + query = (req.query or "").strip() + if not query: + raise HTTPException(status_code=400, detail="query must not be empty") + query_vec = model.encode( - [f"query: {req.query}"], + [f"query: {query}"], normalize_embeddings=True ) query_vec = np.array(query_vec).astype("float32") - effective_limit = req.limit + effective_limit = limit + doc_filter: Optional[List[str]] = None if req.doc_ids: - effective_limit = max(req.limit * 5, 50) + # Normalize incoming doc_ids for reliable matching + doc_filter = [] + for d in req.doc_ids: + dk = _as_key(d) + if dk: + doc_filter.append(dk) + + # When doc filtering is enabled, we fetch a wider pool and filter down. + # Keep it bounded to avoid expensive scans on huge indices. + effective_limit = max(limit * 5, 50) + effective_limit = min(effective_limit, 500) scores, indices = chunk_index.search(query_vec, effective_limit) @@ -307,19 +401,33 @@ def search_chunks(req: SearchRequest): if idx < 0 or idx >= len(chunk_ids): continue - chunk_id = chunk_ids[idx] + raw_chunk_id = chunk_ids[idx] + chunk_id_key = _as_key(raw_chunk_id) + if not chunk_id_key: + continue - if req.doc_ids: - doc_id = chunk_doc_map.get(chunk_id) - if doc_id not in req.doc_ids: + # Apply doc filter if requested + doc_id = chunk_doc_map.get(chunk_id_key) + if doc_filter is not None: + if doc_id is None or doc_id not in doc_filter: continue - results.append({ - "chunk_id": chunk_id, + # Sales-RAG signals: + # - document_id (for doc quotas / diversity rules) + # - chunk_index (position within doc for distance constraints) + payload = { + "chunk_id": raw_chunk_id, "score": float(score), - }) + "document_id": doc_id, # may be None if ndjson missing/partial + } - if len(results) >= req.limit: + ci = chunk_pos_map.get(chunk_id_key) + if isinstance(ci, int): + payload["chunk_index"] = ci + + results.append(payload) + + if len(results) >= limit: break return results @@ -330,13 +438,19 @@ def search_tags(req: SearchRequest): if tag_index is None or tag_ids is None or model is None: raise HTTPException(status_code=503, detail="Tag index not available") + limit = _sanitize_limit(req.limit, default=8, max_limit=200) + + query = (req.query or "").strip() + if not query: + raise HTTPException(status_code=400, detail="query must not be empty") + query_vec = model.encode( - [f"query: {req.query}"], + [f"query: {query}"], normalize_embeddings=True ) query_vec = np.array(query_vec).astype("float32") - scores, indices = tag_index.search(query_vec, req.limit) + scores, indices = tag_index.search(query_vec, limit) results = [] diff --git a/src/Intent/SalesIntentLite.php b/src/Intent/SalesIntentLite.php new file mode 100644 index 0000000..37d0f84 --- /dev/null +++ b/src/Intent/SalesIntentLite.php @@ -0,0 +1,160 @@ +normalize($originalPrompt); + + $scores = [ + self::PRICING => 0, + self::COMPARISON => 0, + self::OBJECTION => 0, + self::IMPLEMENTATION => 0, + self::ROI => 0, + ]; + + // ------------------------------------------------------------ + // PRICING + // ------------------------------------------------------------ + $pricingWords = [ + 'preis', 'preise', 'kosten', 'lizenz', 'lizenzmodell', + 'paket', 'pakete', 'tarif', 'tarife', + 'gebühr', 'gebuehr', 'monatlich', 'jährlich', 'jaehrlich', + 'abo', 'subscription' + ]; + + foreach ($pricingWords as $word) { + if (preg_match('/\b' . preg_quote($word, '/') . '\b/u', $p)) { + $scores[self::PRICING] += 2; + } + } + + // ------------------------------------------------------------ + // COMPARISON + // ------------------------------------------------------------ + $comparisonPatterns = [ + '/\bvergleich\b/u', + '/\bvs\b/u', + '/\boder\b/u', + '/\balternative(n)?\b/u', + '/\bunterschied(e)?\b/u', + '/\bbesser\b/u', + ]; + + foreach ($comparisonPatterns as $pattern) { + if (preg_match($pattern, $p)) { + $scores[self::COMPARISON] += 2; + } + } + + // ------------------------------------------------------------ + // OBJECTION + // ------------------------------------------------------------ + $objectionWords = [ + 'problem', 'risiko', 'nachteil', 'datenschutz', + 'dsgvo', 'sicherheit', 'compliance', + 'kritik', 'zweifel', 'unsicher' + ]; + + foreach ($objectionWords as $word) { + if (preg_match('/\b' . preg_quote($word, '/') . '\b/u', $p)) { + $scores[self::OBJECTION] += 2; + } + } + + // ------------------------------------------------------------ + // IMPLEMENTATION + // ------------------------------------------------------------ + $implementationWords = [ + 'implementierung', 'einführung', 'einfuehrung', + 'integration', 'aufwand', 'setup', + 'rollout', 'migration', 'installation', + 'technisch', 'api', 'schnittstelle' + ]; + + foreach ($implementationWords as $word) { + if (preg_match('/\b' . preg_quote($word, '/') . '\b/u', $p)) { + $scores[self::IMPLEMENTATION] += 2; + } + } + + // ------------------------------------------------------------ + // ROI / Business Case + // ------------------------------------------------------------ + $roiWords = [ + 'roi', 'rentabilität', 'rentabilitaet', + 'business case', 'nutzen', + 'effizienz', 'einsparung', 'umsatz', + 'wert', 'vorteil' + ]; + + foreach ($roiWords as $word) { + if (preg_match('/\b' . preg_quote($word, '/') . '\b/u', $p)) { + $scores[self::ROI] += 2; + } + } + + // ------------------------------------------------------------ + // Entscheidung + // ------------------------------------------------------------ + arsort($scores); + + $topIntent = array_key_first($scores); + $topScore = $scores[$topIntent] ?? 0; + + if ($topScore <= 0) { + return [ + 'intent' => self::DISCOVERY, + 'score' => 0, + ]; + } + + return [ + 'intent' => $topIntent, + 'score' => $topScore, + ]; + } + + private function normalize(string $s): string + { + $s = mb_strtolower($s); + + $replacements = [ + 'ä' => 'ae', + 'ö' => 'oe', + 'ü' => 'ue', + 'ß' => 'ss', + ]; + + foreach ($replacements as $umlaut => $alt) { + if (str_contains($s, $umlaut)) { + $s .= ' ' . str_replace($umlaut, $alt, $s); + break; + } + } + + return $s; + } +} \ No newline at end of file diff --git a/src/Knowledge/Ingest/KnowledgeIngestService.php b/src/Knowledge/Ingest/KnowledgeIngestService.php index 5a11732..d2f05ad 100644 --- a/src/Knowledge/Ingest/KnowledgeIngestService.php +++ b/src/Knowledge/Ingest/KnowledgeIngestService.php @@ -41,6 +41,7 @@ final readonly class KnowledgeIngestService foreach ($chunks as $chunkText) { if ($title !== '' && !str_starts_with($chunkText, $title)) { + //title with backticks $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText; } diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 6278a3c..41f0540 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -6,7 +6,7 @@ namespace App\Knowledge\Retrieval; use App\Entity\ModelGenerationConfig; use App\Intent\IntentLite; -use App\Knowledge\ChunkManager; +use App\Intent\SalesIntentLite; use App\Knowledge\QueryCleaner; use App\Repository\ModelGenerationConfigRepository; use App\Tag\TagRoutingService; @@ -21,11 +21,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface private const LIST_BONUS = 1.5; - /** - * Tags must only provide a small bonus (never act as a gate/filter). - * Enterprise default: keep it low, otherwise tags will dominate ranking again. - */ - private const TAG_SCORE_BONUS = 0.1 * (1 - self::VECTOR_SCORE_THRESHOLD); + private const MAX_CHUNKS_PER_DOC = 2; + private const MIN_CHUNK_DISTANCE = 2; + private const RRF_K = 60; public function __construct( private readonly NdjsonChunkLookup $lookup, @@ -33,7 +31,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface private readonly TagRoutingService $tagRouting, private readonly ModelGenerationConfigRepository $configRepository, private readonly QueryCleaner $queryCleaner, - private readonly IntentLite $intentLite + private readonly IntentLite $intentLite, + private readonly SalesIntentLite $salesIntentLite ) { } @@ -49,27 +48,63 @@ final class NdjsonHybridRetriever implements RetrieverInterface return $this->retrieveInternal($prompt, $config); } - public function retrieveInternal(string $prompt, ModelGenerationConfig $config): array { $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); - // Important: list-intent detection must run on the original prompt - // (cleaning might remove "show/list" etc.). $isListQuery = $this->intentLite->isListQuery($prompt); + $salesIntent = $this->salesIntentLite->detect($prompt)['intent']; - // ------------------------------------------------- - // CLEAN QUERY (retrieval-only: tag routing + vector search) - // ------------------------------------------------- $cleanQuery = $this->queryCleaner->clean($prompt); if ($cleanQuery === '') { $cleanQuery = $prompt; } // ------------------------------------------------- - // 1) Tag routing (cleaned query) -> bonus only + // Intent-based adjustments // ------------------------------------------------- + + $threshold = self::VECTOR_SCORE_THRESHOLD; + $topK = $vectorTopKBase; + + switch ($salesIntent) { + case SalesIntentLite::PRICING: + $threshold += 0.02; // more precision + break; + + case SalesIntentLite::COMPARISON: + $topK = (int)round($vectorTopKBase * 1.4); + break; + + case SalesIntentLite::OBJECTION: + $threshold -= 0.02; + break; + + case SalesIntentLite::IMPLEMENTATION: + $topK = (int)round($vectorTopKBase * 1.3); + break; + + case SalesIntentLite::ROI: + $topK = (int)round($vectorTopKBase * 1.2); + break; + + case SalesIntentLite::DISCOVERY: + default: + $threshold -= 0.03; + break; + } + + if ($isListQuery) { + $topK = (int)round($topK * self::LIST_BONUS); + } + + $topK = max(1, min($topK, self::HARD_MAX_VECTORK)); + + // ------------------------------------------------- + // Tag routing + // ------------------------------------------------- + $candidateDocIds = $this->tagRouting->route($cleanQuery); $candidateSet = null; @@ -78,215 +113,144 @@ final class NdjsonHybridRetriever implements RetrieverInterface } // ------------------------------------------------- - // 2) Determine TopK + // Dual search // ------------------------------------------------- - $topK = $vectorTopKBase; - // List mode: increase coverage to rank more documents - if ($isListQuery) { - $topK = (int)round($vectorTopKBase * self::LIST_BONUS); + $globalHits = $this->vectorClient->search($cleanQuery, $topK); + + $scopedHits = []; + if ($candidateSet !== null) { + $scopedHits = $this->vectorClient->searchScoped( + $cleanQuery, + $topK, + array_keys($candidateSet) + ); } - $topK = max(1, min($topK, self::HARD_MAX_VECTORK)); - - // ------------------------------------------------- - // 3) Vector search (always GLOBAL; tags are NOT a filter) - // ------------------------------------------------- - $hits = $this->vectorClient->search($cleanQuery, $topK); - - if ($hits === []) { - // Tags must NOT act as a fallback (otherwise they become too powerful again). + if ($globalHits === [] && $scopedHits === []) { return []; } // ------------------------------------------------- - // 4) Collect chunkIds + scores (raw) + // RRF Fusion // ------------------------------------------------- - /** @var array $rawScoreByChunkId */ - $rawScoreByChunkId = []; + + $rrfScores = []; + + $this->applyRrf($globalHits, $rrfScores, $threshold); + $this->applyRrf($scopedHits, $rrfScores, $threshold, $salesIntent === SalesIntentLite::OBJECTION); + + if ($rrfScores === []) { + return []; + } + + arsort($rrfScores); + $rankedChunkIds = array_keys($rrfScores); + + $rows = $this->lookup->findByChunkIds($rankedChunkIds); + if ($rows === []) { + return []; + } + + if (!$isListQuery) { + return $this->collectSalesOptimized( + $rankedChunkIds, + $rows, + $limit + ); + } + + return $this->collectTexts($rankedChunkIds, $rows, $limit); + } + + private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void + { + $rank = 0; foreach ($hits as $hit) { + if (!isset($hit['chunk_id'], $hit['score'])) { continue; } $raw = (float)$hit['score']; - - // Apply the threshold to the RAW score (quality gate) - if ($raw < self::VECTOR_SCORE_THRESHOLD) { + if ($raw < $threshold) { continue; } $chunkId = (string)$hit['chunk_id']; - // If a chunk appears multiple times, keep the best raw score - if (!isset($rawScoreByChunkId[$chunkId]) || $raw > $rawScoreByChunkId[$chunkId]) { - $rawScoreByChunkId[$chunkId] = $raw; - } - } + $rank++; + $rrf = 1 / (self::RRF_K + $rank); - if ($rawScoreByChunkId === []) { - return []; - } - - // Lookup returns document_id + text etc. - $rows = $this->lookup->findByChunkIds(array_keys($rawScoreByChunkId)); - - // ------------------------------------------------- - // 5) Adjusted score (tag bonus) + ranking - // ------------------------------------------------- - /** @var array $adjScoreByChunkId */ - $adjScoreByChunkId = []; - - foreach ($rawScoreByChunkId as $chunkId => $rawScore) { - if (!isset($rows[$chunkId])) { - continue; + if ($boost) { + $rrf *= 1.2; // scoped boost for objections } - $adj = $rawScore; - - if ($candidateSet !== null) { - $docId = $rows[$chunkId]['document_id'] ?? null; - if (is_string($docId) && isset($candidateSet[$docId])) { - $adj += self::TAG_SCORE_BONUS; - } + if (!isset($rrfScores[$chunkId])) { + $rrfScores[$chunkId] = 0.0; } - $adjScoreByChunkId[$chunkId] = $adj; + $rrfScores[$chunkId] += $rrf; } - - if ($adjScoreByChunkId === []) { - return []; - } - - // Sort: adjusted desc, deterministic tie-break by chunkId - uksort($adjScoreByChunkId, static function (string $a, string $b) use ($adjScoreByChunkId): int { - $sa = $adjScoreByChunkId[$a]; - $sb = $adjScoreByChunkId[$b]; - - if ($sa === $sb) { - return $a <=> $b; - } - return ($sb <=> $sa); - }); - - $rankedChunkIds = array_keys($adjScoreByChunkId); - - // ------------------------------------------------- - // 6) List mode -> document ranking (with tag bonus in scores) - // ------------------------------------------------- - if ($isListQuery) { - $rankedDocIds = $this->rankDocumentsFromAdjustedScores($adjScoreByChunkId, $rows); - - if ($rankedDocIds === []) { - return []; - } - - $topDocIds = array_slice($rankedDocIds, 0, $limit); - - return $this->collectBestChunkPerDocumentAdjusted($topDocIds, $adjScoreByChunkId, $rows); - } - - // ------------------------------------------------- - // 7) Normal chunk mode (by adjusted ranking) - // ------------------------------------------------- - return $this->collectTexts($rankedChunkIds, $rows, $limit); } - // ========================================================= - // LIST QUERY DETECTION - // ========================================================= - - // ========================================================= - // DOCUMENT RANKING (Adjusted scores incl. tag bonus) - // ========================================================= - - /** - * @param array $adjScoreByChunkId - * @param array> $rows - * @return string[] - */ - private function rankDocumentsFromAdjustedScores(array $adjScoreByChunkId, array $rows): array + private function collectSalesOptimized(array $chunkIds, array $rows, int $limit): array { - $documentScores = []; + $out = []; + $docCounter = []; + $docChunkPositions = []; - foreach ($adjScoreByChunkId as $chunkId => $score) { - if (!isset($rows[$chunkId])) { + foreach ($chunkIds as $chunkId) { + + if (!isset($rows[$chunkId]['text'])) { continue; } $docId = $rows[$chunkId]['document_id'] ?? null; - if (!is_string($docId) || $docId === '') { + $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; + + if (!is_string($docId)) { continue; } - $documentScores[$docId][] = (float)$score; - } - - if ($documentScores === []) { - return []; - } - - $ranked = []; - - foreach ($documentScores as $docId => $scores) { - rsort($scores); - $topScores = array_slice($scores, 0, 3); - $ranked[$docId] = array_sum($topScores) / count($topScores); - } - - arsort($ranked); - - return array_keys($ranked); - } - - /** - * @param string[] $docIds - * @param array $adjScoreByChunkId - * @param array> $rows - * @return string[] - */ - private function collectBestChunkPerDocumentAdjusted(array $docIds, array $adjScoreByChunkId, array $rows): array - { - $result = []; - - foreach ($docIds as $docId) { - $bestScore = -INF; - $bestText = null; - - foreach ($adjScoreByChunkId as $chunkId => $score) { - if (!isset($rows[$chunkId])) { - continue; - } - - if (($rows[$chunkId]['document_id'] ?? null) !== $docId) { - continue; - } - - if ((float)$score > $bestScore) { - $bestScore = (float)$score; - $bestText = $rows[$chunkId]['text'] ?? null; - } + if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) { + continue; } - if (is_string($bestText) && $bestText !== '') { - $result[] = trim($bestText); + if (is_int($chunkIndex)) { + $prev = $docChunkPositions[$docId] ?? []; + foreach ($prev as $prevIdx) { + if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) { + continue 2; + } + } + $docChunkPositions[$docId][] = $chunkIndex; + } + + $text = trim((string)$rows[$chunkId]['text']); + if ($text === '') { + continue; + } + + $out[] = $text; + $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; + + if (count($out) >= $limit) { + break; } } - return $result; + return $out; } - // ========================================================= - // NORMAL MODE - // ========================================================= - private function collectTexts(array $chunkIds, array $rows, int $limit): array { $seen = []; $out = []; foreach ($chunkIds as $id) { + if (!isset($rows[$id]['text'])) { continue; } diff --git a/src/Tag/TagVectorSearchClient.php b/src/Tag/TagVectorSearchClient.php index 0cccd3d..fe8af33 100644 --- a/src/Tag/TagVectorSearchClient.php +++ b/src/Tag/TagVectorSearchClient.php @@ -9,7 +9,16 @@ use Symfony\Contracts\HttpClient\HttpClientInterface; final readonly class TagVectorSearchClient { - private const MIN_SCORE = 0.4; // 🔥 Tag Confidence Gate + /** + * Minimum similarity score required for a tag to be considered. + * Acts as a confidence gate to avoid noisy routing. + */ + private const MIN_SCORE = 0.4; + + /** + * Hard limit to prevent excessive requests. + */ + private const MAX_LIMIT = 50; public function __construct( private HttpClientInterface $http, @@ -18,11 +27,18 @@ final readonly class TagVectorSearchClient ) {} /** + * Executes a vector search against the Python tag index. + * * @return array */ public function search(string $query, int $limit = 8): array { - $limit = max(1, min($limit, 50)); + $query = trim($query); + if ($query === '') { + return []; + } + + $limit = max(1, min($limit, self::MAX_LIMIT)); try { $response = $this->http->request( @@ -38,7 +54,10 @@ final readonly class TagVectorSearchClient ); if ($response->getStatusCode() !== 200) { - $this->agentLogger->warning('Tag vector service returned non-200'); + $this->agentLogger->warning( + 'Tag vector service returned non-200', + ['status' => $response->getStatusCode()] + ); return []; } @@ -46,12 +65,14 @@ final readonly class TagVectorSearchClient } catch (\Throwable $e) { $this->agentLogger->warning( - 'Tag vector service unreachable: ' . $e->getMessage() + 'Tag vector service unreachable', + ['error' => $e->getMessage()] ); return []; } if (!is_array($data)) { + $this->agentLogger->warning('Tag vector service returned invalid payload'); return []; } diff --git a/src/Vector/VectorSearchClient.php b/src/Vector/VectorSearchClient.php index 2d235cc..5932615 100644 --- a/src/Vector/VectorSearchClient.php +++ b/src/Vector/VectorSearchClient.php @@ -9,7 +9,16 @@ use Symfony\Contracts\HttpClient\HttpClientInterface; final class VectorSearchClient { - private const MIN_SCORE = 0.30; // 🔥 weicher als Tag-Gate + /** + * Soft minimum similarity threshold. + * Lower than tag gate to allow broader recall. + */ + private const MIN_SCORE = 0.30; + + /** + * Hard limit clamp to avoid abusive queries. + */ + private const MAX_LIMIT = 200; private HttpClientInterface $http; private string $serviceUrl; @@ -26,18 +35,34 @@ final class VectorSearchClient } /** - * Standard global search + * Standard global search. + * + * @return array */ public function search(string $query, int $limit = 5): array { return $this->executeSearch([ - 'query' => $query, - 'limit' => $limit, + 'query' => trim($query), + 'limit' => $this->clampLimit($limit), ]); } /** - * Scoped search: nur innerhalb bestimmter Dokumente + * Scoped search: only inside specific documents. + * + * @param array $docIds + * + * @return array */ public function searchScoped( string $query, @@ -49,14 +74,23 @@ final class VectorSearchClient } return $this->executeSearch([ - 'query' => $query, - 'limit' => $limit, + 'query' => trim($query), + 'limit' => $this->clampLimit($limit), 'doc_ids' => array_values($docIds), ]); } /** - * Gemeinsame HTTP-Logik (keine Duplikation) + * Shared HTTP logic. + * + * @param array $payload + * + * @return array */ private function executeSearch(array $payload): array { @@ -71,7 +105,10 @@ final class VectorSearchClient ); if ($response->getStatusCode() !== 200) { - $this->agentLogger->error('Vector service returned non-200 (chunks)'); + $this->agentLogger->error( + 'Vector service returned non-200 (chunks)', + ['status' => $response->getStatusCode()] + ); return []; } @@ -79,12 +116,14 @@ final class VectorSearchClient } catch (\Throwable $e) { $this->agentLogger->error( - 'Vector service unreachable (chunks): ' . $e->getMessage() + 'Vector service unreachable (chunks)', + ['error' => $e->getMessage()] ); return []; } if (!is_array($data)) { + $this->agentLogger->warning('Vector service returned invalid payload (chunks)'); return []; } @@ -109,12 +148,41 @@ final class VectorSearchClient continue; } + $documentId = null; + if (isset($row['document_id']) && is_string($row['document_id']) && $row['document_id'] !== '') { + $documentId = $row['document_id']; + } + + $chunkIndex = null; + if (isset($row['chunk_index'])) { + if (is_int($row['chunk_index'])) { + $chunkIndex = $row['chunk_index']; + } elseif (is_string($row['chunk_index']) && ctype_digit($row['chunk_index'])) { + $chunkIndex = (int)$row['chunk_index']; + } + } + $filtered[] = [ - 'chunk_id' => $chunkId, - 'score' => $score, + 'chunk_id' => $chunkId, + 'score' => $score, + 'document_id' => $documentId, + 'chunk_index' => $chunkIndex, ]; } return $filtered; } + + private function clampLimit(int $limit): int + { + if ($limit < 1) { + return 1; + } + + if ($limit > self::MAX_LIMIT) { + return self::MAX_LIMIT; + } + + return $limit; + } } \ No newline at end of file