maxChunks; // --------------------------------------------------------- // 0) Tag-Routing FIRST (soft gate) // --------------------------------------------------------- $candidateDocIds = $this->tagRouting->route($prompt); $candidateSet = null; if (is_array($candidateDocIds) && $candidateDocIds !== []) { $candidateSet = array_fill_keys($candidateDocIds, true); } // --------------------------------------------------------- // 1) Keyword first (simple streaming scan) // --------------------------------------------------------- $terms = $this->extractTerms($prompt); $keywordChunks = $this->keywordSearchStreaming($terms, $limit, $candidateSet); if (\count($keywordChunks) >= $limit) { return array_slice($keywordChunks, 0, $limit); } // --------------------------------------------------------- // 2) Vector fallback / enrichment // - If routed: increase TopK, then filter by document_id // - Soft fallback: if filtering yields nothing -> global vector once // --------------------------------------------------------- $topK = $this->vectorTopK; if ($candidateSet !== null) { $topK = max($this->vectorTopK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $this->vectorTopK); $topK = min($topK, 200); // guardrail } $hits = $this->vectorClient->search($prompt, $topK); if ($hits === []) { return $keywordChunks; } $chunkIds = []; foreach ($hits as $hit) { if (!isset($hit['chunk_id'], $hit['score'])) { continue; } if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) { continue; } $chunkIds[] = (string)$hit['chunk_id']; } if ($chunkIds === []) { return $keywordChunks; } $rows = $this->lookup->findByChunkIds($chunkIds); // routed filtering by document_id $finalChunkIds = $chunkIds; if ($candidateSet !== null) { $filtered = []; foreach ($chunkIds as $id) { $row = $rows[$id] ?? null; if (!is_array($row)) { continue; } $docId = $row['document_id'] ?? null; if (!is_string($docId) || !isset($candidateSet[$docId])) { continue; } $filtered[] = $id; } // Soft fallback: if routing filtered everything away, retry global vector once if ($filtered === []) { $hits2 = $this->vectorClient->search($prompt, $this->vectorTopK); if ($hits2 === []) { return $keywordChunks; } $chunkIds2 = []; foreach ($hits2 as $hit) { if (!isset($hit['chunk_id'], $hit['score'])) { continue; } if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) { continue; } $chunkIds2[] = (string)$hit['chunk_id']; } if ($chunkIds2 === []) { return $keywordChunks; } $rows = $this->lookup->findByChunkIds($chunkIds2); $finalChunkIds = $chunkIds2; } else { $finalChunkIds = $filtered; } } foreach ($finalChunkIds as $id) { if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) { continue; } $keywordChunks[] = trim($rows[$id]['text']); } // --------------------------------------------------------- // 3) dedupe + limit // --------------------------------------------------------- $seen = []; $out = []; foreach ($keywordChunks as $chunk) { $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); if (isset($seen[$key])) { continue; } $seen[$key] = true; $out[] = $chunk; if (\count($out) >= $limit) { break; } } return $out; } /** * Streaming Keyword Search über index.ndjson. * Minimal, aber nützlich: * - Score = Anzahl gefundener Terms * - CandidateDocs (Tag-Routing) reduziert Scan massiv * * @param string[] $terms * @param array|null $candidateSet * @return string[] */ private function keywordSearchStreaming(array $terms, int $limit, ?array $candidateSet): array { if ($terms === []) { return []; } $maxScore = \count($terms); // top list: each item = ['score' => int, 'text' => string] $top = []; foreach ($this->chunkManager->streamAll() as $row) { $text = $row['text'] ?? null; if (!is_string($text) || $text === '') { continue; } if ($candidateSet !== null) { $docId = $row['document_id'] ?? null; if (!is_string($docId) || !isset($candidateSet[$docId])) { continue; } } $haystack = mb_strtolower($text); $score = 0; foreach ($terms as $t) { if ($t === '') { continue; } if (mb_stripos($haystack, $t) !== false) { $score++; } } if ($score < self::KEYWORD_MIN_HITS) { continue; } $top[] = [ 'score' => $score, 'text' => trim($text), ]; // keep only best N (simple sort, N is tiny) usort($top, static function (array $a, array $b): int { // higher score first $cmp = ($b['score'] <=> $a['score']); if ($cmp !== 0) { return $cmp; } // shorter chunk first (often more precise) return (mb_strlen($a['text']) <=> mb_strlen($b['text'])); }); if (\count($top) > $limit) { $top = array_slice($top, 0, $limit); } // early exit: perfect matches filled if (\count($top) === $limit && ($top[0]['score'] ?? 0) >= $maxScore) { break; } } $out = []; foreach ($top as $item) { $out[] = (string)$item['text']; } return $out; } /** * Minimal term extraction (stabiles Verhalten, wenig Magie) * * @return string[] */ private function extractTerms(string $text): array { $text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text)); $parts = array_values(array_filter( explode(' ', $text), static fn(string $w) => mb_strlen($w) > 2 )); // unique, order preserved $seen = []; $out = []; foreach ($parts as $w) { if (isset($seen[$w])) { continue; } $seen[$w] = true; $out[] = $w; } return $out; } }