diff --git a/composer.json b/composer.json index f155307..ff6de92 100644 --- a/composer.json +++ b/composer.json @@ -28,7 +28,8 @@ "symfony/security-bundle": "7.4.*", "symfony/twig-bundle": "7.4.*", "symfony/uid": "7.4.*", - "symfony/yaml": "^7.4" + "symfony/yaml": "^7.4", + "ext-sqlite3": "*" }, "config": { "optimize-autoloader": true, diff --git a/src/Command/TestHybridRetrievalCommand.php b/src/Command/TestHybridRetrievalCommand.php new file mode 100644 index 0000000..e691664 --- /dev/null +++ b/src/Command/TestHybridRetrievalCommand.php @@ -0,0 +1,298 @@ +addArgument( + 'prompt', + InputArgument::REQUIRED, + 'Prompt to test against the real hybrid retrieval pipeline' + ) + ->addOption( + 'json', + null, + InputOption::VALUE_NONE, + 'Return the raw retrieval debug result as JSON' + ) + ->addOption( + 'show-text', + null, + InputOption::VALUE_NONE, + 'Show full chunk text instead of a shortened preview' + ); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $io = new SymfonyStyle($input, $output); + + $prompt = trim((string) $input->getArgument('prompt')); + $asJson = (bool) $input->getOption('json'); + $showText = (bool) $input->getOption('show-text'); + + if ($prompt === '') { + $io->error('Prompt must not be empty.'); + + return Command::FAILURE; + } + + $start = microtime(true); + + try { + $results = $this->retriever->retrieveDebug($prompt); + } catch (\Throwable $e) { + $io->error($e->getMessage()); + + return Command::FAILURE; + } + + $durationMs = round((microtime(true) - $start) * 1000, 2); + + if ($asJson) { + $payload = [ + 'prompt' => $prompt, + 'duration_ms' => $durationMs, + 'result_count' => count($results), + 'results' => $results, + ]; + + $json = json_encode( + $payload, + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE + ); + + if (!is_string($json)) { + $io->error('json_encode failed.'); + + return Command::FAILURE; + } + + $output->writeln($json); + + return Command::SUCCESS; + } + + $io->title('Hybrid Retrieval Test'); + $io->definitionList( + ['prompt' => $prompt], + ['duration_ms' => (string) $durationMs], + ['result_count' => (string) count($results)] + ); + + if ($results === []) { + $io->warning('No retrieval results returned.'); + + return Command::SUCCESS; + } + + $first = $results[0]; + + $io->section('Pipeline Summary'); + $io->definitionList( + ['scope_mode' => $this->stringValue($first, 'scope_mode')], + ['selection_mode' => $this->stringValue($first, 'selection_mode')], + ['intent' => $this->stringValue($first, 'intent')], + ['route' => $this->stringValue($first, 'route')], + ['entity_label' => $this->stringValue($first, 'entity_label')], + ['is_list_query' => $this->boolishValue($first, 'is_list_query')], + ['clean_query' => $this->stringValue($first, 'clean_query')], + ['semantic_query' => $this->stringValue($first, 'semantic_query')], + ['secondary_vector_query' => $this->stringValue($first, 'secondary_vector_query')], + ['lexical_query' => $this->stringValue($first, 'lexical_query')], + ['threshold' => $this->scalarValue($first, 'threshold')], + ['lexical_threshold' => $this->scalarValue($first, 'lexical_threshold')] + ); + + $io->section('Scope Candidates'); + $io->definitionList( + ['tag_candidate_doc_ids' => $this->jsonValue($first, 'tag_candidate_doc_ids')], + ['soft_document_candidate_doc_ids' => $this->jsonValue($first, 'soft_document_candidate_doc_ids')], + ['pseudo_scope_doc_ids' => $this->jsonValue($first, 'pseudo_scope_doc_ids')], + ['title_metadata_doc_boosts' => $this->jsonObjectValue($first, 'title_metadata_doc_boosts')] + ); + + $io->section('Hit Counts'); + $io->definitionList( + ['global_hit_count' => $this->scalarValue($first, 'global_hit_count')], + ['scoped_hit_count' => $this->scalarValue($first, 'scoped_hit_count')], + ['global_vector_hit_count' => $this->scalarValue($first, 'global_vector_hit_count')], + ['global_primary_vector_hit_count' => $this->scalarValue($first, 'global_primary_vector_hit_count')], + ['global_secondary_vector_hit_count' => $this->scalarValue($first, 'global_secondary_vector_hit_count')], + ['global_keyword_hit_count' => $this->scalarValue($first, 'global_keyword_hit_count')], + ['scoped_vector_hit_count' => $this->scalarValue($first, 'scoped_vector_hit_count')], + ['scoped_primary_vector_hit_count' => $this->scalarValue($first, 'scoped_primary_vector_hit_count')], + ['scoped_secondary_vector_hit_count' => $this->scalarValue($first, 'scoped_secondary_vector_hit_count')], + ['scoped_keyword_hit_count' => $this->scalarValue($first, 'scoped_keyword_hit_count')] + ); + + $io->section('Boosts'); + $io->definitionList( + ['scoped_boost_factor' => $this->scalarValue($first, 'scoped_boost_factor')], + ['scoped_vector_boost_factor' => $this->scalarValue($first, 'scoped_vector_boost_factor')], + ['secondary_scoped_vector_boost_factor' => $this->scalarValue($first, 'secondary_scoped_vector_boost_factor')], + ['scoped_keyword_boost_factor' => $this->scalarValue($first, 'scoped_keyword_boost_factor')] + ); + + $io->section('Selected Chunks'); + + foreach ($results as $row) { + $rank = $this->scalarValue($row, 'rank'); + $chunkId = $this->stringValue($row, 'chunk_id'); + $documentId = $this->stringValue($row, 'document_id'); + $chunkIndex = $this->scalarValue($row, 'chunk_index'); + $rrfScore = $this->scalarValue($row, 'rrf_score'); + $rawVectorScore = $this->scalarValue($row, 'raw_vector_score'); + $rawKeywordScore = $this->scalarValue($row, 'raw_keyword_score'); + $titleMetadataBoost = $this->scalarValue($row, 'title_metadata_boost'); + $text = (string) ($row['text'] ?? ''); + + if (!$showText) { + $text = $this->shortenText($text, 500); + } + + $io->writeln(sprintf( + '#%s chunk=%s doc=%s idx=%s rrf=%s vector=%s keyword=%s title_meta=%s', + $rank, + $chunkId, + $documentId !== '' ? $documentId : '-', + $chunkIndex !== '' ? $chunkIndex : '-', + $rrfScore !== '' ? $rrfScore : '-', + $rawVectorScore !== '' ? $rawVectorScore : '-', + $rawKeywordScore !== '' ? $rawKeywordScore : '-', + $titleMetadataBoost !== '' ? $titleMetadataBoost : '-' + )); + $io->writeln($text); + $io->writeln(''); + } + + return Command::SUCCESS; + } + + /** + * @param array $row + */ + private function stringValue(array $row, string $key): string + { + $value = $row[$key] ?? null; + + if ($value === null) { + return ''; + } + + return trim((string) $value); + } + + /** + * @param array $row + */ + private function scalarValue(array $row, string $key): string + { + $value = $row[$key] ?? null; + + if ($value === null) { + return ''; + } + + if (is_bool($value)) { + return $value ? 'true' : 'false'; + } + + if (is_scalar($value)) { + return (string) $value; + } + + return ''; + } + + /** + * @param array $row + */ + private function boolishValue(array $row, string $key): string + { + $value = $row[$key] ?? null; + + if (is_bool($value)) { + return $value ? 'true' : 'false'; + } + + if (is_scalar($value)) { + return (string) $value; + } + + return ''; + } + + /** + * @param array $row + */ + private function jsonValue(array $row, string $key): string + { + $value = $row[$key] ?? null; + + if ($value === null || !is_array($value)) { + return '[]'; + } + + $json = json_encode( + array_values($value), + JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE + ); + + return is_string($json) ? $json : '[]'; + } + + /** + * @param array $row + */ + private function jsonObjectValue(array $row, string $key): string + { + $value = $row[$key] ?? null; + + if ($value === null || !is_array($value)) { + return '{}'; + } + + $json = json_encode( + $value, + JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE + ); + + return is_string($json) ? $json : '{}'; + } + + private function shortenText(string $text, int $maxLength): string + { + $text = trim((preg_replace('/\s+/u', ' ', $text) ?? $text)); + + if (mb_strlen($text, 'UTF-8') <= $maxLength) { + return $text; + } + + return mb_substr($text, 0, $maxLength, 'UTF-8') . ' …'; + } +} \ No newline at end of file diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index 845b656..2b8d66b 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -7,62 +7,96 @@ namespace App\Config; final class NdjsonHybridRetrieverConfig { /** - * Default semantic similarity threshold for vector hits. + * Maximum number of chunks the retriever may finally hand to the model. * - * Chosen to stay selective enough for product-family-heavy data - * while not cutting off too many useful fallback hits. + * Rationale: + * - enough room for the stronger hybrid pipeline + * - still conservative enough to avoid prompt bloat */ - public const VECTOR_SCORE_THRESHOLD = 0.83; + public const HARD_MAX_CHUNKS = 6; /** - * Absolute safety caps. + * Hard upper bound for vector retrieval candidate size. * - * These limits protect the retriever from overly large candidate sets - * even if runtime config values are set too high. + * Rationale: + * - the pipeline now combines primary vector, secondary vector, + * lexical, scoped retrieval and re-ranking + * - the old limit would constrain recall too early + * - still capped to keep latency controlled */ - public const HARD_MAX_CHUNKS = 72; - public const HARD_MAX_VECTORK = 180; + public const HARD_MAX_VECTORK = 18; /** - * List-style queries benefit from a slightly wider candidate pool - * before de-duplication and final selection. + * Default semantic score threshold for vector hits. + * + * Rationale: + * - slightly relaxed compared to stricter pure-vector setups + * - the system now has more safeguards: + * lexical cross-signals, scoped retrieval, title/meta boost, selection rules */ - public const LIST_BONUS = 1.25; + public const VECTOR_SCORE_THRESHOLD = 0.81; /** - * Selection rules for cross-document semantic retrieval. + * Lower safety boundary for dynamic threshold adjustments. * - * MAX_CHUNKS_PER_DOC: - * Keeps one document from dominating the final result in normal - * semantic retrieval mode. - * - * MIN_CHUNK_DISTANCE: - * Allows nearby chunks to be selected when they are still meaningfully - * distinct, which is important for compact product sheets. + * Rationale: + * - prevents the system from getting too noisy in fallback cases + * - still allows recovery when exact signals are sparse */ - public const MAX_CHUNKS_PER_DOC = 3; - public const MIN_CHUNK_DISTANCE = 1.0; + public const THRESHOLD_FLOOR = 0.75; + + /** + * Upper safety boundary for dynamic threshold adjustments. + * + * Rationale: + * - protects objection/pricing/list adjustments from becoming too strict + * - keeps retrieval from collapsing into empty result sets too easily + */ + public const THRESHOLD_CEIL = 0.90; + + /** + * Additional candidate expansion factor for list-like prompts. + * + * Rationale: + * - list requests benefit from wider candidate recall + * - too high would create noise across multiple retrieval channels + */ + public const LIST_BONUS = 1.35; /** * Reciprocal Rank Fusion constant. * - * Slightly lower than classic defaults so top-ranked hits matter more. + * Rationale: + * - keep rank importance meaningful + * - but not so aggressive that one retrieval source dominates too hard */ public const RRF_K = 50; /** - * Dynamic threshold clamp boundaries. + * Fallback size when thresholded fusion yields no candidates. * - * The floor must stay below the default threshold, otherwise the - * configured base threshold becomes ineffective. + * Rationale: + * - slightly larger safety net for the richer hybrid stack + * - helps no-tag and low-signal cases without exploding context */ - public const THRESHOLD_FLOOR = 0.78; - public const THRESHOLD_CEIL = 0.90; + public const EMPTY_RRF_FALLBACK_TOPN = 5; /** - * Fallback breadth when strict thresholding removes all fused hits. + * Maximum number of chunks allowed from one document in spread mode. * - * More than one fallback result makes the retriever less brittle. + * Rationale: + * - preserve diversity across documents + * - still allow coherent multi-chunk retrieval from strong sources */ - public const EMPTY_RRF_FALLBACK_TOPN = 3; + public const MAX_CHUNKS_PER_DOC = 2; + + /** + * Minimum distance between chunk indices from the same document + * during spread-style selection. + * + * Rationale: + * - reduce near-duplicate neighboring chunks + * - still allow relevant continuation when needed + */ + public const MIN_CHUNK_DISTANCE = 2; } \ No newline at end of file diff --git a/src/Config/QueryEnricherConfig.php b/src/Config/QueryEnricherConfig.php index 8d79fb7..f4f2837 100644 --- a/src/Config/QueryEnricherConfig.php +++ b/src/Config/QueryEnricherConfig.php @@ -1,22 +1,180 @@ 'residual hardness', + * 'device' => 'instrument', + * ] + * + * 2) Small synonym groups: + * [ + * ['water hardness', 'residual hardness', 'hardness'], + * ['device', 'instrument', 'meter'], + * ] + * + * The public API stays intentionally simple: + * - getEnrichQueryList(): array + * + * This keeps QueryEnricher generic while the domain vocabulary + * deliberately remains inside this class for now. + * + * Replace the example entries below with your real project mappings. + * + * @var array + */ + private const ENRICH_QUERY_LIST = [ + // ----------------------------------------------------------------- + // Example mappings. + // Replace / extend these with your current real project mappings. + // ----------------------------------------------------------------- + + 'water hardness' => 'residual hardness', + 'device' => 'instrument', + 'gerät'=>'produkt', + 'rebuild'=>'reindex', + + ['measuring device', 'meter', 'instrument'], + ]; + + /** + * Returns a normalized, deduplicated mapping for the QueryEnricher. + * + * Output format: + * [ + * 'term a' => 'term b', + * 'term c' => 'term d', + * ] + * + * Rules: + * - ignore empty / invalid values + * - trim and normalize whitespace + * - ignore self-mappings + * - preserve first valid rule if duplicates normalize to the same key + * + * @return array + */ public function getEnrichQueryList(): array { - return [ - 'Wasserhärte' => 'Resthärte', - 'Gerät' => 'Modell', - 'Indikator' => 'Chemie', - 'Seminar' => 'Webinar', - 'Schulung' => 'Seminar', - 'Indikatoren' => 'Indikator', - 'Wasserhärte-Grenzwert' => 'Resthärte', - 'Resthärte-Grenzwert' => 'Wasserhärte', - 'Grenzwert' => 'Überwachungsbereich', - 'store'=>'shop' - ]; + $normalized = []; + + foreach (self::ENRICH_QUERY_LIST as $key => $value) { + if (is_array($value)) { + $this->ingestGroup($normalized, $value); + continue; + } + + $left = $this->normalizePhrase(is_string($key) ? $key : ''); + $right = $this->normalizePhrase(is_string($value) ? $value : ''); + + if (!$this->isValidPair($left, $right)) { + continue; + } + + if (!isset($normalized[$left])) { + $normalized[$left] = $right; + } + } + + return $normalized; + } + + /** + * Returns true when at least one valid enrichment rule exists. + */ + public function hasRules(): bool + { + return $this->getEnrichQueryList() !== []; + } + + /** + * @param array $normalized + * @param array $group + */ + private function ingestGroup(array &$normalized, array $group): void + { + $items = []; + + foreach ($group as $item) { + if (!is_string($item)) { + continue; + } + + $item = $this->normalizePhrase($item); + + if ($item === '') { + continue; + } + + $items[$item] = $item; + } + + $items = array_values($items); + + if (count($items) < 2) { + return; + } + + /** + * Turn a synonym group into a conservative chain: + * ['a', 'b', 'c'] => a=>b, b=>c + * + * QueryEnricher builds a bidirectional lookup later, + * so the config output stays intentionally small. + */ + for ($i = 0, $max = count($items) - 1; $i < $max; $i++) { + $left = $items[$i]; + $right = $items[$i + 1]; + + if (!$this->isValidPair($left, $right)) { + continue; + } + + if (!isset($normalized[$left])) { + $normalized[$left] = $right; + } + } + } + + private function isValidPair(string $left, string $right): bool + { + if ($left === '' || $right === '') { + return false; + } + + if ($left === $right) { + return false; + } + + return true; + } + + private function normalizePhrase(string $value): string + { + $value = trim($value); + + if ($value === '') { + return ''; + } + + $value = mb_strtolower($value, 'UTF-8'); + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim($value); } } \ No newline at end of file diff --git a/src/Ingest/VectorRebuildService.php b/src/Ingest/VectorRebuildService.php index a79498f..0975ee8 100644 --- a/src/Ingest/VectorRebuildService.php +++ b/src/Ingest/VectorRebuildService.php @@ -6,36 +6,50 @@ namespace App\Ingest; use App\Index\IndexMetaManager; use App\Knowledge\ChunkManager; +use App\Knowledge\Retrieval\NdjsonLexicalIndexBuilder; use App\Vector\VectorIndexBuilder; final readonly class VectorRebuildService { public function __construct( private VectorIndexBuilder $vectorBuilder, - private IndexMetaManager $metaManager, - private ChunkManager $chunkManager, - ) {} + private NdjsonLexicalIndexBuilder $lexicalIndexBuilder, + private IndexMetaManager $metaManager, + private ChunkManager $chunkManager, + ) { + } /** - * Führt einen vollständigen, deterministischen FAISS-Rebuild aus. + * Executes a full deterministic rebuild of all derived retrieval artifacts. * - * Ablauf: - * 1. Rebuild des Vector Index aus index.ndjson - * 2. Chunk-Zählung via ChunkManager - * 3. Runtime-Stats atomar aktualisieren + * Flow: + * 1. Ensure index_meta.json exists + * 2. Rebuild vector index from index.ndjson + * 3. Rebuild lexical index from index.ndjson + * 4. Count chunks streaming-safe + * 5. Update runtime stats atomically + * + * Important: + * - Vector and lexical index are both derived from the same NDJSON source + * - rebuilding both here prevents drift between semantic and lexical retrieval layers + * - failures in either derived artifact should fail the rebuild as a whole + * @throws \Throwable */ public function rebuild(?string $logPath = null): void { - // ✅ Stelle sicher, dass index_meta.json existiert + // Ensure metadata exists before derived index work starts. $this->metaManager->ensureExists(); - // 1️⃣ Vector Index neu bauen + // 1) Rebuild semantic vector index. $this->vectorBuilder->rebuildFromNdjson($logPath); - // 2️⃣ Chunk Count streaming-safe zählen + // 2) Rebuild generic lexical index from the same NDJSON source. + $this->lexicalIndexBuilder->build(); + + // 3) Count chunks streaming-safe. $chunkCount = $this->chunkManager->countAllChunks(); - // 3️⃣ Runtime-Stats aktualisieren (atomar) + // 4) Update runtime stats atomically. $this->metaManager->updateRuntimeStats($chunkCount); } } \ No newline at end of file diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 2b7ca25..a29e86f 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -25,7 +25,8 @@ use RuntimeException; * - optionally short-circuit to catalog list output * - resolve exact document-title matches before semantic retrieval * - run vector retrieval globally and optionally document-scoped - * - fuse both result sets with RRF-style scoring + * - run lexical retrieval globally and optionally document-scoped + * - fuse all result sets with RRF-style scoring * - apply selection rules for list queries vs. sales-style queries * - return either plain chunk texts or debug metadata */ @@ -40,35 +41,82 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface private const DOMINANT_DOC_MAX_CHUNKS = 4; private const EXACT_DOCUMENT_MAX_CHUNKS = 6; - public function __construct( - private NdjsonChunkLookup $lookup, - private VectorSearchClient $vectorClient, - private TagRoutingService $tagRouting, - private ModelGenerationConfigRepository $configRepository, - private QueryCleaner $queryCleaner, - private IntentLite $intentLite, - private SalesIntentLite $salesIntentLite, - private CatalogIntentLite $catalogIntent, - private IntentRouteResolver $routeResolver, - private EntityCatalogService $entityCatalogService, - private QueryEnricher $queryEnricher, - ) - { - } + /** + * Conservative no-tag fallback: + * derive a temporary document scope only when the top global vector hits + * show repeated evidence for the same document(s). + */ + private const PSEUDO_SCOPE_GLOBAL_WINDOW = 10; + private const PSEUDO_SCOPE_MIN_DOC_HITS = 2; + private const PSEUDO_SCOPE_MAX_DOCS = 3; - // ========================================================= - // PUBLIC API - // ========================================================= + /** + * Soft document candidates are derived from global lexical hits first. + * This stage is placed between tag-routing and vector-based pseudo scope. + */ + private const SOFT_DOC_CANDIDATE_WINDOW = 8; + private const SOFT_DOC_CANDIDATE_MIN_DOC_HITS = 2; + private const SOFT_DOC_CANDIDATE_MAX_DOCS = 3; + private const SOFT_DOC_TOP_SCORE_MIN = 0.98; + + /** + * Scoped retrieval is useful in both cases, but true tag-routing should + * stay stronger than soft candidates and pseudo-scoping. + */ + private const TAG_SCOPED_VECTOR_BOOST = 1.20; + private const SOFT_DOC_SCOPED_VECTOR_BOOST = 1.12; + private const PSEUDO_SCOPED_VECTOR_BOOST = 1.08; + + /** + * Secondary vector query should help recall/robustness, but must not + * overpower the primary enriched semantic query. + */ + private const SECONDARY_GLOBAL_VECTOR_BOOST = 0.93; + private const SECONDARY_SCOPED_VECTOR_MULTIPLIER = 0.95; + + /** + * Lexical retrieval should support precision, but not overpower vector routing. + */ + private const LEXICAL_SCORE_THRESHOLD = 0.18; + private const GLOBAL_LEXICAL_BOOST = 0.90; + private const TAG_SCOPED_LEXICAL_BOOST = 1.04; + private const SOFT_DOC_SCOPED_LEXICAL_BOOST = 1.02; + private const PSEUDO_SCOPED_LEXICAL_BOOST = 1.00; + + /** + * Conservative re-rank stage based on document title / metadata alignment. + * + * This is intentionally applied after fusion so it sharpens ranking + * without replacing the underlying retrieval sources. + */ + private const TITLE_MATCH_BASE_BOOST = 0.04; + private const TITLE_MATCH_MAX_BOOST = 0.18; + private const FILE_MATCH_BASE_BOOST = 0.02; + private const FILE_MATCH_MAX_BOOST = 0.08; + private const META_MATCH_MAX_BOOST = 0.04; + private const EXACT_TITLE_PHRASE_BOOST = 0.08; + private const EXACT_FILE_PHRASE_BOOST = 0.04; + private const MAX_TITLE_METADATA_BOOST = 0.22; + + public function __construct( + private NdjsonChunkLookup $lookup, + private VectorSearchClient $vectorClient, + private NdjsonKeywordRetriever $keywordRetriever, + private TagRoutingService $tagRouting, + private ModelGenerationConfigRepository $configRepository, + private QueryCleaner $queryCleaner, + private IntentLite $intentLite, + private SalesIntentLite $salesIntentLite, + private CatalogIntentLite $catalogIntent, + private IntentRouteResolver $routeResolver, + private EntityCatalogService $entityCatalogService, + private QueryEnricher $queryEnricher, + ) { + } /** * Returns the final retrieval payload as plain text chunks. * - * Behaviour: - * - loads active retrieval config - * - executes the full orchestration pipeline - * - if the route resolves to a catalog list, returns the catalog block only - * - otherwise returns the selected chunk texts - * * @throws Exception */ public function retrieve(string $prompt): array @@ -93,13 +141,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface /** * Returns a debug-friendly retrieval result with scoring/meta information. * - * This method is used for inspection and tuning: - * - selected chunk ids - * - raw vector scores - * - fused RRF scores - * - intent / route information - * - threshold and list-query flags - * * @throws Exception */ public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array @@ -114,13 +155,40 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'document_id' => null, 'chunk_index' => null, 'raw_score' => null, + 'raw_vector_score' => null, + 'raw_keyword_score' => null, 'rrf_score' => null, 'threshold' => 0.0, + 'lexical_threshold' => self::LEXICAL_SCORE_THRESHOLD, 'intent' => $result['intent'], 'route' => $result['route'], 'entity_label' => $result['entityLabel'], 'is_list_query' => true, 'selection_mode' => 'catalog_list', + 'scope_mode' => 'catalog_list', + 'clean_query' => null, + 'semantic_query' => null, + 'secondary_vector_query' => null, + 'lexical_query' => null, + 'tag_candidate_doc_ids' => [], + 'soft_document_candidate_doc_ids' => [], + 'pseudo_scope_doc_ids' => [], + 'global_hit_count' => 0, + 'scoped_hit_count' => 0, + 'global_vector_hit_count' => 0, + 'global_primary_vector_hit_count' => 0, + 'global_secondary_vector_hit_count' => 0, + 'global_keyword_hit_count' => 0, + 'scoped_vector_hit_count' => 0, + 'scoped_primary_vector_hit_count' => 0, + 'scoped_secondary_vector_hit_count' => 0, + 'scoped_keyword_hit_count' => 0, + 'scoped_boost_factor' => 0.0, + 'scoped_vector_boost_factor' => 0.0, + 'secondary_scoped_vector_boost_factor' => 0.0, + 'scoped_keyword_boost_factor' => 0.0, + 'title_metadata_boost' => 0.0, + 'title_metadata_doc_boosts' => [], 'text' => $result['catalogBlock'], ]]; } @@ -139,19 +207,49 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $rank++; + $rawVectorScore = $result['rawVectorScores'][$chunkId] ?? null; + $rawKeywordScore = $result['rawKeywordScores'][$chunkId] ?? null; + $out[] = [ 'rank' => $rank, 'chunk_id' => $chunkId, 'document_id' => $result['rows'][$chunkId]['document_id'] ?? null, 'chunk_index' => $result['rows'][$chunkId]['chunk_index'] ?? null, - 'raw_score' => $result['rawScores'][$chunkId] ?? null, + 'raw_score' => $this->maxNullableFloat($rawVectorScore, $rawKeywordScore), + 'raw_vector_score' => $rawVectorScore, + 'raw_keyword_score' => $rawKeywordScore, 'rrf_score' => $result['rrfScores'][$chunkId] ?? null, 'threshold' => $result['threshold'], + 'lexical_threshold' => self::LEXICAL_SCORE_THRESHOLD, 'intent' => $result['intent'], 'route' => $result['route'], 'entity_label' => $result['entityLabel'], 'is_list_query' => $result['isListQuery'], 'selection_mode' => $result['selectionMode'], + 'scope_mode' => $result['scopeMode'], + 'clean_query' => $result['cleanQuery'], + 'semantic_query' => $result['semanticQuery'], + 'secondary_vector_query' => $result['secondaryVectorQuery'], + 'lexical_query' => $result['lexicalQuery'], + 'tag_candidate_doc_ids' => $result['tagCandidateDocIds'], + 'soft_document_candidate_doc_ids' => $result['softDocumentCandidateDocIds'], + 'pseudo_scope_doc_ids' => $result['pseudoScopeDocIds'], + 'global_hit_count' => $result['globalHitCount'], + 'scoped_hit_count' => $result['scopedHitCount'], + 'global_vector_hit_count' => $result['globalVectorHitCount'], + 'global_primary_vector_hit_count' => $result['globalPrimaryVectorHitCount'], + 'global_secondary_vector_hit_count' => $result['globalSecondaryVectorHitCount'], + 'global_keyword_hit_count' => $result['globalKeywordHitCount'], + 'scoped_vector_hit_count' => $result['scopedVectorHitCount'], + 'scoped_primary_vector_hit_count' => $result['scopedPrimaryVectorHitCount'], + 'scoped_secondary_vector_hit_count' => $result['scopedSecondaryVectorHitCount'], + 'scoped_keyword_hit_count' => $result['scopedKeywordHitCount'], + 'scoped_boost_factor' => $result['scopedBoostFactor'], + 'scoped_vector_boost_factor' => $result['scopedVectorBoostFactor'], + 'secondary_scoped_vector_boost_factor' => $result['secondaryScopedVectorBoostFactor'], + 'scoped_keyword_boost_factor' => $result['scopedKeywordBoostFactor'], + 'title_metadata_boost' => $result['titleMetadataBoosts'][$chunkId] ?? 0.0, + 'title_metadata_doc_boosts' => $result['titleMetadataDocBoosts'], 'text' => trim((string)$result['rows'][$chunkId]['text']), ]; } @@ -159,29 +257,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $out; } - // ========================================================= - // CENTRAL ORCHESTRATION - // ========================================================= - /** * Central orchestration entrypoint. * - * Pipeline: - * 1. Detect catalog entity and sales intent - * 2. Resolve route - * 3. If route is a catalog list route, try direct catalog output - * 4. If prompt matches one exact document title, use exact-document fast path - * 5. Otherwise, run the normal hybrid retrieval core - * 6. Select final chunk ids depending on query type - * * @throws Exception */ private function execute( - string $prompt, + string $prompt, ModelGenerationConfig $config, - bool $withScores - ): array - { + bool $withScores + ): array { $entityLabel = $this->catalogIntent->detect($prompt); $salesIntent = $this->detectSalesIntent($prompt); $route = $this->routeResolver->resolve($salesIntent, $entityLabel); @@ -196,10 +281,35 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'intent' => $salesIntent, 'isListQuery' => true, 'selectionMode' => 'catalog_list', + 'scopeMode' => 'catalog_list', + 'cleanQuery' => null, + 'semanticQuery' => null, + 'secondaryVectorQuery' => null, + 'lexicalQuery' => null, + 'tagCandidateDocIds' => [], + 'softDocumentCandidateDocIds' => [], + 'pseudoScopeDocIds' => [], + 'globalHitCount' => 0, + 'scopedHitCount' => 0, + 'globalVectorHitCount' => 0, + 'globalPrimaryVectorHitCount' => 0, + 'globalSecondaryVectorHitCount' => 0, + 'globalKeywordHitCount' => 0, + 'scopedVectorHitCount' => 0, + 'scopedPrimaryVectorHitCount' => 0, + 'scopedSecondaryVectorHitCount' => 0, + 'scopedKeywordHitCount' => 0, + 'scopedBoostFactor' => 0.0, + 'scopedVectorBoostFactor' => 0.0, + 'secondaryScopedVectorBoostFactor' => 0.0, + 'scopedKeywordBoostFactor' => 0.0, 'selectedChunkIds' => [], 'rows' => [], 'rrfScores' => [], - 'rawScores' => [], + 'rawVectorScores' => [], + 'rawKeywordScores' => [], + 'titleMetadataBoosts' => [], + 'titleMetadataDocBoosts' => [], 'threshold' => 0.0, 'catalogBlock' => trim($catalogBlock), ]; @@ -221,10 +331,35 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'intent' => $salesIntent, 'isListQuery' => false, 'selectionMode' => 'exact_document_title', + 'scopeMode' => 'exact_document_title', + 'cleanQuery' => null, + 'semanticQuery' => null, + 'secondaryVectorQuery' => null, + 'lexicalQuery' => null, + 'tagCandidateDocIds' => [], + 'softDocumentCandidateDocIds' => [], + 'pseudoScopeDocIds' => [], + 'globalHitCount' => 0, + 'scopedHitCount' => 0, + 'globalVectorHitCount' => 0, + 'globalPrimaryVectorHitCount' => 0, + 'globalSecondaryVectorHitCount' => 0, + 'globalKeywordHitCount' => 0, + 'scopedVectorHitCount' => 0, + 'scopedPrimaryVectorHitCount' => 0, + 'scopedSecondaryVectorHitCount' => 0, + 'scopedKeywordHitCount' => 0, + 'scopedBoostFactor' => 0.0, + 'scopedVectorBoostFactor' => 0.0, + 'secondaryScopedVectorBoostFactor' => 0.0, + 'scopedKeywordBoostFactor' => 0.0, 'selectedChunkIds' => $selectedChunkIds, 'rows' => $exactDocumentMatch['rows'], 'rrfScores' => $this->buildExactDocumentScores($selectedChunkIds), - 'rawScores' => [], + 'rawVectorScores' => [], + 'rawKeywordScores' => [], + 'titleMetadataBoosts' => [], + 'titleMetadataDocBoosts' => [], 'threshold' => 1.0, 'catalogBlock' => null, ]; @@ -240,10 +375,39 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'intent' => $salesIntent, 'isListQuery' => $core['is_list_query'], 'selectionMode' => null, + 'scopeMode' => $core['scope_mode'], + 'cleanQuery' => $core['clean_query'], + 'semanticQuery' => $core['semantic_query'], + 'secondaryVectorQuery' => $core['secondary_vector_query'], + 'lexicalQuery' => $core['lexical_query'], + 'tagCandidateDocIds' => $core['tag_candidate_doc_ids'], + 'softDocumentCandidateDocIds' => $core['soft_document_candidate_doc_ids'], + 'pseudoScopeDocIds' => $core['pseudo_scope_doc_ids'], + 'globalHitCount' => $core['global_hit_count'], + 'scopedHitCount' => $core['scoped_hit_count'], + 'globalVectorHitCount' => $core['global_vector_hit_count'], + 'globalPrimaryVectorHitCount' => $core['global_primary_vector_hit_count'], + 'globalSecondaryVectorHitCount' => $core['global_secondary_vector_hit_count'], + 'globalKeywordHitCount' => $core['global_keyword_hit_count'], + 'scopedVectorHitCount' => $core['scoped_vector_hit_count'], + 'scopedPrimaryVectorHitCount' => $core['scoped_primary_vector_hit_count'], + 'scopedSecondaryVectorHitCount' => $core['scoped_secondary_vector_hit_count'], + 'scopedKeywordHitCount' => $core['scoped_keyword_hit_count'], + 'scopedBoostFactor' => max( + $core['scoped_vector_boost_factor'], + $core['secondary_scoped_vector_boost_factor'], + $core['scoped_keyword_boost_factor'] + ), + 'scopedVectorBoostFactor' => $core['scoped_vector_boost_factor'], + 'secondaryScopedVectorBoostFactor' => $core['secondary_scoped_vector_boost_factor'], + 'scopedKeywordBoostFactor' => $core['scoped_keyword_boost_factor'], 'selectedChunkIds' => [], 'rows' => [], 'rrfScores' => [], - 'rawScores' => [], + 'rawVectorScores' => [], + 'rawKeywordScores' => [], + 'titleMetadataBoosts' => $core['title_metadata_boosts'], + 'titleMetadataDocBoosts' => $core['title_metadata_doc_boosts'], 'threshold' => $core['threshold'], 'catalogBlock' => null, ]; @@ -273,107 +437,272 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'intent' => $salesIntent, 'isListQuery' => $core['is_list_query'], 'selectionMode' => $selectionMode, + 'scopeMode' => $core['scope_mode'], + 'cleanQuery' => $core['clean_query'], + 'semanticQuery' => $core['semantic_query'], + 'secondaryVectorQuery' => $core['secondary_vector_query'], + 'lexicalQuery' => $core['lexical_query'], + 'tagCandidateDocIds' => $core['tag_candidate_doc_ids'], + 'softDocumentCandidateDocIds' => $core['soft_document_candidate_doc_ids'], + 'pseudoScopeDocIds' => $core['pseudo_scope_doc_ids'], + 'globalHitCount' => $core['global_hit_count'], + 'scopedHitCount' => $core['scoped_hit_count'], + 'globalVectorHitCount' => $core['global_vector_hit_count'], + 'globalPrimaryVectorHitCount' => $core['global_primary_vector_hit_count'], + 'globalSecondaryVectorHitCount' => $core['global_secondary_vector_hit_count'], + 'globalKeywordHitCount' => $core['global_keyword_hit_count'], + 'scopedVectorHitCount' => $core['scoped_vector_hit_count'], + 'scopedPrimaryVectorHitCount' => $core['scoped_primary_vector_hit_count'], + 'scopedSecondaryVectorHitCount' => $core['scoped_secondary_vector_hit_count'], + 'scopedKeywordHitCount' => $core['scoped_keyword_hit_count'], + 'scopedBoostFactor' => max( + $core['scoped_vector_boost_factor'], + $core['secondary_scoped_vector_boost_factor'], + $core['scoped_keyword_boost_factor'] + ), + 'scopedVectorBoostFactor' => $core['scoped_vector_boost_factor'], + 'secondaryScopedVectorBoostFactor' => $core['secondary_scoped_vector_boost_factor'], + 'scopedKeywordBoostFactor' => $core['scoped_keyword_boost_factor'], 'selectedChunkIds' => $selectedChunkIds, 'rows' => $core['rows'], 'rrfScores' => $core['rrf_scores'], - 'rawScores' => $core['raw_scores'], + 'rawVectorScores' => $core['raw_vector_scores'], + 'rawKeywordScores' => $core['raw_keyword_scores'], + 'titleMetadataBoosts' => $core['title_metadata_boosts'], + 'titleMetadataDocBoosts' => $core['title_metadata_doc_boosts'], 'threshold' => $core['threshold'], 'catalogBlock' => null, ]; } - // ========================================================= - // CORE PIPELINE - // ========================================================= - /** * Executes the actual hybrid retrieval logic. * - * Steps: - * - derive limits from config within hard safety caps - * - detect whether the prompt is a "list query" - * - clean and enrich the prompt - * - compute threshold + vector topK based on intent/query type - * - route query into candidate document ids via tag routing - * - run global and optional scoped vector search - * - fuse hits - * - resolve chunk ids to chunk rows - * * @throws Exception */ private function runCore( - string $prompt, + string $prompt, ModelGenerationConfig $config, - bool $withScores, - string $salesIntent - ): array - { + bool $withScores, + string $salesIntent + ): array { $limit = max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK)); $isListQuery = $this->intentLite->isListQuery($prompt); $cleanQuery = $this->queryCleaner->clean($prompt); - $cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery); if ($cleanQuery === '') { return [ 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD, + 'clean_query' => '', + 'semantic_query' => '', + 'secondary_vector_query' => '', + 'lexical_query' => '', + 'scope_mode' => 'none', + 'tag_candidate_doc_ids' => [], + 'soft_document_candidate_doc_ids' => [], + 'pseudo_scope_doc_ids' => [], + 'global_hit_count' => 0, + 'scoped_hit_count' => 0, + 'global_vector_hit_count' => 0, + 'global_primary_vector_hit_count' => 0, + 'global_secondary_vector_hit_count' => 0, + 'global_keyword_hit_count' => 0, + 'scoped_vector_hit_count' => 0, + 'scoped_primary_vector_hit_count' => 0, + 'scoped_secondary_vector_hit_count' => 0, + 'scoped_keyword_hit_count' => 0, + 'scoped_vector_boost_factor' => 0.0, + 'secondary_scoped_vector_boost_factor' => 0.0, + 'scoped_keyword_boost_factor' => 0.0, 'ranked_chunk_ids' => [], 'rows' => [], 'rrf_scores' => [], - 'raw_scores' => [], + 'raw_vector_scores' => [], + 'raw_keyword_scores' => [], + 'title_metadata_boosts' => [], + 'title_metadata_doc_boosts' => [], ]; } + $semanticQuery = $this->queryEnricher->enrichPrompt($cleanQuery); + $secondaryVectorQuery = $cleanQuery !== $semanticQuery ? $cleanQuery : ''; + $lexicalQuery = $cleanQuery; + [$threshold, $topK] = $this->computeThresholdAndTopK( $salesIntent, $isListQuery, $vectorTopKBase ); - $candidateDocIds = $this->tagRouting->route($cleanQuery); - $candidateDocIds = is_array($candidateDocIds) + $tagCandidateDocIds = $this->tagRouting->route($semanticQuery); + $tagCandidateDocIds = is_array($tagCandidateDocIds) ? array_values(array_unique(array_filter( - $candidateDocIds, + $tagCandidateDocIds, static fn(mixed $value): bool => is_string($value) && $value !== '' ))) : []; - $globalHits = $this->vectorClient->search($cleanQuery, $topK); + $globalPrimaryVectorHits = $this->vectorClient->search($semanticQuery, $topK); + $globalSecondaryVectorHits = $secondaryVectorQuery !== '' + ? $this->vectorClient->search($secondaryVectorQuery, $topK) + : []; + $globalKeywordHits = $this->keywordRetriever->search($lexicalQuery, $topK); - $scopedHits = []; - if ($candidateDocIds !== []) { - $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds); + $softDocumentCandidateDocIds = []; + $pseudoScopeDocIds = []; + $scopeMode = 'none'; + + $scopedVectorBoostFactor = 0.0; + $secondaryScopedVectorBoostFactor = 0.0; + $scopedKeywordBoostFactor = 0.0; + + $scopedPrimaryVectorHits = []; + $scopedSecondaryVectorHits = []; + $scopedKeywordHits = []; + + if ($tagCandidateDocIds !== []) { + $scopeMode = 'tag_routing'; + $scopedVectorBoostFactor = self::TAG_SCOPED_VECTOR_BOOST; + $secondaryScopedVectorBoostFactor = self::TAG_SCOPED_VECTOR_BOOST * self::SECONDARY_SCOPED_VECTOR_MULTIPLIER; + $scopedKeywordBoostFactor = self::TAG_SCOPED_LEXICAL_BOOST; + + $scopedPrimaryVectorHits = $this->vectorClient->searchScoped($semanticQuery, $topK, $tagCandidateDocIds); + $scopedSecondaryVectorHits = $secondaryVectorQuery !== '' + ? $this->vectorClient->searchScoped($secondaryVectorQuery, $topK, $tagCandidateDocIds) + : []; + $scopedKeywordHits = $this->keywordRetriever->search($lexicalQuery, $topK, $tagCandidateDocIds); + } else { + $softDocumentCandidateDocIds = $this->deriveSoftDocumentCandidateDocIds($globalKeywordHits); + + if ($softDocumentCandidateDocIds !== []) { + $scopeMode = 'soft_document_candidate'; + $scopedVectorBoostFactor = self::SOFT_DOC_SCOPED_VECTOR_BOOST; + $secondaryScopedVectorBoostFactor = self::SOFT_DOC_SCOPED_VECTOR_BOOST * self::SECONDARY_SCOPED_VECTOR_MULTIPLIER; + $scopedKeywordBoostFactor = self::SOFT_DOC_SCOPED_LEXICAL_BOOST; + + $scopedPrimaryVectorHits = $this->vectorClient->searchScoped($semanticQuery, $topK, $softDocumentCandidateDocIds); + $scopedSecondaryVectorHits = $secondaryVectorQuery !== '' + ? $this->vectorClient->searchScoped($secondaryVectorQuery, $topK, $softDocumentCandidateDocIds) + : []; + $scopedKeywordHits = $this->keywordRetriever->search($lexicalQuery, $topK, $softDocumentCandidateDocIds); + } else { + $pseudoScopeDocIds = $this->derivePseudoScopeDocumentIds($globalPrimaryVectorHits); + + if ($pseudoScopeDocIds !== []) { + $scopeMode = 'pseudo_scope'; + $scopedVectorBoostFactor = self::PSEUDO_SCOPED_VECTOR_BOOST; + $secondaryScopedVectorBoostFactor = self::PSEUDO_SCOPED_VECTOR_BOOST * self::SECONDARY_SCOPED_VECTOR_MULTIPLIER; + $scopedKeywordBoostFactor = self::PSEUDO_SCOPED_LEXICAL_BOOST; + + $scopedPrimaryVectorHits = $this->vectorClient->searchScoped($semanticQuery, $topK, $pseudoScopeDocIds); + $scopedSecondaryVectorHits = $secondaryVectorQuery !== '' + ? $this->vectorClient->searchScoped($secondaryVectorQuery, $topK, $pseudoScopeDocIds) + : []; + $scopedKeywordHits = $this->keywordRetriever->search($lexicalQuery, $topK, $pseudoScopeDocIds); + } + } } - if ($globalHits === [] && $scopedHits === []) { + if ( + $globalPrimaryVectorHits === [] + && $globalSecondaryVectorHits === [] + && $globalKeywordHits === [] + && $scopedPrimaryVectorHits === [] + && $scopedSecondaryVectorHits === [] + && $scopedKeywordHits === [] + ) { return [ 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => $threshold, + 'clean_query' => $cleanQuery, + 'semantic_query' => $semanticQuery, + 'secondary_vector_query' => $secondaryVectorQuery, + 'lexical_query' => $lexicalQuery, + 'scope_mode' => $scopeMode, + 'tag_candidate_doc_ids' => $tagCandidateDocIds, + 'soft_document_candidate_doc_ids' => $softDocumentCandidateDocIds, + 'pseudo_scope_doc_ids' => $pseudoScopeDocIds, + 'global_hit_count' => 0, + 'scoped_hit_count' => 0, + 'global_vector_hit_count' => 0, + 'global_primary_vector_hit_count' => 0, + 'global_secondary_vector_hit_count' => 0, + 'global_keyword_hit_count' => 0, + 'scoped_vector_hit_count' => 0, + 'scoped_primary_vector_hit_count' => 0, + 'scoped_secondary_vector_hit_count' => 0, + 'scoped_keyword_hit_count' => 0, + 'scoped_vector_boost_factor' => $scopedVectorBoostFactor, + 'secondary_scoped_vector_boost_factor' => $secondaryScopedVectorBoostFactor, + 'scoped_keyword_boost_factor' => $scopedKeywordBoostFactor, 'ranked_chunk_ids' => [], 'rows' => [], 'rrf_scores' => [], - 'raw_scores' => [], + 'raw_vector_scores' => [], + 'raw_keyword_scores' => [], + 'title_metadata_boosts' => [], + 'title_metadata_doc_boosts' => [], ]; } - $fused = $this->fuseHits( - $globalHits, - $scopedHits, - $threshold, - $scopedHits !== [], - $withScores - ); + $fused = $this->fuseHitSources([ + [ + 'hits' => $globalPrimaryVectorHits, + 'threshold' => $threshold, + 'boost' => 1.0, + 'bucket' => 'vector', + ], + [ + 'hits' => $globalSecondaryVectorHits, + 'threshold' => $threshold, + 'boost' => self::SECONDARY_GLOBAL_VECTOR_BOOST, + 'bucket' => 'vector', + ], + [ + 'hits' => $globalKeywordHits, + 'threshold' => self::LEXICAL_SCORE_THRESHOLD, + 'boost' => self::GLOBAL_LEXICAL_BOOST, + 'bucket' => 'keyword', + ], + [ + 'hits' => $scopedPrimaryVectorHits, + 'threshold' => $threshold, + 'boost' => $scopedVectorBoostFactor, + 'bucket' => 'vector', + ], + [ + 'hits' => $scopedSecondaryVectorHits, + 'threshold' => $threshold, + 'boost' => $secondaryScopedVectorBoostFactor, + 'bucket' => 'vector', + ], + [ + 'hits' => $scopedKeywordHits, + 'threshold' => self::LEXICAL_SCORE_THRESHOLD, + 'boost' => $scopedKeywordBoostFactor, + 'bucket' => 'keyword', + ], + ], $withScores); $rrfScores = $fused['rrf_scores']; - $rawScores = $fused['raw_scores']; + $rawVectorScores = $fused['raw_vector_scores']; + $rawKeywordScores = $fused['raw_keyword_scores']; - if ($rrfScores === [] && $globalHits !== []) { - $rrfScores = $this->fallbackRrfFromHits($globalHits); + if ($rrfScores === []) { + $rrfScores = $this->fallbackRrfFromSources( + $globalPrimaryVectorHits, + $globalSecondaryVectorHits, + $globalKeywordHits, + $scopedPrimaryVectorHits, + $scopedSecondaryVectorHits, + $scopedKeywordHits + ); } if ($rrfScores === []) { @@ -381,13 +710,45 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => $threshold, + 'clean_query' => $cleanQuery, + 'semantic_query' => $semanticQuery, + 'secondary_vector_query' => $secondaryVectorQuery, + 'lexical_query' => $lexicalQuery, + 'scope_mode' => $scopeMode, + 'tag_candidate_doc_ids' => $tagCandidateDocIds, + 'soft_document_candidate_doc_ids' => $softDocumentCandidateDocIds, + 'pseudo_scope_doc_ids' => $pseudoScopeDocIds, + 'global_hit_count' => count($globalPrimaryVectorHits) + count($globalSecondaryVectorHits) + count($globalKeywordHits), + 'scoped_hit_count' => count($scopedPrimaryVectorHits) + count($scopedSecondaryVectorHits) + count($scopedKeywordHits), + 'global_vector_hit_count' => count($globalPrimaryVectorHits) + count($globalSecondaryVectorHits), + 'global_primary_vector_hit_count' => count($globalPrimaryVectorHits), + 'global_secondary_vector_hit_count' => count($globalSecondaryVectorHits), + 'global_keyword_hit_count' => count($globalKeywordHits), + 'scoped_vector_hit_count' => count($scopedPrimaryVectorHits) + count($scopedSecondaryVectorHits), + 'scoped_primary_vector_hit_count' => count($scopedPrimaryVectorHits), + 'scoped_secondary_vector_hit_count' => count($scopedSecondaryVectorHits), + 'scoped_keyword_hit_count' => count($scopedKeywordHits), + 'scoped_vector_boost_factor' => $scopedVectorBoostFactor, + 'secondary_scoped_vector_boost_factor' => $secondaryScopedVectorBoostFactor, + 'scoped_keyword_boost_factor' => $scopedKeywordBoostFactor, 'ranked_chunk_ids' => [], 'rows' => [], 'rrf_scores' => [], - 'raw_scores' => $rawScores, + 'raw_vector_scores' => $rawVectorScores, + 'raw_keyword_scores' => $rawKeywordScores, + 'title_metadata_boosts' => [], + 'title_metadata_doc_boosts' => [], ]; } + $rows = $this->lookup->findByChunkIds(array_keys($rrfScores)); + + [$rrfScores, $titleMetadataBoosts, $titleMetadataDocBoosts] = $this->applyTitleMetadataBoosts( + $rrfScores, + $rows, + $lexicalQuery + ); + arsort($rrfScores); $rankedChunkIds = array_keys($rrfScores); @@ -397,22 +758,38 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface 'limit' => $limit, 'is_list_query' => $isListQuery, 'threshold' => $threshold, + 'clean_query' => $cleanQuery, + 'semantic_query' => $semanticQuery, + 'secondary_vector_query' => $secondaryVectorQuery, + 'lexical_query' => $lexicalQuery, + 'scope_mode' => $scopeMode, + 'tag_candidate_doc_ids' => $tagCandidateDocIds, + 'soft_document_candidate_doc_ids' => $softDocumentCandidateDocIds, + 'pseudo_scope_doc_ids' => $pseudoScopeDocIds, + 'global_hit_count' => count($globalPrimaryVectorHits) + count($globalSecondaryVectorHits) + count($globalKeywordHits), + 'scoped_hit_count' => count($scopedPrimaryVectorHits) + count($scopedSecondaryVectorHits) + count($scopedKeywordHits), + 'global_vector_hit_count' => count($globalPrimaryVectorHits) + count($globalSecondaryVectorHits), + 'global_primary_vector_hit_count' => count($globalPrimaryVectorHits), + 'global_secondary_vector_hit_count' => count($globalSecondaryVectorHits), + 'global_keyword_hit_count' => count($globalKeywordHits), + 'scoped_vector_hit_count' => count($scopedPrimaryVectorHits) + count($scopedSecondaryVectorHits), + 'scoped_primary_vector_hit_count' => count($scopedPrimaryVectorHits), + 'scoped_secondary_vector_hit_count' => count($scopedSecondaryVectorHits), + 'scoped_keyword_hit_count' => count($scopedKeywordHits), + 'scoped_vector_boost_factor' => $scopedVectorBoostFactor, + 'secondary_scoped_vector_boost_factor' => $secondaryScopedVectorBoostFactor, + 'scoped_keyword_boost_factor' => $scopedKeywordBoostFactor, 'ranked_chunk_ids' => $rankedChunkIds, 'rows' => $rows, 'rrf_scores' => $rrfScores, - 'raw_scores' => $rawScores, + 'raw_vector_scores' => $rawVectorScores, + 'raw_keyword_scores' => $rawKeywordScores, + 'title_metadata_boosts' => $titleMetadataBoosts, + 'title_metadata_doc_boosts' => $titleMetadataDocBoosts, ]; } - // ========================================================= - // SUPPORT - // ========================================================= - /** - * Loads the active model generation config. - * - * Retrieval is not allowed to proceed without an active config. - */ private function requireConfig(): ModelGenerationConfig { $config = $this->configRepository->findActiveForModel(); @@ -424,32 +801,18 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $config; } - /** - * Extracts the normalized sales intent string from the intent detector. - * - * Falls back to DISCOVERY when the detector payload is incomplete. - */ private function detectSalesIntent(string $prompt): string { $data = $this->salesIntentLite->detect($prompt); - return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY); + return (string) ($data['intent'] ?? SalesIntentLite::DISCOVERY); } - /** - * Computes retrieval threshold and vector topK. - * - * Rules: - * - objection/pricing intents are slightly stricter - * - list queries are allowed to retrieve a wider candidate set - * - all values are clamped to global hard limits - */ private function computeThresholdAndTopK( string $salesIntent, - bool $isListQuery, - int $vectorTopKBase - ): array - { + bool $isListQuery, + int $vectorTopKBase + ): array { $threshold = NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD; $topK = $vectorTopKBase; @@ -461,7 +824,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } if ($isListQuery) { - $topK = (int)round($topK * NdjsonHybridRetrieverConfig::LIST_BONUS); + $topK = (int) round($topK * NdjsonHybridRetrieverConfig::LIST_BONUS); } $topK = max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK)); @@ -474,26 +837,175 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } /** - * Fuses multiple hit lists into one RRF-style score map. - * - * Notes: - * - only hits above threshold are considered - * - rank position within each hit list contributes to the final score - * - scoped hits can be boosted - * - raw scores are optionally captured for debug output + * @param array> $globalKeywordHits + * @return string[] */ - private function fuseHits( - array $globalHits, - array $scopedHits, - float $threshold, - bool $boostScoped, - bool $captureRaw - ): array + private function deriveSoftDocumentCandidateDocIds(array $globalKeywordHits): array + { + $window = array_slice($globalKeywordHits, 0, self::SOFT_DOC_CANDIDATE_WINDOW); + $stats = []; + + foreach ($window as $rank => $hit) { + $documentId = $hit['document_id'] ?? null; + + if (!is_string($documentId) || $documentId === '') { + continue; + } + + $score = isset($hit['score']) && is_numeric($hit['score']) + ? (float) $hit['score'] + : 0.0; + + if (!isset($stats[$documentId])) { + $stats[$documentId] = [ + 'document_id' => $documentId, + 'count' => 0, + 'best_rank' => $rank, + 'best_score' => $score, + ]; + } + + $stats[$documentId]['count']++; + $stats[$documentId]['best_rank'] = min($stats[$documentId]['best_rank'], $rank); + $stats[$documentId]['best_score'] = max($stats[$documentId]['best_score'], $score); + } + + if ($stats === []) { + return []; + } + + uasort($stats, static function (array $a, array $b): int { + if ($a['count'] !== $b['count']) { + return $b['count'] <=> $a['count']; + } + + if (abs((float) $a['best_score'] - (float) $b['best_score']) > 0.000001) { + return ((float) $b['best_score'] <=> (float) $a['best_score']); + } + + return $a['best_rank'] <=> $b['best_rank']; + }); + + $selected = []; + + foreach ($stats as $row) { + $count = (int) $row['count']; + $bestRank = (int) $row['best_rank']; + $bestScore = (float) $row['best_score']; + + if ( + $count < self::SOFT_DOC_CANDIDATE_MIN_DOC_HITS + && !($bestRank === 0 && $bestScore >= self::SOFT_DOC_TOP_SCORE_MIN) + ) { + continue; + } + + $selected[] = (string) $row['document_id']; + + if (count($selected) >= self::SOFT_DOC_CANDIDATE_MAX_DOCS) { + break; + } + } + + return $selected; + } + + /** + * @param array> $globalPrimaryVectorHits + * @return string[] + */ + private function derivePseudoScopeDocumentIds(array $globalPrimaryVectorHits): array + { + $window = array_slice($globalPrimaryVectorHits, 0, self::PSEUDO_SCOPE_GLOBAL_WINDOW); + $stats = []; + + foreach ($window as $rank => $hit) { + $documentId = $hit['document_id'] ?? null; + + if (!is_string($documentId) || $documentId === '') { + continue; + } + + $score = isset($hit['score']) && is_numeric($hit['score']) + ? (float) $hit['score'] + : 0.0; + + if (!isset($stats[$documentId])) { + $stats[$documentId] = [ + 'document_id' => $documentId, + 'count' => 0, + 'best_rank' => $rank, + 'best_score' => $score, + ]; + } + + $stats[$documentId]['count']++; + $stats[$documentId]['best_rank'] = min($stats[$documentId]['best_rank'], $rank); + $stats[$documentId]['best_score'] = max($stats[$documentId]['best_score'], $score); + } + + if ($stats === []) { + return []; + } + + uasort($stats, static function (array $a, array $b): int { + if ($a['count'] !== $b['count']) { + return $b['count'] <=> $a['count']; + } + + if (abs((float) $a['best_score'] - (float) $b['best_score']) > 0.000001) { + return ((float) $b['best_score'] <=> (float) $a['best_score']); + } + + return $a['best_rank'] <=> $b['best_rank']; + }); + + $selected = []; + + foreach ($stats as $row) { + if ((int) $row['count'] < self::PSEUDO_SCOPE_MIN_DOC_HITS) { + continue; + } + + $selected[] = (string) $row['document_id']; + + if (count($selected) >= self::PSEUDO_SCOPE_MAX_DOCS) { + break; + } + } + + return $selected; + } + + /** + * @param array>, + * threshold: float, + * boost: float, + * bucket: string + * }> $sources + * @return array{ + * rrf_scores: array, + * raw_vector_scores: array, + * raw_keyword_scores: array + * } + */ + private function fuseHitSources(array $sources, bool $captureRaw): array { $rrfScores = []; - $rawScores = []; + $rawVectorScores = []; + $rawKeywordScores = []; + + foreach ($sources as $source) { + $hits = $source['hits']; + $threshold = (float) $source['threshold']; + $boost = max(0.0, (float) $source['boost']); + $bucket = (string) $source['bucket']; + + if ($hits === [] || $boost <= 0.0) { + continue; + } - $apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void { $rank = 0; foreach ($hits as $hit) { @@ -501,85 +1013,83 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - $raw = (float)$hit['score']; + $raw = (float) $hit['score']; if ($raw < $threshold) { continue; } - $chunkId = (string)$hit['chunk_id']; + $chunkId = (string) $hit['chunk_id']; if ($captureRaw) { - $rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw); + if ($bucket === 'vector') { + $rawVectorScores[$chunkId] = max($rawVectorScores[$chunkId] ?? 0.0, $raw); + } elseif ($bucket === 'keyword') { + $rawKeywordScores[$chunkId] = max($rawKeywordScores[$chunkId] ?? 0.0, $raw); + } } $rank++; $rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); - - if ($boost) { - $rrf *= 1.2; - } + $rrf *= $boost; $rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf; } - }; - - $apply($globalHits, false); - $apply($scopedHits, $boostScoped); + } return [ 'rrf_scores' => $rrfScores, - 'raw_scores' => $rawScores, + 'raw_vector_scores' => $rawVectorScores, + 'raw_keyword_scores' => $rawKeywordScores, ]; } /** - * Builds a fallback RRF ranking purely from hit order. - * - * Used when thresholding removed all fused candidates but - * the global hit list itself still exists. + * @param array> ...$sourceLists + * @return array */ - private function fallbackRrfFromHits(array $hits): array + private function fallbackRrfFromSources(array ...$sourceLists): array { - $rrf = []; - $rank = 0; + foreach ($sourceLists as $hits) { + $rrf = []; + $rank = 0; - foreach ($hits as $hit) { - if (!isset($hit['chunk_id'])) { - continue; + foreach ($hits as $hit) { + if (!isset($hit['chunk_id'])) { + continue; + } + + $rank++; + $rrf[(string) $hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); + + if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) { + break; + } } - $rank++; - $rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); - - if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) { - break; + if ($rrf !== []) { + return $rrf; } } - return $rrf; + return []; } /** - * Selects a coherent chunk window from one exact document title match. - * - * For exact product questions we prefer a pure document slice over - * cross-document fusion to avoid mixing neighbouring product families. - * * @param array> $rows * @return string[] */ private function selectExactDocumentChunkIds(array $rows, int $limit): array { uasort($rows, static function (array $a, array $b): int { - $aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX; - $bIndex = is_int($b['chunk_index'] ?? null) ? (int)$b['chunk_index'] : PHP_INT_MAX; + $aIndex = is_int($a['chunk_index'] ?? null) ? (int) $a['chunk_index'] : PHP_INT_MAX; + $bIndex = is_int($b['chunk_index'] ?? null) ? (int) $b['chunk_index'] : PHP_INT_MAX; if ($aIndex !== $bIndex) { return $aIndex <=> $bIndex; } - return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? '')); + return strcmp((string) ($a['chunk_id'] ?? ''), (string) ($b['chunk_id'] ?? '')); }); $selected = []; @@ -587,7 +1097,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface foreach ($rows as $row) { $chunkId = $row['chunk_id'] ?? null; - $text = trim((string)($row['text'] ?? '')); + $text = trim((string) ($row['text'] ?? '')); if (!is_string($chunkId) || $chunkId === '' || $text === '') { continue; @@ -604,10 +1114,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } /** - * Builds synthetic scores for exact-title fast-path selections. - * - * These scores are only used for debug output consistency. - * * @param string[] $chunkIds * @return array */ @@ -616,20 +1122,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $scores = []; foreach (array_values($chunkIds) as $rank => $chunkId) { - $scores[(string)$chunkId] = 1.0 / (1 + $rank); + $scores[(string) $chunkId] = 1.0 / (1 + $rank); } return $scores; } - /** - * Selection strategy for list-style queries. - * - * Goal: - * - avoid near-identical chunks - * - prefer diverse list entries - * - stop once the configured limit is reached - */ private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array { $seen = []; @@ -640,19 +1138,19 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - $chunk = trim((string)$rows[$id]['text']); + $chunk = trim((string) $rows[$id]['text']); if ($chunk === '') { continue; } - $key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk))); + $key = md5(mb_strtolower((string) (preg_replace('/\s+/u', ' ', $chunk) ?? $chunk))); if (isset($seen[$key])) { continue; } $seen[$key] = true; - $out[] = (string)$id; + $out[] = (string) $id; if (count($out) >= $limit) { break; @@ -662,23 +1160,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $out; } - /** - * Selection strategy for sales-oriented queries. - * - * Modes: - * - exact_document_title: - * used when the prompt clearly contains one exact document title - * and the answer should stay strictly within that document - * - * - sales_dominant_document: - * used when one document clearly dominates the top hit window - * and coherent neighbouring chunks from that document are more - * useful than cross-document spread - * - * - sales_spread: - * default mode that spreads chunks across documents and enforces - * distance between chunk positions of the same document - */ private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array { $dominantDocId = $this->detectDominantTopDocument($chunkIds, $rows); @@ -710,13 +1191,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface ]; } - /** - * Detects whether one document clearly dominates the first ranked window. - * - * This is especially useful for product-sheet style documents where - * several adjacent chunks belong together and should be passed to the model - * as one coherent factual block. - */ private function detectDominantTopDocument(array $chunkIds, array $rows): ?string { $docWindow = []; @@ -726,7 +1200,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - $text = trim((string)$rows[$chunkId]['text']); + $text = trim((string) $rows[$chunkId]['text']); $docId = $rows[$chunkId]['document_id'] ?? null; if ($text === '' || !is_string($docId) || $docId === '') { @@ -749,7 +1223,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return null; } - $dominantCount = (int)($counts[$dominantDocId] ?? 0); + $dominantCount = (int) ($counts[$dominantDocId] ?? 0); if ($dominantCount >= self::DOMINANT_DOC_MIN_HITS) { return $dominantDocId; @@ -765,21 +1239,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return null; } - /** - * Selects a coherent chunk window from the dominant document. - * - * Strategy: - * - use the highest-ranked chunk of that document as anchor - * - prefer neighbouring chunk indices around that anchor - * - sort the final selection by chunk index for prompt coherence - */ private function selectDominantDocumentChunkIds( string $documentId, - array $chunkIds, - array $rows, - int $limit - ): array - { + array $chunkIds, + array $rows, + int $limit + ): array { $docHits = []; $anchorChunkIndex = null; @@ -788,7 +1253,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - $text = trim((string)$rows[$chunkId]['text']); + $text = trim((string) $rows[$chunkId]['text']); $docId = $rows[$chunkId]['document_id'] ?? null; if ($text === '' || $docId !== $documentId) { @@ -803,7 +1268,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } $docHits[] = [ - 'id' => (string)$chunkId, + 'id' => (string) $chunkId, 'rank' => $rank, 'chunk_index' => $chunkIndex, ]; @@ -861,19 +1326,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface ); } - /** - * Fills the remaining sales slots after a dominant document selection. - * - * The already selected dominant-document chunks stay fixed. - * Remaining slots are filled with the normal spread strategy. - */ private function fillRemainingSalesChunkIds( array $seedChunkIds, array $chunkIds, array $rows, - int $limit - ): array - { + int $limit + ): array { $out = array_values(array_unique(array_map('strval', $seedChunkIds))); if (count($out) >= $limit) { @@ -925,12 +1383,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface } } - $text = trim((string)$rows[$chunkId]['text']); + $text = trim((string) $rows[$chunkId]['text']); if ($text === '') { continue; } - $out[] = (string)$chunkId; + $out[] = (string) $chunkId; $selected[$chunkId] = true; $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; @@ -946,14 +1404,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $out; } - /** - * Default spread selection for sales-oriented queries. - * - * Goal: - * - avoid overloading the result with chunks from the same document - * - avoid chunks that are too close to each other in the same document - * - preserve top-ranked relevance while improving contextual spread - */ private function selectSalesChunkIdsSpread(array $chunkIds, array $rows, int $limit): array { $out = []; @@ -986,12 +1436,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $docChunkPositions[$docId][] = $chunkIndex; } - $text = trim((string)$rows[$chunkId]['text']); + $text = trim((string) $rows[$chunkId]['text']); if ($text === '') { continue; } - $out[] = (string)$chunkId; + $out[] = (string) $chunkId; $docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1; if (count($out) >= $limit) { @@ -1002,9 +1452,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $out; } - /** - * Converts selected chunk ids into the final plain text result list. - */ private function collectTextsFromIds(array $chunkIds, array $rows): array { $out = []; @@ -1014,7 +1461,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } - $text = trim((string)$rows[$id]['text']); + $text = trim((string) $rows[$id]['text']); if ($text !== '') { $out[] = $text; @@ -1023,4 +1470,233 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $out; } + + /** + * Applies a conservative document-level re-rank based on title / metadata matching. + * + * This is intentionally executed after source fusion. It should sharpen ranking + * for clearly matching documents, but never replace the underlying retrieval logic. + * + * @param array $rrfScores + * @param array> $rows + * @return array{0: array, 1: array, 2: array} + */ + private function applyTitleMetadataBoosts(array $rrfScores, array $rows, string $lexicalQuery): array + { + $normalizedQuery = $this->normalizeForMatching($lexicalQuery); + $queryTokens = $this->tokenizeNormalizedQuery($normalizedQuery); + + if ($normalizedQuery === '' || $queryTokens === [] || $rrfScores === [] || $rows === []) { + return [$rrfScores, [], []]; + } + + $documentBoosts = []; + + foreach ($rows as $row) { + $documentId = $row['document_id'] ?? null; + + if (!is_string($documentId) || $documentId === '' || isset($documentBoosts[$documentId])) { + continue; + } + + $documentBoosts[$documentId] = $this->computeDocumentMetadataBoost( + $row, + $normalizedQuery, + $queryTokens + ); + } + + if ($documentBoosts === []) { + return [$rrfScores, [], []]; + } + + $chunkBoosts = []; + + foreach ($rrfScores as $chunkId => $score) { + $row = $rows[$chunkId] ?? null; + + if (!is_array($row)) { + continue; + } + + $documentId = $row['document_id'] ?? null; + + if (!is_string($documentId) || $documentId === '') { + continue; + } + + $boost = $documentBoosts[$documentId] ?? 0.0; + + if ($boost <= 0.0) { + continue; + } + + $rrfScores[$chunkId] = $score * (1.0 + $boost); + $chunkBoosts[$chunkId] = $boost; + } + + return [$rrfScores, $chunkBoosts, $documentBoosts]; + } + + /** + * @param array $row + * @param string[] $queryTokens + */ + private function computeDocumentMetadataBoost(array $row, string $normalizedQuery, array $queryTokens): float + { + $documentTitle = $this->normalizeForMatching($this->extractMetadataString($row, [ + 'document_title', + 'title', + ])); + + $fileName = $this->normalizeForMatching($this->extractMetadataString($row, [ + 'file_name', + 'filename', + 'original_filename', + 'source_name', + 'document_name', + ])); + + $metaText = $this->normalizeForMatching($this->extractMetadataString($row, [ + 'source_path', + 'path', + 'heading', + 'section_title', + 'category', + ])); + + $boost = 0.0; + + $titleCoverage = $this->computeNormalizedTokenCoverage($queryTokens, $documentTitle); + if ($titleCoverage > 0.0) { + $boost += min( + self::TITLE_MATCH_MAX_BOOST, + self::TITLE_MATCH_BASE_BOOST + ($titleCoverage * self::TITLE_MATCH_MAX_BOOST) + ); + } + + $fileCoverage = $this->computeNormalizedTokenCoverage($queryTokens, $fileName); + if ($fileCoverage > 0.0) { + $boost += min( + self::FILE_MATCH_MAX_BOOST, + self::FILE_MATCH_BASE_BOOST + ($fileCoverage * self::FILE_MATCH_MAX_BOOST) + ); + } + + $metaCoverage = $this->computeNormalizedTokenCoverage($queryTokens, $metaText); + if ($metaCoverage > 0.0) { + $boost += min( + self::META_MATCH_MAX_BOOST, + $metaCoverage * self::META_MATCH_MAX_BOOST + ); + } + + if (str_contains($normalizedQuery, ' ')) { + if ($documentTitle !== '' && str_contains(' ' . $documentTitle . ' ', ' ' . $normalizedQuery . ' ')) { + $boost += self::EXACT_TITLE_PHRASE_BOOST; + } + + if ($fileName !== '' && str_contains(' ' . $fileName . ' ', ' ' . $normalizedQuery . ' ')) { + $boost += self::EXACT_FILE_PHRASE_BOOST; + } + } + + return min(self::MAX_TITLE_METADATA_BOOST, $boost); + } + + /** + * @param array $row + * @param string[] $preferredKeys + */ + private function extractMetadataString(array $row, array $preferredKeys): string + { + foreach ($preferredKeys as $key) { + $topLevel = $row[$key] ?? null; + if (is_string($topLevel) && trim($topLevel) !== '') { + return trim($topLevel); + } + + $metadata = $row['metadata'] ?? null; + if (is_array($metadata)) { + $value = $metadata[$key] ?? null; + if (is_string($value) && trim($value) !== '') { + return trim($value); + } + } + } + + return ''; + } + + /** + * @param string[] $queryTokens + */ + private function computeNormalizedTokenCoverage(array $queryTokens, string $normalizedHaystack): float + { + if ($queryTokens === [] || $normalizedHaystack === '') { + return 0.0; + } + + $matched = 0; + + foreach ($queryTokens as $token) { + if ($token === '') { + continue; + } + + if (str_contains(' ' . $normalizedHaystack . ' ', ' ' . $token . ' ')) { + $matched++; + } + } + + if ($matched < 1) { + return 0.0; + } + + return $matched / max(1, count($queryTokens)); + } + + /** + * @return string[] + */ + private function tokenizeNormalizedQuery(string $normalizedQuery): array + { + if ($normalizedQuery === '') { + return []; + } + + $tokens = preg_split('/\s+/u', $normalizedQuery, -1, PREG_SPLIT_NO_EMPTY) ?: []; + $tokens = array_values(array_unique(array_filter( + $tokens, + static fn (string $token): bool => mb_strlen($token, 'UTF-8') >= 2 + ))); + + return $tokens; + } + + private function normalizeForMatching(string $value): string + { + $value = mb_strtolower(trim($value), 'UTF-8'); + $value = preg_replace('/[^\p{L}\p{N}]+/u', ' ', $value) ?? $value; + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim($value); + } + + private function maxNullableFloat(?float $a, ?float $b): ?float + { + if ($a === null && $b === null) { + return null; + } + + if ($a === null) { + return $b; + } + + if ($b === null) { + return $a; + } + + return max($a, $b); + } } \ No newline at end of file diff --git a/src/Knowledge/Retrieval/NdjsonKeywordRetriever.php b/src/Knowledge/Retrieval/NdjsonKeywordRetriever.php new file mode 100644 index 0000000..1d4c16a --- /dev/null +++ b/src/Knowledge/Retrieval/NdjsonKeywordRetriever.php @@ -0,0 +1,451 @@ + + */ + public function search(string $query, int $limit = 10, array $docIds = []): array + { + $limit = $this->clampLimit($limit); + $analysis = $this->analyzeQuery($query); + + if ($analysis['tokens'] === []) { + return []; + } + + $db = $this->openReadOnlyDb(); + + if (!$db instanceof SQLite3) { + return []; + } + + try { + $totalChunks = $this->loadTotalChunks($db); + $rows = $this->loadPostings( + $db, + $analysis['tokens'], + $docIds + ); + + if ($rows === []) { + return []; + } + + return $this->scoreRows( + $rows, + $analysis['tokens'], + $analysis['numeric_tokens'], + $totalChunks, + $limit + ); + } catch (\Throwable $e) { + $this->agentLogger->error('Keyword retriever failed', [ + 'error' => $e->getMessage(), + ]); + + return []; + } finally { + $db->close(); + } + } + + /** + * @return array{ + * normalized_query:string, + * tokens:string[], + * numeric_tokens:string[] + * } + */ + private function analyzeQuery(string $query): array + { + $normalized = $this->normalizeText($query); + + if ($normalized === '') { + return [ + 'normalized_query' => '', + 'tokens' => [], + 'numeric_tokens' => [], + ]; + } + + $parts = preg_split('/\s+/u', $normalized, -1, PREG_SPLIT_NO_EMPTY) ?: []; + + $tokens = []; + $numericTokens = []; + + foreach ($parts as $token) { + if ($token === '') { + continue; + } + + if ($this->shouldIgnoreToken($token)) { + continue; + } + + $tokens[] = $token; + + if (preg_match('/\d/u', $token) === 1) { + $numericTokens[] = $token; + } + } + + $tokens = array_values(array_unique($tokens)); + $numericTokens = array_values(array_unique($numericTokens)); + + if (count($tokens) > self::MAX_QUERY_TOKENS) { + $tokens = array_slice($tokens, 0, self::MAX_QUERY_TOKENS); + } + + return [ + 'normalized_query' => $normalized, + 'tokens' => $tokens, + 'numeric_tokens' => $numericTokens, + ]; + } + + private function shouldIgnoreToken(string $token): bool + { + if ($token === '') { + return true; + } + + if (preg_match('/\d/u', $token) === 1) { + return false; + } + + if (mb_strlen($token, 'UTF-8') < 2) { + return true; + } + + return StopWords::isStopWord($token); + } + + private function normalizeText(string $value): string + { + $value = mb_strtolower(trim($value), 'UTF-8'); + $value = str_replace(['-', '/', '_'], ' ', $value); + $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value; + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim($value); + } + + private function openReadOnlyDb(): ?SQLite3 + { + if (!class_exists(SQLite3::class)) { + $this->agentLogger->warning('Keyword retriever unavailable: sqlite3 extension missing.'); + + return null; + } + + $path = $this->getIndexPath(); + + if (!is_file($path)) { + return null; + } + + try { + $db = new SQLite3($path, SQLITE3_OPEN_READONLY); + $db->busyTimeout(1000); + + return $db; + } catch (\Throwable $e) { + $this->agentLogger->error('Unable to open lexical index', [ + 'path' => $path, + 'error' => $e->getMessage(), + ]); + + return null; + } + } + + private function getIndexPath(): string + { + return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH; + } + + private function loadTotalChunks(SQLite3 $db): int + { + $stmt = $db->prepare('SELECT value FROM lexical_meta WHERE key = :key'); + if (!$stmt) { + return 1; + } + + $stmt->bindValue(':key', 'total_chunks', SQLITE3_TEXT); + $result = $stmt->execute(); + + if ($result === false) { + return 1; + } + + $row = $result->fetchArray(SQLITE3_ASSOC); + $result->finalize(); + + $value = isset($row['value']) ? (int) $row['value'] : 0; + + return max(1, $value); + } + + /** + * @param string[] $tokens + * @param string[] $docIds + * @return array + */ + private function loadPostings(SQLite3 $db, array $tokens, array $docIds): array + { + if ($tokens === []) { + return []; + } + + $tokenPlaceholders = []; + foreach (array_keys($tokens) as $i) { + $tokenPlaceholders[] = ':t' . $i; + } + + $sql = ' + SELECT + p.token, + p.chunk_id, + p.document_id, + p.chunk_index, + p.tf, + p.title_tf, + lt.df + FROM lexical_postings p + INNER JOIN lexical_terms lt ON lt.token = p.token + WHERE p.token IN (' . implode(', ', $tokenPlaceholders) . ') + '; + + $docIds = array_values(array_unique(array_filter( + $docIds, + static fn (mixed $value): bool => is_string($value) && $value !== '' + ))); + + if ($docIds !== []) { + $docPlaceholders = []; + foreach (array_keys($docIds) as $i) { + $docPlaceholders[] = ':d' . $i; + } + + $sql .= ' AND p.document_id IN (' . implode(', ', $docPlaceholders) . ')'; + } + + $stmt = $db->prepare($sql); + + if ($stmt === false) { + return []; + } + + foreach ($tokens as $i => $token) { + $stmt->bindValue(':t' . $i, $token, SQLITE3_TEXT); + } + + foreach ($docIds as $i => $docId) { + $stmt->bindValue(':d' . $i, $docId, SQLITE3_TEXT); + } + + $result = $stmt->execute(); + + if ($result === false) { + return []; + } + + $rows = []; + + while (($row = $result->fetchArray(SQLITE3_ASSOC)) !== false) { + $chunkId = (string) ($row['chunk_id'] ?? ''); + $documentId = (string) ($row['document_id'] ?? ''); + $token = (string) ($row['token'] ?? ''); + + if ($chunkId === '' || $documentId === '' || $token === '') { + continue; + } + + $chunkIndex = null; + if (isset($row['chunk_index']) && is_numeric($row['chunk_index'])) { + $chunkIndex = (int) $row['chunk_index']; + } + + $rows[] = [ + 'token' => $token, + 'chunk_id' => $chunkId, + 'document_id' => $documentId, + 'chunk_index' => $chunkIndex, + 'tf' => max(1, (int) ($row['tf'] ?? 1)), + 'title_tf' => max(0, (int) ($row['title_tf'] ?? 0)), + 'df' => max(1, (int) ($row['df'] ?? 1)), + ]; + } + + $result->finalize(); + + return $rows; + } + + /** + * @param array $rows + * @param string[] $queryTokens + * @param string[] $numericTokens + * + * @return array + */ + private function scoreRows( + array $rows, + array $queryTokens, + array $numericTokens, + int $totalChunks, + int $limit + ): array { + if ($rows === []) { + return []; + } + + $numericLookup = array_fill_keys($numericTokens, true); + $queryTokenCount = max(1, count($queryTokens)); + + $scores = []; + $meta = []; + $matchedTokens = []; + + foreach ($rows as $row) { + $chunkId = $row['chunk_id']; + $token = $row['token']; + + $idf = log(1.0 + ($totalChunks / max(1.0, (float) (1 + $row['df'])))); + $tfBoost = 1.0 + (min(3, $row['tf']) * 0.20); + $numericBoost = isset($numericLookup[$token]) ? 1.60 : 1.0; + $titleBonus = $row['title_tf'] > 0 ? ($idf * 0.75) : 0.0; + + $scores[$chunkId] = ($scores[$chunkId] ?? 0.0) + + ($idf * $tfBoost * $numericBoost) + + $titleBonus; + + $matchedTokens[$chunkId][$token] = true; + + if (!isset($meta[$chunkId])) { + $meta[$chunkId] = [ + 'document_id' => $row['document_id'], + 'chunk_index' => $row['chunk_index'], + ]; + } + } + + foreach ($scores as $chunkId => $score) { + $coverage = count($matchedTokens[$chunkId] ?? []) / $queryTokenCount; + $scores[$chunkId] = $score * (0.65 + (0.35 * $coverage)); + } + + arsort($scores); + + $topScore = (float) reset($scores); + if ($topScore <= 0.0) { + return []; + } + + $out = []; + + foreach ($scores as $chunkId => $score) { + $normalizedScore = $score / $topScore; + + $out[] = [ + 'chunk_id' => $chunkId, + 'score' => round($normalizedScore, 6), + 'document_id' => $meta[$chunkId]['document_id'] ?? null, + 'chunk_index' => $meta[$chunkId]['chunk_index'] ?? null, + ]; + + if (count($out) >= $limit) { + break; + } + } + + return $out; + } + + private function clampLimit(int $limit): int + { + if ($limit < 1) { + return 1; + } + + if ($limit > self::MAX_LIMIT) { + return self::MAX_LIMIT; + } + + return $limit; + } +} \ No newline at end of file diff --git a/src/Knowledge/Retrieval/NdjsonLexicalIndexBuilder.php b/src/Knowledge/Retrieval/NdjsonLexicalIndexBuilder.php new file mode 100644 index 0000000..c83f1f7 --- /dev/null +++ b/src/Knowledge/Retrieval/NdjsonLexicalIndexBuilder.php @@ -0,0 +1,528 @@ +assertSqliteAvailable(); + + $indexNdjsonPath = $this->getIndexNdjsonPath(); + $lexicalIndexPath = $this->getLexicalIndexPath(); + $tmpPath = $lexicalIndexPath . '.tmp'; + + if (!is_file($indexNdjsonPath) || filesize($indexNdjsonPath) === 0) { + $this->removeFileIfExists($lexicalIndexPath); + $this->removeFileIfExists($tmpPath); + + $this->agentLogger->info('Lexical index skipped because index.ndjson is missing or empty.', [ + 'index_ndjson' => $indexNdjsonPath, + ]); + + return; + } + + $this->ensureTargetDirectoryExists($lexicalIndexPath); + $this->removeFileIfExists($tmpPath); + + $db = $this->openWritableDb($tmpPath); + + try { + $this->initializeSchema($db); + $this->buildFromNdjson($db, $indexNdjsonPath); + $db->close(); + + $this->atomicReplace($tmpPath, $lexicalIndexPath); + + $this->agentLogger->info('Lexical index build completed.', [ + 'path' => $lexicalIndexPath, + ]); + } catch (\Throwable $e) { + try { + $db->close(); + } catch (\Throwable) { + // Ignore close failures during cleanup. + } + + $this->removeFileIfExists($tmpPath); + + $this->agentLogger->error('Lexical index build failed.', [ + 'path' => $lexicalIndexPath, + 'error' => $e->getMessage(), + ]); + + throw $e; + } + } + + private function buildFromNdjson(SQLite3 $db, string $indexNdjsonPath): void + { + $handle = @fopen($indexNdjsonPath, 'rb'); + + if ($handle === false) { + throw new \RuntimeException('Unable to read index.ndjson: ' . $indexNdjsonPath); + } + + $db->exec('BEGIN IMMEDIATE TRANSACTION'); + + try { + $seenChunkStmt = $db->prepare( + 'INSERT OR IGNORE INTO lexical_seen_chunks (chunk_id) VALUES (:chunk_id)' + ); + $termStmt = $db->prepare( + 'INSERT INTO lexical_terms (token, df) + VALUES (:token, 1) + ON CONFLICT(token) DO UPDATE SET df = df + 1' + ); + $postingStmt = $db->prepare( + 'INSERT INTO lexical_postings ( + token, + chunk_id, + document_id, + chunk_index, + tf, + title_tf + ) VALUES ( + :token, + :chunk_id, + :document_id, + :chunk_index, + :tf, + :title_tf + )' + ); + + if (!$seenChunkStmt || !$termStmt || !$postingStmt) { + throw new \RuntimeException('Failed to prepare lexical index SQL statements.'); + } + + $totalChunks = 0; + $lineNumber = 0; + + while (($line = fgets($handle)) !== false) { + $lineNumber++; + $line = trim($line); + + if ($line === '') { + continue; + } + + $row = json_decode($line, true); + + if (!is_array($row)) { + continue; + } + + $chunkId = trim((string)($row['chunk_id'] ?? '')); + $documentId = trim((string)($row['document_id'] ?? '')); + $chunkIndex = $this->normalizeChunkIndex($row['chunk_index'] ?? null); + $text = trim((string)($row['text'] ?? '')); + + if ($chunkId === '' || $documentId === '' || $text === '') { + continue; + } + + $seenChunkStmt->reset(); + $seenChunkStmt->clear(); + $seenChunkStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT); + $seenResult = $seenChunkStmt->execute(); + + if ($seenResult !== false) { + $seenResult->finalize(); + } + + if ($db->changes() < 1) { + continue; + } + + $title = $this->extractDocumentTitle($row); + $tokenStats = $this->buildTokenStats($text, $title); + + if ($tokenStats === []) { + continue; + } + + $totalChunks++; + + foreach ($tokenStats as $token => $stats) { + $termStmt->reset(); + $termStmt->clear(); + $termStmt->bindValue(':token', $token, SQLITE3_TEXT); + $termResult = $termStmt->execute(); + + if ($termResult !== false) { + $termResult->finalize(); + } + + $postingStmt->reset(); + $postingStmt->clear(); + $postingStmt->bindValue(':token', $token, SQLITE3_TEXT); + $postingStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT); + $postingStmt->bindValue(':document_id', $documentId, SQLITE3_TEXT); + + if ($chunkIndex === null) { + $postingStmt->bindValue(':chunk_index', null, SQLITE3_NULL); + } else { + $postingStmt->bindValue(':chunk_index', $chunkIndex, SQLITE3_INTEGER); + } + + $postingStmt->bindValue(':tf', $stats['tf'], SQLITE3_INTEGER); + $postingStmt->bindValue(':title_tf', $stats['title_tf'], SQLITE3_INTEGER); + + $postingResult = $postingStmt->execute(); + + if ($postingResult === false) { + throw new \RuntimeException('Failed to insert lexical posting for token: ' . $token); + } + + $postingResult->finalize(); + } + } + + fclose($handle); + + $this->writeMeta($db, $totalChunks); + + $db->exec('COMMIT'); + + $this->agentLogger->info('Lexical index streaming pass completed.', [ + 'indexed_chunks' => $totalChunks, + 'source' => $indexNdjsonPath, + ]); + } catch (\Throwable $e) { + fclose($handle); + $db->exec('ROLLBACK'); + + throw $e; + } + } + + /** + * @return array + */ + private function buildTokenStats(string $text, string $title): array + { + $textTokens = $this->tokenize($text); + $titleTokens = $this->tokenize($title); + + if ($textTokens === [] && $titleTokens === []) { + return []; + } + + $textTf = []; + foreach ($textTokens as $token) { + $textTf[$token] = ($textTf[$token] ?? 0) + 1; + } + + $titleTf = []; + foreach ($titleTokens as $token) { + $titleTf[$token] = ($titleTf[$token] ?? 0) + 1; + } + + $tokens = array_values(array_unique(array_merge( + array_keys($textTf), + array_keys($titleTf) + ))); + + if (count($tokens) > self::MAX_UNIQUE_TOKENS_PER_CHUNK) { + $tokens = array_slice($tokens, 0, self::MAX_UNIQUE_TOKENS_PER_CHUNK); + } + + $stats = []; + + foreach ($tokens as $token) { + $stats[$token] = [ + 'tf' => $textTf[$token] ?? 0, + 'title_tf' => $titleTf[$token] ?? 0, + ]; + } + + return $stats; + } + + /** + * Generic tokenizer: + * - lowercases + * - removes punctuation + * - preserves alphanumeric codes + * - keeps numeric/code-like tokens even if short + * - drops generic stop words for non-numeric tokens + * + * @return string[] + */ + private function tokenize(string $value): array + { + $value = $this->normalizeText($value); + + if ($value === '') { + return []; + } + + $parts = preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: []; + $tokens = []; + + foreach ($parts as $token) { + if ($token === '') { + continue; + } + + if ($this->shouldIgnoreToken($token)) { + continue; + } + + $tokens[] = $token; + } + + return $tokens; + } + + private function shouldIgnoreToken(string $token): bool + { + if ($token === '') { + return true; + } + + if (preg_match('/\d/u', $token) === 1) { + return false; + } + + if (mb_strlen($token, 'UTF-8') < 2) { + return true; + } + + return StopWords::isStopWord($token); + } + + private function normalizeText(string $value): string + { + $value = mb_strtolower(trim($value), 'UTF-8'); + $value = str_replace(['-', '/', '_'], ' ', $value); + $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value; + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim($value); + } + + private function extractDocumentTitle(array $row): string + { + $metadata = $row['metadata'] ?? null; + + if (!is_array($metadata)) { + return ''; + } + + return trim((string)($metadata['document_title'] ?? '')); + } + + private function normalizeChunkIndex(mixed $value): ?int + { + if (is_int($value)) { + return $value; + } + + if (is_string($value) && ctype_digit($value)) { + return (int)$value; + } + + return null; + } + + private function writeMeta(SQLite3 $db, int $totalChunks): void + { + $metaStmt = $db->prepare( + 'INSERT OR REPLACE INTO lexical_meta (key, value) VALUES (:key, :value)' + ); + + if ($metaStmt === false) { + throw new \RuntimeException('Failed to prepare lexical meta statement.'); + } + + $meta = [ + 'schema_version' => '1', + 'built_at' => (new \DateTimeImmutable())->format(DATE_ATOM), + 'total_chunks' => (string)$totalChunks, + ]; + + foreach ($meta as $key => $value) { + $metaStmt->reset(); + $metaStmt->clear(); + $metaStmt->bindValue(':key', $key, SQLITE3_TEXT); + $metaStmt->bindValue(':value', $value, SQLITE3_TEXT); + + $result = $metaStmt->execute(); + + if ($result === false) { + throw new \RuntimeException('Failed to write lexical meta key: ' . $key); + } + + $result->finalize(); + } + } + + private function initializeSchema(SQLite3 $db): void + { + $db->exec('PRAGMA journal_mode = DELETE'); + $db->exec('PRAGMA synchronous = NORMAL'); + $db->exec('PRAGMA temp_store = MEMORY'); + $db->exec('PRAGMA foreign_keys = OFF'); + + $schema = <<<'SQL' +CREATE TABLE IF NOT EXISTS lexical_meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS lexical_terms ( + token TEXT PRIMARY KEY, + df INTEGER NOT NULL +); + +CREATE TABLE IF NOT EXISTS lexical_postings ( + token TEXT NOT NULL, + chunk_id TEXT NOT NULL, + document_id TEXT NOT NULL, + chunk_index INTEGER NULL, + tf INTEGER NOT NULL, + title_tf INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (token, chunk_id) +); + +CREATE INDEX IF NOT EXISTS idx_lexical_postings_document_token + ON lexical_postings (document_id, token); + +CREATE INDEX IF NOT EXISTS idx_lexical_postings_chunk + ON lexical_postings (chunk_id); + +CREATE TABLE IF NOT EXISTS lexical_seen_chunks ( + chunk_id TEXT PRIMARY KEY +); +SQL; + + if ($db->exec($schema) === false) { + throw new \RuntimeException('Failed to initialize lexical index schema.'); + } + } + + private function openWritableDb(string $path): SQLite3 + { + try { + $db = new SQLite3($path, SQLITE3_OPEN_READWRITE | SQLITE3_OPEN_CREATE); + } catch (\Throwable $e) { + throw new \RuntimeException('Unable to open lexical index DB: ' . $path, 0, $e); + } + + $db->busyTimeout(5000); + + return $db; + } + + private function getIndexNdjsonPath(): string + { + return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_NDJSON_PATH; + } + + private function getLexicalIndexPath(): string + { + return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH; + } + + private function ensureTargetDirectoryExists(string $finalIndexPath): void + { + $dir = dirname($finalIndexPath); + + if (is_dir($dir)) { + return; + } + + if (!@mkdir($dir, 0775, true) && !is_dir($dir)) { + throw new \RuntimeException('Unable to create lexical index directory: ' . $dir); + } + } + + private function atomicReplace(string $tmpPath, string $finalPath): void + { + if (is_file($finalPath)) { + @chmod($finalPath, 0664); + } + + if (!@rename($tmpPath, $finalPath)) { + if (!@copy($tmpPath, $finalPath)) { + @unlink($tmpPath); + throw new \RuntimeException('Atomic replace failed for lexical index: ' . $finalPath); + } + + @unlink($tmpPath); + } + + @chmod($finalPath, 0664); + } + + private function removeFileIfExists(string $path): void + { + if (is_file($path)) { + @unlink($path); + } + } + + private function assertSqliteAvailable(): void + { + if (!class_exists(SQLite3::class)) { + throw new \RuntimeException('The sqlite3 PHP extension is required for lexical index building.'); + } + } +} \ No newline at end of file diff --git a/src/Knowledge/Retrieval/QueryEnricher.php b/src/Knowledge/Retrieval/QueryEnricher.php index 87faf88..ffe66dc 100644 --- a/src/Knowledge/Retrieval/QueryEnricher.php +++ b/src/Knowledge/Retrieval/QueryEnricher.php @@ -8,6 +8,14 @@ use App\Config\QueryEnricherConfig; final readonly class QueryEnricher { + /** + * Keep enrichment conservative. + * + * The enriched semantic query should help vector retrieval, + * but must not become bloated enough to dilute the original user intent. + */ + private const MAX_EXPANSIONS = 4; + public function __construct( private QueryEnricherConfig $config ) { @@ -16,6 +24,12 @@ final readonly class QueryEnricher /** * Enriches the query with mapped counterpart terms. * + * Design goals: + * - preserve the original query unchanged at the front + * - only append counterpart terms that are not already present + * - prefer longer / more specific phrase matches over short generic matches + * - keep the number of appended terms intentionally small + * * Example: * - input: "water hardness device" * - output: "water hardness device residual hardness model" @@ -29,26 +43,63 @@ final readonly class QueryEnricher } $mapping = $this->config->getEnrichQueryList(); + + if ($mapping === []) { + return $originalQuery; + } + $lookup = $this->buildBidirectionalLookup($mapping); + + if ($lookup === []) { + return $originalQuery; + } + + $lookup = $this->sortLookupBySpecificity($lookup); $normalizedQuery = $this->normalizeForMatching($originalQuery); - $matches = []; + if ($normalizedQuery === '') { + return $originalQuery; + } - foreach ($lookup as $needle => $mappedValue) { - if ($needle === '') { + $matches = []; + $seenNormalizedExpansions = []; + + foreach ($lookup as $normalizedNeedle => $mappedValue) { + if ($normalizedNeedle === '') { continue; } - if ($this->containsWholePhrase($normalizedQuery, $needle)) { - $matches[] = $mappedValue; + if (!$this->containsWholePhrase($normalizedQuery, $normalizedNeedle)) { + continue; + } + + $mappedValue = trim($mappedValue); + if ($mappedValue === '') { + continue; + } + + $normalizedMappedValue = $this->normalizeForMatching($mappedValue); + if ($normalizedMappedValue === '') { + continue; + } + + // Do not re-add information that is already present in the query. + if ($this->containsWholePhrase($normalizedQuery, $normalizedMappedValue)) { + continue; + } + + if (isset($seenNormalizedExpansions[$normalizedMappedValue])) { + continue; + } + + $matches[] = $mappedValue; + $seenNormalizedExpansions[$normalizedMappedValue] = true; + + if (count($matches) >= self::MAX_EXPANSIONS) { + break; } } - $matches = array_values(array_unique(array_filter( - $matches, - static fn(string $value): bool => trim($value) !== '' - ))); - if ($matches === []) { return $originalQuery; } @@ -106,6 +157,11 @@ final readonly class QueryEnricher * 'jacket' => 'coat', * 'coat' => 'jacket', * ] + * + * Returned format: + * [ + * '' => '', + * ] */ private function buildBidirectionalLookup(array $mapping): array { @@ -122,15 +178,49 @@ final readonly class QueryEnricher $normalizedKey = $this->normalizeForMatching($key); $normalizedValue = $this->normalizeForMatching($value); - if ($normalizedKey !== '') { + if ($normalizedKey !== '' && !isset($lookup[$normalizedKey])) { $lookup[$normalizedKey] = $value; } - if ($normalizedValue !== '') { + if ($normalizedValue !== '' && !isset($lookup[$normalizedValue])) { $lookup[$normalizedValue] = $key; } } return $lookup; } + + /** + * Sorts phrase rules by specificity so longer / more precise phrases win first. + * + * Priority: + * 1. more words + * 2. longer character length + * 3. lexical order for deterministic output + * + * @param array $lookup + * @return array + */ + private function sortLookupBySpecificity(array $lookup): array + { + uksort($lookup, static function (string $a, string $b): int { + $aWordCount = substr_count($a, ' ') + 1; + $bWordCount = substr_count($b, ' ') + 1; + + if ($aWordCount !== $bWordCount) { + return $bWordCount <=> $aWordCount; + } + + $aLength = mb_strlen($a, 'UTF-8'); + $bLength = mb_strlen($b, 'UTF-8'); + + if ($aLength !== $bLength) { + return $bLength <=> $aLength; + } + + return strcmp($a, $b); + }); + + return $lookup; + } } \ No newline at end of file