diff --git a/composer.json b/composer.json
index f155307..ff6de92 100644
--- a/composer.json
+++ b/composer.json
@@ -28,7 +28,8 @@
"symfony/security-bundle": "7.4.*",
"symfony/twig-bundle": "7.4.*",
"symfony/uid": "7.4.*",
- "symfony/yaml": "^7.4"
+ "symfony/yaml": "^7.4",
+ "ext-sqlite3": "*"
},
"config": {
"optimize-autoloader": true,
diff --git a/src/Command/TestHybridRetrievalCommand.php b/src/Command/TestHybridRetrievalCommand.php
new file mode 100644
index 0000000..e691664
--- /dev/null
+++ b/src/Command/TestHybridRetrievalCommand.php
@@ -0,0 +1,298 @@
+addArgument(
+ 'prompt',
+ InputArgument::REQUIRED,
+ 'Prompt to test against the real hybrid retrieval pipeline'
+ )
+ ->addOption(
+ 'json',
+ null,
+ InputOption::VALUE_NONE,
+ 'Return the raw retrieval debug result as JSON'
+ )
+ ->addOption(
+ 'show-text',
+ null,
+ InputOption::VALUE_NONE,
+ 'Show full chunk text instead of a shortened preview'
+ );
+ }
+
+ protected function execute(InputInterface $input, OutputInterface $output): int
+ {
+ $io = new SymfonyStyle($input, $output);
+
+ $prompt = trim((string) $input->getArgument('prompt'));
+ $asJson = (bool) $input->getOption('json');
+ $showText = (bool) $input->getOption('show-text');
+
+ if ($prompt === '') {
+ $io->error('Prompt must not be empty.');
+
+ return Command::FAILURE;
+ }
+
+ $start = microtime(true);
+
+ try {
+ $results = $this->retriever->retrieveDebug($prompt);
+ } catch (\Throwable $e) {
+ $io->error($e->getMessage());
+
+ return Command::FAILURE;
+ }
+
+ $durationMs = round((microtime(true) - $start) * 1000, 2);
+
+ if ($asJson) {
+ $payload = [
+ 'prompt' => $prompt,
+ 'duration_ms' => $durationMs,
+ 'result_count' => count($results),
+ 'results' => $results,
+ ];
+
+ $json = json_encode(
+ $payload,
+ JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
+ );
+
+ if (!is_string($json)) {
+ $io->error('json_encode failed.');
+
+ return Command::FAILURE;
+ }
+
+ $output->writeln($json);
+
+ return Command::SUCCESS;
+ }
+
+ $io->title('Hybrid Retrieval Test');
+ $io->definitionList(
+ ['prompt' => $prompt],
+ ['duration_ms' => (string) $durationMs],
+ ['result_count' => (string) count($results)]
+ );
+
+ if ($results === []) {
+ $io->warning('No retrieval results returned.');
+
+ return Command::SUCCESS;
+ }
+
+ $first = $results[0];
+
+ $io->section('Pipeline Summary');
+ $io->definitionList(
+ ['scope_mode' => $this->stringValue($first, 'scope_mode')],
+ ['selection_mode' => $this->stringValue($first, 'selection_mode')],
+ ['intent' => $this->stringValue($first, 'intent')],
+ ['route' => $this->stringValue($first, 'route')],
+ ['entity_label' => $this->stringValue($first, 'entity_label')],
+ ['is_list_query' => $this->boolishValue($first, 'is_list_query')],
+ ['clean_query' => $this->stringValue($first, 'clean_query')],
+ ['semantic_query' => $this->stringValue($first, 'semantic_query')],
+ ['secondary_vector_query' => $this->stringValue($first, 'secondary_vector_query')],
+ ['lexical_query' => $this->stringValue($first, 'lexical_query')],
+ ['threshold' => $this->scalarValue($first, 'threshold')],
+ ['lexical_threshold' => $this->scalarValue($first, 'lexical_threshold')]
+ );
+
+ $io->section('Scope Candidates');
+ $io->definitionList(
+ ['tag_candidate_doc_ids' => $this->jsonValue($first, 'tag_candidate_doc_ids')],
+ ['soft_document_candidate_doc_ids' => $this->jsonValue($first, 'soft_document_candidate_doc_ids')],
+ ['pseudo_scope_doc_ids' => $this->jsonValue($first, 'pseudo_scope_doc_ids')],
+ ['title_metadata_doc_boosts' => $this->jsonObjectValue($first, 'title_metadata_doc_boosts')]
+ );
+
+ $io->section('Hit Counts');
+ $io->definitionList(
+ ['global_hit_count' => $this->scalarValue($first, 'global_hit_count')],
+ ['scoped_hit_count' => $this->scalarValue($first, 'scoped_hit_count')],
+ ['global_vector_hit_count' => $this->scalarValue($first, 'global_vector_hit_count')],
+ ['global_primary_vector_hit_count' => $this->scalarValue($first, 'global_primary_vector_hit_count')],
+ ['global_secondary_vector_hit_count' => $this->scalarValue($first, 'global_secondary_vector_hit_count')],
+ ['global_keyword_hit_count' => $this->scalarValue($first, 'global_keyword_hit_count')],
+ ['scoped_vector_hit_count' => $this->scalarValue($first, 'scoped_vector_hit_count')],
+ ['scoped_primary_vector_hit_count' => $this->scalarValue($first, 'scoped_primary_vector_hit_count')],
+ ['scoped_secondary_vector_hit_count' => $this->scalarValue($first, 'scoped_secondary_vector_hit_count')],
+ ['scoped_keyword_hit_count' => $this->scalarValue($first, 'scoped_keyword_hit_count')]
+ );
+
+ $io->section('Boosts');
+ $io->definitionList(
+ ['scoped_boost_factor' => $this->scalarValue($first, 'scoped_boost_factor')],
+ ['scoped_vector_boost_factor' => $this->scalarValue($first, 'scoped_vector_boost_factor')],
+ ['secondary_scoped_vector_boost_factor' => $this->scalarValue($first, 'secondary_scoped_vector_boost_factor')],
+ ['scoped_keyword_boost_factor' => $this->scalarValue($first, 'scoped_keyword_boost_factor')]
+ );
+
+ $io->section('Selected Chunks');
+
+ foreach ($results as $row) {
+ $rank = $this->scalarValue($row, 'rank');
+ $chunkId = $this->stringValue($row, 'chunk_id');
+ $documentId = $this->stringValue($row, 'document_id');
+ $chunkIndex = $this->scalarValue($row, 'chunk_index');
+ $rrfScore = $this->scalarValue($row, 'rrf_score');
+ $rawVectorScore = $this->scalarValue($row, 'raw_vector_score');
+ $rawKeywordScore = $this->scalarValue($row, 'raw_keyword_score');
+ $titleMetadataBoost = $this->scalarValue($row, 'title_metadata_boost');
+ $text = (string) ($row['text'] ?? '');
+
+ if (!$showText) {
+ $text = $this->shortenText($text, 500);
+ }
+
+ $io->writeln(sprintf(
+ '#%s chunk=%s doc=%s idx=%s rrf=%s vector=%s keyword=%s title_meta=%s',
+ $rank,
+ $chunkId,
+ $documentId !== '' ? $documentId : '-',
+ $chunkIndex !== '' ? $chunkIndex : '-',
+ $rrfScore !== '' ? $rrfScore : '-',
+ $rawVectorScore !== '' ? $rawVectorScore : '-',
+ $rawKeywordScore !== '' ? $rawKeywordScore : '-',
+ $titleMetadataBoost !== '' ? $titleMetadataBoost : '-'
+ ));
+ $io->writeln($text);
+ $io->writeln('');
+ }
+
+ return Command::SUCCESS;
+ }
+
+ /**
+ * @param array $row
+ */
+ private function stringValue(array $row, string $key): string
+ {
+ $value = $row[$key] ?? null;
+
+ if ($value === null) {
+ return '';
+ }
+
+ return trim((string) $value);
+ }
+
+ /**
+ * @param array $row
+ */
+ private function scalarValue(array $row, string $key): string
+ {
+ $value = $row[$key] ?? null;
+
+ if ($value === null) {
+ return '';
+ }
+
+ if (is_bool($value)) {
+ return $value ? 'true' : 'false';
+ }
+
+ if (is_scalar($value)) {
+ return (string) $value;
+ }
+
+ return '';
+ }
+
+ /**
+ * @param array $row
+ */
+ private function boolishValue(array $row, string $key): string
+ {
+ $value = $row[$key] ?? null;
+
+ if (is_bool($value)) {
+ return $value ? 'true' : 'false';
+ }
+
+ if (is_scalar($value)) {
+ return (string) $value;
+ }
+
+ return '';
+ }
+
+ /**
+ * @param array $row
+ */
+ private function jsonValue(array $row, string $key): string
+ {
+ $value = $row[$key] ?? null;
+
+ if ($value === null || !is_array($value)) {
+ return '[]';
+ }
+
+ $json = json_encode(
+ array_values($value),
+ JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
+ );
+
+ return is_string($json) ? $json : '[]';
+ }
+
+ /**
+ * @param array $row
+ */
+ private function jsonObjectValue(array $row, string $key): string
+ {
+ $value = $row[$key] ?? null;
+
+ if ($value === null || !is_array($value)) {
+ return '{}';
+ }
+
+ $json = json_encode(
+ $value,
+ JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
+ );
+
+ return is_string($json) ? $json : '{}';
+ }
+
+ private function shortenText(string $text, int $maxLength): string
+ {
+ $text = trim((preg_replace('/\s+/u', ' ', $text) ?? $text));
+
+ if (mb_strlen($text, 'UTF-8') <= $maxLength) {
+ return $text;
+ }
+
+ return mb_substr($text, 0, $maxLength, 'UTF-8') . ' …';
+ }
+}
\ No newline at end of file
diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php
index 845b656..2b8d66b 100644
--- a/src/Config/NdjsonHybridRetrieverConfig.php
+++ b/src/Config/NdjsonHybridRetrieverConfig.php
@@ -7,62 +7,96 @@ namespace App\Config;
final class NdjsonHybridRetrieverConfig
{
/**
- * Default semantic similarity threshold for vector hits.
+ * Maximum number of chunks the retriever may finally hand to the model.
*
- * Chosen to stay selective enough for product-family-heavy data
- * while not cutting off too many useful fallback hits.
+ * Rationale:
+ * - enough room for the stronger hybrid pipeline
+ * - still conservative enough to avoid prompt bloat
*/
- public const VECTOR_SCORE_THRESHOLD = 0.83;
+ public const HARD_MAX_CHUNKS = 6;
/**
- * Absolute safety caps.
+ * Hard upper bound for vector retrieval candidate size.
*
- * These limits protect the retriever from overly large candidate sets
- * even if runtime config values are set too high.
+ * Rationale:
+ * - the pipeline now combines primary vector, secondary vector,
+ * lexical, scoped retrieval and re-ranking
+ * - the old limit would constrain recall too early
+ * - still capped to keep latency controlled
*/
- public const HARD_MAX_CHUNKS = 72;
- public const HARD_MAX_VECTORK = 180;
+ public const HARD_MAX_VECTORK = 18;
/**
- * List-style queries benefit from a slightly wider candidate pool
- * before de-duplication and final selection.
+ * Default semantic score threshold for vector hits.
+ *
+ * Rationale:
+ * - slightly relaxed compared to stricter pure-vector setups
+ * - the system now has more safeguards:
+ * lexical cross-signals, scoped retrieval, title/meta boost, selection rules
*/
- public const LIST_BONUS = 1.25;
+ public const VECTOR_SCORE_THRESHOLD = 0.81;
/**
- * Selection rules for cross-document semantic retrieval.
+ * Lower safety boundary for dynamic threshold adjustments.
*
- * MAX_CHUNKS_PER_DOC:
- * Keeps one document from dominating the final result in normal
- * semantic retrieval mode.
- *
- * MIN_CHUNK_DISTANCE:
- * Allows nearby chunks to be selected when they are still meaningfully
- * distinct, which is important for compact product sheets.
+ * Rationale:
+ * - prevents the system from getting too noisy in fallback cases
+ * - still allows recovery when exact signals are sparse
*/
- public const MAX_CHUNKS_PER_DOC = 3;
- public const MIN_CHUNK_DISTANCE = 1.0;
+ public const THRESHOLD_FLOOR = 0.75;
+
+ /**
+ * Upper safety boundary for dynamic threshold adjustments.
+ *
+ * Rationale:
+ * - protects objection/pricing/list adjustments from becoming too strict
+ * - keeps retrieval from collapsing into empty result sets too easily
+ */
+ public const THRESHOLD_CEIL = 0.90;
+
+ /**
+ * Additional candidate expansion factor for list-like prompts.
+ *
+ * Rationale:
+ * - list requests benefit from wider candidate recall
+ * - too high would create noise across multiple retrieval channels
+ */
+ public const LIST_BONUS = 1.35;
/**
* Reciprocal Rank Fusion constant.
*
- * Slightly lower than classic defaults so top-ranked hits matter more.
+ * Rationale:
+ * - keep rank importance meaningful
+ * - but not so aggressive that one retrieval source dominates too hard
*/
public const RRF_K = 50;
/**
- * Dynamic threshold clamp boundaries.
+ * Fallback size when thresholded fusion yields no candidates.
*
- * The floor must stay below the default threshold, otherwise the
- * configured base threshold becomes ineffective.
+ * Rationale:
+ * - slightly larger safety net for the richer hybrid stack
+ * - helps no-tag and low-signal cases without exploding context
*/
- public const THRESHOLD_FLOOR = 0.78;
- public const THRESHOLD_CEIL = 0.90;
+ public const EMPTY_RRF_FALLBACK_TOPN = 5;
/**
- * Fallback breadth when strict thresholding removes all fused hits.
+ * Maximum number of chunks allowed from one document in spread mode.
*
- * More than one fallback result makes the retriever less brittle.
+ * Rationale:
+ * - preserve diversity across documents
+ * - still allow coherent multi-chunk retrieval from strong sources
*/
- public const EMPTY_RRF_FALLBACK_TOPN = 3;
+ public const MAX_CHUNKS_PER_DOC = 2;
+
+ /**
+ * Minimum distance between chunk indices from the same document
+ * during spread-style selection.
+ *
+ * Rationale:
+ * - reduce near-duplicate neighboring chunks
+ * - still allow relevant continuation when needed
+ */
+ public const MIN_CHUNK_DISTANCE = 2;
}
\ No newline at end of file
diff --git a/src/Config/QueryEnricherConfig.php b/src/Config/QueryEnricherConfig.php
index 8d79fb7..f4f2837 100644
--- a/src/Config/QueryEnricherConfig.php
+++ b/src/Config/QueryEnricherConfig.php
@@ -1,22 +1,180 @@
'residual hardness',
+ * 'device' => 'instrument',
+ * ]
+ *
+ * 2) Small synonym groups:
+ * [
+ * ['water hardness', 'residual hardness', 'hardness'],
+ * ['device', 'instrument', 'meter'],
+ * ]
+ *
+ * The public API stays intentionally simple:
+ * - getEnrichQueryList(): array
+ *
+ * This keeps QueryEnricher generic while the domain vocabulary
+ * deliberately remains inside this class for now.
+ *
+ * Replace the example entries below with your real project mappings.
+ *
+ * @var array
+ */
+ private const ENRICH_QUERY_LIST = [
+ // -----------------------------------------------------------------
+ // Example mappings.
+ // Replace / extend these with your current real project mappings.
+ // -----------------------------------------------------------------
+
+ 'water hardness' => 'residual hardness',
+ 'device' => 'instrument',
+ 'gerät'=>'produkt',
+ 'rebuild'=>'reindex',
+
+ ['measuring device', 'meter', 'instrument'],
+ ];
+
+ /**
+ * Returns a normalized, deduplicated mapping for the QueryEnricher.
+ *
+ * Output format:
+ * [
+ * 'term a' => 'term b',
+ * 'term c' => 'term d',
+ * ]
+ *
+ * Rules:
+ * - ignore empty / invalid values
+ * - trim and normalize whitespace
+ * - ignore self-mappings
+ * - preserve first valid rule if duplicates normalize to the same key
+ *
+ * @return array
+ */
public function getEnrichQueryList(): array
{
- return [
- 'Wasserhärte' => 'Resthärte',
- 'Gerät' => 'Modell',
- 'Indikator' => 'Chemie',
- 'Seminar' => 'Webinar',
- 'Schulung' => 'Seminar',
- 'Indikatoren' => 'Indikator',
- 'Wasserhärte-Grenzwert' => 'Resthärte',
- 'Resthärte-Grenzwert' => 'Wasserhärte',
- 'Grenzwert' => 'Überwachungsbereich',
- 'store'=>'shop'
- ];
+ $normalized = [];
+
+ foreach (self::ENRICH_QUERY_LIST as $key => $value) {
+ if (is_array($value)) {
+ $this->ingestGroup($normalized, $value);
+ continue;
+ }
+
+ $left = $this->normalizePhrase(is_string($key) ? $key : '');
+ $right = $this->normalizePhrase(is_string($value) ? $value : '');
+
+ if (!$this->isValidPair($left, $right)) {
+ continue;
+ }
+
+ if (!isset($normalized[$left])) {
+ $normalized[$left] = $right;
+ }
+ }
+
+ return $normalized;
+ }
+
+ /**
+ * Returns true when at least one valid enrichment rule exists.
+ */
+ public function hasRules(): bool
+ {
+ return $this->getEnrichQueryList() !== [];
+ }
+
+ /**
+ * @param array $normalized
+ * @param array $group
+ */
+ private function ingestGroup(array &$normalized, array $group): void
+ {
+ $items = [];
+
+ foreach ($group as $item) {
+ if (!is_string($item)) {
+ continue;
+ }
+
+ $item = $this->normalizePhrase($item);
+
+ if ($item === '') {
+ continue;
+ }
+
+ $items[$item] = $item;
+ }
+
+ $items = array_values($items);
+
+ if (count($items) < 2) {
+ return;
+ }
+
+ /**
+ * Turn a synonym group into a conservative chain:
+ * ['a', 'b', 'c'] => a=>b, b=>c
+ *
+ * QueryEnricher builds a bidirectional lookup later,
+ * so the config output stays intentionally small.
+ */
+ for ($i = 0, $max = count($items) - 1; $i < $max; $i++) {
+ $left = $items[$i];
+ $right = $items[$i + 1];
+
+ if (!$this->isValidPair($left, $right)) {
+ continue;
+ }
+
+ if (!isset($normalized[$left])) {
+ $normalized[$left] = $right;
+ }
+ }
+ }
+
+ private function isValidPair(string $left, string $right): bool
+ {
+ if ($left === '' || $right === '') {
+ return false;
+ }
+
+ if ($left === $right) {
+ return false;
+ }
+
+ return true;
+ }
+
+ private function normalizePhrase(string $value): string
+ {
+ $value = trim($value);
+
+ if ($value === '') {
+ return '';
+ }
+
+ $value = mb_strtolower($value, 'UTF-8');
+ $value = preg_replace('/\s+/u', ' ', $value) ?? $value;
+
+ return trim($value);
}
}
\ No newline at end of file
diff --git a/src/Ingest/VectorRebuildService.php b/src/Ingest/VectorRebuildService.php
index a79498f..0975ee8 100644
--- a/src/Ingest/VectorRebuildService.php
+++ b/src/Ingest/VectorRebuildService.php
@@ -6,36 +6,50 @@ namespace App\Ingest;
use App\Index\IndexMetaManager;
use App\Knowledge\ChunkManager;
+use App\Knowledge\Retrieval\NdjsonLexicalIndexBuilder;
use App\Vector\VectorIndexBuilder;
final readonly class VectorRebuildService
{
public function __construct(
private VectorIndexBuilder $vectorBuilder,
- private IndexMetaManager $metaManager,
- private ChunkManager $chunkManager,
- ) {}
+ private NdjsonLexicalIndexBuilder $lexicalIndexBuilder,
+ private IndexMetaManager $metaManager,
+ private ChunkManager $chunkManager,
+ ) {
+ }
/**
- * Führt einen vollständigen, deterministischen FAISS-Rebuild aus.
+ * Executes a full deterministic rebuild of all derived retrieval artifacts.
*
- * Ablauf:
- * 1. Rebuild des Vector Index aus index.ndjson
- * 2. Chunk-Zählung via ChunkManager
- * 3. Runtime-Stats atomar aktualisieren
+ * Flow:
+ * 1. Ensure index_meta.json exists
+ * 2. Rebuild vector index from index.ndjson
+ * 3. Rebuild lexical index from index.ndjson
+ * 4. Count chunks streaming-safe
+ * 5. Update runtime stats atomically
+ *
+ * Important:
+ * - Vector and lexical index are both derived from the same NDJSON source
+ * - rebuilding both here prevents drift between semantic and lexical retrieval layers
+ * - failures in either derived artifact should fail the rebuild as a whole
+ * @throws \Throwable
*/
public function rebuild(?string $logPath = null): void
{
- // ✅ Stelle sicher, dass index_meta.json existiert
+ // Ensure metadata exists before derived index work starts.
$this->metaManager->ensureExists();
- // 1️⃣ Vector Index neu bauen
+ // 1) Rebuild semantic vector index.
$this->vectorBuilder->rebuildFromNdjson($logPath);
- // 2️⃣ Chunk Count streaming-safe zählen
+ // 2) Rebuild generic lexical index from the same NDJSON source.
+ $this->lexicalIndexBuilder->build();
+
+ // 3) Count chunks streaming-safe.
$chunkCount = $this->chunkManager->countAllChunks();
- // 3️⃣ Runtime-Stats aktualisieren (atomar)
+ // 4) Update runtime stats atomically.
$this->metaManager->updateRuntimeStats($chunkCount);
}
}
\ No newline at end of file
diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
index 2b7ca25..a29e86f 100644
--- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
+++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
@@ -25,7 +25,8 @@ use RuntimeException;
* - optionally short-circuit to catalog list output
* - resolve exact document-title matches before semantic retrieval
* - run vector retrieval globally and optionally document-scoped
- * - fuse both result sets with RRF-style scoring
+ * - run lexical retrieval globally and optionally document-scoped
+ * - fuse all result sets with RRF-style scoring
* - apply selection rules for list queries vs. sales-style queries
* - return either plain chunk texts or debug metadata
*/
@@ -40,35 +41,82 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
private const DOMINANT_DOC_MAX_CHUNKS = 4;
private const EXACT_DOCUMENT_MAX_CHUNKS = 6;
- public function __construct(
- private NdjsonChunkLookup $lookup,
- private VectorSearchClient $vectorClient,
- private TagRoutingService $tagRouting,
- private ModelGenerationConfigRepository $configRepository,
- private QueryCleaner $queryCleaner,
- private IntentLite $intentLite,
- private SalesIntentLite $salesIntentLite,
- private CatalogIntentLite $catalogIntent,
- private IntentRouteResolver $routeResolver,
- private EntityCatalogService $entityCatalogService,
- private QueryEnricher $queryEnricher,
- )
- {
- }
+ /**
+ * Conservative no-tag fallback:
+ * derive a temporary document scope only when the top global vector hits
+ * show repeated evidence for the same document(s).
+ */
+ private const PSEUDO_SCOPE_GLOBAL_WINDOW = 10;
+ private const PSEUDO_SCOPE_MIN_DOC_HITS = 2;
+ private const PSEUDO_SCOPE_MAX_DOCS = 3;
- // =========================================================
- // PUBLIC API
- // =========================================================
+ /**
+ * Soft document candidates are derived from global lexical hits first.
+ * This stage is placed between tag-routing and vector-based pseudo scope.
+ */
+ private const SOFT_DOC_CANDIDATE_WINDOW = 8;
+ private const SOFT_DOC_CANDIDATE_MIN_DOC_HITS = 2;
+ private const SOFT_DOC_CANDIDATE_MAX_DOCS = 3;
+ private const SOFT_DOC_TOP_SCORE_MIN = 0.98;
+
+ /**
+ * Scoped retrieval is useful in both cases, but true tag-routing should
+ * stay stronger than soft candidates and pseudo-scoping.
+ */
+ private const TAG_SCOPED_VECTOR_BOOST = 1.20;
+ private const SOFT_DOC_SCOPED_VECTOR_BOOST = 1.12;
+ private const PSEUDO_SCOPED_VECTOR_BOOST = 1.08;
+
+ /**
+ * Secondary vector query should help recall/robustness, but must not
+ * overpower the primary enriched semantic query.
+ */
+ private const SECONDARY_GLOBAL_VECTOR_BOOST = 0.93;
+ private const SECONDARY_SCOPED_VECTOR_MULTIPLIER = 0.95;
+
+ /**
+ * Lexical retrieval should support precision, but not overpower vector routing.
+ */
+ private const LEXICAL_SCORE_THRESHOLD = 0.18;
+ private const GLOBAL_LEXICAL_BOOST = 0.90;
+ private const TAG_SCOPED_LEXICAL_BOOST = 1.04;
+ private const SOFT_DOC_SCOPED_LEXICAL_BOOST = 1.02;
+ private const PSEUDO_SCOPED_LEXICAL_BOOST = 1.00;
+
+ /**
+ * Conservative re-rank stage based on document title / metadata alignment.
+ *
+ * This is intentionally applied after fusion so it sharpens ranking
+ * without replacing the underlying retrieval sources.
+ */
+ private const TITLE_MATCH_BASE_BOOST = 0.04;
+ private const TITLE_MATCH_MAX_BOOST = 0.18;
+ private const FILE_MATCH_BASE_BOOST = 0.02;
+ private const FILE_MATCH_MAX_BOOST = 0.08;
+ private const META_MATCH_MAX_BOOST = 0.04;
+ private const EXACT_TITLE_PHRASE_BOOST = 0.08;
+ private const EXACT_FILE_PHRASE_BOOST = 0.04;
+ private const MAX_TITLE_METADATA_BOOST = 0.22;
+
+ public function __construct(
+ private NdjsonChunkLookup $lookup,
+ private VectorSearchClient $vectorClient,
+ private NdjsonKeywordRetriever $keywordRetriever,
+ private TagRoutingService $tagRouting,
+ private ModelGenerationConfigRepository $configRepository,
+ private QueryCleaner $queryCleaner,
+ private IntentLite $intentLite,
+ private SalesIntentLite $salesIntentLite,
+ private CatalogIntentLite $catalogIntent,
+ private IntentRouteResolver $routeResolver,
+ private EntityCatalogService $entityCatalogService,
+ private QueryEnricher $queryEnricher,
+ ) {
+ }
/**
* Returns the final retrieval payload as plain text chunks.
*
- * Behaviour:
- * - loads active retrieval config
- * - executes the full orchestration pipeline
- * - if the route resolves to a catalog list, returns the catalog block only
- * - otherwise returns the selected chunk texts
- *
* @throws Exception
*/
public function retrieve(string $prompt): array
@@ -93,13 +141,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
/**
* Returns a debug-friendly retrieval result with scoring/meta information.
*
- * This method is used for inspection and tuning:
- * - selected chunk ids
- * - raw vector scores
- * - fused RRF scores
- * - intent / route information
- * - threshold and list-query flags
- *
* @throws Exception
*/
public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array
@@ -114,13 +155,40 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
'document_id' => null,
'chunk_index' => null,
'raw_score' => null,
+ 'raw_vector_score' => null,
+ 'raw_keyword_score' => null,
'rrf_score' => null,
'threshold' => 0.0,
+ 'lexical_threshold' => self::LEXICAL_SCORE_THRESHOLD,
'intent' => $result['intent'],
'route' => $result['route'],
'entity_label' => $result['entityLabel'],
'is_list_query' => true,
'selection_mode' => 'catalog_list',
+ 'scope_mode' => 'catalog_list',
+ 'clean_query' => null,
+ 'semantic_query' => null,
+ 'secondary_vector_query' => null,
+ 'lexical_query' => null,
+ 'tag_candidate_doc_ids' => [],
+ 'soft_document_candidate_doc_ids' => [],
+ 'pseudo_scope_doc_ids' => [],
+ 'global_hit_count' => 0,
+ 'scoped_hit_count' => 0,
+ 'global_vector_hit_count' => 0,
+ 'global_primary_vector_hit_count' => 0,
+ 'global_secondary_vector_hit_count' => 0,
+ 'global_keyword_hit_count' => 0,
+ 'scoped_vector_hit_count' => 0,
+ 'scoped_primary_vector_hit_count' => 0,
+ 'scoped_secondary_vector_hit_count' => 0,
+ 'scoped_keyword_hit_count' => 0,
+ 'scoped_boost_factor' => 0.0,
+ 'scoped_vector_boost_factor' => 0.0,
+ 'secondary_scoped_vector_boost_factor' => 0.0,
+ 'scoped_keyword_boost_factor' => 0.0,
+ 'title_metadata_boost' => 0.0,
+ 'title_metadata_doc_boosts' => [],
'text' => $result['catalogBlock'],
]];
}
@@ -139,19 +207,49 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$rank++;
+ $rawVectorScore = $result['rawVectorScores'][$chunkId] ?? null;
+ $rawKeywordScore = $result['rawKeywordScores'][$chunkId] ?? null;
+
$out[] = [
'rank' => $rank,
'chunk_id' => $chunkId,
'document_id' => $result['rows'][$chunkId]['document_id'] ?? null,
'chunk_index' => $result['rows'][$chunkId]['chunk_index'] ?? null,
- 'raw_score' => $result['rawScores'][$chunkId] ?? null,
+ 'raw_score' => $this->maxNullableFloat($rawVectorScore, $rawKeywordScore),
+ 'raw_vector_score' => $rawVectorScore,
+ 'raw_keyword_score' => $rawKeywordScore,
'rrf_score' => $result['rrfScores'][$chunkId] ?? null,
'threshold' => $result['threshold'],
+ 'lexical_threshold' => self::LEXICAL_SCORE_THRESHOLD,
'intent' => $result['intent'],
'route' => $result['route'],
'entity_label' => $result['entityLabel'],
'is_list_query' => $result['isListQuery'],
'selection_mode' => $result['selectionMode'],
+ 'scope_mode' => $result['scopeMode'],
+ 'clean_query' => $result['cleanQuery'],
+ 'semantic_query' => $result['semanticQuery'],
+ 'secondary_vector_query' => $result['secondaryVectorQuery'],
+ 'lexical_query' => $result['lexicalQuery'],
+ 'tag_candidate_doc_ids' => $result['tagCandidateDocIds'],
+ 'soft_document_candidate_doc_ids' => $result['softDocumentCandidateDocIds'],
+ 'pseudo_scope_doc_ids' => $result['pseudoScopeDocIds'],
+ 'global_hit_count' => $result['globalHitCount'],
+ 'scoped_hit_count' => $result['scopedHitCount'],
+ 'global_vector_hit_count' => $result['globalVectorHitCount'],
+ 'global_primary_vector_hit_count' => $result['globalPrimaryVectorHitCount'],
+ 'global_secondary_vector_hit_count' => $result['globalSecondaryVectorHitCount'],
+ 'global_keyword_hit_count' => $result['globalKeywordHitCount'],
+ 'scoped_vector_hit_count' => $result['scopedVectorHitCount'],
+ 'scoped_primary_vector_hit_count' => $result['scopedPrimaryVectorHitCount'],
+ 'scoped_secondary_vector_hit_count' => $result['scopedSecondaryVectorHitCount'],
+ 'scoped_keyword_hit_count' => $result['scopedKeywordHitCount'],
+ 'scoped_boost_factor' => $result['scopedBoostFactor'],
+ 'scoped_vector_boost_factor' => $result['scopedVectorBoostFactor'],
+ 'secondary_scoped_vector_boost_factor' => $result['secondaryScopedVectorBoostFactor'],
+ 'scoped_keyword_boost_factor' => $result['scopedKeywordBoostFactor'],
+ 'title_metadata_boost' => $result['titleMetadataBoosts'][$chunkId] ?? 0.0,
+ 'title_metadata_doc_boosts' => $result['titleMetadataDocBoosts'],
'text' => trim((string)$result['rows'][$chunkId]['text']),
];
}
@@ -159,29 +257,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return $out;
}
- // =========================================================
- // CENTRAL ORCHESTRATION
- // =========================================================
-
/**
* Central orchestration entrypoint.
*
- * Pipeline:
- * 1. Detect catalog entity and sales intent
- * 2. Resolve route
- * 3. If route is a catalog list route, try direct catalog output
- * 4. If prompt matches one exact document title, use exact-document fast path
- * 5. Otherwise, run the normal hybrid retrieval core
- * 6. Select final chunk ids depending on query type
- *
* @throws Exception
*/
private function execute(
- string $prompt,
+ string $prompt,
ModelGenerationConfig $config,
- bool $withScores
- ): array
- {
+ bool $withScores
+ ): array {
$entityLabel = $this->catalogIntent->detect($prompt);
$salesIntent = $this->detectSalesIntent($prompt);
$route = $this->routeResolver->resolve($salesIntent, $entityLabel);
@@ -196,10 +281,35 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
'intent' => $salesIntent,
'isListQuery' => true,
'selectionMode' => 'catalog_list',
+ 'scopeMode' => 'catalog_list',
+ 'cleanQuery' => null,
+ 'semanticQuery' => null,
+ 'secondaryVectorQuery' => null,
+ 'lexicalQuery' => null,
+ 'tagCandidateDocIds' => [],
+ 'softDocumentCandidateDocIds' => [],
+ 'pseudoScopeDocIds' => [],
+ 'globalHitCount' => 0,
+ 'scopedHitCount' => 0,
+ 'globalVectorHitCount' => 0,
+ 'globalPrimaryVectorHitCount' => 0,
+ 'globalSecondaryVectorHitCount' => 0,
+ 'globalKeywordHitCount' => 0,
+ 'scopedVectorHitCount' => 0,
+ 'scopedPrimaryVectorHitCount' => 0,
+ 'scopedSecondaryVectorHitCount' => 0,
+ 'scopedKeywordHitCount' => 0,
+ 'scopedBoostFactor' => 0.0,
+ 'scopedVectorBoostFactor' => 0.0,
+ 'secondaryScopedVectorBoostFactor' => 0.0,
+ 'scopedKeywordBoostFactor' => 0.0,
'selectedChunkIds' => [],
'rows' => [],
'rrfScores' => [],
- 'rawScores' => [],
+ 'rawVectorScores' => [],
+ 'rawKeywordScores' => [],
+ 'titleMetadataBoosts' => [],
+ 'titleMetadataDocBoosts' => [],
'threshold' => 0.0,
'catalogBlock' => trim($catalogBlock),
];
@@ -221,10 +331,35 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
'intent' => $salesIntent,
'isListQuery' => false,
'selectionMode' => 'exact_document_title',
+ 'scopeMode' => 'exact_document_title',
+ 'cleanQuery' => null,
+ 'semanticQuery' => null,
+ 'secondaryVectorQuery' => null,
+ 'lexicalQuery' => null,
+ 'tagCandidateDocIds' => [],
+ 'softDocumentCandidateDocIds' => [],
+ 'pseudoScopeDocIds' => [],
+ 'globalHitCount' => 0,
+ 'scopedHitCount' => 0,
+ 'globalVectorHitCount' => 0,
+ 'globalPrimaryVectorHitCount' => 0,
+ 'globalSecondaryVectorHitCount' => 0,
+ 'globalKeywordHitCount' => 0,
+ 'scopedVectorHitCount' => 0,
+ 'scopedPrimaryVectorHitCount' => 0,
+ 'scopedSecondaryVectorHitCount' => 0,
+ 'scopedKeywordHitCount' => 0,
+ 'scopedBoostFactor' => 0.0,
+ 'scopedVectorBoostFactor' => 0.0,
+ 'secondaryScopedVectorBoostFactor' => 0.0,
+ 'scopedKeywordBoostFactor' => 0.0,
'selectedChunkIds' => $selectedChunkIds,
'rows' => $exactDocumentMatch['rows'],
'rrfScores' => $this->buildExactDocumentScores($selectedChunkIds),
- 'rawScores' => [],
+ 'rawVectorScores' => [],
+ 'rawKeywordScores' => [],
+ 'titleMetadataBoosts' => [],
+ 'titleMetadataDocBoosts' => [],
'threshold' => 1.0,
'catalogBlock' => null,
];
@@ -240,10 +375,39 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
'intent' => $salesIntent,
'isListQuery' => $core['is_list_query'],
'selectionMode' => null,
+ 'scopeMode' => $core['scope_mode'],
+ 'cleanQuery' => $core['clean_query'],
+ 'semanticQuery' => $core['semantic_query'],
+ 'secondaryVectorQuery' => $core['secondary_vector_query'],
+ 'lexicalQuery' => $core['lexical_query'],
+ 'tagCandidateDocIds' => $core['tag_candidate_doc_ids'],
+ 'softDocumentCandidateDocIds' => $core['soft_document_candidate_doc_ids'],
+ 'pseudoScopeDocIds' => $core['pseudo_scope_doc_ids'],
+ 'globalHitCount' => $core['global_hit_count'],
+ 'scopedHitCount' => $core['scoped_hit_count'],
+ 'globalVectorHitCount' => $core['global_vector_hit_count'],
+ 'globalPrimaryVectorHitCount' => $core['global_primary_vector_hit_count'],
+ 'globalSecondaryVectorHitCount' => $core['global_secondary_vector_hit_count'],
+ 'globalKeywordHitCount' => $core['global_keyword_hit_count'],
+ 'scopedVectorHitCount' => $core['scoped_vector_hit_count'],
+ 'scopedPrimaryVectorHitCount' => $core['scoped_primary_vector_hit_count'],
+ 'scopedSecondaryVectorHitCount' => $core['scoped_secondary_vector_hit_count'],
+ 'scopedKeywordHitCount' => $core['scoped_keyword_hit_count'],
+ 'scopedBoostFactor' => max(
+ $core['scoped_vector_boost_factor'],
+ $core['secondary_scoped_vector_boost_factor'],
+ $core['scoped_keyword_boost_factor']
+ ),
+ 'scopedVectorBoostFactor' => $core['scoped_vector_boost_factor'],
+ 'secondaryScopedVectorBoostFactor' => $core['secondary_scoped_vector_boost_factor'],
+ 'scopedKeywordBoostFactor' => $core['scoped_keyword_boost_factor'],
'selectedChunkIds' => [],
'rows' => [],
'rrfScores' => [],
- 'rawScores' => [],
+ 'rawVectorScores' => [],
+ 'rawKeywordScores' => [],
+ 'titleMetadataBoosts' => $core['title_metadata_boosts'],
+ 'titleMetadataDocBoosts' => $core['title_metadata_doc_boosts'],
'threshold' => $core['threshold'],
'catalogBlock' => null,
];
@@ -273,107 +437,272 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
'intent' => $salesIntent,
'isListQuery' => $core['is_list_query'],
'selectionMode' => $selectionMode,
+ 'scopeMode' => $core['scope_mode'],
+ 'cleanQuery' => $core['clean_query'],
+ 'semanticQuery' => $core['semantic_query'],
+ 'secondaryVectorQuery' => $core['secondary_vector_query'],
+ 'lexicalQuery' => $core['lexical_query'],
+ 'tagCandidateDocIds' => $core['tag_candidate_doc_ids'],
+ 'softDocumentCandidateDocIds' => $core['soft_document_candidate_doc_ids'],
+ 'pseudoScopeDocIds' => $core['pseudo_scope_doc_ids'],
+ 'globalHitCount' => $core['global_hit_count'],
+ 'scopedHitCount' => $core['scoped_hit_count'],
+ 'globalVectorHitCount' => $core['global_vector_hit_count'],
+ 'globalPrimaryVectorHitCount' => $core['global_primary_vector_hit_count'],
+ 'globalSecondaryVectorHitCount' => $core['global_secondary_vector_hit_count'],
+ 'globalKeywordHitCount' => $core['global_keyword_hit_count'],
+ 'scopedVectorHitCount' => $core['scoped_vector_hit_count'],
+ 'scopedPrimaryVectorHitCount' => $core['scoped_primary_vector_hit_count'],
+ 'scopedSecondaryVectorHitCount' => $core['scoped_secondary_vector_hit_count'],
+ 'scopedKeywordHitCount' => $core['scoped_keyword_hit_count'],
+ 'scopedBoostFactor' => max(
+ $core['scoped_vector_boost_factor'],
+ $core['secondary_scoped_vector_boost_factor'],
+ $core['scoped_keyword_boost_factor']
+ ),
+ 'scopedVectorBoostFactor' => $core['scoped_vector_boost_factor'],
+ 'secondaryScopedVectorBoostFactor' => $core['secondary_scoped_vector_boost_factor'],
+ 'scopedKeywordBoostFactor' => $core['scoped_keyword_boost_factor'],
'selectedChunkIds' => $selectedChunkIds,
'rows' => $core['rows'],
'rrfScores' => $core['rrf_scores'],
- 'rawScores' => $core['raw_scores'],
+ 'rawVectorScores' => $core['raw_vector_scores'],
+ 'rawKeywordScores' => $core['raw_keyword_scores'],
+ 'titleMetadataBoosts' => $core['title_metadata_boosts'],
+ 'titleMetadataDocBoosts' => $core['title_metadata_doc_boosts'],
'threshold' => $core['threshold'],
'catalogBlock' => null,
];
}
- // =========================================================
- // CORE PIPELINE
- // =========================================================
-
/**
* Executes the actual hybrid retrieval logic.
*
- * Steps:
- * - derive limits from config within hard safety caps
- * - detect whether the prompt is a "list query"
- * - clean and enrich the prompt
- * - compute threshold + vector topK based on intent/query type
- * - route query into candidate document ids via tag routing
- * - run global and optional scoped vector search
- * - fuse hits
- * - resolve chunk ids to chunk rows
- *
* @throws Exception
*/
private function runCore(
- string $prompt,
+ string $prompt,
ModelGenerationConfig $config,
- bool $withScores,
- string $salesIntent
- ): array
- {
+ bool $withScores,
+ string $salesIntent
+ ): array {
$limit = max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
$isListQuery = $this->intentLite->isListQuery($prompt);
$cleanQuery = $this->queryCleaner->clean($prompt);
- $cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery);
if ($cleanQuery === '') {
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD,
+ 'clean_query' => '',
+ 'semantic_query' => '',
+ 'secondary_vector_query' => '',
+ 'lexical_query' => '',
+ 'scope_mode' => 'none',
+ 'tag_candidate_doc_ids' => [],
+ 'soft_document_candidate_doc_ids' => [],
+ 'pseudo_scope_doc_ids' => [],
+ 'global_hit_count' => 0,
+ 'scoped_hit_count' => 0,
+ 'global_vector_hit_count' => 0,
+ 'global_primary_vector_hit_count' => 0,
+ 'global_secondary_vector_hit_count' => 0,
+ 'global_keyword_hit_count' => 0,
+ 'scoped_vector_hit_count' => 0,
+ 'scoped_primary_vector_hit_count' => 0,
+ 'scoped_secondary_vector_hit_count' => 0,
+ 'scoped_keyword_hit_count' => 0,
+ 'scoped_vector_boost_factor' => 0.0,
+ 'secondary_scoped_vector_boost_factor' => 0.0,
+ 'scoped_keyword_boost_factor' => 0.0,
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
- 'raw_scores' => [],
+ 'raw_vector_scores' => [],
+ 'raw_keyword_scores' => [],
+ 'title_metadata_boosts' => [],
+ 'title_metadata_doc_boosts' => [],
];
}
+ $semanticQuery = $this->queryEnricher->enrichPrompt($cleanQuery);
+ $secondaryVectorQuery = $cleanQuery !== $semanticQuery ? $cleanQuery : '';
+ $lexicalQuery = $cleanQuery;
+
[$threshold, $topK] = $this->computeThresholdAndTopK(
$salesIntent,
$isListQuery,
$vectorTopKBase
);
- $candidateDocIds = $this->tagRouting->route($cleanQuery);
- $candidateDocIds = is_array($candidateDocIds)
+ $tagCandidateDocIds = $this->tagRouting->route($semanticQuery);
+ $tagCandidateDocIds = is_array($tagCandidateDocIds)
? array_values(array_unique(array_filter(
- $candidateDocIds,
+ $tagCandidateDocIds,
static fn(mixed $value): bool => is_string($value) && $value !== ''
)))
: [];
- $globalHits = $this->vectorClient->search($cleanQuery, $topK);
+ $globalPrimaryVectorHits = $this->vectorClient->search($semanticQuery, $topK);
+ $globalSecondaryVectorHits = $secondaryVectorQuery !== ''
+ ? $this->vectorClient->search($secondaryVectorQuery, $topK)
+ : [];
+ $globalKeywordHits = $this->keywordRetriever->search($lexicalQuery, $topK);
- $scopedHits = [];
- if ($candidateDocIds !== []) {
- $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
+ $softDocumentCandidateDocIds = [];
+ $pseudoScopeDocIds = [];
+ $scopeMode = 'none';
+
+ $scopedVectorBoostFactor = 0.0;
+ $secondaryScopedVectorBoostFactor = 0.0;
+ $scopedKeywordBoostFactor = 0.0;
+
+ $scopedPrimaryVectorHits = [];
+ $scopedSecondaryVectorHits = [];
+ $scopedKeywordHits = [];
+
+ if ($tagCandidateDocIds !== []) {
+ $scopeMode = 'tag_routing';
+ $scopedVectorBoostFactor = self::TAG_SCOPED_VECTOR_BOOST;
+ $secondaryScopedVectorBoostFactor = self::TAG_SCOPED_VECTOR_BOOST * self::SECONDARY_SCOPED_VECTOR_MULTIPLIER;
+ $scopedKeywordBoostFactor = self::TAG_SCOPED_LEXICAL_BOOST;
+
+ $scopedPrimaryVectorHits = $this->vectorClient->searchScoped($semanticQuery, $topK, $tagCandidateDocIds);
+ $scopedSecondaryVectorHits = $secondaryVectorQuery !== ''
+ ? $this->vectorClient->searchScoped($secondaryVectorQuery, $topK, $tagCandidateDocIds)
+ : [];
+ $scopedKeywordHits = $this->keywordRetriever->search($lexicalQuery, $topK, $tagCandidateDocIds);
+ } else {
+ $softDocumentCandidateDocIds = $this->deriveSoftDocumentCandidateDocIds($globalKeywordHits);
+
+ if ($softDocumentCandidateDocIds !== []) {
+ $scopeMode = 'soft_document_candidate';
+ $scopedVectorBoostFactor = self::SOFT_DOC_SCOPED_VECTOR_BOOST;
+ $secondaryScopedVectorBoostFactor = self::SOFT_DOC_SCOPED_VECTOR_BOOST * self::SECONDARY_SCOPED_VECTOR_MULTIPLIER;
+ $scopedKeywordBoostFactor = self::SOFT_DOC_SCOPED_LEXICAL_BOOST;
+
+ $scopedPrimaryVectorHits = $this->vectorClient->searchScoped($semanticQuery, $topK, $softDocumentCandidateDocIds);
+ $scopedSecondaryVectorHits = $secondaryVectorQuery !== ''
+ ? $this->vectorClient->searchScoped($secondaryVectorQuery, $topK, $softDocumentCandidateDocIds)
+ : [];
+ $scopedKeywordHits = $this->keywordRetriever->search($lexicalQuery, $topK, $softDocumentCandidateDocIds);
+ } else {
+ $pseudoScopeDocIds = $this->derivePseudoScopeDocumentIds($globalPrimaryVectorHits);
+
+ if ($pseudoScopeDocIds !== []) {
+ $scopeMode = 'pseudo_scope';
+ $scopedVectorBoostFactor = self::PSEUDO_SCOPED_VECTOR_BOOST;
+ $secondaryScopedVectorBoostFactor = self::PSEUDO_SCOPED_VECTOR_BOOST * self::SECONDARY_SCOPED_VECTOR_MULTIPLIER;
+ $scopedKeywordBoostFactor = self::PSEUDO_SCOPED_LEXICAL_BOOST;
+
+ $scopedPrimaryVectorHits = $this->vectorClient->searchScoped($semanticQuery, $topK, $pseudoScopeDocIds);
+ $scopedSecondaryVectorHits = $secondaryVectorQuery !== ''
+ ? $this->vectorClient->searchScoped($secondaryVectorQuery, $topK, $pseudoScopeDocIds)
+ : [];
+ $scopedKeywordHits = $this->keywordRetriever->search($lexicalQuery, $topK, $pseudoScopeDocIds);
+ }
+ }
}
- if ($globalHits === [] && $scopedHits === []) {
+ if (
+ $globalPrimaryVectorHits === []
+ && $globalSecondaryVectorHits === []
+ && $globalKeywordHits === []
+ && $scopedPrimaryVectorHits === []
+ && $scopedSecondaryVectorHits === []
+ && $scopedKeywordHits === []
+ ) {
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'threshold' => $threshold,
+ 'clean_query' => $cleanQuery,
+ 'semantic_query' => $semanticQuery,
+ 'secondary_vector_query' => $secondaryVectorQuery,
+ 'lexical_query' => $lexicalQuery,
+ 'scope_mode' => $scopeMode,
+ 'tag_candidate_doc_ids' => $tagCandidateDocIds,
+ 'soft_document_candidate_doc_ids' => $softDocumentCandidateDocIds,
+ 'pseudo_scope_doc_ids' => $pseudoScopeDocIds,
+ 'global_hit_count' => 0,
+ 'scoped_hit_count' => 0,
+ 'global_vector_hit_count' => 0,
+ 'global_primary_vector_hit_count' => 0,
+ 'global_secondary_vector_hit_count' => 0,
+ 'global_keyword_hit_count' => 0,
+ 'scoped_vector_hit_count' => 0,
+ 'scoped_primary_vector_hit_count' => 0,
+ 'scoped_secondary_vector_hit_count' => 0,
+ 'scoped_keyword_hit_count' => 0,
+ 'scoped_vector_boost_factor' => $scopedVectorBoostFactor,
+ 'secondary_scoped_vector_boost_factor' => $secondaryScopedVectorBoostFactor,
+ 'scoped_keyword_boost_factor' => $scopedKeywordBoostFactor,
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
- 'raw_scores' => [],
+ 'raw_vector_scores' => [],
+ 'raw_keyword_scores' => [],
+ 'title_metadata_boosts' => [],
+ 'title_metadata_doc_boosts' => [],
];
}
- $fused = $this->fuseHits(
- $globalHits,
- $scopedHits,
- $threshold,
- $scopedHits !== [],
- $withScores
- );
+ $fused = $this->fuseHitSources([
+ [
+ 'hits' => $globalPrimaryVectorHits,
+ 'threshold' => $threshold,
+ 'boost' => 1.0,
+ 'bucket' => 'vector',
+ ],
+ [
+ 'hits' => $globalSecondaryVectorHits,
+ 'threshold' => $threshold,
+ 'boost' => self::SECONDARY_GLOBAL_VECTOR_BOOST,
+ 'bucket' => 'vector',
+ ],
+ [
+ 'hits' => $globalKeywordHits,
+ 'threshold' => self::LEXICAL_SCORE_THRESHOLD,
+ 'boost' => self::GLOBAL_LEXICAL_BOOST,
+ 'bucket' => 'keyword',
+ ],
+ [
+ 'hits' => $scopedPrimaryVectorHits,
+ 'threshold' => $threshold,
+ 'boost' => $scopedVectorBoostFactor,
+ 'bucket' => 'vector',
+ ],
+ [
+ 'hits' => $scopedSecondaryVectorHits,
+ 'threshold' => $threshold,
+ 'boost' => $secondaryScopedVectorBoostFactor,
+ 'bucket' => 'vector',
+ ],
+ [
+ 'hits' => $scopedKeywordHits,
+ 'threshold' => self::LEXICAL_SCORE_THRESHOLD,
+ 'boost' => $scopedKeywordBoostFactor,
+ 'bucket' => 'keyword',
+ ],
+ ], $withScores);
$rrfScores = $fused['rrf_scores'];
- $rawScores = $fused['raw_scores'];
+ $rawVectorScores = $fused['raw_vector_scores'];
+ $rawKeywordScores = $fused['raw_keyword_scores'];
- if ($rrfScores === [] && $globalHits !== []) {
- $rrfScores = $this->fallbackRrfFromHits($globalHits);
+ if ($rrfScores === []) {
+ $rrfScores = $this->fallbackRrfFromSources(
+ $globalPrimaryVectorHits,
+ $globalSecondaryVectorHits,
+ $globalKeywordHits,
+ $scopedPrimaryVectorHits,
+ $scopedSecondaryVectorHits,
+ $scopedKeywordHits
+ );
}
if ($rrfScores === []) {
@@ -381,13 +710,45 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
'limit' => $limit,
'is_list_query' => $isListQuery,
'threshold' => $threshold,
+ 'clean_query' => $cleanQuery,
+ 'semantic_query' => $semanticQuery,
+ 'secondary_vector_query' => $secondaryVectorQuery,
+ 'lexical_query' => $lexicalQuery,
+ 'scope_mode' => $scopeMode,
+ 'tag_candidate_doc_ids' => $tagCandidateDocIds,
+ 'soft_document_candidate_doc_ids' => $softDocumentCandidateDocIds,
+ 'pseudo_scope_doc_ids' => $pseudoScopeDocIds,
+ 'global_hit_count' => count($globalPrimaryVectorHits) + count($globalSecondaryVectorHits) + count($globalKeywordHits),
+ 'scoped_hit_count' => count($scopedPrimaryVectorHits) + count($scopedSecondaryVectorHits) + count($scopedKeywordHits),
+ 'global_vector_hit_count' => count($globalPrimaryVectorHits) + count($globalSecondaryVectorHits),
+ 'global_primary_vector_hit_count' => count($globalPrimaryVectorHits),
+ 'global_secondary_vector_hit_count' => count($globalSecondaryVectorHits),
+ 'global_keyword_hit_count' => count($globalKeywordHits),
+ 'scoped_vector_hit_count' => count($scopedPrimaryVectorHits) + count($scopedSecondaryVectorHits),
+ 'scoped_primary_vector_hit_count' => count($scopedPrimaryVectorHits),
+ 'scoped_secondary_vector_hit_count' => count($scopedSecondaryVectorHits),
+ 'scoped_keyword_hit_count' => count($scopedKeywordHits),
+ 'scoped_vector_boost_factor' => $scopedVectorBoostFactor,
+ 'secondary_scoped_vector_boost_factor' => $secondaryScopedVectorBoostFactor,
+ 'scoped_keyword_boost_factor' => $scopedKeywordBoostFactor,
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
- 'raw_scores' => $rawScores,
+ 'raw_vector_scores' => $rawVectorScores,
+ 'raw_keyword_scores' => $rawKeywordScores,
+ 'title_metadata_boosts' => [],
+ 'title_metadata_doc_boosts' => [],
];
}
+ $rows = $this->lookup->findByChunkIds(array_keys($rrfScores));
+
+ [$rrfScores, $titleMetadataBoosts, $titleMetadataDocBoosts] = $this->applyTitleMetadataBoosts(
+ $rrfScores,
+ $rows,
+ $lexicalQuery
+ );
+
arsort($rrfScores);
$rankedChunkIds = array_keys($rrfScores);
@@ -397,22 +758,38 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
'limit' => $limit,
'is_list_query' => $isListQuery,
'threshold' => $threshold,
+ 'clean_query' => $cleanQuery,
+ 'semantic_query' => $semanticQuery,
+ 'secondary_vector_query' => $secondaryVectorQuery,
+ 'lexical_query' => $lexicalQuery,
+ 'scope_mode' => $scopeMode,
+ 'tag_candidate_doc_ids' => $tagCandidateDocIds,
+ 'soft_document_candidate_doc_ids' => $softDocumentCandidateDocIds,
+ 'pseudo_scope_doc_ids' => $pseudoScopeDocIds,
+ 'global_hit_count' => count($globalPrimaryVectorHits) + count($globalSecondaryVectorHits) + count($globalKeywordHits),
+ 'scoped_hit_count' => count($scopedPrimaryVectorHits) + count($scopedSecondaryVectorHits) + count($scopedKeywordHits),
+ 'global_vector_hit_count' => count($globalPrimaryVectorHits) + count($globalSecondaryVectorHits),
+ 'global_primary_vector_hit_count' => count($globalPrimaryVectorHits),
+ 'global_secondary_vector_hit_count' => count($globalSecondaryVectorHits),
+ 'global_keyword_hit_count' => count($globalKeywordHits),
+ 'scoped_vector_hit_count' => count($scopedPrimaryVectorHits) + count($scopedSecondaryVectorHits),
+ 'scoped_primary_vector_hit_count' => count($scopedPrimaryVectorHits),
+ 'scoped_secondary_vector_hit_count' => count($scopedSecondaryVectorHits),
+ 'scoped_keyword_hit_count' => count($scopedKeywordHits),
+ 'scoped_vector_boost_factor' => $scopedVectorBoostFactor,
+ 'secondary_scoped_vector_boost_factor' => $secondaryScopedVectorBoostFactor,
+ 'scoped_keyword_boost_factor' => $scopedKeywordBoostFactor,
'ranked_chunk_ids' => $rankedChunkIds,
'rows' => $rows,
'rrf_scores' => $rrfScores,
- 'raw_scores' => $rawScores,
+ 'raw_vector_scores' => $rawVectorScores,
+ 'raw_keyword_scores' => $rawKeywordScores,
+ 'title_metadata_boosts' => $titleMetadataBoosts,
+ 'title_metadata_doc_boosts' => $titleMetadataDocBoosts,
];
}
- // =========================================================
- // SUPPORT
- // =========================================================
- /**
- * Loads the active model generation config.
- *
- * Retrieval is not allowed to proceed without an active config.
- */
private function requireConfig(): ModelGenerationConfig
{
$config = $this->configRepository->findActiveForModel();
@@ -424,32 +801,18 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return $config;
}
- /**
- * Extracts the normalized sales intent string from the intent detector.
- *
- * Falls back to DISCOVERY when the detector payload is incomplete.
- */
private function detectSalesIntent(string $prompt): string
{
$data = $this->salesIntentLite->detect($prompt);
- return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
+ return (string) ($data['intent'] ?? SalesIntentLite::DISCOVERY);
}
- /**
- * Computes retrieval threshold and vector topK.
- *
- * Rules:
- * - objection/pricing intents are slightly stricter
- * - list queries are allowed to retrieve a wider candidate set
- * - all values are clamped to global hard limits
- */
private function computeThresholdAndTopK(
string $salesIntent,
- bool $isListQuery,
- int $vectorTopKBase
- ): array
- {
+ bool $isListQuery,
+ int $vectorTopKBase
+ ): array {
$threshold = NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD;
$topK = $vectorTopKBase;
@@ -461,7 +824,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
}
if ($isListQuery) {
- $topK = (int)round($topK * NdjsonHybridRetrieverConfig::LIST_BONUS);
+ $topK = (int) round($topK * NdjsonHybridRetrieverConfig::LIST_BONUS);
}
$topK = max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
@@ -474,26 +837,175 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
}
/**
- * Fuses multiple hit lists into one RRF-style score map.
- *
- * Notes:
- * - only hits above threshold are considered
- * - rank position within each hit list contributes to the final score
- * - scoped hits can be boosted
- * - raw scores are optionally captured for debug output
+ * @param array> $globalKeywordHits
+ * @return string[]
*/
- private function fuseHits(
- array $globalHits,
- array $scopedHits,
- float $threshold,
- bool $boostScoped,
- bool $captureRaw
- ): array
+ private function deriveSoftDocumentCandidateDocIds(array $globalKeywordHits): array
+ {
+ $window = array_slice($globalKeywordHits, 0, self::SOFT_DOC_CANDIDATE_WINDOW);
+ $stats = [];
+
+ foreach ($window as $rank => $hit) {
+ $documentId = $hit['document_id'] ?? null;
+
+ if (!is_string($documentId) || $documentId === '') {
+ continue;
+ }
+
+ $score = isset($hit['score']) && is_numeric($hit['score'])
+ ? (float) $hit['score']
+ : 0.0;
+
+ if (!isset($stats[$documentId])) {
+ $stats[$documentId] = [
+ 'document_id' => $documentId,
+ 'count' => 0,
+ 'best_rank' => $rank,
+ 'best_score' => $score,
+ ];
+ }
+
+ $stats[$documentId]['count']++;
+ $stats[$documentId]['best_rank'] = min($stats[$documentId]['best_rank'], $rank);
+ $stats[$documentId]['best_score'] = max($stats[$documentId]['best_score'], $score);
+ }
+
+ if ($stats === []) {
+ return [];
+ }
+
+ uasort($stats, static function (array $a, array $b): int {
+ if ($a['count'] !== $b['count']) {
+ return $b['count'] <=> $a['count'];
+ }
+
+ if (abs((float) $a['best_score'] - (float) $b['best_score']) > 0.000001) {
+ return ((float) $b['best_score'] <=> (float) $a['best_score']);
+ }
+
+ return $a['best_rank'] <=> $b['best_rank'];
+ });
+
+ $selected = [];
+
+ foreach ($stats as $row) {
+ $count = (int) $row['count'];
+ $bestRank = (int) $row['best_rank'];
+ $bestScore = (float) $row['best_score'];
+
+ if (
+ $count < self::SOFT_DOC_CANDIDATE_MIN_DOC_HITS
+ && !($bestRank === 0 && $bestScore >= self::SOFT_DOC_TOP_SCORE_MIN)
+ ) {
+ continue;
+ }
+
+ $selected[] = (string) $row['document_id'];
+
+ if (count($selected) >= self::SOFT_DOC_CANDIDATE_MAX_DOCS) {
+ break;
+ }
+ }
+
+ return $selected;
+ }
+
+ /**
+ * @param array> $globalPrimaryVectorHits
+ * @return string[]
+ */
+ private function derivePseudoScopeDocumentIds(array $globalPrimaryVectorHits): array
+ {
+ $window = array_slice($globalPrimaryVectorHits, 0, self::PSEUDO_SCOPE_GLOBAL_WINDOW);
+ $stats = [];
+
+ foreach ($window as $rank => $hit) {
+ $documentId = $hit['document_id'] ?? null;
+
+ if (!is_string($documentId) || $documentId === '') {
+ continue;
+ }
+
+ $score = isset($hit['score']) && is_numeric($hit['score'])
+ ? (float) $hit['score']
+ : 0.0;
+
+ if (!isset($stats[$documentId])) {
+ $stats[$documentId] = [
+ 'document_id' => $documentId,
+ 'count' => 0,
+ 'best_rank' => $rank,
+ 'best_score' => $score,
+ ];
+ }
+
+ $stats[$documentId]['count']++;
+ $stats[$documentId]['best_rank'] = min($stats[$documentId]['best_rank'], $rank);
+ $stats[$documentId]['best_score'] = max($stats[$documentId]['best_score'], $score);
+ }
+
+ if ($stats === []) {
+ return [];
+ }
+
+ uasort($stats, static function (array $a, array $b): int {
+ if ($a['count'] !== $b['count']) {
+ return $b['count'] <=> $a['count'];
+ }
+
+ if (abs((float) $a['best_score'] - (float) $b['best_score']) > 0.000001) {
+ return ((float) $b['best_score'] <=> (float) $a['best_score']);
+ }
+
+ return $a['best_rank'] <=> $b['best_rank'];
+ });
+
+ $selected = [];
+
+ foreach ($stats as $row) {
+ if ((int) $row['count'] < self::PSEUDO_SCOPE_MIN_DOC_HITS) {
+ continue;
+ }
+
+ $selected[] = (string) $row['document_id'];
+
+ if (count($selected) >= self::PSEUDO_SCOPE_MAX_DOCS) {
+ break;
+ }
+ }
+
+ return $selected;
+ }
+
+ /**
+ * @param array>,
+ * threshold: float,
+ * boost: float,
+ * bucket: string
+ * }> $sources
+ * @return array{
+ * rrf_scores: array,
+ * raw_vector_scores: array,
+ * raw_keyword_scores: array
+ * }
+ */
+ private function fuseHitSources(array $sources, bool $captureRaw): array
{
$rrfScores = [];
- $rawScores = [];
+ $rawVectorScores = [];
+ $rawKeywordScores = [];
+
+ foreach ($sources as $source) {
+ $hits = $source['hits'];
+ $threshold = (float) $source['threshold'];
+ $boost = max(0.0, (float) $source['boost']);
+ $bucket = (string) $source['bucket'];
+
+ if ($hits === [] || $boost <= 0.0) {
+ continue;
+ }
- $apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void {
$rank = 0;
foreach ($hits as $hit) {
@@ -501,85 +1013,83 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
continue;
}
- $raw = (float)$hit['score'];
+ $raw = (float) $hit['score'];
if ($raw < $threshold) {
continue;
}
- $chunkId = (string)$hit['chunk_id'];
+ $chunkId = (string) $hit['chunk_id'];
if ($captureRaw) {
- $rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
+ if ($bucket === 'vector') {
+ $rawVectorScores[$chunkId] = max($rawVectorScores[$chunkId] ?? 0.0, $raw);
+ } elseif ($bucket === 'keyword') {
+ $rawKeywordScores[$chunkId] = max($rawKeywordScores[$chunkId] ?? 0.0, $raw);
+ }
}
$rank++;
$rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
-
- if ($boost) {
- $rrf *= 1.2;
- }
+ $rrf *= $boost;
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
}
- };
-
- $apply($globalHits, false);
- $apply($scopedHits, $boostScoped);
+ }
return [
'rrf_scores' => $rrfScores,
- 'raw_scores' => $rawScores,
+ 'raw_vector_scores' => $rawVectorScores,
+ 'raw_keyword_scores' => $rawKeywordScores,
];
}
/**
- * Builds a fallback RRF ranking purely from hit order.
- *
- * Used when thresholding removed all fused candidates but
- * the global hit list itself still exists.
+ * @param array> ...$sourceLists
+ * @return array
*/
- private function fallbackRrfFromHits(array $hits): array
+ private function fallbackRrfFromSources(array ...$sourceLists): array
{
- $rrf = [];
- $rank = 0;
+ foreach ($sourceLists as $hits) {
+ $rrf = [];
+ $rank = 0;
- foreach ($hits as $hit) {
- if (!isset($hit['chunk_id'])) {
- continue;
+ foreach ($hits as $hit) {
+ if (!isset($hit['chunk_id'])) {
+ continue;
+ }
+
+ $rank++;
+ $rrf[(string) $hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
+
+ if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) {
+ break;
+ }
}
- $rank++;
- $rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
-
- if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) {
- break;
+ if ($rrf !== []) {
+ return $rrf;
}
}
- return $rrf;
+ return [];
}
/**
- * Selects a coherent chunk window from one exact document title match.
- *
- * For exact product questions we prefer a pure document slice over
- * cross-document fusion to avoid mixing neighbouring product families.
- *
* @param array> $rows
* @return string[]
*/
private function selectExactDocumentChunkIds(array $rows, int $limit): array
{
uasort($rows, static function (array $a, array $b): int {
- $aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX;
- $bIndex = is_int($b['chunk_index'] ?? null) ? (int)$b['chunk_index'] : PHP_INT_MAX;
+ $aIndex = is_int($a['chunk_index'] ?? null) ? (int) $a['chunk_index'] : PHP_INT_MAX;
+ $bIndex = is_int($b['chunk_index'] ?? null) ? (int) $b['chunk_index'] : PHP_INT_MAX;
if ($aIndex !== $bIndex) {
return $aIndex <=> $bIndex;
}
- return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? ''));
+ return strcmp((string) ($a['chunk_id'] ?? ''), (string) ($b['chunk_id'] ?? ''));
});
$selected = [];
@@ -587,7 +1097,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
foreach ($rows as $row) {
$chunkId = $row['chunk_id'] ?? null;
- $text = trim((string)($row['text'] ?? ''));
+ $text = trim((string) ($row['text'] ?? ''));
if (!is_string($chunkId) || $chunkId === '' || $text === '') {
continue;
@@ -604,10 +1114,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
}
/**
- * Builds synthetic scores for exact-title fast-path selections.
- *
- * These scores are only used for debug output consistency.
- *
* @param string[] $chunkIds
* @return array
*/
@@ -616,20 +1122,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$scores = [];
foreach (array_values($chunkIds) as $rank => $chunkId) {
- $scores[(string)$chunkId] = 1.0 / (1 + $rank);
+ $scores[(string) $chunkId] = 1.0 / (1 + $rank);
}
return $scores;
}
- /**
- * Selection strategy for list-style queries.
- *
- * Goal:
- * - avoid near-identical chunks
- * - prefer diverse list entries
- * - stop once the configured limit is reached
- */
private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array
{
$seen = [];
@@ -640,19 +1138,19 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
continue;
}
- $chunk = trim((string)$rows[$id]['text']);
+ $chunk = trim((string) $rows[$id]['text']);
if ($chunk === '') {
continue;
}
- $key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));
+ $key = md5(mb_strtolower((string) (preg_replace('/\s+/u', ' ', $chunk) ?? $chunk)));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
- $out[] = (string)$id;
+ $out[] = (string) $id;
if (count($out) >= $limit) {
break;
@@ -662,23 +1160,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return $out;
}
- /**
- * Selection strategy for sales-oriented queries.
- *
- * Modes:
- * - exact_document_title:
- * used when the prompt clearly contains one exact document title
- * and the answer should stay strictly within that document
- *
- * - sales_dominant_document:
- * used when one document clearly dominates the top hit window
- * and coherent neighbouring chunks from that document are more
- * useful than cross-document spread
- *
- * - sales_spread:
- * default mode that spreads chunks across documents and enforces
- * distance between chunk positions of the same document
- */
private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array
{
$dominantDocId = $this->detectDominantTopDocument($chunkIds, $rows);
@@ -710,13 +1191,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
];
}
- /**
- * Detects whether one document clearly dominates the first ranked window.
- *
- * This is especially useful for product-sheet style documents where
- * several adjacent chunks belong together and should be passed to the model
- * as one coherent factual block.
- */
private function detectDominantTopDocument(array $chunkIds, array $rows): ?string
{
$docWindow = [];
@@ -726,7 +1200,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
continue;
}
- $text = trim((string)$rows[$chunkId]['text']);
+ $text = trim((string) $rows[$chunkId]['text']);
$docId = $rows[$chunkId]['document_id'] ?? null;
if ($text === '' || !is_string($docId) || $docId === '') {
@@ -749,7 +1223,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return null;
}
- $dominantCount = (int)($counts[$dominantDocId] ?? 0);
+ $dominantCount = (int) ($counts[$dominantDocId] ?? 0);
if ($dominantCount >= self::DOMINANT_DOC_MIN_HITS) {
return $dominantDocId;
@@ -765,21 +1239,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return null;
}
- /**
- * Selects a coherent chunk window from the dominant document.
- *
- * Strategy:
- * - use the highest-ranked chunk of that document as anchor
- * - prefer neighbouring chunk indices around that anchor
- * - sort the final selection by chunk index for prompt coherence
- */
private function selectDominantDocumentChunkIds(
string $documentId,
- array $chunkIds,
- array $rows,
- int $limit
- ): array
- {
+ array $chunkIds,
+ array $rows,
+ int $limit
+ ): array {
$docHits = [];
$anchorChunkIndex = null;
@@ -788,7 +1253,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
continue;
}
- $text = trim((string)$rows[$chunkId]['text']);
+ $text = trim((string) $rows[$chunkId]['text']);
$docId = $rows[$chunkId]['document_id'] ?? null;
if ($text === '' || $docId !== $documentId) {
@@ -803,7 +1268,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
}
$docHits[] = [
- 'id' => (string)$chunkId,
+ 'id' => (string) $chunkId,
'rank' => $rank,
'chunk_index' => $chunkIndex,
];
@@ -861,19 +1326,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
);
}
- /**
- * Fills the remaining sales slots after a dominant document selection.
- *
- * The already selected dominant-document chunks stay fixed.
- * Remaining slots are filled with the normal spread strategy.
- */
private function fillRemainingSalesChunkIds(
array $seedChunkIds,
array $chunkIds,
array $rows,
- int $limit
- ): array
- {
+ int $limit
+ ): array {
$out = array_values(array_unique(array_map('strval', $seedChunkIds)));
if (count($out) >= $limit) {
@@ -925,12 +1383,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
}
}
- $text = trim((string)$rows[$chunkId]['text']);
+ $text = trim((string) $rows[$chunkId]['text']);
if ($text === '') {
continue;
}
- $out[] = (string)$chunkId;
+ $out[] = (string) $chunkId;
$selected[$chunkId] = true;
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
@@ -946,14 +1404,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return $out;
}
- /**
- * Default spread selection for sales-oriented queries.
- *
- * Goal:
- * - avoid overloading the result with chunks from the same document
- * - avoid chunks that are too close to each other in the same document
- * - preserve top-ranked relevance while improving contextual spread
- */
private function selectSalesChunkIdsSpread(array $chunkIds, array $rows, int $limit): array
{
$out = [];
@@ -986,12 +1436,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$docChunkPositions[$docId][] = $chunkIndex;
}
- $text = trim((string)$rows[$chunkId]['text']);
+ $text = trim((string) $rows[$chunkId]['text']);
if ($text === '') {
continue;
}
- $out[] = (string)$chunkId;
+ $out[] = (string) $chunkId;
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
if (count($out) >= $limit) {
@@ -1002,9 +1452,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return $out;
}
- /**
- * Converts selected chunk ids into the final plain text result list.
- */
private function collectTextsFromIds(array $chunkIds, array $rows): array
{
$out = [];
@@ -1014,7 +1461,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
continue;
}
- $text = trim((string)$rows[$id]['text']);
+ $text = trim((string) $rows[$id]['text']);
if ($text !== '') {
$out[] = $text;
@@ -1023,4 +1470,233 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return $out;
}
+
+ /**
+ * Applies a conservative document-level re-rank based on title / metadata matching.
+ *
+ * This is intentionally executed after source fusion. It should sharpen ranking
+ * for clearly matching documents, but never replace the underlying retrieval logic.
+ *
+ * @param array $rrfScores
+ * @param array> $rows
+ * @return array{0: array, 1: array, 2: array}
+ */
+ private function applyTitleMetadataBoosts(array $rrfScores, array $rows, string $lexicalQuery): array
+ {
+ $normalizedQuery = $this->normalizeForMatching($lexicalQuery);
+ $queryTokens = $this->tokenizeNormalizedQuery($normalizedQuery);
+
+ if ($normalizedQuery === '' || $queryTokens === [] || $rrfScores === [] || $rows === []) {
+ return [$rrfScores, [], []];
+ }
+
+ $documentBoosts = [];
+
+ foreach ($rows as $row) {
+ $documentId = $row['document_id'] ?? null;
+
+ if (!is_string($documentId) || $documentId === '' || isset($documentBoosts[$documentId])) {
+ continue;
+ }
+
+ $documentBoosts[$documentId] = $this->computeDocumentMetadataBoost(
+ $row,
+ $normalizedQuery,
+ $queryTokens
+ );
+ }
+
+ if ($documentBoosts === []) {
+ return [$rrfScores, [], []];
+ }
+
+ $chunkBoosts = [];
+
+ foreach ($rrfScores as $chunkId => $score) {
+ $row = $rows[$chunkId] ?? null;
+
+ if (!is_array($row)) {
+ continue;
+ }
+
+ $documentId = $row['document_id'] ?? null;
+
+ if (!is_string($documentId) || $documentId === '') {
+ continue;
+ }
+
+ $boost = $documentBoosts[$documentId] ?? 0.0;
+
+ if ($boost <= 0.0) {
+ continue;
+ }
+
+ $rrfScores[$chunkId] = $score * (1.0 + $boost);
+ $chunkBoosts[$chunkId] = $boost;
+ }
+
+ return [$rrfScores, $chunkBoosts, $documentBoosts];
+ }
+
+ /**
+ * @param array $row
+ * @param string[] $queryTokens
+ */
+ private function computeDocumentMetadataBoost(array $row, string $normalizedQuery, array $queryTokens): float
+ {
+ $documentTitle = $this->normalizeForMatching($this->extractMetadataString($row, [
+ 'document_title',
+ 'title',
+ ]));
+
+ $fileName = $this->normalizeForMatching($this->extractMetadataString($row, [
+ 'file_name',
+ 'filename',
+ 'original_filename',
+ 'source_name',
+ 'document_name',
+ ]));
+
+ $metaText = $this->normalizeForMatching($this->extractMetadataString($row, [
+ 'source_path',
+ 'path',
+ 'heading',
+ 'section_title',
+ 'category',
+ ]));
+
+ $boost = 0.0;
+
+ $titleCoverage = $this->computeNormalizedTokenCoverage($queryTokens, $documentTitle);
+ if ($titleCoverage > 0.0) {
+ $boost += min(
+ self::TITLE_MATCH_MAX_BOOST,
+ self::TITLE_MATCH_BASE_BOOST + ($titleCoverage * self::TITLE_MATCH_MAX_BOOST)
+ );
+ }
+
+ $fileCoverage = $this->computeNormalizedTokenCoverage($queryTokens, $fileName);
+ if ($fileCoverage > 0.0) {
+ $boost += min(
+ self::FILE_MATCH_MAX_BOOST,
+ self::FILE_MATCH_BASE_BOOST + ($fileCoverage * self::FILE_MATCH_MAX_BOOST)
+ );
+ }
+
+ $metaCoverage = $this->computeNormalizedTokenCoverage($queryTokens, $metaText);
+ if ($metaCoverage > 0.0) {
+ $boost += min(
+ self::META_MATCH_MAX_BOOST,
+ $metaCoverage * self::META_MATCH_MAX_BOOST
+ );
+ }
+
+ if (str_contains($normalizedQuery, ' ')) {
+ if ($documentTitle !== '' && str_contains(' ' . $documentTitle . ' ', ' ' . $normalizedQuery . ' ')) {
+ $boost += self::EXACT_TITLE_PHRASE_BOOST;
+ }
+
+ if ($fileName !== '' && str_contains(' ' . $fileName . ' ', ' ' . $normalizedQuery . ' ')) {
+ $boost += self::EXACT_FILE_PHRASE_BOOST;
+ }
+ }
+
+ return min(self::MAX_TITLE_METADATA_BOOST, $boost);
+ }
+
+ /**
+ * @param array $row
+ * @param string[] $preferredKeys
+ */
+ private function extractMetadataString(array $row, array $preferredKeys): string
+ {
+ foreach ($preferredKeys as $key) {
+ $topLevel = $row[$key] ?? null;
+ if (is_string($topLevel) && trim($topLevel) !== '') {
+ return trim($topLevel);
+ }
+
+ $metadata = $row['metadata'] ?? null;
+ if (is_array($metadata)) {
+ $value = $metadata[$key] ?? null;
+ if (is_string($value) && trim($value) !== '') {
+ return trim($value);
+ }
+ }
+ }
+
+ return '';
+ }
+
+ /**
+ * @param string[] $queryTokens
+ */
+ private function computeNormalizedTokenCoverage(array $queryTokens, string $normalizedHaystack): float
+ {
+ if ($queryTokens === [] || $normalizedHaystack === '') {
+ return 0.0;
+ }
+
+ $matched = 0;
+
+ foreach ($queryTokens as $token) {
+ if ($token === '') {
+ continue;
+ }
+
+ if (str_contains(' ' . $normalizedHaystack . ' ', ' ' . $token . ' ')) {
+ $matched++;
+ }
+ }
+
+ if ($matched < 1) {
+ return 0.0;
+ }
+
+ return $matched / max(1, count($queryTokens));
+ }
+
+ /**
+ * @return string[]
+ */
+ private function tokenizeNormalizedQuery(string $normalizedQuery): array
+ {
+ if ($normalizedQuery === '') {
+ return [];
+ }
+
+ $tokens = preg_split('/\s+/u', $normalizedQuery, -1, PREG_SPLIT_NO_EMPTY) ?: [];
+ $tokens = array_values(array_unique(array_filter(
+ $tokens,
+ static fn (string $token): bool => mb_strlen($token, 'UTF-8') >= 2
+ )));
+
+ return $tokens;
+ }
+
+ private function normalizeForMatching(string $value): string
+ {
+ $value = mb_strtolower(trim($value), 'UTF-8');
+ $value = preg_replace('/[^\p{L}\p{N}]+/u', ' ', $value) ?? $value;
+ $value = preg_replace('/\s+/u', ' ', $value) ?? $value;
+
+ return trim($value);
+ }
+
+ private function maxNullableFloat(?float $a, ?float $b): ?float
+ {
+ if ($a === null && $b === null) {
+ return null;
+ }
+
+ if ($a === null) {
+ return $b;
+ }
+
+ if ($b === null) {
+ return $a;
+ }
+
+ return max($a, $b);
+ }
}
\ No newline at end of file
diff --git a/src/Knowledge/Retrieval/NdjsonKeywordRetriever.php b/src/Knowledge/Retrieval/NdjsonKeywordRetriever.php
new file mode 100644
index 0000000..1d4c16a
--- /dev/null
+++ b/src/Knowledge/Retrieval/NdjsonKeywordRetriever.php
@@ -0,0 +1,451 @@
+
+ */
+ public function search(string $query, int $limit = 10, array $docIds = []): array
+ {
+ $limit = $this->clampLimit($limit);
+ $analysis = $this->analyzeQuery($query);
+
+ if ($analysis['tokens'] === []) {
+ return [];
+ }
+
+ $db = $this->openReadOnlyDb();
+
+ if (!$db instanceof SQLite3) {
+ return [];
+ }
+
+ try {
+ $totalChunks = $this->loadTotalChunks($db);
+ $rows = $this->loadPostings(
+ $db,
+ $analysis['tokens'],
+ $docIds
+ );
+
+ if ($rows === []) {
+ return [];
+ }
+
+ return $this->scoreRows(
+ $rows,
+ $analysis['tokens'],
+ $analysis['numeric_tokens'],
+ $totalChunks,
+ $limit
+ );
+ } catch (\Throwable $e) {
+ $this->agentLogger->error('Keyword retriever failed', [
+ 'error' => $e->getMessage(),
+ ]);
+
+ return [];
+ } finally {
+ $db->close();
+ }
+ }
+
+ /**
+ * @return array{
+ * normalized_query:string,
+ * tokens:string[],
+ * numeric_tokens:string[]
+ * }
+ */
+ private function analyzeQuery(string $query): array
+ {
+ $normalized = $this->normalizeText($query);
+
+ if ($normalized === '') {
+ return [
+ 'normalized_query' => '',
+ 'tokens' => [],
+ 'numeric_tokens' => [],
+ ];
+ }
+
+ $parts = preg_split('/\s+/u', $normalized, -1, PREG_SPLIT_NO_EMPTY) ?: [];
+
+ $tokens = [];
+ $numericTokens = [];
+
+ foreach ($parts as $token) {
+ if ($token === '') {
+ continue;
+ }
+
+ if ($this->shouldIgnoreToken($token)) {
+ continue;
+ }
+
+ $tokens[] = $token;
+
+ if (preg_match('/\d/u', $token) === 1) {
+ $numericTokens[] = $token;
+ }
+ }
+
+ $tokens = array_values(array_unique($tokens));
+ $numericTokens = array_values(array_unique($numericTokens));
+
+ if (count($tokens) > self::MAX_QUERY_TOKENS) {
+ $tokens = array_slice($tokens, 0, self::MAX_QUERY_TOKENS);
+ }
+
+ return [
+ 'normalized_query' => $normalized,
+ 'tokens' => $tokens,
+ 'numeric_tokens' => $numericTokens,
+ ];
+ }
+
+ private function shouldIgnoreToken(string $token): bool
+ {
+ if ($token === '') {
+ return true;
+ }
+
+ if (preg_match('/\d/u', $token) === 1) {
+ return false;
+ }
+
+ if (mb_strlen($token, 'UTF-8') < 2) {
+ return true;
+ }
+
+ return StopWords::isStopWord($token);
+ }
+
+ private function normalizeText(string $value): string
+ {
+ $value = mb_strtolower(trim($value), 'UTF-8');
+ $value = str_replace(['-', '/', '_'], ' ', $value);
+ $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
+ $value = preg_replace('/\s+/u', ' ', $value) ?? $value;
+
+ return trim($value);
+ }
+
+ private function openReadOnlyDb(): ?SQLite3
+ {
+ if (!class_exists(SQLite3::class)) {
+ $this->agentLogger->warning('Keyword retriever unavailable: sqlite3 extension missing.');
+
+ return null;
+ }
+
+ $path = $this->getIndexPath();
+
+ if (!is_file($path)) {
+ return null;
+ }
+
+ try {
+ $db = new SQLite3($path, SQLITE3_OPEN_READONLY);
+ $db->busyTimeout(1000);
+
+ return $db;
+ } catch (\Throwable $e) {
+ $this->agentLogger->error('Unable to open lexical index', [
+ 'path' => $path,
+ 'error' => $e->getMessage(),
+ ]);
+
+ return null;
+ }
+ }
+
+ private function getIndexPath(): string
+ {
+ return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH;
+ }
+
+ private function loadTotalChunks(SQLite3 $db): int
+ {
+ $stmt = $db->prepare('SELECT value FROM lexical_meta WHERE key = :key');
+ if (!$stmt) {
+ return 1;
+ }
+
+ $stmt->bindValue(':key', 'total_chunks', SQLITE3_TEXT);
+ $result = $stmt->execute();
+
+ if ($result === false) {
+ return 1;
+ }
+
+ $row = $result->fetchArray(SQLITE3_ASSOC);
+ $result->finalize();
+
+ $value = isset($row['value']) ? (int) $row['value'] : 0;
+
+ return max(1, $value);
+ }
+
+ /**
+ * @param string[] $tokens
+ * @param string[] $docIds
+ * @return array
+ */
+ private function loadPostings(SQLite3 $db, array $tokens, array $docIds): array
+ {
+ if ($tokens === []) {
+ return [];
+ }
+
+ $tokenPlaceholders = [];
+ foreach (array_keys($tokens) as $i) {
+ $tokenPlaceholders[] = ':t' . $i;
+ }
+
+ $sql = '
+ SELECT
+ p.token,
+ p.chunk_id,
+ p.document_id,
+ p.chunk_index,
+ p.tf,
+ p.title_tf,
+ lt.df
+ FROM lexical_postings p
+ INNER JOIN lexical_terms lt ON lt.token = p.token
+ WHERE p.token IN (' . implode(', ', $tokenPlaceholders) . ')
+ ';
+
+ $docIds = array_values(array_unique(array_filter(
+ $docIds,
+ static fn (mixed $value): bool => is_string($value) && $value !== ''
+ )));
+
+ if ($docIds !== []) {
+ $docPlaceholders = [];
+ foreach (array_keys($docIds) as $i) {
+ $docPlaceholders[] = ':d' . $i;
+ }
+
+ $sql .= ' AND p.document_id IN (' . implode(', ', $docPlaceholders) . ')';
+ }
+
+ $stmt = $db->prepare($sql);
+
+ if ($stmt === false) {
+ return [];
+ }
+
+ foreach ($tokens as $i => $token) {
+ $stmt->bindValue(':t' . $i, $token, SQLITE3_TEXT);
+ }
+
+ foreach ($docIds as $i => $docId) {
+ $stmt->bindValue(':d' . $i, $docId, SQLITE3_TEXT);
+ }
+
+ $result = $stmt->execute();
+
+ if ($result === false) {
+ return [];
+ }
+
+ $rows = [];
+
+ while (($row = $result->fetchArray(SQLITE3_ASSOC)) !== false) {
+ $chunkId = (string) ($row['chunk_id'] ?? '');
+ $documentId = (string) ($row['document_id'] ?? '');
+ $token = (string) ($row['token'] ?? '');
+
+ if ($chunkId === '' || $documentId === '' || $token === '') {
+ continue;
+ }
+
+ $chunkIndex = null;
+ if (isset($row['chunk_index']) && is_numeric($row['chunk_index'])) {
+ $chunkIndex = (int) $row['chunk_index'];
+ }
+
+ $rows[] = [
+ 'token' => $token,
+ 'chunk_id' => $chunkId,
+ 'document_id' => $documentId,
+ 'chunk_index' => $chunkIndex,
+ 'tf' => max(1, (int) ($row['tf'] ?? 1)),
+ 'title_tf' => max(0, (int) ($row['title_tf'] ?? 0)),
+ 'df' => max(1, (int) ($row['df'] ?? 1)),
+ ];
+ }
+
+ $result->finalize();
+
+ return $rows;
+ }
+
+ /**
+ * @param array $rows
+ * @param string[] $queryTokens
+ * @param string[] $numericTokens
+ *
+ * @return array
+ */
+ private function scoreRows(
+ array $rows,
+ array $queryTokens,
+ array $numericTokens,
+ int $totalChunks,
+ int $limit
+ ): array {
+ if ($rows === []) {
+ return [];
+ }
+
+ $numericLookup = array_fill_keys($numericTokens, true);
+ $queryTokenCount = max(1, count($queryTokens));
+
+ $scores = [];
+ $meta = [];
+ $matchedTokens = [];
+
+ foreach ($rows as $row) {
+ $chunkId = $row['chunk_id'];
+ $token = $row['token'];
+
+ $idf = log(1.0 + ($totalChunks / max(1.0, (float) (1 + $row['df']))));
+ $tfBoost = 1.0 + (min(3, $row['tf']) * 0.20);
+ $numericBoost = isset($numericLookup[$token]) ? 1.60 : 1.0;
+ $titleBonus = $row['title_tf'] > 0 ? ($idf * 0.75) : 0.0;
+
+ $scores[$chunkId] = ($scores[$chunkId] ?? 0.0)
+ + ($idf * $tfBoost * $numericBoost)
+ + $titleBonus;
+
+ $matchedTokens[$chunkId][$token] = true;
+
+ if (!isset($meta[$chunkId])) {
+ $meta[$chunkId] = [
+ 'document_id' => $row['document_id'],
+ 'chunk_index' => $row['chunk_index'],
+ ];
+ }
+ }
+
+ foreach ($scores as $chunkId => $score) {
+ $coverage = count($matchedTokens[$chunkId] ?? []) / $queryTokenCount;
+ $scores[$chunkId] = $score * (0.65 + (0.35 * $coverage));
+ }
+
+ arsort($scores);
+
+ $topScore = (float) reset($scores);
+ if ($topScore <= 0.0) {
+ return [];
+ }
+
+ $out = [];
+
+ foreach ($scores as $chunkId => $score) {
+ $normalizedScore = $score / $topScore;
+
+ $out[] = [
+ 'chunk_id' => $chunkId,
+ 'score' => round($normalizedScore, 6),
+ 'document_id' => $meta[$chunkId]['document_id'] ?? null,
+ 'chunk_index' => $meta[$chunkId]['chunk_index'] ?? null,
+ ];
+
+ if (count($out) >= $limit) {
+ break;
+ }
+ }
+
+ return $out;
+ }
+
+ private function clampLimit(int $limit): int
+ {
+ if ($limit < 1) {
+ return 1;
+ }
+
+ if ($limit > self::MAX_LIMIT) {
+ return self::MAX_LIMIT;
+ }
+
+ return $limit;
+ }
+}
\ No newline at end of file
diff --git a/src/Knowledge/Retrieval/NdjsonLexicalIndexBuilder.php b/src/Knowledge/Retrieval/NdjsonLexicalIndexBuilder.php
new file mode 100644
index 0000000..c83f1f7
--- /dev/null
+++ b/src/Knowledge/Retrieval/NdjsonLexicalIndexBuilder.php
@@ -0,0 +1,528 @@
+assertSqliteAvailable();
+
+ $indexNdjsonPath = $this->getIndexNdjsonPath();
+ $lexicalIndexPath = $this->getLexicalIndexPath();
+ $tmpPath = $lexicalIndexPath . '.tmp';
+
+ if (!is_file($indexNdjsonPath) || filesize($indexNdjsonPath) === 0) {
+ $this->removeFileIfExists($lexicalIndexPath);
+ $this->removeFileIfExists($tmpPath);
+
+ $this->agentLogger->info('Lexical index skipped because index.ndjson is missing or empty.', [
+ 'index_ndjson' => $indexNdjsonPath,
+ ]);
+
+ return;
+ }
+
+ $this->ensureTargetDirectoryExists($lexicalIndexPath);
+ $this->removeFileIfExists($tmpPath);
+
+ $db = $this->openWritableDb($tmpPath);
+
+ try {
+ $this->initializeSchema($db);
+ $this->buildFromNdjson($db, $indexNdjsonPath);
+ $db->close();
+
+ $this->atomicReplace($tmpPath, $lexicalIndexPath);
+
+ $this->agentLogger->info('Lexical index build completed.', [
+ 'path' => $lexicalIndexPath,
+ ]);
+ } catch (\Throwable $e) {
+ try {
+ $db->close();
+ } catch (\Throwable) {
+ // Ignore close failures during cleanup.
+ }
+
+ $this->removeFileIfExists($tmpPath);
+
+ $this->agentLogger->error('Lexical index build failed.', [
+ 'path' => $lexicalIndexPath,
+ 'error' => $e->getMessage(),
+ ]);
+
+ throw $e;
+ }
+ }
+
+ private function buildFromNdjson(SQLite3 $db, string $indexNdjsonPath): void
+ {
+ $handle = @fopen($indexNdjsonPath, 'rb');
+
+ if ($handle === false) {
+ throw new \RuntimeException('Unable to read index.ndjson: ' . $indexNdjsonPath);
+ }
+
+ $db->exec('BEGIN IMMEDIATE TRANSACTION');
+
+ try {
+ $seenChunkStmt = $db->prepare(
+ 'INSERT OR IGNORE INTO lexical_seen_chunks (chunk_id) VALUES (:chunk_id)'
+ );
+ $termStmt = $db->prepare(
+ 'INSERT INTO lexical_terms (token, df)
+ VALUES (:token, 1)
+ ON CONFLICT(token) DO UPDATE SET df = df + 1'
+ );
+ $postingStmt = $db->prepare(
+ 'INSERT INTO lexical_postings (
+ token,
+ chunk_id,
+ document_id,
+ chunk_index,
+ tf,
+ title_tf
+ ) VALUES (
+ :token,
+ :chunk_id,
+ :document_id,
+ :chunk_index,
+ :tf,
+ :title_tf
+ )'
+ );
+
+ if (!$seenChunkStmt || !$termStmt || !$postingStmt) {
+ throw new \RuntimeException('Failed to prepare lexical index SQL statements.');
+ }
+
+ $totalChunks = 0;
+ $lineNumber = 0;
+
+ while (($line = fgets($handle)) !== false) {
+ $lineNumber++;
+ $line = trim($line);
+
+ if ($line === '') {
+ continue;
+ }
+
+ $row = json_decode($line, true);
+
+ if (!is_array($row)) {
+ continue;
+ }
+
+ $chunkId = trim((string)($row['chunk_id'] ?? ''));
+ $documentId = trim((string)($row['document_id'] ?? ''));
+ $chunkIndex = $this->normalizeChunkIndex($row['chunk_index'] ?? null);
+ $text = trim((string)($row['text'] ?? ''));
+
+ if ($chunkId === '' || $documentId === '' || $text === '') {
+ continue;
+ }
+
+ $seenChunkStmt->reset();
+ $seenChunkStmt->clear();
+ $seenChunkStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT);
+ $seenResult = $seenChunkStmt->execute();
+
+ if ($seenResult !== false) {
+ $seenResult->finalize();
+ }
+
+ if ($db->changes() < 1) {
+ continue;
+ }
+
+ $title = $this->extractDocumentTitle($row);
+ $tokenStats = $this->buildTokenStats($text, $title);
+
+ if ($tokenStats === []) {
+ continue;
+ }
+
+ $totalChunks++;
+
+ foreach ($tokenStats as $token => $stats) {
+ $termStmt->reset();
+ $termStmt->clear();
+ $termStmt->bindValue(':token', $token, SQLITE3_TEXT);
+ $termResult = $termStmt->execute();
+
+ if ($termResult !== false) {
+ $termResult->finalize();
+ }
+
+ $postingStmt->reset();
+ $postingStmt->clear();
+ $postingStmt->bindValue(':token', $token, SQLITE3_TEXT);
+ $postingStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT);
+ $postingStmt->bindValue(':document_id', $documentId, SQLITE3_TEXT);
+
+ if ($chunkIndex === null) {
+ $postingStmt->bindValue(':chunk_index', null, SQLITE3_NULL);
+ } else {
+ $postingStmt->bindValue(':chunk_index', $chunkIndex, SQLITE3_INTEGER);
+ }
+
+ $postingStmt->bindValue(':tf', $stats['tf'], SQLITE3_INTEGER);
+ $postingStmt->bindValue(':title_tf', $stats['title_tf'], SQLITE3_INTEGER);
+
+ $postingResult = $postingStmt->execute();
+
+ if ($postingResult === false) {
+ throw new \RuntimeException('Failed to insert lexical posting for token: ' . $token);
+ }
+
+ $postingResult->finalize();
+ }
+ }
+
+ fclose($handle);
+
+ $this->writeMeta($db, $totalChunks);
+
+ $db->exec('COMMIT');
+
+ $this->agentLogger->info('Lexical index streaming pass completed.', [
+ 'indexed_chunks' => $totalChunks,
+ 'source' => $indexNdjsonPath,
+ ]);
+ } catch (\Throwable $e) {
+ fclose($handle);
+ $db->exec('ROLLBACK');
+
+ throw $e;
+ }
+ }
+
+ /**
+ * @return array
+ */
+ private function buildTokenStats(string $text, string $title): array
+ {
+ $textTokens = $this->tokenize($text);
+ $titleTokens = $this->tokenize($title);
+
+ if ($textTokens === [] && $titleTokens === []) {
+ return [];
+ }
+
+ $textTf = [];
+ foreach ($textTokens as $token) {
+ $textTf[$token] = ($textTf[$token] ?? 0) + 1;
+ }
+
+ $titleTf = [];
+ foreach ($titleTokens as $token) {
+ $titleTf[$token] = ($titleTf[$token] ?? 0) + 1;
+ }
+
+ $tokens = array_values(array_unique(array_merge(
+ array_keys($textTf),
+ array_keys($titleTf)
+ )));
+
+ if (count($tokens) > self::MAX_UNIQUE_TOKENS_PER_CHUNK) {
+ $tokens = array_slice($tokens, 0, self::MAX_UNIQUE_TOKENS_PER_CHUNK);
+ }
+
+ $stats = [];
+
+ foreach ($tokens as $token) {
+ $stats[$token] = [
+ 'tf' => $textTf[$token] ?? 0,
+ 'title_tf' => $titleTf[$token] ?? 0,
+ ];
+ }
+
+ return $stats;
+ }
+
+ /**
+ * Generic tokenizer:
+ * - lowercases
+ * - removes punctuation
+ * - preserves alphanumeric codes
+ * - keeps numeric/code-like tokens even if short
+ * - drops generic stop words for non-numeric tokens
+ *
+ * @return string[]
+ */
+ private function tokenize(string $value): array
+ {
+ $value = $this->normalizeText($value);
+
+ if ($value === '') {
+ return [];
+ }
+
+ $parts = preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: [];
+ $tokens = [];
+
+ foreach ($parts as $token) {
+ if ($token === '') {
+ continue;
+ }
+
+ if ($this->shouldIgnoreToken($token)) {
+ continue;
+ }
+
+ $tokens[] = $token;
+ }
+
+ return $tokens;
+ }
+
+ private function shouldIgnoreToken(string $token): bool
+ {
+ if ($token === '') {
+ return true;
+ }
+
+ if (preg_match('/\d/u', $token) === 1) {
+ return false;
+ }
+
+ if (mb_strlen($token, 'UTF-8') < 2) {
+ return true;
+ }
+
+ return StopWords::isStopWord($token);
+ }
+
+ private function normalizeText(string $value): string
+ {
+ $value = mb_strtolower(trim($value), 'UTF-8');
+ $value = str_replace(['-', '/', '_'], ' ', $value);
+ $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
+ $value = preg_replace('/\s+/u', ' ', $value) ?? $value;
+
+ return trim($value);
+ }
+
+ private function extractDocumentTitle(array $row): string
+ {
+ $metadata = $row['metadata'] ?? null;
+
+ if (!is_array($metadata)) {
+ return '';
+ }
+
+ return trim((string)($metadata['document_title'] ?? ''));
+ }
+
+ private function normalizeChunkIndex(mixed $value): ?int
+ {
+ if (is_int($value)) {
+ return $value;
+ }
+
+ if (is_string($value) && ctype_digit($value)) {
+ return (int)$value;
+ }
+
+ return null;
+ }
+
+ private function writeMeta(SQLite3 $db, int $totalChunks): void
+ {
+ $metaStmt = $db->prepare(
+ 'INSERT OR REPLACE INTO lexical_meta (key, value) VALUES (:key, :value)'
+ );
+
+ if ($metaStmt === false) {
+ throw new \RuntimeException('Failed to prepare lexical meta statement.');
+ }
+
+ $meta = [
+ 'schema_version' => '1',
+ 'built_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
+ 'total_chunks' => (string)$totalChunks,
+ ];
+
+ foreach ($meta as $key => $value) {
+ $metaStmt->reset();
+ $metaStmt->clear();
+ $metaStmt->bindValue(':key', $key, SQLITE3_TEXT);
+ $metaStmt->bindValue(':value', $value, SQLITE3_TEXT);
+
+ $result = $metaStmt->execute();
+
+ if ($result === false) {
+ throw new \RuntimeException('Failed to write lexical meta key: ' . $key);
+ }
+
+ $result->finalize();
+ }
+ }
+
+ private function initializeSchema(SQLite3 $db): void
+ {
+ $db->exec('PRAGMA journal_mode = DELETE');
+ $db->exec('PRAGMA synchronous = NORMAL');
+ $db->exec('PRAGMA temp_store = MEMORY');
+ $db->exec('PRAGMA foreign_keys = OFF');
+
+ $schema = <<<'SQL'
+CREATE TABLE IF NOT EXISTS lexical_meta (
+ key TEXT PRIMARY KEY,
+ value TEXT NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS lexical_terms (
+ token TEXT PRIMARY KEY,
+ df INTEGER NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS lexical_postings (
+ token TEXT NOT NULL,
+ chunk_id TEXT NOT NULL,
+ document_id TEXT NOT NULL,
+ chunk_index INTEGER NULL,
+ tf INTEGER NOT NULL,
+ title_tf INTEGER NOT NULL DEFAULT 0,
+ PRIMARY KEY (token, chunk_id)
+);
+
+CREATE INDEX IF NOT EXISTS idx_lexical_postings_document_token
+ ON lexical_postings (document_id, token);
+
+CREATE INDEX IF NOT EXISTS idx_lexical_postings_chunk
+ ON lexical_postings (chunk_id);
+
+CREATE TABLE IF NOT EXISTS lexical_seen_chunks (
+ chunk_id TEXT PRIMARY KEY
+);
+SQL;
+
+ if ($db->exec($schema) === false) {
+ throw new \RuntimeException('Failed to initialize lexical index schema.');
+ }
+ }
+
+ private function openWritableDb(string $path): SQLite3
+ {
+ try {
+ $db = new SQLite3($path, SQLITE3_OPEN_READWRITE | SQLITE3_OPEN_CREATE);
+ } catch (\Throwable $e) {
+ throw new \RuntimeException('Unable to open lexical index DB: ' . $path, 0, $e);
+ }
+
+ $db->busyTimeout(5000);
+
+ return $db;
+ }
+
+ private function getIndexNdjsonPath(): string
+ {
+ return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_NDJSON_PATH;
+ }
+
+ private function getLexicalIndexPath(): string
+ {
+ return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH;
+ }
+
+ private function ensureTargetDirectoryExists(string $finalIndexPath): void
+ {
+ $dir = dirname($finalIndexPath);
+
+ if (is_dir($dir)) {
+ return;
+ }
+
+ if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
+ throw new \RuntimeException('Unable to create lexical index directory: ' . $dir);
+ }
+ }
+
+ private function atomicReplace(string $tmpPath, string $finalPath): void
+ {
+ if (is_file($finalPath)) {
+ @chmod($finalPath, 0664);
+ }
+
+ if (!@rename($tmpPath, $finalPath)) {
+ if (!@copy($tmpPath, $finalPath)) {
+ @unlink($tmpPath);
+ throw new \RuntimeException('Atomic replace failed for lexical index: ' . $finalPath);
+ }
+
+ @unlink($tmpPath);
+ }
+
+ @chmod($finalPath, 0664);
+ }
+
+ private function removeFileIfExists(string $path): void
+ {
+ if (is_file($path)) {
+ @unlink($path);
+ }
+ }
+
+ private function assertSqliteAvailable(): void
+ {
+ if (!class_exists(SQLite3::class)) {
+ throw new \RuntimeException('The sqlite3 PHP extension is required for lexical index building.');
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/Knowledge/Retrieval/QueryEnricher.php b/src/Knowledge/Retrieval/QueryEnricher.php
index 87faf88..ffe66dc 100644
--- a/src/Knowledge/Retrieval/QueryEnricher.php
+++ b/src/Knowledge/Retrieval/QueryEnricher.php
@@ -8,6 +8,14 @@ use App\Config\QueryEnricherConfig;
final readonly class QueryEnricher
{
+ /**
+ * Keep enrichment conservative.
+ *
+ * The enriched semantic query should help vector retrieval,
+ * but must not become bloated enough to dilute the original user intent.
+ */
+ private const MAX_EXPANSIONS = 4;
+
public function __construct(
private QueryEnricherConfig $config
) {
@@ -16,6 +24,12 @@ final readonly class QueryEnricher
/**
* Enriches the query with mapped counterpart terms.
*
+ * Design goals:
+ * - preserve the original query unchanged at the front
+ * - only append counterpart terms that are not already present
+ * - prefer longer / more specific phrase matches over short generic matches
+ * - keep the number of appended terms intentionally small
+ *
* Example:
* - input: "water hardness device"
* - output: "water hardness device residual hardness model"
@@ -29,26 +43,63 @@ final readonly class QueryEnricher
}
$mapping = $this->config->getEnrichQueryList();
+
+ if ($mapping === []) {
+ return $originalQuery;
+ }
+
$lookup = $this->buildBidirectionalLookup($mapping);
+
+ if ($lookup === []) {
+ return $originalQuery;
+ }
+
+ $lookup = $this->sortLookupBySpecificity($lookup);
$normalizedQuery = $this->normalizeForMatching($originalQuery);
- $matches = [];
+ if ($normalizedQuery === '') {
+ return $originalQuery;
+ }
- foreach ($lookup as $needle => $mappedValue) {
- if ($needle === '') {
+ $matches = [];
+ $seenNormalizedExpansions = [];
+
+ foreach ($lookup as $normalizedNeedle => $mappedValue) {
+ if ($normalizedNeedle === '') {
continue;
}
- if ($this->containsWholePhrase($normalizedQuery, $needle)) {
- $matches[] = $mappedValue;
+ if (!$this->containsWholePhrase($normalizedQuery, $normalizedNeedle)) {
+ continue;
+ }
+
+ $mappedValue = trim($mappedValue);
+ if ($mappedValue === '') {
+ continue;
+ }
+
+ $normalizedMappedValue = $this->normalizeForMatching($mappedValue);
+ if ($normalizedMappedValue === '') {
+ continue;
+ }
+
+ // Do not re-add information that is already present in the query.
+ if ($this->containsWholePhrase($normalizedQuery, $normalizedMappedValue)) {
+ continue;
+ }
+
+ if (isset($seenNormalizedExpansions[$normalizedMappedValue])) {
+ continue;
+ }
+
+ $matches[] = $mappedValue;
+ $seenNormalizedExpansions[$normalizedMappedValue] = true;
+
+ if (count($matches) >= self::MAX_EXPANSIONS) {
+ break;
}
}
- $matches = array_values(array_unique(array_filter(
- $matches,
- static fn(string $value): bool => trim($value) !== ''
- )));
-
if ($matches === []) {
return $originalQuery;
}
@@ -106,6 +157,11 @@ final readonly class QueryEnricher
* 'jacket' => 'coat',
* 'coat' => 'jacket',
* ]
+ *
+ * Returned format:
+ * [
+ * '' => '',
+ * ]
*/
private function buildBidirectionalLookup(array $mapping): array
{
@@ -122,15 +178,49 @@ final readonly class QueryEnricher
$normalizedKey = $this->normalizeForMatching($key);
$normalizedValue = $this->normalizeForMatching($value);
- if ($normalizedKey !== '') {
+ if ($normalizedKey !== '' && !isset($lookup[$normalizedKey])) {
$lookup[$normalizedKey] = $value;
}
- if ($normalizedValue !== '') {
+ if ($normalizedValue !== '' && !isset($lookup[$normalizedValue])) {
$lookup[$normalizedValue] = $key;
}
}
return $lookup;
}
+
+ /**
+ * Sorts phrase rules by specificity so longer / more precise phrases win first.
+ *
+ * Priority:
+ * 1. more words
+ * 2. longer character length
+ * 3. lexical order for deterministic output
+ *
+ * @param array $lookup
+ * @return array
+ */
+ private function sortLookupBySpecificity(array $lookup): array
+ {
+ uksort($lookup, static function (string $a, string $b): int {
+ $aWordCount = substr_count($a, ' ') + 1;
+ $bWordCount = substr_count($b, ' ') + 1;
+
+ if ($aWordCount !== $bWordCount) {
+ return $bWordCount <=> $aWordCount;
+ }
+
+ $aLength = mb_strlen($a, 'UTF-8');
+ $bLength = mb_strlen($b, 'UTF-8');
+
+ if ($aLength !== $bLength) {
+ return $bLength <=> $aLength;
+ }
+
+ return strcmp($a, $b);
+ });
+
+ return $lookup;
+ }
}
\ No newline at end of file