lexical logic

2026-04-20 21:46:42 +02:00
parent 2587ac8b4b
commit 065f59c090
9 changed files with 2576 additions and 326 deletions
--- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
+++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
--- a/src/Knowledge/Retrieval/NdjsonKeywordRetriever.php
+++ b/src/Knowledge/Retrieval/NdjsonKeywordRetriever.php
@@ -0,0 +1,451 @@
+<?php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Retrieval;
+
+use App\Knowledge\StopWords;
+use Psr\Log\LoggerInterface;
+use SQLite3;
+
+final readonly class NdjsonKeywordRetriever
+{
+    private const DEFAULT_RELATIVE_INDEX_PATH = '/var/knowledge/lexical.index.sqlite';
+    private const MAX_LIMIT = 100;
+    private const MAX_QUERY_TOKENS = 12;
+
+    public function __construct(
+        private string $projectDir,
+        private LoggerInterface $agentLogger,
+    ) {
+    }
+
+    /**
+     * Generic lexical retrieval against a prebuilt SQLite index.
+     *
+     * Expected DB schema (to be created by the lexical index builder):
+     *
+     * lexical_meta(
+     *   key TEXT PRIMARY KEY,
+     *   value TEXT NOT NULL
+     * )
+     *
+     * lexical_terms(
+     *   token TEXT PRIMARY KEY,
+     *   df INTEGER NOT NULL
+     * )
+     *
+     * lexical_postings(
+     *   token TEXT NOT NULL,
+     *   chunk_id TEXT NOT NULL,
+     *   document_id TEXT NOT NULL,
+     *   chunk_index INTEGER,
+     *   tf INTEGER NOT NULL,
+     *   title_tf INTEGER NOT NULL DEFAULT 0,
+     *   PRIMARY KEY(token, chunk_id)
+     * )
+     *
+     * This retriever contains no domain-specific keyword logic.
+     * It only uses generic token overlap, rarity, title hits, and numeric/code emphasis.
+     *
+     * @param string[] $docIds Optional document scope
+     *
+     * @return array<int, array{
+     *     chunk_id:string,
+     *     score:float,
+     *     document_id:?string,
+     *     chunk_index:?int
+     * }>
+     */
+    public function search(string $query, int $limit = 10, array $docIds = []): array
+    {
+        $limit = $this->clampLimit($limit);
+        $analysis = $this->analyzeQuery($query);
+
+        if ($analysis['tokens'] === []) {
+            return [];
+        }
+
+        $db = $this->openReadOnlyDb();
+
+        if (!$db instanceof SQLite3) {
+            return [];
+        }
+
+        try {
+            $totalChunks = $this->loadTotalChunks($db);
+            $rows = $this->loadPostings(
+                $db,
+                $analysis['tokens'],
+                $docIds
+            );
+
+            if ($rows === []) {
+                return [];
+            }
+
+            return $this->scoreRows(
+                $rows,
+                $analysis['tokens'],
+                $analysis['numeric_tokens'],
+                $totalChunks,
+                $limit
+            );
+        } catch (\Throwable $e) {
+            $this->agentLogger->error('Keyword retriever failed', [
+                'error' => $e->getMessage(),
+            ]);
+
+            return [];
+        } finally {
+            $db->close();
+        }
+    }
+
+    /**
+     * @return array{
+     *   normalized_query:string,
+     *   tokens:string[],
+     *   numeric_tokens:string[]
+     * }
+     */
+    private function analyzeQuery(string $query): array
+    {
+        $normalized = $this->normalizeText($query);
+
+        if ($normalized === '') {
+            return [
+                'normalized_query' => '',
+                'tokens' => [],
+                'numeric_tokens' => [],
+            ];
+        }
+
+        $parts = preg_split('/\s+/u', $normalized, -1, PREG_SPLIT_NO_EMPTY) ?: [];
+
+        $tokens = [];
+        $numericTokens = [];
+
+        foreach ($parts as $token) {
+            if ($token === '') {
+                continue;
+            }
+
+            if ($this->shouldIgnoreToken($token)) {
+                continue;
+            }
+
+            $tokens[] = $token;
+
+            if (preg_match('/\d/u', $token) === 1) {
+                $numericTokens[] = $token;
+            }
+        }
+
+        $tokens = array_values(array_unique($tokens));
+        $numericTokens = array_values(array_unique($numericTokens));
+
+        if (count($tokens) > self::MAX_QUERY_TOKENS) {
+            $tokens = array_slice($tokens, 0, self::MAX_QUERY_TOKENS);
+        }
+
+        return [
+            'normalized_query' => $normalized,
+            'tokens' => $tokens,
+            'numeric_tokens' => $numericTokens,
+        ];
+    }
+
+    private function shouldIgnoreToken(string $token): bool
+    {
+        if ($token === '') {
+            return true;
+        }
+
+        if (preg_match('/\d/u', $token) === 1) {
+            return false;
+        }
+
+        if (mb_strlen($token, 'UTF-8') < 2) {
+            return true;
+        }
+
+        return StopWords::isStopWord($token);
+    }
+
+    private function normalizeText(string $value): string
+    {
+        $value = mb_strtolower(trim($value), 'UTF-8');
+        $value = str_replace(['-', '/', '_'], ' ', $value);
+        $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
+        $value = preg_replace('/\s+/u', ' ', $value) ?? $value;
+
+        return trim($value);
+    }
+
+    private function openReadOnlyDb(): ?SQLite3
+    {
+        if (!class_exists(SQLite3::class)) {
+            $this->agentLogger->warning('Keyword retriever unavailable: sqlite3 extension missing.');
+
+            return null;
+        }
+
+        $path = $this->getIndexPath();
+
+        if (!is_file($path)) {
+            return null;
+        }
+
+        try {
+            $db = new SQLite3($path, SQLITE3_OPEN_READONLY);
+            $db->busyTimeout(1000);
+
+            return $db;
+        } catch (\Throwable $e) {
+            $this->agentLogger->error('Unable to open lexical index', [
+                'path' => $path,
+                'error' => $e->getMessage(),
+            ]);
+
+            return null;
+        }
+    }
+
+    private function getIndexPath(): string
+    {
+        return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH;
+    }
+
+    private function loadTotalChunks(SQLite3 $db): int
+    {
+        $stmt = $db->prepare('SELECT value FROM lexical_meta WHERE key = :key');
+        if (!$stmt) {
+            return 1;
+        }
+
+        $stmt->bindValue(':key', 'total_chunks', SQLITE3_TEXT);
+        $result = $stmt->execute();
+
+        if ($result === false) {
+            return 1;
+        }
+
+        $row = $result->fetchArray(SQLITE3_ASSOC);
+        $result->finalize();
+
+        $value = isset($row['value']) ? (int) $row['value'] : 0;
+
+        return max(1, $value);
+    }
+
+    /**
+     * @param string[] $tokens
+     * @param string[] $docIds
+     * @return array<int, array{
+     *   token:string,
+     *   chunk_id:string,
+     *   document_id:string,
+     *   chunk_index:?int,
+     *   tf:int,
+     *   title_tf:int,
+     *   df:int
+     * }>
+     */
+    private function loadPostings(SQLite3 $db, array $tokens, array $docIds): array
+    {
+        if ($tokens === []) {
+            return [];
+        }
+
+        $tokenPlaceholders = [];
+        foreach (array_keys($tokens) as $i) {
+            $tokenPlaceholders[] = ':t' . $i;
+        }
+
+        $sql = '
+            SELECT
+                p.token,
+                p.chunk_id,
+                p.document_id,
+                p.chunk_index,
+                p.tf,
+                p.title_tf,
+                lt.df
+            FROM lexical_postings p
+            INNER JOIN lexical_terms lt ON lt.token = p.token
+            WHERE p.token IN (' . implode(', ', $tokenPlaceholders) . ')
+        ';
+
+        $docIds = array_values(array_unique(array_filter(
+            $docIds,
+            static fn (mixed $value): bool => is_string($value) && $value !== ''
+        )));
+
+        if ($docIds !== []) {
+            $docPlaceholders = [];
+            foreach (array_keys($docIds) as $i) {
+                $docPlaceholders[] = ':d' . $i;
+            }
+
+            $sql .= ' AND p.document_id IN (' . implode(', ', $docPlaceholders) . ')';
+        }
+
+        $stmt = $db->prepare($sql);
+
+        if ($stmt === false) {
+            return [];
+        }
+
+        foreach ($tokens as $i => $token) {
+            $stmt->bindValue(':t' . $i, $token, SQLITE3_TEXT);
+        }
+
+        foreach ($docIds as $i => $docId) {
+            $stmt->bindValue(':d' . $i, $docId, SQLITE3_TEXT);
+        }
+
+        $result = $stmt->execute();
+
+        if ($result === false) {
+            return [];
+        }
+
+        $rows = [];
+
+        while (($row = $result->fetchArray(SQLITE3_ASSOC)) !== false) {
+            $chunkId = (string) ($row['chunk_id'] ?? '');
+            $documentId = (string) ($row['document_id'] ?? '');
+            $token = (string) ($row['token'] ?? '');
+
+            if ($chunkId === '' || $documentId === '' || $token === '') {
+                continue;
+            }
+
+            $chunkIndex = null;
+            if (isset($row['chunk_index']) && is_numeric($row['chunk_index'])) {
+                $chunkIndex = (int) $row['chunk_index'];
+            }
+
+            $rows[] = [
+                'token' => $token,
+                'chunk_id' => $chunkId,
+                'document_id' => $documentId,
+                'chunk_index' => $chunkIndex,
+                'tf' => max(1, (int) ($row['tf'] ?? 1)),
+                'title_tf' => max(0, (int) ($row['title_tf'] ?? 0)),
+                'df' => max(1, (int) ($row['df'] ?? 1)),
+            ];
+        }
+
+        $result->finalize();
+
+        return $rows;
+    }
+
+    /**
+     * @param array<int, array{
+     *   token:string,
+     *   chunk_id:string,
+     *   document_id:string,
+     *   chunk_index:?int,
+     *   tf:int,
+     *   title_tf:int,
+     *   df:int
+     * }> $rows
+     * @param string[] $queryTokens
+     * @param string[] $numericTokens
+     *
+     * @return array<int, array{
+     *   chunk_id:string,
+     *   score:float,
+     *   document_id:?string,
+     *   chunk_index:?int
+     * }>
+     */
+    private function scoreRows(
+        array $rows,
+        array $queryTokens,
+        array $numericTokens,
+        int $totalChunks,
+        int $limit
+    ): array {
+        if ($rows === []) {
+            return [];
+        }
+
+        $numericLookup = array_fill_keys($numericTokens, true);
+        $queryTokenCount = max(1, count($queryTokens));
+
+        $scores = [];
+        $meta = [];
+        $matchedTokens = [];
+
+        foreach ($rows as $row) {
+            $chunkId = $row['chunk_id'];
+            $token = $row['token'];
+
+            $idf = log(1.0 + ($totalChunks / max(1.0, (float) (1 + $row['df']))));
+            $tfBoost = 1.0 + (min(3, $row['tf']) * 0.20);
+            $numericBoost = isset($numericLookup[$token]) ? 1.60 : 1.0;
+            $titleBonus = $row['title_tf'] > 0 ? ($idf * 0.75) : 0.0;
+
+            $scores[$chunkId] = ($scores[$chunkId] ?? 0.0)
+                + ($idf * $tfBoost * $numericBoost)
+                + $titleBonus;
+
+            $matchedTokens[$chunkId][$token] = true;
+
+            if (!isset($meta[$chunkId])) {
+                $meta[$chunkId] = [
+                    'document_id' => $row['document_id'],
+                    'chunk_index' => $row['chunk_index'],
+                ];
+            }
+        }
+
+        foreach ($scores as $chunkId => $score) {
+            $coverage = count($matchedTokens[$chunkId] ?? []) / $queryTokenCount;
+            $scores[$chunkId] = $score * (0.65 + (0.35 * $coverage));
+        }
+
+        arsort($scores);
+
+        $topScore = (float) reset($scores);
+        if ($topScore <= 0.0) {
+            return [];
+        }
+
+        $out = [];
+
+        foreach ($scores as $chunkId => $score) {
+            $normalizedScore = $score / $topScore;
+
+            $out[] = [
+                'chunk_id' => $chunkId,
+                'score' => round($normalizedScore, 6),
+                'document_id' => $meta[$chunkId]['document_id'] ?? null,
+                'chunk_index' => $meta[$chunkId]['chunk_index'] ?? null,
+            ];
+
+            if (count($out) >= $limit) {
+                break;
+            }
+        }
+
+        return $out;
+    }
+
+    private function clampLimit(int $limit): int
+    {
+        if ($limit < 1) {
+            return 1;
+        }
+
+        if ($limit > self::MAX_LIMIT) {
+            return self::MAX_LIMIT;
+        }
+
+        return $limit;
+    }
+}
--- a/src/Knowledge/Retrieval/NdjsonLexicalIndexBuilder.php
+++ b/src/Knowledge/Retrieval/NdjsonLexicalIndexBuilder.php
@@ -0,0 +1,528 @@
+<?php
+
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Retrieval;
+
+use App\Knowledge\StopWords;
+use Psr\Log\LoggerInterface;
+use SQLite3;
+
+final readonly class NdjsonLexicalIndexBuilder
+{
+    private const DEFAULT_RELATIVE_NDJSON_PATH = '/var/knowledge/index.ndjson';
+    private const DEFAULT_RELATIVE_INDEX_PATH = '/var/knowledge/lexical.index.sqlite';
+
+    /**
+     * Upper bound to avoid pathological chunks exploding the lexical index.
+     * This stays generic and does not encode any domain-specific assumption.
+     */
+    private const MAX_UNIQUE_TOKENS_PER_CHUNK = 256;
+
+    public function __construct(
+        private string          $projectDir,
+        private LoggerInterface $agentLogger,
+    )
+    {
+    }
+
+    /**
+     * Build a generic lexical SQLite index from index.ndjson.
+     *
+     * Output DB schema:
+     *
+     * lexical_meta(
+     *   key TEXT PRIMARY KEY,
+     *   value TEXT NOT NULL
+     * )
+     *
+     * lexical_terms(
+     *   token TEXT PRIMARY KEY,
+     *   df INTEGER NOT NULL
+     * )
+     *
+     * lexical_postings(
+     *   token TEXT NOT NULL,
+     *   chunk_id TEXT NOT NULL,
+     *   document_id TEXT NOT NULL,
+     *   chunk_index INTEGER,
+     *   tf INTEGER NOT NULL,
+     *   title_tf INTEGER NOT NULL DEFAULT 0,
+     *   PRIMARY KEY(token, chunk_id)
+     * )
+     *
+     * Design goals:
+     * - generic, data-driven lexical retrieval base
+     * - no domain keywords in core code
+     * - no full scan per request later
+     * - duplicate chunk_id lines in index.ndjson must not inflate the index
+     */
+    public function build(): void
+    {
+        $this->assertSqliteAvailable();
+
+        $indexNdjsonPath = $this->getIndexNdjsonPath();
+        $lexicalIndexPath = $this->getLexicalIndexPath();
+        $tmpPath = $lexicalIndexPath . '.tmp';
+
+        if (!is_file($indexNdjsonPath) || filesize($indexNdjsonPath) === 0) {
+            $this->removeFileIfExists($lexicalIndexPath);
+            $this->removeFileIfExists($tmpPath);
+
+            $this->agentLogger->info('Lexical index skipped because index.ndjson is missing or empty.', [
+                'index_ndjson' => $indexNdjsonPath,
+            ]);
+
+            return;
+        }
+
+        $this->ensureTargetDirectoryExists($lexicalIndexPath);
+        $this->removeFileIfExists($tmpPath);
+
+        $db = $this->openWritableDb($tmpPath);
+
+        try {
+            $this->initializeSchema($db);
+            $this->buildFromNdjson($db, $indexNdjsonPath);
+            $db->close();
+
+            $this->atomicReplace($tmpPath, $lexicalIndexPath);
+
+            $this->agentLogger->info('Lexical index build completed.', [
+                'path' => $lexicalIndexPath,
+            ]);
+        } catch (\Throwable $e) {
+            try {
+                $db->close();
+            } catch (\Throwable) {
+                // Ignore close failures during cleanup.
+            }
+
+            $this->removeFileIfExists($tmpPath);
+
+            $this->agentLogger->error('Lexical index build failed.', [
+                'path' => $lexicalIndexPath,
+                'error' => $e->getMessage(),
+            ]);
+
+            throw $e;
+        }
+    }
+
+    private function buildFromNdjson(SQLite3 $db, string $indexNdjsonPath): void
+    {
+        $handle = @fopen($indexNdjsonPath, 'rb');
+
+        if ($handle === false) {
+            throw new \RuntimeException('Unable to read index.ndjson: ' . $indexNdjsonPath);
+        }
+
+        $db->exec('BEGIN IMMEDIATE TRANSACTION');
+
+        try {
+            $seenChunkStmt = $db->prepare(
+                'INSERT OR IGNORE INTO lexical_seen_chunks (chunk_id) VALUES (:chunk_id)'
+            );
+            $termStmt = $db->prepare(
+                'INSERT INTO lexical_terms (token, df)
+                 VALUES (:token, 1)
+                 ON CONFLICT(token) DO UPDATE SET df = df + 1'
+            );
+            $postingStmt = $db->prepare(
+                'INSERT INTO lexical_postings (
+                    token,
+                    chunk_id,
+                    document_id,
+                    chunk_index,
+                    tf,
+                    title_tf
+                 ) VALUES (
+                    :token,
+                    :chunk_id,
+                    :document_id,
+                    :chunk_index,
+                    :tf,
+                    :title_tf
+                 )'
+            );
+
+            if (!$seenChunkStmt || !$termStmt || !$postingStmt) {
+                throw new \RuntimeException('Failed to prepare lexical index SQL statements.');
+            }
+
+            $totalChunks = 0;
+            $lineNumber = 0;
+
+            while (($line = fgets($handle)) !== false) {
+                $lineNumber++;
+                $line = trim($line);
+
+                if ($line === '') {
+                    continue;
+                }
+
+                $row = json_decode($line, true);
+
+                if (!is_array($row)) {
+                    continue;
+                }
+
+                $chunkId = trim((string)($row['chunk_id'] ?? ''));
+                $documentId = trim((string)($row['document_id'] ?? ''));
+                $chunkIndex = $this->normalizeChunkIndex($row['chunk_index'] ?? null);
+                $text = trim((string)($row['text'] ?? ''));
+
+                if ($chunkId === '' || $documentId === '' || $text === '') {
+                    continue;
+                }
+
+                $seenChunkStmt->reset();
+                $seenChunkStmt->clear();
+                $seenChunkStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT);
+                $seenResult = $seenChunkStmt->execute();
+
+                if ($seenResult !== false) {
+                    $seenResult->finalize();
+                }
+
+                if ($db->changes() < 1) {
+                    continue;
+                }
+
+                $title = $this->extractDocumentTitle($row);
+                $tokenStats = $this->buildTokenStats($text, $title);
+
+                if ($tokenStats === []) {
+                    continue;
+                }
+
+                $totalChunks++;
+
+                foreach ($tokenStats as $token => $stats) {
+                    $termStmt->reset();
+                    $termStmt->clear();
+                    $termStmt->bindValue(':token', $token, SQLITE3_TEXT);
+                    $termResult = $termStmt->execute();
+
+                    if ($termResult !== false) {
+                        $termResult->finalize();
+                    }
+
+                    $postingStmt->reset();
+                    $postingStmt->clear();
+                    $postingStmt->bindValue(':token', $token, SQLITE3_TEXT);
+                    $postingStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT);
+                    $postingStmt->bindValue(':document_id', $documentId, SQLITE3_TEXT);
+
+                    if ($chunkIndex === null) {
+                        $postingStmt->bindValue(':chunk_index', null, SQLITE3_NULL);
+                    } else {
+                        $postingStmt->bindValue(':chunk_index', $chunkIndex, SQLITE3_INTEGER);
+                    }
+
+                    $postingStmt->bindValue(':tf', $stats['tf'], SQLITE3_INTEGER);
+                    $postingStmt->bindValue(':title_tf', $stats['title_tf'], SQLITE3_INTEGER);
+
+                    $postingResult = $postingStmt->execute();
+
+                    if ($postingResult === false) {
+                        throw new \RuntimeException('Failed to insert lexical posting for token: ' . $token);
+                    }
+
+                    $postingResult->finalize();
+                }
+            }
+
+            fclose($handle);
+
+            $this->writeMeta($db, $totalChunks);
+
+            $db->exec('COMMIT');
+
+            $this->agentLogger->info('Lexical index streaming pass completed.', [
+                'indexed_chunks' => $totalChunks,
+                'source' => $indexNdjsonPath,
+            ]);
+        } catch (\Throwable $e) {
+            fclose($handle);
+            $db->exec('ROLLBACK');
+
+            throw $e;
+        }
+    }
+
+    /**
+     * @return array<string, array{tf:int, title_tf:int}>
+     */
+    private function buildTokenStats(string $text, string $title): array
+    {
+        $textTokens = $this->tokenize($text);
+        $titleTokens = $this->tokenize($title);
+
+        if ($textTokens === [] && $titleTokens === []) {
+            return [];
+        }
+
+        $textTf = [];
+        foreach ($textTokens as $token) {
+            $textTf[$token] = ($textTf[$token] ?? 0) + 1;
+        }
+
+        $titleTf = [];
+        foreach ($titleTokens as $token) {
+            $titleTf[$token] = ($titleTf[$token] ?? 0) + 1;
+        }
+
+        $tokens = array_values(array_unique(array_merge(
+            array_keys($textTf),
+            array_keys($titleTf)
+        )));
+
+        if (count($tokens) > self::MAX_UNIQUE_TOKENS_PER_CHUNK) {
+            $tokens = array_slice($tokens, 0, self::MAX_UNIQUE_TOKENS_PER_CHUNK);
+        }
+
+        $stats = [];
+
+        foreach ($tokens as $token) {
+            $stats[$token] = [
+                'tf' => $textTf[$token] ?? 0,
+                'title_tf' => $titleTf[$token] ?? 0,
+            ];
+        }
+
+        return $stats;
+    }
+
+    /**
+     * Generic tokenizer:
+     * - lowercases
+     * - removes punctuation
+     * - preserves alphanumeric codes
+     * - keeps numeric/code-like tokens even if short
+     * - drops generic stop words for non-numeric tokens
+     *
+     * @return string[]
+     */
+    private function tokenize(string $value): array
+    {
+        $value = $this->normalizeText($value);
+
+        if ($value === '') {
+            return [];
+        }
+
+        $parts = preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: [];
+        $tokens = [];
+
+        foreach ($parts as $token) {
+            if ($token === '') {
+                continue;
+            }
+
+            if ($this->shouldIgnoreToken($token)) {
+                continue;
+            }
+
+            $tokens[] = $token;
+        }
+
+        return $tokens;
+    }
+
+    private function shouldIgnoreToken(string $token): bool
+    {
+        if ($token === '') {
+            return true;
+        }
+
+        if (preg_match('/\d/u', $token) === 1) {
+            return false;
+        }
+
+        if (mb_strlen($token, 'UTF-8') < 2) {
+            return true;
+        }
+
+        return StopWords::isStopWord($token);
+    }
+
+    private function normalizeText(string $value): string
+    {
+        $value = mb_strtolower(trim($value), 'UTF-8');
+        $value = str_replace(['-', '/', '_'], ' ', $value);
+        $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
+        $value = preg_replace('/\s+/u', ' ', $value) ?? $value;
+
+        return trim($value);
+    }
+
+    private function extractDocumentTitle(array $row): string
+    {
+        $metadata = $row['metadata'] ?? null;
+
+        if (!is_array($metadata)) {
+            return '';
+        }
+
+        return trim((string)($metadata['document_title'] ?? ''));
+    }
+
+    private function normalizeChunkIndex(mixed $value): ?int
+    {
+        if (is_int($value)) {
+            return $value;
+        }
+
+        if (is_string($value) && ctype_digit($value)) {
+            return (int)$value;
+        }
+
+        return null;
+    }
+
+    private function writeMeta(SQLite3 $db, int $totalChunks): void
+    {
+        $metaStmt = $db->prepare(
+            'INSERT OR REPLACE INTO lexical_meta (key, value) VALUES (:key, :value)'
+        );
+
+        if ($metaStmt === false) {
+            throw new \RuntimeException('Failed to prepare lexical meta statement.');
+        }
+
+        $meta = [
+            'schema_version' => '1',
+            'built_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
+            'total_chunks' => (string)$totalChunks,
+        ];
+
+        foreach ($meta as $key => $value) {
+            $metaStmt->reset();
+            $metaStmt->clear();
+            $metaStmt->bindValue(':key', $key, SQLITE3_TEXT);
+            $metaStmt->bindValue(':value', $value, SQLITE3_TEXT);
+
+            $result = $metaStmt->execute();
+
+            if ($result === false) {
+                throw new \RuntimeException('Failed to write lexical meta key: ' . $key);
+            }
+
+            $result->finalize();
+        }
+    }
+
+    private function initializeSchema(SQLite3 $db): void
+    {
+        $db->exec('PRAGMA journal_mode = DELETE');
+        $db->exec('PRAGMA synchronous = NORMAL');
+        $db->exec('PRAGMA temp_store = MEMORY');
+        $db->exec('PRAGMA foreign_keys = OFF');
+
+        $schema = <<<'SQL'
+CREATE TABLE IF NOT EXISTS lexical_meta (
+    key TEXT PRIMARY KEY,
+    value TEXT NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS lexical_terms (
+    token TEXT PRIMARY KEY,
+    df INTEGER NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS lexical_postings (
+    token TEXT NOT NULL,
+    chunk_id TEXT NOT NULL,
+    document_id TEXT NOT NULL,
+    chunk_index INTEGER NULL,
+    tf INTEGER NOT NULL,
+    title_tf INTEGER NOT NULL DEFAULT 0,
+    PRIMARY KEY (token, chunk_id)
+);
+
+CREATE INDEX IF NOT EXISTS idx_lexical_postings_document_token
+    ON lexical_postings (document_id, token);
+
+CREATE INDEX IF NOT EXISTS idx_lexical_postings_chunk
+    ON lexical_postings (chunk_id);
+
+CREATE TABLE IF NOT EXISTS lexical_seen_chunks (
+    chunk_id TEXT PRIMARY KEY
+);
+SQL;
+
+        if ($db->exec($schema) === false) {
+            throw new \RuntimeException('Failed to initialize lexical index schema.');
+        }
+    }
+
+    private function openWritableDb(string $path): SQLite3
+    {
+        try {
+            $db = new SQLite3($path, SQLITE3_OPEN_READWRITE | SQLITE3_OPEN_CREATE);
+        } catch (\Throwable $e) {
+            throw new \RuntimeException('Unable to open lexical index DB: ' . $path, 0, $e);
+        }
+
+        $db->busyTimeout(5000);
+
+        return $db;
+    }
+
+    private function getIndexNdjsonPath(): string
+    {
+        return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_NDJSON_PATH;
+    }
+
+    private function getLexicalIndexPath(): string
+    {
+        return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH;
+    }
+
+    private function ensureTargetDirectoryExists(string $finalIndexPath): void
+    {
+        $dir = dirname($finalIndexPath);
+
+        if (is_dir($dir)) {
+            return;
+        }
+
+        if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
+            throw new \RuntimeException('Unable to create lexical index directory: ' . $dir);
+        }
+    }
+
+    private function atomicReplace(string $tmpPath, string $finalPath): void
+    {
+        if (is_file($finalPath)) {
+            @chmod($finalPath, 0664);
+        }
+
+        if (!@rename($tmpPath, $finalPath)) {
+            if (!@copy($tmpPath, $finalPath)) {
+                @unlink($tmpPath);
+                throw new \RuntimeException('Atomic replace failed for lexical index: ' . $finalPath);
+            }
+
+            @unlink($tmpPath);
+        }
+
+        @chmod($finalPath, 0664);
+    }
+
+    private function removeFileIfExists(string $path): void
+    {
+        if (is_file($path)) {
+            @unlink($path);
+        }
+    }
+
+    private function assertSqliteAvailable(): void
+    {
+        if (!class_exists(SQLite3::class)) {
+            throw new \RuntimeException('The sqlite3 PHP extension is required for lexical index building.');
+        }
+    }
+}
--- a/src/Knowledge/Retrieval/QueryEnricher.php
+++ b/src/Knowledge/Retrieval/QueryEnricher.php
@@ -8,6 +8,14 @@ use App\Config\QueryEnricherConfig;

 final readonly class QueryEnricher
 {
+    /**
+     * Keep enrichment conservative.
+     *
+     * The enriched semantic query should help vector retrieval,
+     * but must not become bloated enough to dilute the original user intent.
+     */
+    private const MAX_EXPANSIONS = 4;
+
    public function __construct(
        private QueryEnricherConfig $config
    ) {
@@ -16,6 +24,12 @@ final readonly class QueryEnricher
    /**
     * Enriches the query with mapped counterpart terms.
     *
+     * Design goals:
+     * - preserve the original query unchanged at the front
+     * - only append counterpart terms that are not already present
+     * - prefer longer / more specific phrase matches over short generic matches
+     * - keep the number of appended terms intentionally small
+     *
     * Example:
     * - input:  "water hardness device"
     * - output: "water hardness device residual hardness model"
@@ -29,26 +43,63 @@ final readonly class QueryEnricher
        }

        $mapping = $this->config->getEnrichQueryList();
+
+        if ($mapping === []) {
+            return $originalQuery;
+        }
+
        $lookup = $this->buildBidirectionalLookup($mapping);
+
+        if ($lookup === []) {
+            return $originalQuery;
+        }
+
+        $lookup = $this->sortLookupBySpecificity($lookup);
        $normalizedQuery = $this->normalizeForMatching($originalQuery);

-        $matches = [];
+        if ($normalizedQuery === '') {
+            return $originalQuery;
+        }

-        foreach ($lookup as $needle => $mappedValue) {
-            if ($needle === '') {
+        $matches = [];
+        $seenNormalizedExpansions = [];
+
+        foreach ($lookup as $normalizedNeedle => $mappedValue) {
+            if ($normalizedNeedle === '') {
                continue;
            }

-            if ($this->containsWholePhrase($normalizedQuery, $needle)) {
-                $matches[] = $mappedValue;
+            if (!$this->containsWholePhrase($normalizedQuery, $normalizedNeedle)) {
+                continue;
+            }
+
+            $mappedValue = trim($mappedValue);
+            if ($mappedValue === '') {
+                continue;
+            }
+
+            $normalizedMappedValue = $this->normalizeForMatching($mappedValue);
+            if ($normalizedMappedValue === '') {
+                continue;
+            }
+
+            // Do not re-add information that is already present in the query.
+            if ($this->containsWholePhrase($normalizedQuery, $normalizedMappedValue)) {
+                continue;
+            }
+
+            if (isset($seenNormalizedExpansions[$normalizedMappedValue])) {
+                continue;
+            }
+
+            $matches[] = $mappedValue;
+            $seenNormalizedExpansions[$normalizedMappedValue] = true;
+
+            if (count($matches) >= self::MAX_EXPANSIONS) {
+                break;
            }
        }

-        $matches = array_values(array_unique(array_filter(
-            $matches,
-            static fn(string $value): bool => trim($value) !== ''
-        )));
-
        if ($matches === []) {
            return $originalQuery;
        }
@@ -106,6 +157,11 @@ final readonly class QueryEnricher
     *     'jacket'   => 'coat',
     *     'coat'     => 'jacket',
     * ]
+     *
+     * Returned format:
+     * [
+     *     '<normalized needle>' => '<original mapped value>',
+     * ]
     */
    private function buildBidirectionalLookup(array $mapping): array
    {
@@ -122,15 +178,49 @@ final readonly class QueryEnricher
            $normalizedKey = $this->normalizeForMatching($key);
            $normalizedValue = $this->normalizeForMatching($value);

-            if ($normalizedKey !== '') {
+            if ($normalizedKey !== '' && !isset($lookup[$normalizedKey])) {
                $lookup[$normalizedKey] = $value;
            }

-            if ($normalizedValue !== '') {
+            if ($normalizedValue !== '' && !isset($lookup[$normalizedValue])) {
                $lookup[$normalizedValue] = $key;
            }
        }

        return $lookup;
    }
+
+    /**
+     * Sorts phrase rules by specificity so longer / more precise phrases win first.
+     *
+     * Priority:
+     * 1. more words
+     * 2. longer character length
+     * 3. lexical order for deterministic output
+     *
+     * @param array<string, string> $lookup
+     * @return array<string, string>
+     */
+    private function sortLookupBySpecificity(array $lookup): array
+    {
+        uksort($lookup, static function (string $a, string $b): int {
+            $aWordCount = substr_count($a, ' ') + 1;
+            $bWordCount = substr_count($b, ' ') + 1;
+
+            if ($aWordCount !== $bWordCount) {
+                return $bWordCount <=> $aWordCount;
+            }
+
+            $aLength = mb_strlen($a, 'UTF-8');
+            $bLength = mb_strlen($b, 'UTF-8');
+
+            if ($aLength !== $bLength) {
+                return $bLength <=> $aLength;
+            }
+
+            return strcmp($a, $b);
+        });
+
+        return $lookup;
+    }
 }