MtoRagSystem/src/Knowledge/Retrieval/NdjsonLexicalIndexBuilder.php

<?php


declare(strict_types=1);

namespace App\Knowledge\Retrieval;

use App\Knowledge\StopWords;
use Psr\Log\LoggerInterface;
use SQLite3;

final readonly class NdjsonLexicalIndexBuilder
{
    private const DEFAULT_RELATIVE_NDJSON_PATH = '/var/knowledge/index.ndjson';
    private const DEFAULT_RELATIVE_INDEX_PATH = '/var/knowledge/lexical.index.sqlite';

    /**
     * Upper bound to avoid pathological chunks exploding the lexical index.
     * This stays generic and does not encode any domain-specific assumption.
     */
    private const MAX_UNIQUE_TOKENS_PER_CHUNK = 256;

    public function __construct(
        private string          $projectDir,
        private LoggerInterface $agentLogger,
    )
    {
    }

    /**
     * Build a generic lexical SQLite index from index.ndjson.
     *
     * Output DB schema:
     *
     * lexical_meta(
     *   key TEXT PRIMARY KEY,
     *   value TEXT NOT NULL
     * )
     *
     * lexical_terms(
     *   token TEXT PRIMARY KEY,
     *   df INTEGER NOT NULL
     * )
     *
     * lexical_postings(
     *   token TEXT NOT NULL,
     *   chunk_id TEXT NOT NULL,
     *   document_id TEXT NOT NULL,
     *   chunk_index INTEGER,
     *   tf INTEGER NOT NULL,
     *   title_tf INTEGER NOT NULL DEFAULT 0,
     *   PRIMARY KEY(token, chunk_id)
     * )
     *
     * Design goals:
     * - generic, data-driven lexical retrieval base
     * - no domain keywords in core code
     * - no full scan per request later
     * - duplicate chunk_id lines in index.ndjson must not inflate the index
     */
    public function build(): void
    {
        $this->assertSqliteAvailable();

        $indexNdjsonPath = $this->getIndexNdjsonPath();
        $lexicalIndexPath = $this->getLexicalIndexPath();
        $tmpPath = $lexicalIndexPath . '.tmp';

        if (!is_file($indexNdjsonPath) || filesize($indexNdjsonPath) === 0) {
            $this->removeFileIfExists($lexicalIndexPath);
            $this->removeFileIfExists($tmpPath);

            $this->agentLogger->info('Lexical index skipped because index.ndjson is missing or empty.', [
                'index_ndjson' => $indexNdjsonPath,
            ]);

            return;
        }

        $this->ensureTargetDirectoryExists($lexicalIndexPath);
        $this->removeFileIfExists($tmpPath);

        $db = $this->openWritableDb($tmpPath);

        try {
            $this->initializeSchema($db);
            $this->buildFromNdjson($db, $indexNdjsonPath);
            $db->close();

            $this->atomicReplace($tmpPath, $lexicalIndexPath);

            $this->agentLogger->info('Lexical index build completed.', [
                'path' => $lexicalIndexPath,
            ]);
        } catch (\Throwable $e) {
            try {
                $db->close();
            } catch (\Throwable) {
                // Ignore close failures during cleanup.
            }

            $this->removeFileIfExists($tmpPath);

            $this->agentLogger->error('Lexical index build failed.', [
                'path' => $lexicalIndexPath,
                'error' => $e->getMessage(),
            ]);

            throw $e;
        }
    }

    private function buildFromNdjson(SQLite3 $db, string $indexNdjsonPath): void
    {
        $handle = @fopen($indexNdjsonPath, 'rb');

        if ($handle === false) {
            throw new \RuntimeException('Unable to read index.ndjson: ' . $indexNdjsonPath);
        }

        $db->exec('BEGIN IMMEDIATE TRANSACTION');

        try {
            $seenChunkStmt = $db->prepare(
                'INSERT OR IGNORE INTO lexical_seen_chunks (chunk_id) VALUES (:chunk_id)'
            );
            $termStmt = $db->prepare(
                'INSERT INTO lexical_terms (token, df)
                 VALUES (:token, 1)
                 ON CONFLICT(token) DO UPDATE SET df = df + 1'
            );
            $postingStmt = $db->prepare(
                'INSERT INTO lexical_postings (
                    token,
                    chunk_id,
                    document_id,
                    chunk_index,
                    tf,
                    title_tf
                 ) VALUES (
                    :token,
                    :chunk_id,
                    :document_id,
                    :chunk_index,
                    :tf,
                    :title_tf
                 )'
            );

            if (!$seenChunkStmt || !$termStmt || !$postingStmt) {
                throw new \RuntimeException('Failed to prepare lexical index SQL statements.');
            }

            $totalChunks = 0;
            $lineNumber = 0;

            while (($line = fgets($handle)) !== false) {
                $lineNumber++;
                $line = trim($line);

                if ($line === '') {
                    continue;
                }

                $row = json_decode($line, true);

                if (!is_array($row)) {
                    continue;
                }

                $chunkId = trim((string)($row['chunk_id'] ?? ''));
                $documentId = trim((string)($row['document_id'] ?? ''));
                $chunkIndex = $this->normalizeChunkIndex($row['chunk_index'] ?? null);
                $text = trim((string)($row['text'] ?? ''));

                if ($chunkId === '' || $documentId === '' || $text === '') {
                    continue;
                }

                $seenChunkStmt->reset();
                $seenChunkStmt->clear();
                $seenChunkStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT);
                $seenResult = $seenChunkStmt->execute();

                if ($seenResult !== false) {
                    $seenResult->finalize();
                }

                if ($db->changes() < 1) {
                    continue;
                }

                $title = $this->extractDocumentTitle($row);
                $tokenStats = $this->buildTokenStats($text, $title);

                if ($tokenStats === []) {
                    continue;
                }

                $totalChunks++;

                foreach ($tokenStats as $token => $stats) {
                    $termStmt->reset();
                    $termStmt->clear();
                    $termStmt->bindValue(':token', $token, SQLITE3_TEXT);
                    $termResult = $termStmt->execute();

                    if ($termResult !== false) {
                        $termResult->finalize();
                    }

                    $postingStmt->reset();
                    $postingStmt->clear();
                    $postingStmt->bindValue(':token', $token, SQLITE3_TEXT);
                    $postingStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT);
                    $postingStmt->bindValue(':document_id', $documentId, SQLITE3_TEXT);

                    if ($chunkIndex === null) {
                        $postingStmt->bindValue(':chunk_index', null, SQLITE3_NULL);
                    } else {
                        $postingStmt->bindValue(':chunk_index', $chunkIndex, SQLITE3_INTEGER);
                    }

                    $postingStmt->bindValue(':tf', $stats['tf'], SQLITE3_INTEGER);
                    $postingStmt->bindValue(':title_tf', $stats['title_tf'], SQLITE3_INTEGER);

                    $postingResult = $postingStmt->execute();

                    if ($postingResult === false) {
                        throw new \RuntimeException('Failed to insert lexical posting for token: ' . $token);
                    }

                    $postingResult->finalize();
                }
            }

            fclose($handle);

            $this->writeMeta($db, $totalChunks);

            $db->exec('COMMIT');

            $this->agentLogger->info('Lexical index streaming pass completed.', [
                'indexed_chunks' => $totalChunks,
                'source' => $indexNdjsonPath,
            ]);
        } catch (\Throwable $e) {
            fclose($handle);
            $db->exec('ROLLBACK');

            throw $e;
        }
    }

    /**
     * @return array<string, array{tf:int, title_tf:int}>
     */
    private function buildTokenStats(string $text, string $title): array
    {
        $textTokens = $this->tokenize($text);
        $titleTokens = $this->tokenize($title);

        if ($textTokens === [] && $titleTokens === []) {
            return [];
        }

        $textTf = [];
        foreach ($textTokens as $token) {
            $textTf[$token] = ($textTf[$token] ?? 0) + 1;
        }

        $titleTf = [];
        foreach ($titleTokens as $token) {
            $titleTf[$token] = ($titleTf[$token] ?? 0) + 1;
        }

        $tokens = array_values(array_unique(array_merge(
            array_keys($textTf),
            array_keys($titleTf)
        )));

        if (count($tokens) > self::MAX_UNIQUE_TOKENS_PER_CHUNK) {
            $tokens = array_slice($tokens, 0, self::MAX_UNIQUE_TOKENS_PER_CHUNK);
        }

        $stats = [];

        foreach ($tokens as $token) {
            $stats[$token] = [
                'tf' => $textTf[$token] ?? 0,
                'title_tf' => $titleTf[$token] ?? 0,
            ];
        }

        return $stats;
    }

    /**
     * Generic tokenizer:
     * - lowercases
     * - removes punctuation
     * - preserves alphanumeric codes
     * - keeps numeric/code-like tokens even if short
     * - drops generic stop words for non-numeric tokens
     *
     * @return string[]
     */
    private function tokenize(string $value): array
    {
        $value = $this->normalizeText($value);

        if ($value === '') {
            return [];
        }

        $parts = preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: [];
        $tokens = [];

        foreach ($parts as $token) {
            if ($token === '') {
                continue;
            }

            if ($this->shouldIgnoreToken($token)) {
                continue;
            }

            $tokens[] = $token;
        }

        return $tokens;
    }

    private function shouldIgnoreToken(string $token): bool
    {
        if ($token === '') {
            return true;
        }

        if (preg_match('/\d/u', $token) === 1) {
            return false;
        }

        if (mb_strlen($token, 'UTF-8') < 2) {
            return true;
        }

        return StopWords::isStopWord($token);
    }

    private function normalizeText(string $value): string
    {
        $value = mb_strtolower(trim($value), 'UTF-8');
        $value = str_replace(['-', '/', '_'], ' ', $value);
        $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
        $value = preg_replace('/\s+/u', ' ', $value) ?? $value;

        return trim($value);
    }

    private function extractDocumentTitle(array $row): string
    {
        $metadata = $row['metadata'] ?? null;

        if (!is_array($metadata)) {
            return '';
        }

        return trim((string)($metadata['document_title'] ?? ''));
    }

    private function normalizeChunkIndex(mixed $value): ?int
    {
        if (is_int($value)) {
            return $value;
        }

        if (is_string($value) && ctype_digit($value)) {
            return (int)$value;
        }

        return null;
    }

    private function writeMeta(SQLite3 $db, int $totalChunks): void
    {
        $metaStmt = $db->prepare(
            'INSERT OR REPLACE INTO lexical_meta (key, value) VALUES (:key, :value)'
        );

        if ($metaStmt === false) {
            throw new \RuntimeException('Failed to prepare lexical meta statement.');
        }

        $meta = [
            'schema_version' => '1',
            'built_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
            'total_chunks' => (string)$totalChunks,
        ];

        foreach ($meta as $key => $value) {
            $metaStmt->reset();
            $metaStmt->clear();
            $metaStmt->bindValue(':key', $key, SQLITE3_TEXT);
            $metaStmt->bindValue(':value', $value, SQLITE3_TEXT);

            $result = $metaStmt->execute();

            if ($result === false) {
                throw new \RuntimeException('Failed to write lexical meta key: ' . $key);
            }

            $result->finalize();
        }
    }

    private function initializeSchema(SQLite3 $db): void
    {
        $db->exec('PRAGMA journal_mode = DELETE');
        $db->exec('PRAGMA synchronous = NORMAL');
        $db->exec('PRAGMA temp_store = MEMORY');
        $db->exec('PRAGMA foreign_keys = OFF');

        $schema = <<<'SQL'
CREATE TABLE IF NOT EXISTS lexical_meta (
    key TEXT PRIMARY KEY,
    value TEXT NOT NULL
);

CREATE TABLE IF NOT EXISTS lexical_terms (
    token TEXT PRIMARY KEY,
    df INTEGER NOT NULL
);

CREATE TABLE IF NOT EXISTS lexical_postings (
    token TEXT NOT NULL,
    chunk_id TEXT NOT NULL,
    document_id TEXT NOT NULL,
    chunk_index INTEGER NULL,
    tf INTEGER NOT NULL,
    title_tf INTEGER NOT NULL DEFAULT 0,
    PRIMARY KEY (token, chunk_id)
);

CREATE INDEX IF NOT EXISTS idx_lexical_postings_document_token
    ON lexical_postings (document_id, token);

CREATE INDEX IF NOT EXISTS idx_lexical_postings_chunk
    ON lexical_postings (chunk_id);

CREATE TABLE IF NOT EXISTS lexical_seen_chunks (
    chunk_id TEXT PRIMARY KEY
);
SQL;

        if ($db->exec($schema) === false) {
            throw new \RuntimeException('Failed to initialize lexical index schema.');
        }
    }

    private function openWritableDb(string $path): SQLite3
    {
        try {
            $db = new SQLite3($path, SQLITE3_OPEN_READWRITE | SQLITE3_OPEN_CREATE);
        } catch (\Throwable $e) {
            throw new \RuntimeException('Unable to open lexical index DB: ' . $path, 0, $e);
        }

        $db->busyTimeout(5000);

        return $db;
    }

    private function getIndexNdjsonPath(): string
    {
        return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_NDJSON_PATH;
    }

    private function getLexicalIndexPath(): string
    {
        return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH;
    }

    private function ensureTargetDirectoryExists(string $finalIndexPath): void
    {
        $dir = dirname($finalIndexPath);

        if (is_dir($dir)) {
            return;
        }

        if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
            throw new \RuntimeException('Unable to create lexical index directory: ' . $dir);
        }
    }

    private function atomicReplace(string $tmpPath, string $finalPath): void
    {
        if (is_file($finalPath)) {
            @chmod($finalPath, 0664);
        }

        if (!@rename($tmpPath, $finalPath)) {
            if (!@copy($tmpPath, $finalPath)) {
                @unlink($tmpPath);
                throw new \RuntimeException('Atomic replace failed for lexical index: ' . $finalPath);
            }

            @unlink($tmpPath);
        }

        @chmod($finalPath, 0664);
    }

    private function removeFileIfExists(string $path): void
    {
        if (is_file($path)) {
            @unlink($path);
        }
    }

    private function assertSqliteAvailable(): void
    {
        if (!class_exists(SQLite3::class)) {
            throw new \RuntimeException('The sqlite3 PHP extension is required for lexical index building.');
        }
    }
}