assertSqliteAvailable(); $indexNdjsonPath = $this->getIndexNdjsonPath(); $lexicalIndexPath = $this->getLexicalIndexPath(); $tmpPath = $lexicalIndexPath . '.tmp'; if (!is_file($indexNdjsonPath) || filesize($indexNdjsonPath) === 0) { $this->removeFileIfExists($lexicalIndexPath); $this->removeFileIfExists($tmpPath); $this->agentLogger->info('Lexical index skipped because index.ndjson is missing or empty.', [ 'index_ndjson' => $indexNdjsonPath, ]); return; } $this->ensureTargetDirectoryExists($lexicalIndexPath); $this->removeFileIfExists($tmpPath); $db = $this->openWritableDb($tmpPath); try { $this->initializeSchema($db); $this->buildFromNdjson($db, $indexNdjsonPath); $db->close(); $this->atomicReplace($tmpPath, $lexicalIndexPath); $this->agentLogger->info('Lexical index build completed.', [ 'path' => $lexicalIndexPath, ]); } catch (\Throwable $e) { try { $db->close(); } catch (\Throwable) { // Ignore close failures during cleanup. } $this->removeFileIfExists($tmpPath); $this->agentLogger->error('Lexical index build failed.', [ 'path' => $lexicalIndexPath, 'error' => $e->getMessage(), ]); throw $e; } } private function buildFromNdjson(SQLite3 $db, string $indexNdjsonPath): void { $handle = @fopen($indexNdjsonPath, 'rb'); if ($handle === false) { throw new \RuntimeException('Unable to read index.ndjson: ' . $indexNdjsonPath); } $db->exec('BEGIN IMMEDIATE TRANSACTION'); try { $seenChunkStmt = $db->prepare( 'INSERT OR IGNORE INTO lexical_seen_chunks (chunk_id) VALUES (:chunk_id)' ); $termStmt = $db->prepare( 'INSERT INTO lexical_terms (token, df) VALUES (:token, 1) ON CONFLICT(token) DO UPDATE SET df = df + 1' ); $postingStmt = $db->prepare( 'INSERT INTO lexical_postings ( token, chunk_id, document_id, chunk_index, tf, title_tf ) VALUES ( :token, :chunk_id, :document_id, :chunk_index, :tf, :title_tf )' ); if (!$seenChunkStmt || !$termStmt || !$postingStmt) { throw new \RuntimeException('Failed to prepare lexical index SQL statements.'); } $totalChunks = 0; $lineNumber = 0; while (($line = fgets($handle)) !== false) { $lineNumber++; $line = trim($line); if ($line === '') { continue; } $row = json_decode($line, true); if (!is_array($row)) { continue; } $chunkId = trim((string)($row['chunk_id'] ?? '')); $documentId = trim((string)($row['document_id'] ?? '')); $chunkIndex = $this->normalizeChunkIndex($row['chunk_index'] ?? null); $text = trim((string)($row['text'] ?? '')); if ($chunkId === '' || $documentId === '' || $text === '') { continue; } $seenChunkStmt->reset(); $seenChunkStmt->clear(); $seenChunkStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT); $seenResult = $seenChunkStmt->execute(); if ($seenResult !== false) { $seenResult->finalize(); } if ($db->changes() < 1) { continue; } $title = $this->extractDocumentTitle($row); $tokenStats = $this->buildTokenStats($text, $title); if ($tokenStats === []) { continue; } $totalChunks++; foreach ($tokenStats as $token => $stats) { $termStmt->reset(); $termStmt->clear(); $termStmt->bindValue(':token', $token, SQLITE3_TEXT); $termResult = $termStmt->execute(); if ($termResult !== false) { $termResult->finalize(); } $postingStmt->reset(); $postingStmt->clear(); $postingStmt->bindValue(':token', $token, SQLITE3_TEXT); $postingStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT); $postingStmt->bindValue(':document_id', $documentId, SQLITE3_TEXT); if ($chunkIndex === null) { $postingStmt->bindValue(':chunk_index', null, SQLITE3_NULL); } else { $postingStmt->bindValue(':chunk_index', $chunkIndex, SQLITE3_INTEGER); } $postingStmt->bindValue(':tf', $stats['tf'], SQLITE3_INTEGER); $postingStmt->bindValue(':title_tf', $stats['title_tf'], SQLITE3_INTEGER); $postingResult = $postingStmt->execute(); if ($postingResult === false) { throw new \RuntimeException('Failed to insert lexical posting for token: ' . $token); } $postingResult->finalize(); } } fclose($handle); $this->writeMeta($db, $totalChunks); $db->exec('COMMIT'); $this->agentLogger->info('Lexical index streaming pass completed.', [ 'indexed_chunks' => $totalChunks, 'source' => $indexNdjsonPath, ]); } catch (\Throwable $e) { fclose($handle); $db->exec('ROLLBACK'); throw $e; } } /** * @return array */ private function buildTokenStats(string $text, string $title): array { $textTokens = $this->tokenize($text); $titleTokens = $this->tokenize($title); if ($textTokens === [] && $titleTokens === []) { return []; } $textTf = []; foreach ($textTokens as $token) { $textTf[$token] = ($textTf[$token] ?? 0) + 1; } $titleTf = []; foreach ($titleTokens as $token) { $titleTf[$token] = ($titleTf[$token] ?? 0) + 1; } $tokens = array_values(array_unique(array_merge( array_keys($textTf), array_keys($titleTf) ))); if (count($tokens) > self::MAX_UNIQUE_TOKENS_PER_CHUNK) { $tokens = array_slice($tokens, 0, self::MAX_UNIQUE_TOKENS_PER_CHUNK); } $stats = []; foreach ($tokens as $token) { $stats[$token] = [ 'tf' => $textTf[$token] ?? 0, 'title_tf' => $titleTf[$token] ?? 0, ]; } return $stats; } /** * Generic tokenizer: * - lowercases * - removes punctuation * - preserves alphanumeric codes * - keeps numeric/code-like tokens even if short * - drops generic stop words for non-numeric tokens * * @return string[] */ private function tokenize(string $value): array { $value = $this->normalizeText($value); if ($value === '') { return []; } $parts = preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: []; $tokens = []; foreach ($parts as $token) { if ($token === '') { continue; } if ($this->shouldIgnoreToken($token)) { continue; } $tokens[] = $token; } return $tokens; } private function shouldIgnoreToken(string $token): bool { if ($token === '') { return true; } if (preg_match('/\d/u', $token) === 1) { return false; } if (mb_strlen($token, 'UTF-8') < 2) { return true; } return StopWords::isStopWord($token); } private function normalizeText(string $value): string { $value = mb_strtolower(trim($value), 'UTF-8'); $value = str_replace(['-', '/', '_'], ' ', $value); $value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value; $value = preg_replace('/\s+/u', ' ', $value) ?? $value; return trim($value); } private function extractDocumentTitle(array $row): string { $metadata = $row['metadata'] ?? null; if (!is_array($metadata)) { return ''; } return trim((string)($metadata['document_title'] ?? '')); } private function normalizeChunkIndex(mixed $value): ?int { if (is_int($value)) { return $value; } if (is_string($value) && ctype_digit($value)) { return (int)$value; } return null; } private function writeMeta(SQLite3 $db, int $totalChunks): void { $metaStmt = $db->prepare( 'INSERT OR REPLACE INTO lexical_meta (key, value) VALUES (:key, :value)' ); if ($metaStmt === false) { throw new \RuntimeException('Failed to prepare lexical meta statement.'); } $meta = [ 'schema_version' => '1', 'built_at' => (new \DateTimeImmutable())->format(DATE_ATOM), 'total_chunks' => (string)$totalChunks, ]; foreach ($meta as $key => $value) { $metaStmt->reset(); $metaStmt->clear(); $metaStmt->bindValue(':key', $key, SQLITE3_TEXT); $metaStmt->bindValue(':value', $value, SQLITE3_TEXT); $result = $metaStmt->execute(); if ($result === false) { throw new \RuntimeException('Failed to write lexical meta key: ' . $key); } $result->finalize(); } } private function initializeSchema(SQLite3 $db): void { $db->exec('PRAGMA journal_mode = DELETE'); $db->exec('PRAGMA synchronous = NORMAL'); $db->exec('PRAGMA temp_store = MEMORY'); $db->exec('PRAGMA foreign_keys = OFF'); $schema = <<<'SQL' CREATE TABLE IF NOT EXISTS lexical_meta ( key TEXT PRIMARY KEY, value TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS lexical_terms ( token TEXT PRIMARY KEY, df INTEGER NOT NULL ); CREATE TABLE IF NOT EXISTS lexical_postings ( token TEXT NOT NULL, chunk_id TEXT NOT NULL, document_id TEXT NOT NULL, chunk_index INTEGER NULL, tf INTEGER NOT NULL, title_tf INTEGER NOT NULL DEFAULT 0, PRIMARY KEY (token, chunk_id) ); CREATE INDEX IF NOT EXISTS idx_lexical_postings_document_token ON lexical_postings (document_id, token); CREATE INDEX IF NOT EXISTS idx_lexical_postings_chunk ON lexical_postings (chunk_id); CREATE TABLE IF NOT EXISTS lexical_seen_chunks ( chunk_id TEXT PRIMARY KEY ); SQL; if ($db->exec($schema) === false) { throw new \RuntimeException('Failed to initialize lexical index schema.'); } } private function openWritableDb(string $path): SQLite3 { try { $db = new SQLite3($path, SQLITE3_OPEN_READWRITE | SQLITE3_OPEN_CREATE); } catch (\Throwable $e) { throw new \RuntimeException('Unable to open lexical index DB: ' . $path, 0, $e); } $db->busyTimeout(5000); return $db; } private function getIndexNdjsonPath(): string { return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_NDJSON_PATH; } private function getLexicalIndexPath(): string { return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH; } private function ensureTargetDirectoryExists(string $finalIndexPath): void { $dir = dirname($finalIndexPath); if (is_dir($dir)) { return; } if (!@mkdir($dir, 0775, true) && !is_dir($dir)) { throw new \RuntimeException('Unable to create lexical index directory: ' . $dir); } } private function atomicReplace(string $tmpPath, string $finalPath): void { if (is_file($finalPath)) { @chmod($finalPath, 0664); } if (!@rename($tmpPath, $finalPath)) { if (!@copy($tmpPath, $finalPath)) { @unlink($tmpPath); throw new \RuntimeException('Atomic replace failed for lexical index: ' . $finalPath); } @unlink($tmpPath); } @chmod($finalPath, 0664); } private function removeFileIfExists(string $path): void { if (is_file($path)) { @unlink($path); } } private function assertSqliteAvailable(): void { if (!class_exists(SQLite3::class)) { throw new \RuntimeException('The sqlite3 PHP extension is required for lexical index building.'); } } }