528 lines
15 KiB
PHP
528 lines
15 KiB
PHP
<?php
|
|
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Knowledge\Retrieval;
|
|
|
|
use App\Knowledge\StopWords;
|
|
use Psr\Log\LoggerInterface;
|
|
use SQLite3;
|
|
|
|
final readonly class NdjsonLexicalIndexBuilder
|
|
{
|
|
private const DEFAULT_RELATIVE_NDJSON_PATH = '/var/knowledge/index.ndjson';
|
|
private const DEFAULT_RELATIVE_INDEX_PATH = '/var/knowledge/lexical.index.sqlite';
|
|
|
|
/**
|
|
* Upper bound to avoid pathological chunks exploding the lexical index.
|
|
* This stays generic and does not encode any domain-specific assumption.
|
|
*/
|
|
private const MAX_UNIQUE_TOKENS_PER_CHUNK = 256;
|
|
|
|
public function __construct(
|
|
private string $projectDir,
|
|
private LoggerInterface $agentLogger,
|
|
)
|
|
{
|
|
}
|
|
|
|
/**
|
|
* Build a generic lexical SQLite index from index.ndjson.
|
|
*
|
|
* Output DB schema:
|
|
*
|
|
* lexical_meta(
|
|
* key TEXT PRIMARY KEY,
|
|
* value TEXT NOT NULL
|
|
* )
|
|
*
|
|
* lexical_terms(
|
|
* token TEXT PRIMARY KEY,
|
|
* df INTEGER NOT NULL
|
|
* )
|
|
*
|
|
* lexical_postings(
|
|
* token TEXT NOT NULL,
|
|
* chunk_id TEXT NOT NULL,
|
|
* document_id TEXT NOT NULL,
|
|
* chunk_index INTEGER,
|
|
* tf INTEGER NOT NULL,
|
|
* title_tf INTEGER NOT NULL DEFAULT 0,
|
|
* PRIMARY KEY(token, chunk_id)
|
|
* )
|
|
*
|
|
* Design goals:
|
|
* - generic, data-driven lexical retrieval base
|
|
* - no domain keywords in core code
|
|
* - no full scan per request later
|
|
* - duplicate chunk_id lines in index.ndjson must not inflate the index
|
|
*/
|
|
public function build(): void
|
|
{
|
|
$this->assertSqliteAvailable();
|
|
|
|
$indexNdjsonPath = $this->getIndexNdjsonPath();
|
|
$lexicalIndexPath = $this->getLexicalIndexPath();
|
|
$tmpPath = $lexicalIndexPath . '.tmp';
|
|
|
|
if (!is_file($indexNdjsonPath) || filesize($indexNdjsonPath) === 0) {
|
|
$this->removeFileIfExists($lexicalIndexPath);
|
|
$this->removeFileIfExists($tmpPath);
|
|
|
|
$this->agentLogger->info('Lexical index skipped because index.ndjson is missing or empty.', [
|
|
'index_ndjson' => $indexNdjsonPath,
|
|
]);
|
|
|
|
return;
|
|
}
|
|
|
|
$this->ensureTargetDirectoryExists($lexicalIndexPath);
|
|
$this->removeFileIfExists($tmpPath);
|
|
|
|
$db = $this->openWritableDb($tmpPath);
|
|
|
|
try {
|
|
$this->initializeSchema($db);
|
|
$this->buildFromNdjson($db, $indexNdjsonPath);
|
|
$db->close();
|
|
|
|
$this->atomicReplace($tmpPath, $lexicalIndexPath);
|
|
|
|
$this->agentLogger->info('Lexical index build completed.', [
|
|
'path' => $lexicalIndexPath,
|
|
]);
|
|
} catch (\Throwable $e) {
|
|
try {
|
|
$db->close();
|
|
} catch (\Throwable) {
|
|
// Ignore close failures during cleanup.
|
|
}
|
|
|
|
$this->removeFileIfExists($tmpPath);
|
|
|
|
$this->agentLogger->error('Lexical index build failed.', [
|
|
'path' => $lexicalIndexPath,
|
|
'error' => $e->getMessage(),
|
|
]);
|
|
|
|
throw $e;
|
|
}
|
|
}
|
|
|
|
private function buildFromNdjson(SQLite3 $db, string $indexNdjsonPath): void
|
|
{
|
|
$handle = @fopen($indexNdjsonPath, 'rb');
|
|
|
|
if ($handle === false) {
|
|
throw new \RuntimeException('Unable to read index.ndjson: ' . $indexNdjsonPath);
|
|
}
|
|
|
|
$db->exec('BEGIN IMMEDIATE TRANSACTION');
|
|
|
|
try {
|
|
$seenChunkStmt = $db->prepare(
|
|
'INSERT OR IGNORE INTO lexical_seen_chunks (chunk_id) VALUES (:chunk_id)'
|
|
);
|
|
$termStmt = $db->prepare(
|
|
'INSERT INTO lexical_terms (token, df)
|
|
VALUES (:token, 1)
|
|
ON CONFLICT(token) DO UPDATE SET df = df + 1'
|
|
);
|
|
$postingStmt = $db->prepare(
|
|
'INSERT INTO lexical_postings (
|
|
token,
|
|
chunk_id,
|
|
document_id,
|
|
chunk_index,
|
|
tf,
|
|
title_tf
|
|
) VALUES (
|
|
:token,
|
|
:chunk_id,
|
|
:document_id,
|
|
:chunk_index,
|
|
:tf,
|
|
:title_tf
|
|
)'
|
|
);
|
|
|
|
if (!$seenChunkStmt || !$termStmt || !$postingStmt) {
|
|
throw new \RuntimeException('Failed to prepare lexical index SQL statements.');
|
|
}
|
|
|
|
$totalChunks = 0;
|
|
$lineNumber = 0;
|
|
|
|
while (($line = fgets($handle)) !== false) {
|
|
$lineNumber++;
|
|
$line = trim($line);
|
|
|
|
if ($line === '') {
|
|
continue;
|
|
}
|
|
|
|
$row = json_decode($line, true);
|
|
|
|
if (!is_array($row)) {
|
|
continue;
|
|
}
|
|
|
|
$chunkId = trim((string)($row['chunk_id'] ?? ''));
|
|
$documentId = trim((string)($row['document_id'] ?? ''));
|
|
$chunkIndex = $this->normalizeChunkIndex($row['chunk_index'] ?? null);
|
|
$text = trim((string)($row['text'] ?? ''));
|
|
|
|
if ($chunkId === '' || $documentId === '' || $text === '') {
|
|
continue;
|
|
}
|
|
|
|
$seenChunkStmt->reset();
|
|
$seenChunkStmt->clear();
|
|
$seenChunkStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT);
|
|
$seenResult = $seenChunkStmt->execute();
|
|
|
|
if ($seenResult !== false) {
|
|
$seenResult->finalize();
|
|
}
|
|
|
|
if ($db->changes() < 1) {
|
|
continue;
|
|
}
|
|
|
|
$title = $this->extractDocumentTitle($row);
|
|
$tokenStats = $this->buildTokenStats($text, $title);
|
|
|
|
if ($tokenStats === []) {
|
|
continue;
|
|
}
|
|
|
|
$totalChunks++;
|
|
|
|
foreach ($tokenStats as $token => $stats) {
|
|
$termStmt->reset();
|
|
$termStmt->clear();
|
|
$termStmt->bindValue(':token', $token, SQLITE3_TEXT);
|
|
$termResult = $termStmt->execute();
|
|
|
|
if ($termResult !== false) {
|
|
$termResult->finalize();
|
|
}
|
|
|
|
$postingStmt->reset();
|
|
$postingStmt->clear();
|
|
$postingStmt->bindValue(':token', $token, SQLITE3_TEXT);
|
|
$postingStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT);
|
|
$postingStmt->bindValue(':document_id', $documentId, SQLITE3_TEXT);
|
|
|
|
if ($chunkIndex === null) {
|
|
$postingStmt->bindValue(':chunk_index', null, SQLITE3_NULL);
|
|
} else {
|
|
$postingStmt->bindValue(':chunk_index', $chunkIndex, SQLITE3_INTEGER);
|
|
}
|
|
|
|
$postingStmt->bindValue(':tf', $stats['tf'], SQLITE3_INTEGER);
|
|
$postingStmt->bindValue(':title_tf', $stats['title_tf'], SQLITE3_INTEGER);
|
|
|
|
$postingResult = $postingStmt->execute();
|
|
|
|
if ($postingResult === false) {
|
|
throw new \RuntimeException('Failed to insert lexical posting for token: ' . $token);
|
|
}
|
|
|
|
$postingResult->finalize();
|
|
}
|
|
}
|
|
|
|
fclose($handle);
|
|
|
|
$this->writeMeta($db, $totalChunks);
|
|
|
|
$db->exec('COMMIT');
|
|
|
|
$this->agentLogger->info('Lexical index streaming pass completed.', [
|
|
'indexed_chunks' => $totalChunks,
|
|
'source' => $indexNdjsonPath,
|
|
]);
|
|
} catch (\Throwable $e) {
|
|
fclose($handle);
|
|
$db->exec('ROLLBACK');
|
|
|
|
throw $e;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @return array<string, array{tf:int, title_tf:int}>
|
|
*/
|
|
private function buildTokenStats(string $text, string $title): array
|
|
{
|
|
$textTokens = $this->tokenize($text);
|
|
$titleTokens = $this->tokenize($title);
|
|
|
|
if ($textTokens === [] && $titleTokens === []) {
|
|
return [];
|
|
}
|
|
|
|
$textTf = [];
|
|
foreach ($textTokens as $token) {
|
|
$textTf[$token] = ($textTf[$token] ?? 0) + 1;
|
|
}
|
|
|
|
$titleTf = [];
|
|
foreach ($titleTokens as $token) {
|
|
$titleTf[$token] = ($titleTf[$token] ?? 0) + 1;
|
|
}
|
|
|
|
$tokens = array_values(array_unique(array_merge(
|
|
array_keys($textTf),
|
|
array_keys($titleTf)
|
|
)));
|
|
|
|
if (count($tokens) > self::MAX_UNIQUE_TOKENS_PER_CHUNK) {
|
|
$tokens = array_slice($tokens, 0, self::MAX_UNIQUE_TOKENS_PER_CHUNK);
|
|
}
|
|
|
|
$stats = [];
|
|
|
|
foreach ($tokens as $token) {
|
|
$stats[$token] = [
|
|
'tf' => $textTf[$token] ?? 0,
|
|
'title_tf' => $titleTf[$token] ?? 0,
|
|
];
|
|
}
|
|
|
|
return $stats;
|
|
}
|
|
|
|
/**
|
|
* Generic tokenizer:
|
|
* - lowercases
|
|
* - removes punctuation
|
|
* - preserves alphanumeric codes
|
|
* - keeps numeric/code-like tokens even if short
|
|
* - drops generic stop words for non-numeric tokens
|
|
*
|
|
* @return string[]
|
|
*/
|
|
private function tokenize(string $value): array
|
|
{
|
|
$value = $this->normalizeText($value);
|
|
|
|
if ($value === '') {
|
|
return [];
|
|
}
|
|
|
|
$parts = preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
|
$tokens = [];
|
|
|
|
foreach ($parts as $token) {
|
|
if ($token === '') {
|
|
continue;
|
|
}
|
|
|
|
if ($this->shouldIgnoreToken($token)) {
|
|
continue;
|
|
}
|
|
|
|
$tokens[] = $token;
|
|
}
|
|
|
|
return $tokens;
|
|
}
|
|
|
|
private function shouldIgnoreToken(string $token): bool
|
|
{
|
|
if ($token === '') {
|
|
return true;
|
|
}
|
|
|
|
if (preg_match('/\d/u', $token) === 1) {
|
|
return false;
|
|
}
|
|
|
|
if (mb_strlen($token, 'UTF-8') < 2) {
|
|
return true;
|
|
}
|
|
|
|
return StopWords::isStopWord($token);
|
|
}
|
|
|
|
private function normalizeText(string $value): string
|
|
{
|
|
$value = mb_strtolower(trim($value), 'UTF-8');
|
|
$value = str_replace(['-', '/', '_'], ' ', $value);
|
|
$value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
|
|
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
|
|
|
return trim($value);
|
|
}
|
|
|
|
private function extractDocumentTitle(array $row): string
|
|
{
|
|
$metadata = $row['metadata'] ?? null;
|
|
|
|
if (!is_array($metadata)) {
|
|
return '';
|
|
}
|
|
|
|
return trim((string)($metadata['document_title'] ?? ''));
|
|
}
|
|
|
|
private function normalizeChunkIndex(mixed $value): ?int
|
|
{
|
|
if (is_int($value)) {
|
|
return $value;
|
|
}
|
|
|
|
if (is_string($value) && ctype_digit($value)) {
|
|
return (int)$value;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
private function writeMeta(SQLite3 $db, int $totalChunks): void
|
|
{
|
|
$metaStmt = $db->prepare(
|
|
'INSERT OR REPLACE INTO lexical_meta (key, value) VALUES (:key, :value)'
|
|
);
|
|
|
|
if ($metaStmt === false) {
|
|
throw new \RuntimeException('Failed to prepare lexical meta statement.');
|
|
}
|
|
|
|
$meta = [
|
|
'schema_version' => '1',
|
|
'built_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
|
'total_chunks' => (string)$totalChunks,
|
|
];
|
|
|
|
foreach ($meta as $key => $value) {
|
|
$metaStmt->reset();
|
|
$metaStmt->clear();
|
|
$metaStmt->bindValue(':key', $key, SQLITE3_TEXT);
|
|
$metaStmt->bindValue(':value', $value, SQLITE3_TEXT);
|
|
|
|
$result = $metaStmt->execute();
|
|
|
|
if ($result === false) {
|
|
throw new \RuntimeException('Failed to write lexical meta key: ' . $key);
|
|
}
|
|
|
|
$result->finalize();
|
|
}
|
|
}
|
|
|
|
private function initializeSchema(SQLite3 $db): void
|
|
{
|
|
$db->exec('PRAGMA journal_mode = DELETE');
|
|
$db->exec('PRAGMA synchronous = NORMAL');
|
|
$db->exec('PRAGMA temp_store = MEMORY');
|
|
$db->exec('PRAGMA foreign_keys = OFF');
|
|
|
|
$schema = <<<'SQL'
|
|
CREATE TABLE IF NOT EXISTS lexical_meta (
|
|
key TEXT PRIMARY KEY,
|
|
value TEXT NOT NULL
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS lexical_terms (
|
|
token TEXT PRIMARY KEY,
|
|
df INTEGER NOT NULL
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS lexical_postings (
|
|
token TEXT NOT NULL,
|
|
chunk_id TEXT NOT NULL,
|
|
document_id TEXT NOT NULL,
|
|
chunk_index INTEGER NULL,
|
|
tf INTEGER NOT NULL,
|
|
title_tf INTEGER NOT NULL DEFAULT 0,
|
|
PRIMARY KEY (token, chunk_id)
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_lexical_postings_document_token
|
|
ON lexical_postings (document_id, token);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_lexical_postings_chunk
|
|
ON lexical_postings (chunk_id);
|
|
|
|
CREATE TABLE IF NOT EXISTS lexical_seen_chunks (
|
|
chunk_id TEXT PRIMARY KEY
|
|
);
|
|
SQL;
|
|
|
|
if ($db->exec($schema) === false) {
|
|
throw new \RuntimeException('Failed to initialize lexical index schema.');
|
|
}
|
|
}
|
|
|
|
private function openWritableDb(string $path): SQLite3
|
|
{
|
|
try {
|
|
$db = new SQLite3($path, SQLITE3_OPEN_READWRITE | SQLITE3_OPEN_CREATE);
|
|
} catch (\Throwable $e) {
|
|
throw new \RuntimeException('Unable to open lexical index DB: ' . $path, 0, $e);
|
|
}
|
|
|
|
$db->busyTimeout(5000);
|
|
|
|
return $db;
|
|
}
|
|
|
|
private function getIndexNdjsonPath(): string
|
|
{
|
|
return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_NDJSON_PATH;
|
|
}
|
|
|
|
private function getLexicalIndexPath(): string
|
|
{
|
|
return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH;
|
|
}
|
|
|
|
private function ensureTargetDirectoryExists(string $finalIndexPath): void
|
|
{
|
|
$dir = dirname($finalIndexPath);
|
|
|
|
if (is_dir($dir)) {
|
|
return;
|
|
}
|
|
|
|
if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
|
|
throw new \RuntimeException('Unable to create lexical index directory: ' . $dir);
|
|
}
|
|
}
|
|
|
|
private function atomicReplace(string $tmpPath, string $finalPath): void
|
|
{
|
|
if (is_file($finalPath)) {
|
|
@chmod($finalPath, 0664);
|
|
}
|
|
|
|
if (!@rename($tmpPath, $finalPath)) {
|
|
if (!@copy($tmpPath, $finalPath)) {
|
|
@unlink($tmpPath);
|
|
throw new \RuntimeException('Atomic replace failed for lexical index: ' . $finalPath);
|
|
}
|
|
|
|
@unlink($tmpPath);
|
|
}
|
|
|
|
@chmod($finalPath, 0664);
|
|
}
|
|
|
|
private function removeFileIfExists(string $path): void
|
|
{
|
|
if (is_file($path)) {
|
|
@unlink($path);
|
|
}
|
|
}
|
|
|
|
private function assertSqliteAvailable(): void
|
|
{
|
|
if (!class_exists(SQLite3::class)) {
|
|
throw new \RuntimeException('The sqlite3 PHP extension is required for lexical index building.');
|
|
}
|
|
}
|
|
} |