lexical logic

This commit is contained in:
team2
2026-04-20 21:46:42 +02:00
parent 2587ac8b4b
commit 065f59c090
9 changed files with 2576 additions and 326 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,451 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Knowledge\StopWords;
use Psr\Log\LoggerInterface;
use SQLite3;
final readonly class NdjsonKeywordRetriever
{
private const DEFAULT_RELATIVE_INDEX_PATH = '/var/knowledge/lexical.index.sqlite';
private const MAX_LIMIT = 100;
private const MAX_QUERY_TOKENS = 12;
public function __construct(
private string $projectDir,
private LoggerInterface $agentLogger,
) {
}
/**
* Generic lexical retrieval against a prebuilt SQLite index.
*
* Expected DB schema (to be created by the lexical index builder):
*
* lexical_meta(
* key TEXT PRIMARY KEY,
* value TEXT NOT NULL
* )
*
* lexical_terms(
* token TEXT PRIMARY KEY,
* df INTEGER NOT NULL
* )
*
* lexical_postings(
* token TEXT NOT NULL,
* chunk_id TEXT NOT NULL,
* document_id TEXT NOT NULL,
* chunk_index INTEGER,
* tf INTEGER NOT NULL,
* title_tf INTEGER NOT NULL DEFAULT 0,
* PRIMARY KEY(token, chunk_id)
* )
*
* This retriever contains no domain-specific keyword logic.
* It only uses generic token overlap, rarity, title hits, and numeric/code emphasis.
*
* @param string[] $docIds Optional document scope
*
* @return array<int, array{
* chunk_id:string,
* score:float,
* document_id:?string,
* chunk_index:?int
* }>
*/
public function search(string $query, int $limit = 10, array $docIds = []): array
{
$limit = $this->clampLimit($limit);
$analysis = $this->analyzeQuery($query);
if ($analysis['tokens'] === []) {
return [];
}
$db = $this->openReadOnlyDb();
if (!$db instanceof SQLite3) {
return [];
}
try {
$totalChunks = $this->loadTotalChunks($db);
$rows = $this->loadPostings(
$db,
$analysis['tokens'],
$docIds
);
if ($rows === []) {
return [];
}
return $this->scoreRows(
$rows,
$analysis['tokens'],
$analysis['numeric_tokens'],
$totalChunks,
$limit
);
} catch (\Throwable $e) {
$this->agentLogger->error('Keyword retriever failed', [
'error' => $e->getMessage(),
]);
return [];
} finally {
$db->close();
}
}
/**
* @return array{
* normalized_query:string,
* tokens:string[],
* numeric_tokens:string[]
* }
*/
private function analyzeQuery(string $query): array
{
$normalized = $this->normalizeText($query);
if ($normalized === '') {
return [
'normalized_query' => '',
'tokens' => [],
'numeric_tokens' => [],
];
}
$parts = preg_split('/\s+/u', $normalized, -1, PREG_SPLIT_NO_EMPTY) ?: [];
$tokens = [];
$numericTokens = [];
foreach ($parts as $token) {
if ($token === '') {
continue;
}
if ($this->shouldIgnoreToken($token)) {
continue;
}
$tokens[] = $token;
if (preg_match('/\d/u', $token) === 1) {
$numericTokens[] = $token;
}
}
$tokens = array_values(array_unique($tokens));
$numericTokens = array_values(array_unique($numericTokens));
if (count($tokens) > self::MAX_QUERY_TOKENS) {
$tokens = array_slice($tokens, 0, self::MAX_QUERY_TOKENS);
}
return [
'normalized_query' => $normalized,
'tokens' => $tokens,
'numeric_tokens' => $numericTokens,
];
}
private function shouldIgnoreToken(string $token): bool
{
if ($token === '') {
return true;
}
if (preg_match('/\d/u', $token) === 1) {
return false;
}
if (mb_strlen($token, 'UTF-8') < 2) {
return true;
}
return StopWords::isStopWord($token);
}
private function normalizeText(string $value): string
{
$value = mb_strtolower(trim($value), 'UTF-8');
$value = str_replace(['-', '/', '_'], ' ', $value);
$value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
return trim($value);
}
private function openReadOnlyDb(): ?SQLite3
{
if (!class_exists(SQLite3::class)) {
$this->agentLogger->warning('Keyword retriever unavailable: sqlite3 extension missing.');
return null;
}
$path = $this->getIndexPath();
if (!is_file($path)) {
return null;
}
try {
$db = new SQLite3($path, SQLITE3_OPEN_READONLY);
$db->busyTimeout(1000);
return $db;
} catch (\Throwable $e) {
$this->agentLogger->error('Unable to open lexical index', [
'path' => $path,
'error' => $e->getMessage(),
]);
return null;
}
}
private function getIndexPath(): string
{
return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH;
}
private function loadTotalChunks(SQLite3 $db): int
{
$stmt = $db->prepare('SELECT value FROM lexical_meta WHERE key = :key');
if (!$stmt) {
return 1;
}
$stmt->bindValue(':key', 'total_chunks', SQLITE3_TEXT);
$result = $stmt->execute();
if ($result === false) {
return 1;
}
$row = $result->fetchArray(SQLITE3_ASSOC);
$result->finalize();
$value = isset($row['value']) ? (int) $row['value'] : 0;
return max(1, $value);
}
/**
* @param string[] $tokens
* @param string[] $docIds
* @return array<int, array{
* token:string,
* chunk_id:string,
* document_id:string,
* chunk_index:?int,
* tf:int,
* title_tf:int,
* df:int
* }>
*/
private function loadPostings(SQLite3 $db, array $tokens, array $docIds): array
{
if ($tokens === []) {
return [];
}
$tokenPlaceholders = [];
foreach (array_keys($tokens) as $i) {
$tokenPlaceholders[] = ':t' . $i;
}
$sql = '
SELECT
p.token,
p.chunk_id,
p.document_id,
p.chunk_index,
p.tf,
p.title_tf,
lt.df
FROM lexical_postings p
INNER JOIN lexical_terms lt ON lt.token = p.token
WHERE p.token IN (' . implode(', ', $tokenPlaceholders) . ')
';
$docIds = array_values(array_unique(array_filter(
$docIds,
static fn (mixed $value): bool => is_string($value) && $value !== ''
)));
if ($docIds !== []) {
$docPlaceholders = [];
foreach (array_keys($docIds) as $i) {
$docPlaceholders[] = ':d' . $i;
}
$sql .= ' AND p.document_id IN (' . implode(', ', $docPlaceholders) . ')';
}
$stmt = $db->prepare($sql);
if ($stmt === false) {
return [];
}
foreach ($tokens as $i => $token) {
$stmt->bindValue(':t' . $i, $token, SQLITE3_TEXT);
}
foreach ($docIds as $i => $docId) {
$stmt->bindValue(':d' . $i, $docId, SQLITE3_TEXT);
}
$result = $stmt->execute();
if ($result === false) {
return [];
}
$rows = [];
while (($row = $result->fetchArray(SQLITE3_ASSOC)) !== false) {
$chunkId = (string) ($row['chunk_id'] ?? '');
$documentId = (string) ($row['document_id'] ?? '');
$token = (string) ($row['token'] ?? '');
if ($chunkId === '' || $documentId === '' || $token === '') {
continue;
}
$chunkIndex = null;
if (isset($row['chunk_index']) && is_numeric($row['chunk_index'])) {
$chunkIndex = (int) $row['chunk_index'];
}
$rows[] = [
'token' => $token,
'chunk_id' => $chunkId,
'document_id' => $documentId,
'chunk_index' => $chunkIndex,
'tf' => max(1, (int) ($row['tf'] ?? 1)),
'title_tf' => max(0, (int) ($row['title_tf'] ?? 0)),
'df' => max(1, (int) ($row['df'] ?? 1)),
];
}
$result->finalize();
return $rows;
}
/**
* @param array<int, array{
* token:string,
* chunk_id:string,
* document_id:string,
* chunk_index:?int,
* tf:int,
* title_tf:int,
* df:int
* }> $rows
* @param string[] $queryTokens
* @param string[] $numericTokens
*
* @return array<int, array{
* chunk_id:string,
* score:float,
* document_id:?string,
* chunk_index:?int
* }>
*/
private function scoreRows(
array $rows,
array $queryTokens,
array $numericTokens,
int $totalChunks,
int $limit
): array {
if ($rows === []) {
return [];
}
$numericLookup = array_fill_keys($numericTokens, true);
$queryTokenCount = max(1, count($queryTokens));
$scores = [];
$meta = [];
$matchedTokens = [];
foreach ($rows as $row) {
$chunkId = $row['chunk_id'];
$token = $row['token'];
$idf = log(1.0 + ($totalChunks / max(1.0, (float) (1 + $row['df']))));
$tfBoost = 1.0 + (min(3, $row['tf']) * 0.20);
$numericBoost = isset($numericLookup[$token]) ? 1.60 : 1.0;
$titleBonus = $row['title_tf'] > 0 ? ($idf * 0.75) : 0.0;
$scores[$chunkId] = ($scores[$chunkId] ?? 0.0)
+ ($idf * $tfBoost * $numericBoost)
+ $titleBonus;
$matchedTokens[$chunkId][$token] = true;
if (!isset($meta[$chunkId])) {
$meta[$chunkId] = [
'document_id' => $row['document_id'],
'chunk_index' => $row['chunk_index'],
];
}
}
foreach ($scores as $chunkId => $score) {
$coverage = count($matchedTokens[$chunkId] ?? []) / $queryTokenCount;
$scores[$chunkId] = $score * (0.65 + (0.35 * $coverage));
}
arsort($scores);
$topScore = (float) reset($scores);
if ($topScore <= 0.0) {
return [];
}
$out = [];
foreach ($scores as $chunkId => $score) {
$normalizedScore = $score / $topScore;
$out[] = [
'chunk_id' => $chunkId,
'score' => round($normalizedScore, 6),
'document_id' => $meta[$chunkId]['document_id'] ?? null,
'chunk_index' => $meta[$chunkId]['chunk_index'] ?? null,
];
if (count($out) >= $limit) {
break;
}
}
return $out;
}
private function clampLimit(int $limit): int
{
if ($limit < 1) {
return 1;
}
if ($limit > self::MAX_LIMIT) {
return self::MAX_LIMIT;
}
return $limit;
}
}

View File

@@ -0,0 +1,528 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Knowledge\StopWords;
use Psr\Log\LoggerInterface;
use SQLite3;
final readonly class NdjsonLexicalIndexBuilder
{
private const DEFAULT_RELATIVE_NDJSON_PATH = '/var/knowledge/index.ndjson';
private const DEFAULT_RELATIVE_INDEX_PATH = '/var/knowledge/lexical.index.sqlite';
/**
* Upper bound to avoid pathological chunks exploding the lexical index.
* This stays generic and does not encode any domain-specific assumption.
*/
private const MAX_UNIQUE_TOKENS_PER_CHUNK = 256;
public function __construct(
private string $projectDir,
private LoggerInterface $agentLogger,
)
{
}
/**
* Build a generic lexical SQLite index from index.ndjson.
*
* Output DB schema:
*
* lexical_meta(
* key TEXT PRIMARY KEY,
* value TEXT NOT NULL
* )
*
* lexical_terms(
* token TEXT PRIMARY KEY,
* df INTEGER NOT NULL
* )
*
* lexical_postings(
* token TEXT NOT NULL,
* chunk_id TEXT NOT NULL,
* document_id TEXT NOT NULL,
* chunk_index INTEGER,
* tf INTEGER NOT NULL,
* title_tf INTEGER NOT NULL DEFAULT 0,
* PRIMARY KEY(token, chunk_id)
* )
*
* Design goals:
* - generic, data-driven lexical retrieval base
* - no domain keywords in core code
* - no full scan per request later
* - duplicate chunk_id lines in index.ndjson must not inflate the index
*/
public function build(): void
{
$this->assertSqliteAvailable();
$indexNdjsonPath = $this->getIndexNdjsonPath();
$lexicalIndexPath = $this->getLexicalIndexPath();
$tmpPath = $lexicalIndexPath . '.tmp';
if (!is_file($indexNdjsonPath) || filesize($indexNdjsonPath) === 0) {
$this->removeFileIfExists($lexicalIndexPath);
$this->removeFileIfExists($tmpPath);
$this->agentLogger->info('Lexical index skipped because index.ndjson is missing or empty.', [
'index_ndjson' => $indexNdjsonPath,
]);
return;
}
$this->ensureTargetDirectoryExists($lexicalIndexPath);
$this->removeFileIfExists($tmpPath);
$db = $this->openWritableDb($tmpPath);
try {
$this->initializeSchema($db);
$this->buildFromNdjson($db, $indexNdjsonPath);
$db->close();
$this->atomicReplace($tmpPath, $lexicalIndexPath);
$this->agentLogger->info('Lexical index build completed.', [
'path' => $lexicalIndexPath,
]);
} catch (\Throwable $e) {
try {
$db->close();
} catch (\Throwable) {
// Ignore close failures during cleanup.
}
$this->removeFileIfExists($tmpPath);
$this->agentLogger->error('Lexical index build failed.', [
'path' => $lexicalIndexPath,
'error' => $e->getMessage(),
]);
throw $e;
}
}
private function buildFromNdjson(SQLite3 $db, string $indexNdjsonPath): void
{
$handle = @fopen($indexNdjsonPath, 'rb');
if ($handle === false) {
throw new \RuntimeException('Unable to read index.ndjson: ' . $indexNdjsonPath);
}
$db->exec('BEGIN IMMEDIATE TRANSACTION');
try {
$seenChunkStmt = $db->prepare(
'INSERT OR IGNORE INTO lexical_seen_chunks (chunk_id) VALUES (:chunk_id)'
);
$termStmt = $db->prepare(
'INSERT INTO lexical_terms (token, df)
VALUES (:token, 1)
ON CONFLICT(token) DO UPDATE SET df = df + 1'
);
$postingStmt = $db->prepare(
'INSERT INTO lexical_postings (
token,
chunk_id,
document_id,
chunk_index,
tf,
title_tf
) VALUES (
:token,
:chunk_id,
:document_id,
:chunk_index,
:tf,
:title_tf
)'
);
if (!$seenChunkStmt || !$termStmt || !$postingStmt) {
throw new \RuntimeException('Failed to prepare lexical index SQL statements.');
}
$totalChunks = 0;
$lineNumber = 0;
while (($line = fgets($handle)) !== false) {
$lineNumber++;
$line = trim($line);
if ($line === '') {
continue;
}
$row = json_decode($line, true);
if (!is_array($row)) {
continue;
}
$chunkId = trim((string)($row['chunk_id'] ?? ''));
$documentId = trim((string)($row['document_id'] ?? ''));
$chunkIndex = $this->normalizeChunkIndex($row['chunk_index'] ?? null);
$text = trim((string)($row['text'] ?? ''));
if ($chunkId === '' || $documentId === '' || $text === '') {
continue;
}
$seenChunkStmt->reset();
$seenChunkStmt->clear();
$seenChunkStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT);
$seenResult = $seenChunkStmt->execute();
if ($seenResult !== false) {
$seenResult->finalize();
}
if ($db->changes() < 1) {
continue;
}
$title = $this->extractDocumentTitle($row);
$tokenStats = $this->buildTokenStats($text, $title);
if ($tokenStats === []) {
continue;
}
$totalChunks++;
foreach ($tokenStats as $token => $stats) {
$termStmt->reset();
$termStmt->clear();
$termStmt->bindValue(':token', $token, SQLITE3_TEXT);
$termResult = $termStmt->execute();
if ($termResult !== false) {
$termResult->finalize();
}
$postingStmt->reset();
$postingStmt->clear();
$postingStmt->bindValue(':token', $token, SQLITE3_TEXT);
$postingStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT);
$postingStmt->bindValue(':document_id', $documentId, SQLITE3_TEXT);
if ($chunkIndex === null) {
$postingStmt->bindValue(':chunk_index', null, SQLITE3_NULL);
} else {
$postingStmt->bindValue(':chunk_index', $chunkIndex, SQLITE3_INTEGER);
}
$postingStmt->bindValue(':tf', $stats['tf'], SQLITE3_INTEGER);
$postingStmt->bindValue(':title_tf', $stats['title_tf'], SQLITE3_INTEGER);
$postingResult = $postingStmt->execute();
if ($postingResult === false) {
throw new \RuntimeException('Failed to insert lexical posting for token: ' . $token);
}
$postingResult->finalize();
}
}
fclose($handle);
$this->writeMeta($db, $totalChunks);
$db->exec('COMMIT');
$this->agentLogger->info('Lexical index streaming pass completed.', [
'indexed_chunks' => $totalChunks,
'source' => $indexNdjsonPath,
]);
} catch (\Throwable $e) {
fclose($handle);
$db->exec('ROLLBACK');
throw $e;
}
}
/**
* @return array<string, array{tf:int, title_tf:int}>
*/
private function buildTokenStats(string $text, string $title): array
{
$textTokens = $this->tokenize($text);
$titleTokens = $this->tokenize($title);
if ($textTokens === [] && $titleTokens === []) {
return [];
}
$textTf = [];
foreach ($textTokens as $token) {
$textTf[$token] = ($textTf[$token] ?? 0) + 1;
}
$titleTf = [];
foreach ($titleTokens as $token) {
$titleTf[$token] = ($titleTf[$token] ?? 0) + 1;
}
$tokens = array_values(array_unique(array_merge(
array_keys($textTf),
array_keys($titleTf)
)));
if (count($tokens) > self::MAX_UNIQUE_TOKENS_PER_CHUNK) {
$tokens = array_slice($tokens, 0, self::MAX_UNIQUE_TOKENS_PER_CHUNK);
}
$stats = [];
foreach ($tokens as $token) {
$stats[$token] = [
'tf' => $textTf[$token] ?? 0,
'title_tf' => $titleTf[$token] ?? 0,
];
}
return $stats;
}
/**
* Generic tokenizer:
* - lowercases
* - removes punctuation
* - preserves alphanumeric codes
* - keeps numeric/code-like tokens even if short
* - drops generic stop words for non-numeric tokens
*
* @return string[]
*/
private function tokenize(string $value): array
{
$value = $this->normalizeText($value);
if ($value === '') {
return [];
}
$parts = preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: [];
$tokens = [];
foreach ($parts as $token) {
if ($token === '') {
continue;
}
if ($this->shouldIgnoreToken($token)) {
continue;
}
$tokens[] = $token;
}
return $tokens;
}
private function shouldIgnoreToken(string $token): bool
{
if ($token === '') {
return true;
}
if (preg_match('/\d/u', $token) === 1) {
return false;
}
if (mb_strlen($token, 'UTF-8') < 2) {
return true;
}
return StopWords::isStopWord($token);
}
private function normalizeText(string $value): string
{
$value = mb_strtolower(trim($value), 'UTF-8');
$value = str_replace(['-', '/', '_'], ' ', $value);
$value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
return trim($value);
}
private function extractDocumentTitle(array $row): string
{
$metadata = $row['metadata'] ?? null;
if (!is_array($metadata)) {
return '';
}
return trim((string)($metadata['document_title'] ?? ''));
}
private function normalizeChunkIndex(mixed $value): ?int
{
if (is_int($value)) {
return $value;
}
if (is_string($value) && ctype_digit($value)) {
return (int)$value;
}
return null;
}
private function writeMeta(SQLite3 $db, int $totalChunks): void
{
$metaStmt = $db->prepare(
'INSERT OR REPLACE INTO lexical_meta (key, value) VALUES (:key, :value)'
);
if ($metaStmt === false) {
throw new \RuntimeException('Failed to prepare lexical meta statement.');
}
$meta = [
'schema_version' => '1',
'built_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
'total_chunks' => (string)$totalChunks,
];
foreach ($meta as $key => $value) {
$metaStmt->reset();
$metaStmt->clear();
$metaStmt->bindValue(':key', $key, SQLITE3_TEXT);
$metaStmt->bindValue(':value', $value, SQLITE3_TEXT);
$result = $metaStmt->execute();
if ($result === false) {
throw new \RuntimeException('Failed to write lexical meta key: ' . $key);
}
$result->finalize();
}
}
private function initializeSchema(SQLite3 $db): void
{
$db->exec('PRAGMA journal_mode = DELETE');
$db->exec('PRAGMA synchronous = NORMAL');
$db->exec('PRAGMA temp_store = MEMORY');
$db->exec('PRAGMA foreign_keys = OFF');
$schema = <<<'SQL'
CREATE TABLE IF NOT EXISTS lexical_meta (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS lexical_terms (
token TEXT PRIMARY KEY,
df INTEGER NOT NULL
);
CREATE TABLE IF NOT EXISTS lexical_postings (
token TEXT NOT NULL,
chunk_id TEXT NOT NULL,
document_id TEXT NOT NULL,
chunk_index INTEGER NULL,
tf INTEGER NOT NULL,
title_tf INTEGER NOT NULL DEFAULT 0,
PRIMARY KEY (token, chunk_id)
);
CREATE INDEX IF NOT EXISTS idx_lexical_postings_document_token
ON lexical_postings (document_id, token);
CREATE INDEX IF NOT EXISTS idx_lexical_postings_chunk
ON lexical_postings (chunk_id);
CREATE TABLE IF NOT EXISTS lexical_seen_chunks (
chunk_id TEXT PRIMARY KEY
);
SQL;
if ($db->exec($schema) === false) {
throw new \RuntimeException('Failed to initialize lexical index schema.');
}
}
private function openWritableDb(string $path): SQLite3
{
try {
$db = new SQLite3($path, SQLITE3_OPEN_READWRITE | SQLITE3_OPEN_CREATE);
} catch (\Throwable $e) {
throw new \RuntimeException('Unable to open lexical index DB: ' . $path, 0, $e);
}
$db->busyTimeout(5000);
return $db;
}
private function getIndexNdjsonPath(): string
{
return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_NDJSON_PATH;
}
private function getLexicalIndexPath(): string
{
return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH;
}
private function ensureTargetDirectoryExists(string $finalIndexPath): void
{
$dir = dirname($finalIndexPath);
if (is_dir($dir)) {
return;
}
if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create lexical index directory: ' . $dir);
}
}
private function atomicReplace(string $tmpPath, string $finalPath): void
{
if (is_file($finalPath)) {
@chmod($finalPath, 0664);
}
if (!@rename($tmpPath, $finalPath)) {
if (!@copy($tmpPath, $finalPath)) {
@unlink($tmpPath);
throw new \RuntimeException('Atomic replace failed for lexical index: ' . $finalPath);
}
@unlink($tmpPath);
}
@chmod($finalPath, 0664);
}
private function removeFileIfExists(string $path): void
{
if (is_file($path)) {
@unlink($path);
}
}
private function assertSqliteAvailable(): void
{
if (!class_exists(SQLite3::class)) {
throw new \RuntimeException('The sqlite3 PHP extension is required for lexical index building.');
}
}
}

View File

@@ -8,6 +8,14 @@ use App\Config\QueryEnricherConfig;
final readonly class QueryEnricher
{
/**
* Keep enrichment conservative.
*
* The enriched semantic query should help vector retrieval,
* but must not become bloated enough to dilute the original user intent.
*/
private const MAX_EXPANSIONS = 4;
public function __construct(
private QueryEnricherConfig $config
) {
@@ -16,6 +24,12 @@ final readonly class QueryEnricher
/**
* Enriches the query with mapped counterpart terms.
*
* Design goals:
* - preserve the original query unchanged at the front
* - only append counterpart terms that are not already present
* - prefer longer / more specific phrase matches over short generic matches
* - keep the number of appended terms intentionally small
*
* Example:
* - input: "water hardness device"
* - output: "water hardness device residual hardness model"
@@ -29,26 +43,63 @@ final readonly class QueryEnricher
}
$mapping = $this->config->getEnrichQueryList();
if ($mapping === []) {
return $originalQuery;
}
$lookup = $this->buildBidirectionalLookup($mapping);
if ($lookup === []) {
return $originalQuery;
}
$lookup = $this->sortLookupBySpecificity($lookup);
$normalizedQuery = $this->normalizeForMatching($originalQuery);
$matches = [];
if ($normalizedQuery === '') {
return $originalQuery;
}
foreach ($lookup as $needle => $mappedValue) {
if ($needle === '') {
$matches = [];
$seenNormalizedExpansions = [];
foreach ($lookup as $normalizedNeedle => $mappedValue) {
if ($normalizedNeedle === '') {
continue;
}
if ($this->containsWholePhrase($normalizedQuery, $needle)) {
$matches[] = $mappedValue;
if (!$this->containsWholePhrase($normalizedQuery, $normalizedNeedle)) {
continue;
}
$mappedValue = trim($mappedValue);
if ($mappedValue === '') {
continue;
}
$normalizedMappedValue = $this->normalizeForMatching($mappedValue);
if ($normalizedMappedValue === '') {
continue;
}
// Do not re-add information that is already present in the query.
if ($this->containsWholePhrase($normalizedQuery, $normalizedMappedValue)) {
continue;
}
if (isset($seenNormalizedExpansions[$normalizedMappedValue])) {
continue;
}
$matches[] = $mappedValue;
$seenNormalizedExpansions[$normalizedMappedValue] = true;
if (count($matches) >= self::MAX_EXPANSIONS) {
break;
}
}
$matches = array_values(array_unique(array_filter(
$matches,
static fn(string $value): bool => trim($value) !== ''
)));
if ($matches === []) {
return $originalQuery;
}
@@ -106,6 +157,11 @@ final readonly class QueryEnricher
* 'jacket' => 'coat',
* 'coat' => 'jacket',
* ]
*
* Returned format:
* [
* '<normalized needle>' => '<original mapped value>',
* ]
*/
private function buildBidirectionalLookup(array $mapping): array
{
@@ -122,15 +178,49 @@ final readonly class QueryEnricher
$normalizedKey = $this->normalizeForMatching($key);
$normalizedValue = $this->normalizeForMatching($value);
if ($normalizedKey !== '') {
if ($normalizedKey !== '' && !isset($lookup[$normalizedKey])) {
$lookup[$normalizedKey] = $value;
}
if ($normalizedValue !== '') {
if ($normalizedValue !== '' && !isset($lookup[$normalizedValue])) {
$lookup[$normalizedValue] = $key;
}
}
return $lookup;
}
/**
* Sorts phrase rules by specificity so longer / more precise phrases win first.
*
* Priority:
* 1. more words
* 2. longer character length
* 3. lexical order for deterministic output
*
* @param array<string, string> $lookup
* @return array<string, string>
*/
private function sortLookupBySpecificity(array $lookup): array
{
uksort($lookup, static function (string $a, string $b): int {
$aWordCount = substr_count($a, ' ') + 1;
$bWordCount = substr_count($b, ' ') + 1;
if ($aWordCount !== $bWordCount) {
return $bWordCount <=> $aWordCount;
}
$aLength = mb_strlen($a, 'UTF-8');
$bLength = mb_strlen($b, 'UTF-8');
if ($aLength !== $bLength) {
return $bLength <=> $aLength;
}
return strcmp($a, $b);
});
return $lookup;
}
}