lexical logic
This commit is contained in:
298
src/Command/TestHybridRetrievalCommand.php
Normal file
298
src/Command/TestHybridRetrievalCommand.php
Normal file
@@ -0,0 +1,298 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use App\Knowledge\Retrieval\NdjsonHybridRetriever;
|
||||
use Symfony\Component\Console\Attribute\AsCommand;
|
||||
use Symfony\Component\Console\Command\Command;
|
||||
use Symfony\Component\Console\Input\InputArgument;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use Symfony\Component\Console\Style\SymfonyStyle;
|
||||
|
||||
#[AsCommand(
|
||||
name: 'mto:agent:retrieval:test',
|
||||
description: 'Test the real hybrid retrieval path with debug output'
|
||||
)]
|
||||
final class TestHybridRetrievalCommand extends Command
|
||||
{
|
||||
public function __construct(
|
||||
private readonly NdjsonHybridRetriever $retriever,
|
||||
) {
|
||||
parent::__construct();
|
||||
}
|
||||
|
||||
protected function configure(): void
|
||||
{
|
||||
$this
|
||||
->addArgument(
|
||||
'prompt',
|
||||
InputArgument::REQUIRED,
|
||||
'Prompt to test against the real hybrid retrieval pipeline'
|
||||
)
|
||||
->addOption(
|
||||
'json',
|
||||
null,
|
||||
InputOption::VALUE_NONE,
|
||||
'Return the raw retrieval debug result as JSON'
|
||||
)
|
||||
->addOption(
|
||||
'show-text',
|
||||
null,
|
||||
InputOption::VALUE_NONE,
|
||||
'Show full chunk text instead of a shortened preview'
|
||||
);
|
||||
}
|
||||
|
||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$io = new SymfonyStyle($input, $output);
|
||||
|
||||
$prompt = trim((string) $input->getArgument('prompt'));
|
||||
$asJson = (bool) $input->getOption('json');
|
||||
$showText = (bool) $input->getOption('show-text');
|
||||
|
||||
if ($prompt === '') {
|
||||
$io->error('Prompt must not be empty.');
|
||||
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$start = microtime(true);
|
||||
|
||||
try {
|
||||
$results = $this->retriever->retrieveDebug($prompt);
|
||||
} catch (\Throwable $e) {
|
||||
$io->error($e->getMessage());
|
||||
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$durationMs = round((microtime(true) - $start) * 1000, 2);
|
||||
|
||||
if ($asJson) {
|
||||
$payload = [
|
||||
'prompt' => $prompt,
|
||||
'duration_ms' => $durationMs,
|
||||
'result_count' => count($results),
|
||||
'results' => $results,
|
||||
];
|
||||
|
||||
$json = json_encode(
|
||||
$payload,
|
||||
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
|
||||
);
|
||||
|
||||
if (!is_string($json)) {
|
||||
$io->error('json_encode failed.');
|
||||
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$output->writeln($json);
|
||||
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
|
||||
$io->title('Hybrid Retrieval Test');
|
||||
$io->definitionList(
|
||||
['prompt' => $prompt],
|
||||
['duration_ms' => (string) $durationMs],
|
||||
['result_count' => (string) count($results)]
|
||||
);
|
||||
|
||||
if ($results === []) {
|
||||
$io->warning('No retrieval results returned.');
|
||||
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
|
||||
$first = $results[0];
|
||||
|
||||
$io->section('Pipeline Summary');
|
||||
$io->definitionList(
|
||||
['scope_mode' => $this->stringValue($first, 'scope_mode')],
|
||||
['selection_mode' => $this->stringValue($first, 'selection_mode')],
|
||||
['intent' => $this->stringValue($first, 'intent')],
|
||||
['route' => $this->stringValue($first, 'route')],
|
||||
['entity_label' => $this->stringValue($first, 'entity_label')],
|
||||
['is_list_query' => $this->boolishValue($first, 'is_list_query')],
|
||||
['clean_query' => $this->stringValue($first, 'clean_query')],
|
||||
['semantic_query' => $this->stringValue($first, 'semantic_query')],
|
||||
['secondary_vector_query' => $this->stringValue($first, 'secondary_vector_query')],
|
||||
['lexical_query' => $this->stringValue($first, 'lexical_query')],
|
||||
['threshold' => $this->scalarValue($first, 'threshold')],
|
||||
['lexical_threshold' => $this->scalarValue($first, 'lexical_threshold')]
|
||||
);
|
||||
|
||||
$io->section('Scope Candidates');
|
||||
$io->definitionList(
|
||||
['tag_candidate_doc_ids' => $this->jsonValue($first, 'tag_candidate_doc_ids')],
|
||||
['soft_document_candidate_doc_ids' => $this->jsonValue($first, 'soft_document_candidate_doc_ids')],
|
||||
['pseudo_scope_doc_ids' => $this->jsonValue($first, 'pseudo_scope_doc_ids')],
|
||||
['title_metadata_doc_boosts' => $this->jsonObjectValue($first, 'title_metadata_doc_boosts')]
|
||||
);
|
||||
|
||||
$io->section('Hit Counts');
|
||||
$io->definitionList(
|
||||
['global_hit_count' => $this->scalarValue($first, 'global_hit_count')],
|
||||
['scoped_hit_count' => $this->scalarValue($first, 'scoped_hit_count')],
|
||||
['global_vector_hit_count' => $this->scalarValue($first, 'global_vector_hit_count')],
|
||||
['global_primary_vector_hit_count' => $this->scalarValue($first, 'global_primary_vector_hit_count')],
|
||||
['global_secondary_vector_hit_count' => $this->scalarValue($first, 'global_secondary_vector_hit_count')],
|
||||
['global_keyword_hit_count' => $this->scalarValue($first, 'global_keyword_hit_count')],
|
||||
['scoped_vector_hit_count' => $this->scalarValue($first, 'scoped_vector_hit_count')],
|
||||
['scoped_primary_vector_hit_count' => $this->scalarValue($first, 'scoped_primary_vector_hit_count')],
|
||||
['scoped_secondary_vector_hit_count' => $this->scalarValue($first, 'scoped_secondary_vector_hit_count')],
|
||||
['scoped_keyword_hit_count' => $this->scalarValue($first, 'scoped_keyword_hit_count')]
|
||||
);
|
||||
|
||||
$io->section('Boosts');
|
||||
$io->definitionList(
|
||||
['scoped_boost_factor' => $this->scalarValue($first, 'scoped_boost_factor')],
|
||||
['scoped_vector_boost_factor' => $this->scalarValue($first, 'scoped_vector_boost_factor')],
|
||||
['secondary_scoped_vector_boost_factor' => $this->scalarValue($first, 'secondary_scoped_vector_boost_factor')],
|
||||
['scoped_keyword_boost_factor' => $this->scalarValue($first, 'scoped_keyword_boost_factor')]
|
||||
);
|
||||
|
||||
$io->section('Selected Chunks');
|
||||
|
||||
foreach ($results as $row) {
|
||||
$rank = $this->scalarValue($row, 'rank');
|
||||
$chunkId = $this->stringValue($row, 'chunk_id');
|
||||
$documentId = $this->stringValue($row, 'document_id');
|
||||
$chunkIndex = $this->scalarValue($row, 'chunk_index');
|
||||
$rrfScore = $this->scalarValue($row, 'rrf_score');
|
||||
$rawVectorScore = $this->scalarValue($row, 'raw_vector_score');
|
||||
$rawKeywordScore = $this->scalarValue($row, 'raw_keyword_score');
|
||||
$titleMetadataBoost = $this->scalarValue($row, 'title_metadata_boost');
|
||||
$text = (string) ($row['text'] ?? '');
|
||||
|
||||
if (!$showText) {
|
||||
$text = $this->shortenText($text, 500);
|
||||
}
|
||||
|
||||
$io->writeln(sprintf(
|
||||
'<info>#%s</info> chunk=%s doc=%s idx=%s rrf=%s vector=%s keyword=%s title_meta=%s',
|
||||
$rank,
|
||||
$chunkId,
|
||||
$documentId !== '' ? $documentId : '-',
|
||||
$chunkIndex !== '' ? $chunkIndex : '-',
|
||||
$rrfScore !== '' ? $rrfScore : '-',
|
||||
$rawVectorScore !== '' ? $rawVectorScore : '-',
|
||||
$rawKeywordScore !== '' ? $rawKeywordScore : '-',
|
||||
$titleMetadataBoost !== '' ? $titleMetadataBoost : '-'
|
||||
));
|
||||
$io->writeln($text);
|
||||
$io->writeln('');
|
||||
}
|
||||
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $row
|
||||
*/
|
||||
private function stringValue(array $row, string $key): string
|
||||
{
|
||||
$value = $row[$key] ?? null;
|
||||
|
||||
if ($value === null) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return trim((string) $value);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $row
|
||||
*/
|
||||
private function scalarValue(array $row, string $key): string
|
||||
{
|
||||
$value = $row[$key] ?? null;
|
||||
|
||||
if ($value === null) {
|
||||
return '';
|
||||
}
|
||||
|
||||
if (is_bool($value)) {
|
||||
return $value ? 'true' : 'false';
|
||||
}
|
||||
|
||||
if (is_scalar($value)) {
|
||||
return (string) $value;
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $row
|
||||
*/
|
||||
private function boolishValue(array $row, string $key): string
|
||||
{
|
||||
$value = $row[$key] ?? null;
|
||||
|
||||
if (is_bool($value)) {
|
||||
return $value ? 'true' : 'false';
|
||||
}
|
||||
|
||||
if (is_scalar($value)) {
|
||||
return (string) $value;
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $row
|
||||
*/
|
||||
private function jsonValue(array $row, string $key): string
|
||||
{
|
||||
$value = $row[$key] ?? null;
|
||||
|
||||
if ($value === null || !is_array($value)) {
|
||||
return '[]';
|
||||
}
|
||||
|
||||
$json = json_encode(
|
||||
array_values($value),
|
||||
JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
|
||||
);
|
||||
|
||||
return is_string($json) ? $json : '[]';
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $row
|
||||
*/
|
||||
private function jsonObjectValue(array $row, string $key): string
|
||||
{
|
||||
$value = $row[$key] ?? null;
|
||||
|
||||
if ($value === null || !is_array($value)) {
|
||||
return '{}';
|
||||
}
|
||||
|
||||
$json = json_encode(
|
||||
$value,
|
||||
JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
|
||||
);
|
||||
|
||||
return is_string($json) ? $json : '{}';
|
||||
}
|
||||
|
||||
private function shortenText(string $text, int $maxLength): string
|
||||
{
|
||||
$text = trim((preg_replace('/\s+/u', ' ', $text) ?? $text));
|
||||
|
||||
if (mb_strlen($text, 'UTF-8') <= $maxLength) {
|
||||
return $text;
|
||||
}
|
||||
|
||||
return mb_substr($text, 0, $maxLength, 'UTF-8') . ' …';
|
||||
}
|
||||
}
|
||||
@@ -7,62 +7,96 @@ namespace App\Config;
|
||||
final class NdjsonHybridRetrieverConfig
|
||||
{
|
||||
/**
|
||||
* Default semantic similarity threshold for vector hits.
|
||||
* Maximum number of chunks the retriever may finally hand to the model.
|
||||
*
|
||||
* Chosen to stay selective enough for product-family-heavy data
|
||||
* while not cutting off too many useful fallback hits.
|
||||
* Rationale:
|
||||
* - enough room for the stronger hybrid pipeline
|
||||
* - still conservative enough to avoid prompt bloat
|
||||
*/
|
||||
public const VECTOR_SCORE_THRESHOLD = 0.83;
|
||||
public const HARD_MAX_CHUNKS = 6;
|
||||
|
||||
/**
|
||||
* Absolute safety caps.
|
||||
* Hard upper bound for vector retrieval candidate size.
|
||||
*
|
||||
* These limits protect the retriever from overly large candidate sets
|
||||
* even if runtime config values are set too high.
|
||||
* Rationale:
|
||||
* - the pipeline now combines primary vector, secondary vector,
|
||||
* lexical, scoped retrieval and re-ranking
|
||||
* - the old limit would constrain recall too early
|
||||
* - still capped to keep latency controlled
|
||||
*/
|
||||
public const HARD_MAX_CHUNKS = 72;
|
||||
public const HARD_MAX_VECTORK = 180;
|
||||
public const HARD_MAX_VECTORK = 18;
|
||||
|
||||
/**
|
||||
* List-style queries benefit from a slightly wider candidate pool
|
||||
* before de-duplication and final selection.
|
||||
* Default semantic score threshold for vector hits.
|
||||
*
|
||||
* Rationale:
|
||||
* - slightly relaxed compared to stricter pure-vector setups
|
||||
* - the system now has more safeguards:
|
||||
* lexical cross-signals, scoped retrieval, title/meta boost, selection rules
|
||||
*/
|
||||
public const LIST_BONUS = 1.25;
|
||||
public const VECTOR_SCORE_THRESHOLD = 0.81;
|
||||
|
||||
/**
|
||||
* Selection rules for cross-document semantic retrieval.
|
||||
* Lower safety boundary for dynamic threshold adjustments.
|
||||
*
|
||||
* MAX_CHUNKS_PER_DOC:
|
||||
* Keeps one document from dominating the final result in normal
|
||||
* semantic retrieval mode.
|
||||
*
|
||||
* MIN_CHUNK_DISTANCE:
|
||||
* Allows nearby chunks to be selected when they are still meaningfully
|
||||
* distinct, which is important for compact product sheets.
|
||||
* Rationale:
|
||||
* - prevents the system from getting too noisy in fallback cases
|
||||
* - still allows recovery when exact signals are sparse
|
||||
*/
|
||||
public const MAX_CHUNKS_PER_DOC = 3;
|
||||
public const MIN_CHUNK_DISTANCE = 1.0;
|
||||
public const THRESHOLD_FLOOR = 0.75;
|
||||
|
||||
/**
|
||||
* Upper safety boundary for dynamic threshold adjustments.
|
||||
*
|
||||
* Rationale:
|
||||
* - protects objection/pricing/list adjustments from becoming too strict
|
||||
* - keeps retrieval from collapsing into empty result sets too easily
|
||||
*/
|
||||
public const THRESHOLD_CEIL = 0.90;
|
||||
|
||||
/**
|
||||
* Additional candidate expansion factor for list-like prompts.
|
||||
*
|
||||
* Rationale:
|
||||
* - list requests benefit from wider candidate recall
|
||||
* - too high would create noise across multiple retrieval channels
|
||||
*/
|
||||
public const LIST_BONUS = 1.35;
|
||||
|
||||
/**
|
||||
* Reciprocal Rank Fusion constant.
|
||||
*
|
||||
* Slightly lower than classic defaults so top-ranked hits matter more.
|
||||
* Rationale:
|
||||
* - keep rank importance meaningful
|
||||
* - but not so aggressive that one retrieval source dominates too hard
|
||||
*/
|
||||
public const RRF_K = 50;
|
||||
|
||||
/**
|
||||
* Dynamic threshold clamp boundaries.
|
||||
* Fallback size when thresholded fusion yields no candidates.
|
||||
*
|
||||
* The floor must stay below the default threshold, otherwise the
|
||||
* configured base threshold becomes ineffective.
|
||||
* Rationale:
|
||||
* - slightly larger safety net for the richer hybrid stack
|
||||
* - helps no-tag and low-signal cases without exploding context
|
||||
*/
|
||||
public const THRESHOLD_FLOOR = 0.78;
|
||||
public const THRESHOLD_CEIL = 0.90;
|
||||
public const EMPTY_RRF_FALLBACK_TOPN = 5;
|
||||
|
||||
/**
|
||||
* Fallback breadth when strict thresholding removes all fused hits.
|
||||
* Maximum number of chunks allowed from one document in spread mode.
|
||||
*
|
||||
* More than one fallback result makes the retriever less brittle.
|
||||
* Rationale:
|
||||
* - preserve diversity across documents
|
||||
* - still allow coherent multi-chunk retrieval from strong sources
|
||||
*/
|
||||
public const EMPTY_RRF_FALLBACK_TOPN = 3;
|
||||
public const MAX_CHUNKS_PER_DOC = 2;
|
||||
|
||||
/**
|
||||
* Minimum distance between chunk indices from the same document
|
||||
* during spread-style selection.
|
||||
*
|
||||
* Rationale:
|
||||
* - reduce near-duplicate neighboring chunks
|
||||
* - still allow relevant continuation when needed
|
||||
*/
|
||||
public const MIN_CHUNK_DISTANCE = 2;
|
||||
}
|
||||
@@ -1,22 +1,180 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Config;
|
||||
|
||||
class QueryEnricherConfig
|
||||
final readonly class QueryEnricherConfig
|
||||
{
|
||||
/**
|
||||
* Keep the enrichment vocabulary in the class for now.
|
||||
*
|
||||
* Important:
|
||||
* - This is intentionally NOT externalized yet.
|
||||
* - Add or maintain the current project-specific mappings here.
|
||||
* - The later move to external config/files can happen separately.
|
||||
*
|
||||
* Supported shapes:
|
||||
*
|
||||
* 1) Simple mapping:
|
||||
* [
|
||||
* 'water hardness' => 'residual hardness',
|
||||
* 'device' => 'instrument',
|
||||
* ]
|
||||
*
|
||||
* 2) Small synonym groups:
|
||||
* [
|
||||
* ['water hardness', 'residual hardness', 'hardness'],
|
||||
* ['device', 'instrument', 'meter'],
|
||||
* ]
|
||||
*
|
||||
* The public API stays intentionally simple:
|
||||
* - getEnrichQueryList(): array<string,string>
|
||||
*
|
||||
* This keeps QueryEnricher generic while the domain vocabulary
|
||||
* deliberately remains inside this class for now.
|
||||
*
|
||||
* Replace the example entries below with your real project mappings.
|
||||
*
|
||||
* @var array<int|string, mixed>
|
||||
*/
|
||||
private const ENRICH_QUERY_LIST = [
|
||||
// -----------------------------------------------------------------
|
||||
// Example mappings.
|
||||
// Replace / extend these with your current real project mappings.
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
'water hardness' => 'residual hardness',
|
||||
'device' => 'instrument',
|
||||
'gerät'=>'produkt',
|
||||
'rebuild'=>'reindex',
|
||||
|
||||
['measuring device', 'meter', 'instrument'],
|
||||
];
|
||||
|
||||
/**
|
||||
* Returns a normalized, deduplicated mapping for the QueryEnricher.
|
||||
*
|
||||
* Output format:
|
||||
* [
|
||||
* 'term a' => 'term b',
|
||||
* 'term c' => 'term d',
|
||||
* ]
|
||||
*
|
||||
* Rules:
|
||||
* - ignore empty / invalid values
|
||||
* - trim and normalize whitespace
|
||||
* - ignore self-mappings
|
||||
* - preserve first valid rule if duplicates normalize to the same key
|
||||
*
|
||||
* @return array<string, string>
|
||||
*/
|
||||
public function getEnrichQueryList(): array
|
||||
{
|
||||
return [
|
||||
'Wasserhärte' => 'Resthärte',
|
||||
'Gerät' => 'Modell',
|
||||
'Indikator' => 'Chemie',
|
||||
'Seminar' => 'Webinar',
|
||||
'Schulung' => 'Seminar',
|
||||
'Indikatoren' => 'Indikator',
|
||||
'Wasserhärte-Grenzwert' => 'Resthärte',
|
||||
'Resthärte-Grenzwert' => 'Wasserhärte',
|
||||
'Grenzwert' => 'Überwachungsbereich',
|
||||
'store'=>'shop'
|
||||
];
|
||||
$normalized = [];
|
||||
|
||||
foreach (self::ENRICH_QUERY_LIST as $key => $value) {
|
||||
if (is_array($value)) {
|
||||
$this->ingestGroup($normalized, $value);
|
||||
continue;
|
||||
}
|
||||
|
||||
$left = $this->normalizePhrase(is_string($key) ? $key : '');
|
||||
$right = $this->normalizePhrase(is_string($value) ? $value : '');
|
||||
|
||||
if (!$this->isValidPair($left, $right)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isset($normalized[$left])) {
|
||||
$normalized[$left] = $right;
|
||||
}
|
||||
}
|
||||
|
||||
return $normalized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true when at least one valid enrichment rule exists.
|
||||
*/
|
||||
public function hasRules(): bool
|
||||
{
|
||||
return $this->getEnrichQueryList() !== [];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, string> $normalized
|
||||
* @param array<int|string, mixed> $group
|
||||
*/
|
||||
private function ingestGroup(array &$normalized, array $group): void
|
||||
{
|
||||
$items = [];
|
||||
|
||||
foreach ($group as $item) {
|
||||
if (!is_string($item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = $this->normalizePhrase($item);
|
||||
|
||||
if ($item === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$items[$item] = $item;
|
||||
}
|
||||
|
||||
$items = array_values($items);
|
||||
|
||||
if (count($items) < 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Turn a synonym group into a conservative chain:
|
||||
* ['a', 'b', 'c'] => a=>b, b=>c
|
||||
*
|
||||
* QueryEnricher builds a bidirectional lookup later,
|
||||
* so the config output stays intentionally small.
|
||||
*/
|
||||
for ($i = 0, $max = count($items) - 1; $i < $max; $i++) {
|
||||
$left = $items[$i];
|
||||
$right = $items[$i + 1];
|
||||
|
||||
if (!$this->isValidPair($left, $right)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isset($normalized[$left])) {
|
||||
$normalized[$left] = $right;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function isValidPair(string $left, string $right): bool
|
||||
{
|
||||
if ($left === '' || $right === '') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($left === $right) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private function normalizePhrase(string $value): string
|
||||
{
|
||||
$value = trim($value);
|
||||
|
||||
if ($value === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
$value = mb_strtolower($value, 'UTF-8');
|
||||
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
||||
|
||||
return trim($value);
|
||||
}
|
||||
}
|
||||
@@ -6,36 +6,50 @@ namespace App\Ingest;
|
||||
|
||||
use App\Index\IndexMetaManager;
|
||||
use App\Knowledge\ChunkManager;
|
||||
use App\Knowledge\Retrieval\NdjsonLexicalIndexBuilder;
|
||||
use App\Vector\VectorIndexBuilder;
|
||||
|
||||
final readonly class VectorRebuildService
|
||||
{
|
||||
public function __construct(
|
||||
private VectorIndexBuilder $vectorBuilder,
|
||||
private IndexMetaManager $metaManager,
|
||||
private ChunkManager $chunkManager,
|
||||
) {}
|
||||
private NdjsonLexicalIndexBuilder $lexicalIndexBuilder,
|
||||
private IndexMetaManager $metaManager,
|
||||
private ChunkManager $chunkManager,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Führt einen vollständigen, deterministischen FAISS-Rebuild aus.
|
||||
* Executes a full deterministic rebuild of all derived retrieval artifacts.
|
||||
*
|
||||
* Ablauf:
|
||||
* 1. Rebuild des Vector Index aus index.ndjson
|
||||
* 2. Chunk-Zählung via ChunkManager
|
||||
* 3. Runtime-Stats atomar aktualisieren
|
||||
* Flow:
|
||||
* 1. Ensure index_meta.json exists
|
||||
* 2. Rebuild vector index from index.ndjson
|
||||
* 3. Rebuild lexical index from index.ndjson
|
||||
* 4. Count chunks streaming-safe
|
||||
* 5. Update runtime stats atomically
|
||||
*
|
||||
* Important:
|
||||
* - Vector and lexical index are both derived from the same NDJSON source
|
||||
* - rebuilding both here prevents drift between semantic and lexical retrieval layers
|
||||
* - failures in either derived artifact should fail the rebuild as a whole
|
||||
* @throws \Throwable
|
||||
*/
|
||||
public function rebuild(?string $logPath = null): void
|
||||
{
|
||||
// ✅ Stelle sicher, dass index_meta.json existiert
|
||||
// Ensure metadata exists before derived index work starts.
|
||||
$this->metaManager->ensureExists();
|
||||
|
||||
// 1️⃣ Vector Index neu bauen
|
||||
// 1) Rebuild semantic vector index.
|
||||
$this->vectorBuilder->rebuildFromNdjson($logPath);
|
||||
|
||||
// 2️⃣ Chunk Count streaming-safe zählen
|
||||
// 2) Rebuild generic lexical index from the same NDJSON source.
|
||||
$this->lexicalIndexBuilder->build();
|
||||
|
||||
// 3) Count chunks streaming-safe.
|
||||
$chunkCount = $this->chunkManager->countAllChunks();
|
||||
|
||||
// 3️⃣ Runtime-Stats aktualisieren (atomar)
|
||||
// 4) Update runtime stats atomically.
|
||||
$this->metaManager->updateRuntimeStats($chunkCount);
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
451
src/Knowledge/Retrieval/NdjsonKeywordRetriever.php
Normal file
451
src/Knowledge/Retrieval/NdjsonKeywordRetriever.php
Normal file
@@ -0,0 +1,451 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Knowledge\StopWords;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use SQLite3;
|
||||
|
||||
final readonly class NdjsonKeywordRetriever
|
||||
{
|
||||
private const DEFAULT_RELATIVE_INDEX_PATH = '/var/knowledge/lexical.index.sqlite';
|
||||
private const MAX_LIMIT = 100;
|
||||
private const MAX_QUERY_TOKENS = 12;
|
||||
|
||||
public function __construct(
|
||||
private string $projectDir,
|
||||
private LoggerInterface $agentLogger,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic lexical retrieval against a prebuilt SQLite index.
|
||||
*
|
||||
* Expected DB schema (to be created by the lexical index builder):
|
||||
*
|
||||
* lexical_meta(
|
||||
* key TEXT PRIMARY KEY,
|
||||
* value TEXT NOT NULL
|
||||
* )
|
||||
*
|
||||
* lexical_terms(
|
||||
* token TEXT PRIMARY KEY,
|
||||
* df INTEGER NOT NULL
|
||||
* )
|
||||
*
|
||||
* lexical_postings(
|
||||
* token TEXT NOT NULL,
|
||||
* chunk_id TEXT NOT NULL,
|
||||
* document_id TEXT NOT NULL,
|
||||
* chunk_index INTEGER,
|
||||
* tf INTEGER NOT NULL,
|
||||
* title_tf INTEGER NOT NULL DEFAULT 0,
|
||||
* PRIMARY KEY(token, chunk_id)
|
||||
* )
|
||||
*
|
||||
* This retriever contains no domain-specific keyword logic.
|
||||
* It only uses generic token overlap, rarity, title hits, and numeric/code emphasis.
|
||||
*
|
||||
* @param string[] $docIds Optional document scope
|
||||
*
|
||||
* @return array<int, array{
|
||||
* chunk_id:string,
|
||||
* score:float,
|
||||
* document_id:?string,
|
||||
* chunk_index:?int
|
||||
* }>
|
||||
*/
|
||||
public function search(string $query, int $limit = 10, array $docIds = []): array
|
||||
{
|
||||
$limit = $this->clampLimit($limit);
|
||||
$analysis = $this->analyzeQuery($query);
|
||||
|
||||
if ($analysis['tokens'] === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$db = $this->openReadOnlyDb();
|
||||
|
||||
if (!$db instanceof SQLite3) {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
$totalChunks = $this->loadTotalChunks($db);
|
||||
$rows = $this->loadPostings(
|
||||
$db,
|
||||
$analysis['tokens'],
|
||||
$docIds
|
||||
);
|
||||
|
||||
if ($rows === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return $this->scoreRows(
|
||||
$rows,
|
||||
$analysis['tokens'],
|
||||
$analysis['numeric_tokens'],
|
||||
$totalChunks,
|
||||
$limit
|
||||
);
|
||||
} catch (\Throwable $e) {
|
||||
$this->agentLogger->error('Keyword retriever failed', [
|
||||
'error' => $e->getMessage(),
|
||||
]);
|
||||
|
||||
return [];
|
||||
} finally {
|
||||
$db->close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{
|
||||
* normalized_query:string,
|
||||
* tokens:string[],
|
||||
* numeric_tokens:string[]
|
||||
* }
|
||||
*/
|
||||
private function analyzeQuery(string $query): array
|
||||
{
|
||||
$normalized = $this->normalizeText($query);
|
||||
|
||||
if ($normalized === '') {
|
||||
return [
|
||||
'normalized_query' => '',
|
||||
'tokens' => [],
|
||||
'numeric_tokens' => [],
|
||||
];
|
||||
}
|
||||
|
||||
$parts = preg_split('/\s+/u', $normalized, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||||
|
||||
$tokens = [];
|
||||
$numericTokens = [];
|
||||
|
||||
foreach ($parts as $token) {
|
||||
if ($token === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($this->shouldIgnoreToken($token)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$tokens[] = $token;
|
||||
|
||||
if (preg_match('/\d/u', $token) === 1) {
|
||||
$numericTokens[] = $token;
|
||||
}
|
||||
}
|
||||
|
||||
$tokens = array_values(array_unique($tokens));
|
||||
$numericTokens = array_values(array_unique($numericTokens));
|
||||
|
||||
if (count($tokens) > self::MAX_QUERY_TOKENS) {
|
||||
$tokens = array_slice($tokens, 0, self::MAX_QUERY_TOKENS);
|
||||
}
|
||||
|
||||
return [
|
||||
'normalized_query' => $normalized,
|
||||
'tokens' => $tokens,
|
||||
'numeric_tokens' => $numericTokens,
|
||||
];
|
||||
}
|
||||
|
||||
private function shouldIgnoreToken(string $token): bool
|
||||
{
|
||||
if ($token === '') {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (preg_match('/\d/u', $token) === 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (mb_strlen($token, 'UTF-8') < 2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return StopWords::isStopWord($token);
|
||||
}
|
||||
|
||||
private function normalizeText(string $value): string
|
||||
{
|
||||
$value = mb_strtolower(trim($value), 'UTF-8');
|
||||
$value = str_replace(['-', '/', '_'], ' ', $value);
|
||||
$value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
|
||||
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
||||
|
||||
return trim($value);
|
||||
}
|
||||
|
||||
private function openReadOnlyDb(): ?SQLite3
|
||||
{
|
||||
if (!class_exists(SQLite3::class)) {
|
||||
$this->agentLogger->warning('Keyword retriever unavailable: sqlite3 extension missing.');
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
$path = $this->getIndexPath();
|
||||
|
||||
if (!is_file($path)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
$db = new SQLite3($path, SQLITE3_OPEN_READONLY);
|
||||
$db->busyTimeout(1000);
|
||||
|
||||
return $db;
|
||||
} catch (\Throwable $e) {
|
||||
$this->agentLogger->error('Unable to open lexical index', [
|
||||
'path' => $path,
|
||||
'error' => $e->getMessage(),
|
||||
]);
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private function getIndexPath(): string
|
||||
{
|
||||
return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH;
|
||||
}
|
||||
|
||||
private function loadTotalChunks(SQLite3 $db): int
|
||||
{
|
||||
$stmt = $db->prepare('SELECT value FROM lexical_meta WHERE key = :key');
|
||||
if (!$stmt) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
$stmt->bindValue(':key', 'total_chunks', SQLITE3_TEXT);
|
||||
$result = $stmt->execute();
|
||||
|
||||
if ($result === false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
$row = $result->fetchArray(SQLITE3_ASSOC);
|
||||
$result->finalize();
|
||||
|
||||
$value = isset($row['value']) ? (int) $row['value'] : 0;
|
||||
|
||||
return max(1, $value);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[] $tokens
|
||||
* @param string[] $docIds
|
||||
* @return array<int, array{
|
||||
* token:string,
|
||||
* chunk_id:string,
|
||||
* document_id:string,
|
||||
* chunk_index:?int,
|
||||
* tf:int,
|
||||
* title_tf:int,
|
||||
* df:int
|
||||
* }>
|
||||
*/
|
||||
private function loadPostings(SQLite3 $db, array $tokens, array $docIds): array
|
||||
{
|
||||
if ($tokens === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$tokenPlaceholders = [];
|
||||
foreach (array_keys($tokens) as $i) {
|
||||
$tokenPlaceholders[] = ':t' . $i;
|
||||
}
|
||||
|
||||
$sql = '
|
||||
SELECT
|
||||
p.token,
|
||||
p.chunk_id,
|
||||
p.document_id,
|
||||
p.chunk_index,
|
||||
p.tf,
|
||||
p.title_tf,
|
||||
lt.df
|
||||
FROM lexical_postings p
|
||||
INNER JOIN lexical_terms lt ON lt.token = p.token
|
||||
WHERE p.token IN (' . implode(', ', $tokenPlaceholders) . ')
|
||||
';
|
||||
|
||||
$docIds = array_values(array_unique(array_filter(
|
||||
$docIds,
|
||||
static fn (mixed $value): bool => is_string($value) && $value !== ''
|
||||
)));
|
||||
|
||||
if ($docIds !== []) {
|
||||
$docPlaceholders = [];
|
||||
foreach (array_keys($docIds) as $i) {
|
||||
$docPlaceholders[] = ':d' . $i;
|
||||
}
|
||||
|
||||
$sql .= ' AND p.document_id IN (' . implode(', ', $docPlaceholders) . ')';
|
||||
}
|
||||
|
||||
$stmt = $db->prepare($sql);
|
||||
|
||||
if ($stmt === false) {
|
||||
return [];
|
||||
}
|
||||
|
||||
foreach ($tokens as $i => $token) {
|
||||
$stmt->bindValue(':t' . $i, $token, SQLITE3_TEXT);
|
||||
}
|
||||
|
||||
foreach ($docIds as $i => $docId) {
|
||||
$stmt->bindValue(':d' . $i, $docId, SQLITE3_TEXT);
|
||||
}
|
||||
|
||||
$result = $stmt->execute();
|
||||
|
||||
if ($result === false) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$rows = [];
|
||||
|
||||
while (($row = $result->fetchArray(SQLITE3_ASSOC)) !== false) {
|
||||
$chunkId = (string) ($row['chunk_id'] ?? '');
|
||||
$documentId = (string) ($row['document_id'] ?? '');
|
||||
$token = (string) ($row['token'] ?? '');
|
||||
|
||||
if ($chunkId === '' || $documentId === '' || $token === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkIndex = null;
|
||||
if (isset($row['chunk_index']) && is_numeric($row['chunk_index'])) {
|
||||
$chunkIndex = (int) $row['chunk_index'];
|
||||
}
|
||||
|
||||
$rows[] = [
|
||||
'token' => $token,
|
||||
'chunk_id' => $chunkId,
|
||||
'document_id' => $documentId,
|
||||
'chunk_index' => $chunkIndex,
|
||||
'tf' => max(1, (int) ($row['tf'] ?? 1)),
|
||||
'title_tf' => max(0, (int) ($row['title_tf'] ?? 0)),
|
||||
'df' => max(1, (int) ($row['df'] ?? 1)),
|
||||
];
|
||||
}
|
||||
|
||||
$result->finalize();
|
||||
|
||||
return $rows;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, array{
|
||||
* token:string,
|
||||
* chunk_id:string,
|
||||
* document_id:string,
|
||||
* chunk_index:?int,
|
||||
* tf:int,
|
||||
* title_tf:int,
|
||||
* df:int
|
||||
* }> $rows
|
||||
* @param string[] $queryTokens
|
||||
* @param string[] $numericTokens
|
||||
*
|
||||
* @return array<int, array{
|
||||
* chunk_id:string,
|
||||
* score:float,
|
||||
* document_id:?string,
|
||||
* chunk_index:?int
|
||||
* }>
|
||||
*/
|
||||
private function scoreRows(
|
||||
array $rows,
|
||||
array $queryTokens,
|
||||
array $numericTokens,
|
||||
int $totalChunks,
|
||||
int $limit
|
||||
): array {
|
||||
if ($rows === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$numericLookup = array_fill_keys($numericTokens, true);
|
||||
$queryTokenCount = max(1, count($queryTokens));
|
||||
|
||||
$scores = [];
|
||||
$meta = [];
|
||||
$matchedTokens = [];
|
||||
|
||||
foreach ($rows as $row) {
|
||||
$chunkId = $row['chunk_id'];
|
||||
$token = $row['token'];
|
||||
|
||||
$idf = log(1.0 + ($totalChunks / max(1.0, (float) (1 + $row['df']))));
|
||||
$tfBoost = 1.0 + (min(3, $row['tf']) * 0.20);
|
||||
$numericBoost = isset($numericLookup[$token]) ? 1.60 : 1.0;
|
||||
$titleBonus = $row['title_tf'] > 0 ? ($idf * 0.75) : 0.0;
|
||||
|
||||
$scores[$chunkId] = ($scores[$chunkId] ?? 0.0)
|
||||
+ ($idf * $tfBoost * $numericBoost)
|
||||
+ $titleBonus;
|
||||
|
||||
$matchedTokens[$chunkId][$token] = true;
|
||||
|
||||
if (!isset($meta[$chunkId])) {
|
||||
$meta[$chunkId] = [
|
||||
'document_id' => $row['document_id'],
|
||||
'chunk_index' => $row['chunk_index'],
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($scores as $chunkId => $score) {
|
||||
$coverage = count($matchedTokens[$chunkId] ?? []) / $queryTokenCount;
|
||||
$scores[$chunkId] = $score * (0.65 + (0.35 * $coverage));
|
||||
}
|
||||
|
||||
arsort($scores);
|
||||
|
||||
$topScore = (float) reset($scores);
|
||||
if ($topScore <= 0.0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$out = [];
|
||||
|
||||
foreach ($scores as $chunkId => $score) {
|
||||
$normalizedScore = $score / $topScore;
|
||||
|
||||
$out[] = [
|
||||
'chunk_id' => $chunkId,
|
||||
'score' => round($normalizedScore, 6),
|
||||
'document_id' => $meta[$chunkId]['document_id'] ?? null,
|
||||
'chunk_index' => $meta[$chunkId]['chunk_index'] ?? null,
|
||||
];
|
||||
|
||||
if (count($out) >= $limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
private function clampLimit(int $limit): int
|
||||
{
|
||||
if ($limit < 1) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if ($limit > self::MAX_LIMIT) {
|
||||
return self::MAX_LIMIT;
|
||||
}
|
||||
|
||||
return $limit;
|
||||
}
|
||||
}
|
||||
528
src/Knowledge/Retrieval/NdjsonLexicalIndexBuilder.php
Normal file
528
src/Knowledge/Retrieval/NdjsonLexicalIndexBuilder.php
Normal file
@@ -0,0 +1,528 @@
|
||||
<?php
|
||||
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Knowledge\StopWords;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use SQLite3;
|
||||
|
||||
final readonly class NdjsonLexicalIndexBuilder
|
||||
{
|
||||
private const DEFAULT_RELATIVE_NDJSON_PATH = '/var/knowledge/index.ndjson';
|
||||
private const DEFAULT_RELATIVE_INDEX_PATH = '/var/knowledge/lexical.index.sqlite';
|
||||
|
||||
/**
|
||||
* Upper bound to avoid pathological chunks exploding the lexical index.
|
||||
* This stays generic and does not encode any domain-specific assumption.
|
||||
*/
|
||||
private const MAX_UNIQUE_TOKENS_PER_CHUNK = 256;
|
||||
|
||||
public function __construct(
|
||||
private string $projectDir,
|
||||
private LoggerInterface $agentLogger,
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a generic lexical SQLite index from index.ndjson.
|
||||
*
|
||||
* Output DB schema:
|
||||
*
|
||||
* lexical_meta(
|
||||
* key TEXT PRIMARY KEY,
|
||||
* value TEXT NOT NULL
|
||||
* )
|
||||
*
|
||||
* lexical_terms(
|
||||
* token TEXT PRIMARY KEY,
|
||||
* df INTEGER NOT NULL
|
||||
* )
|
||||
*
|
||||
* lexical_postings(
|
||||
* token TEXT NOT NULL,
|
||||
* chunk_id TEXT NOT NULL,
|
||||
* document_id TEXT NOT NULL,
|
||||
* chunk_index INTEGER,
|
||||
* tf INTEGER NOT NULL,
|
||||
* title_tf INTEGER NOT NULL DEFAULT 0,
|
||||
* PRIMARY KEY(token, chunk_id)
|
||||
* )
|
||||
*
|
||||
* Design goals:
|
||||
* - generic, data-driven lexical retrieval base
|
||||
* - no domain keywords in core code
|
||||
* - no full scan per request later
|
||||
* - duplicate chunk_id lines in index.ndjson must not inflate the index
|
||||
*/
|
||||
public function build(): void
|
||||
{
|
||||
$this->assertSqliteAvailable();
|
||||
|
||||
$indexNdjsonPath = $this->getIndexNdjsonPath();
|
||||
$lexicalIndexPath = $this->getLexicalIndexPath();
|
||||
$tmpPath = $lexicalIndexPath . '.tmp';
|
||||
|
||||
if (!is_file($indexNdjsonPath) || filesize($indexNdjsonPath) === 0) {
|
||||
$this->removeFileIfExists($lexicalIndexPath);
|
||||
$this->removeFileIfExists($tmpPath);
|
||||
|
||||
$this->agentLogger->info('Lexical index skipped because index.ndjson is missing or empty.', [
|
||||
'index_ndjson' => $indexNdjsonPath,
|
||||
]);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
$this->ensureTargetDirectoryExists($lexicalIndexPath);
|
||||
$this->removeFileIfExists($tmpPath);
|
||||
|
||||
$db = $this->openWritableDb($tmpPath);
|
||||
|
||||
try {
|
||||
$this->initializeSchema($db);
|
||||
$this->buildFromNdjson($db, $indexNdjsonPath);
|
||||
$db->close();
|
||||
|
||||
$this->atomicReplace($tmpPath, $lexicalIndexPath);
|
||||
|
||||
$this->agentLogger->info('Lexical index build completed.', [
|
||||
'path' => $lexicalIndexPath,
|
||||
]);
|
||||
} catch (\Throwable $e) {
|
||||
try {
|
||||
$db->close();
|
||||
} catch (\Throwable) {
|
||||
// Ignore close failures during cleanup.
|
||||
}
|
||||
|
||||
$this->removeFileIfExists($tmpPath);
|
||||
|
||||
$this->agentLogger->error('Lexical index build failed.', [
|
||||
'path' => $lexicalIndexPath,
|
||||
'error' => $e->getMessage(),
|
||||
]);
|
||||
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
private function buildFromNdjson(SQLite3 $db, string $indexNdjsonPath): void
|
||||
{
|
||||
$handle = @fopen($indexNdjsonPath, 'rb');
|
||||
|
||||
if ($handle === false) {
|
||||
throw new \RuntimeException('Unable to read index.ndjson: ' . $indexNdjsonPath);
|
||||
}
|
||||
|
||||
$db->exec('BEGIN IMMEDIATE TRANSACTION');
|
||||
|
||||
try {
|
||||
$seenChunkStmt = $db->prepare(
|
||||
'INSERT OR IGNORE INTO lexical_seen_chunks (chunk_id) VALUES (:chunk_id)'
|
||||
);
|
||||
$termStmt = $db->prepare(
|
||||
'INSERT INTO lexical_terms (token, df)
|
||||
VALUES (:token, 1)
|
||||
ON CONFLICT(token) DO UPDATE SET df = df + 1'
|
||||
);
|
||||
$postingStmt = $db->prepare(
|
||||
'INSERT INTO lexical_postings (
|
||||
token,
|
||||
chunk_id,
|
||||
document_id,
|
||||
chunk_index,
|
||||
tf,
|
||||
title_tf
|
||||
) VALUES (
|
||||
:token,
|
||||
:chunk_id,
|
||||
:document_id,
|
||||
:chunk_index,
|
||||
:tf,
|
||||
:title_tf
|
||||
)'
|
||||
);
|
||||
|
||||
if (!$seenChunkStmt || !$termStmt || !$postingStmt) {
|
||||
throw new \RuntimeException('Failed to prepare lexical index SQL statements.');
|
||||
}
|
||||
|
||||
$totalChunks = 0;
|
||||
$lineNumber = 0;
|
||||
|
||||
while (($line = fgets($handle)) !== false) {
|
||||
$lineNumber++;
|
||||
$line = trim($line);
|
||||
|
||||
if ($line === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$row = json_decode($line, true);
|
||||
|
||||
if (!is_array($row)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkId = trim((string)($row['chunk_id'] ?? ''));
|
||||
$documentId = trim((string)($row['document_id'] ?? ''));
|
||||
$chunkIndex = $this->normalizeChunkIndex($row['chunk_index'] ?? null);
|
||||
$text = trim((string)($row['text'] ?? ''));
|
||||
|
||||
if ($chunkId === '' || $documentId === '' || $text === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$seenChunkStmt->reset();
|
||||
$seenChunkStmt->clear();
|
||||
$seenChunkStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT);
|
||||
$seenResult = $seenChunkStmt->execute();
|
||||
|
||||
if ($seenResult !== false) {
|
||||
$seenResult->finalize();
|
||||
}
|
||||
|
||||
if ($db->changes() < 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$title = $this->extractDocumentTitle($row);
|
||||
$tokenStats = $this->buildTokenStats($text, $title);
|
||||
|
||||
if ($tokenStats === []) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$totalChunks++;
|
||||
|
||||
foreach ($tokenStats as $token => $stats) {
|
||||
$termStmt->reset();
|
||||
$termStmt->clear();
|
||||
$termStmt->bindValue(':token', $token, SQLITE3_TEXT);
|
||||
$termResult = $termStmt->execute();
|
||||
|
||||
if ($termResult !== false) {
|
||||
$termResult->finalize();
|
||||
}
|
||||
|
||||
$postingStmt->reset();
|
||||
$postingStmt->clear();
|
||||
$postingStmt->bindValue(':token', $token, SQLITE3_TEXT);
|
||||
$postingStmt->bindValue(':chunk_id', $chunkId, SQLITE3_TEXT);
|
||||
$postingStmt->bindValue(':document_id', $documentId, SQLITE3_TEXT);
|
||||
|
||||
if ($chunkIndex === null) {
|
||||
$postingStmt->bindValue(':chunk_index', null, SQLITE3_NULL);
|
||||
} else {
|
||||
$postingStmt->bindValue(':chunk_index', $chunkIndex, SQLITE3_INTEGER);
|
||||
}
|
||||
|
||||
$postingStmt->bindValue(':tf', $stats['tf'], SQLITE3_INTEGER);
|
||||
$postingStmt->bindValue(':title_tf', $stats['title_tf'], SQLITE3_INTEGER);
|
||||
|
||||
$postingResult = $postingStmt->execute();
|
||||
|
||||
if ($postingResult === false) {
|
||||
throw new \RuntimeException('Failed to insert lexical posting for token: ' . $token);
|
||||
}
|
||||
|
||||
$postingResult->finalize();
|
||||
}
|
||||
}
|
||||
|
||||
fclose($handle);
|
||||
|
||||
$this->writeMeta($db, $totalChunks);
|
||||
|
||||
$db->exec('COMMIT');
|
||||
|
||||
$this->agentLogger->info('Lexical index streaming pass completed.', [
|
||||
'indexed_chunks' => $totalChunks,
|
||||
'source' => $indexNdjsonPath,
|
||||
]);
|
||||
} catch (\Throwable $e) {
|
||||
fclose($handle);
|
||||
$db->exec('ROLLBACK');
|
||||
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, array{tf:int, title_tf:int}>
|
||||
*/
|
||||
private function buildTokenStats(string $text, string $title): array
|
||||
{
|
||||
$textTokens = $this->tokenize($text);
|
||||
$titleTokens = $this->tokenize($title);
|
||||
|
||||
if ($textTokens === [] && $titleTokens === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$textTf = [];
|
||||
foreach ($textTokens as $token) {
|
||||
$textTf[$token] = ($textTf[$token] ?? 0) + 1;
|
||||
}
|
||||
|
||||
$titleTf = [];
|
||||
foreach ($titleTokens as $token) {
|
||||
$titleTf[$token] = ($titleTf[$token] ?? 0) + 1;
|
||||
}
|
||||
|
||||
$tokens = array_values(array_unique(array_merge(
|
||||
array_keys($textTf),
|
||||
array_keys($titleTf)
|
||||
)));
|
||||
|
||||
if (count($tokens) > self::MAX_UNIQUE_TOKENS_PER_CHUNK) {
|
||||
$tokens = array_slice($tokens, 0, self::MAX_UNIQUE_TOKENS_PER_CHUNK);
|
||||
}
|
||||
|
||||
$stats = [];
|
||||
|
||||
foreach ($tokens as $token) {
|
||||
$stats[$token] = [
|
||||
'tf' => $textTf[$token] ?? 0,
|
||||
'title_tf' => $titleTf[$token] ?? 0,
|
||||
];
|
||||
}
|
||||
|
||||
return $stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic tokenizer:
|
||||
* - lowercases
|
||||
* - removes punctuation
|
||||
* - preserves alphanumeric codes
|
||||
* - keeps numeric/code-like tokens even if short
|
||||
* - drops generic stop words for non-numeric tokens
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
private function tokenize(string $value): array
|
||||
{
|
||||
$value = $this->normalizeText($value);
|
||||
|
||||
if ($value === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
$parts = preg_split('/\s+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||||
$tokens = [];
|
||||
|
||||
foreach ($parts as $token) {
|
||||
if ($token === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($this->shouldIgnoreToken($token)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$tokens[] = $token;
|
||||
}
|
||||
|
||||
return $tokens;
|
||||
}
|
||||
|
||||
private function shouldIgnoreToken(string $token): bool
|
||||
{
|
||||
if ($token === '') {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (preg_match('/\d/u', $token) === 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (mb_strlen($token, 'UTF-8') < 2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return StopWords::isStopWord($token);
|
||||
}
|
||||
|
||||
private function normalizeText(string $value): string
|
||||
{
|
||||
$value = mb_strtolower(trim($value), 'UTF-8');
|
||||
$value = str_replace(['-', '/', '_'], ' ', $value);
|
||||
$value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
|
||||
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
||||
|
||||
return trim($value);
|
||||
}
|
||||
|
||||
private function extractDocumentTitle(array $row): string
|
||||
{
|
||||
$metadata = $row['metadata'] ?? null;
|
||||
|
||||
if (!is_array($metadata)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return trim((string)($metadata['document_title'] ?? ''));
|
||||
}
|
||||
|
||||
private function normalizeChunkIndex(mixed $value): ?int
|
||||
{
|
||||
if (is_int($value)) {
|
||||
return $value;
|
||||
}
|
||||
|
||||
if (is_string($value) && ctype_digit($value)) {
|
||||
return (int)$value;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private function writeMeta(SQLite3 $db, int $totalChunks): void
|
||||
{
|
||||
$metaStmt = $db->prepare(
|
||||
'INSERT OR REPLACE INTO lexical_meta (key, value) VALUES (:key, :value)'
|
||||
);
|
||||
|
||||
if ($metaStmt === false) {
|
||||
throw new \RuntimeException('Failed to prepare lexical meta statement.');
|
||||
}
|
||||
|
||||
$meta = [
|
||||
'schema_version' => '1',
|
||||
'built_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
||||
'total_chunks' => (string)$totalChunks,
|
||||
];
|
||||
|
||||
foreach ($meta as $key => $value) {
|
||||
$metaStmt->reset();
|
||||
$metaStmt->clear();
|
||||
$metaStmt->bindValue(':key', $key, SQLITE3_TEXT);
|
||||
$metaStmt->bindValue(':value', $value, SQLITE3_TEXT);
|
||||
|
||||
$result = $metaStmt->execute();
|
||||
|
||||
if ($result === false) {
|
||||
throw new \RuntimeException('Failed to write lexical meta key: ' . $key);
|
||||
}
|
||||
|
||||
$result->finalize();
|
||||
}
|
||||
}
|
||||
|
||||
private function initializeSchema(SQLite3 $db): void
|
||||
{
|
||||
$db->exec('PRAGMA journal_mode = DELETE');
|
||||
$db->exec('PRAGMA synchronous = NORMAL');
|
||||
$db->exec('PRAGMA temp_store = MEMORY');
|
||||
$db->exec('PRAGMA foreign_keys = OFF');
|
||||
|
||||
$schema = <<<'SQL'
|
||||
CREATE TABLE IF NOT EXISTS lexical_meta (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS lexical_terms (
|
||||
token TEXT PRIMARY KEY,
|
||||
df INTEGER NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS lexical_postings (
|
||||
token TEXT NOT NULL,
|
||||
chunk_id TEXT NOT NULL,
|
||||
document_id TEXT NOT NULL,
|
||||
chunk_index INTEGER NULL,
|
||||
tf INTEGER NOT NULL,
|
||||
title_tf INTEGER NOT NULL DEFAULT 0,
|
||||
PRIMARY KEY (token, chunk_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_lexical_postings_document_token
|
||||
ON lexical_postings (document_id, token);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_lexical_postings_chunk
|
||||
ON lexical_postings (chunk_id);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS lexical_seen_chunks (
|
||||
chunk_id TEXT PRIMARY KEY
|
||||
);
|
||||
SQL;
|
||||
|
||||
if ($db->exec($schema) === false) {
|
||||
throw new \RuntimeException('Failed to initialize lexical index schema.');
|
||||
}
|
||||
}
|
||||
|
||||
private function openWritableDb(string $path): SQLite3
|
||||
{
|
||||
try {
|
||||
$db = new SQLite3($path, SQLITE3_OPEN_READWRITE | SQLITE3_OPEN_CREATE);
|
||||
} catch (\Throwable $e) {
|
||||
throw new \RuntimeException('Unable to open lexical index DB: ' . $path, 0, $e);
|
||||
}
|
||||
|
||||
$db->busyTimeout(5000);
|
||||
|
||||
return $db;
|
||||
}
|
||||
|
||||
private function getIndexNdjsonPath(): string
|
||||
{
|
||||
return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_NDJSON_PATH;
|
||||
}
|
||||
|
||||
private function getLexicalIndexPath(): string
|
||||
{
|
||||
return rtrim($this->projectDir, '/') . self::DEFAULT_RELATIVE_INDEX_PATH;
|
||||
}
|
||||
|
||||
private function ensureTargetDirectoryExists(string $finalIndexPath): void
|
||||
{
|
||||
$dir = dirname($finalIndexPath);
|
||||
|
||||
if (is_dir($dir)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
|
||||
throw new \RuntimeException('Unable to create lexical index directory: ' . $dir);
|
||||
}
|
||||
}
|
||||
|
||||
private function atomicReplace(string $tmpPath, string $finalPath): void
|
||||
{
|
||||
if (is_file($finalPath)) {
|
||||
@chmod($finalPath, 0664);
|
||||
}
|
||||
|
||||
if (!@rename($tmpPath, $finalPath)) {
|
||||
if (!@copy($tmpPath, $finalPath)) {
|
||||
@unlink($tmpPath);
|
||||
throw new \RuntimeException('Atomic replace failed for lexical index: ' . $finalPath);
|
||||
}
|
||||
|
||||
@unlink($tmpPath);
|
||||
}
|
||||
|
||||
@chmod($finalPath, 0664);
|
||||
}
|
||||
|
||||
private function removeFileIfExists(string $path): void
|
||||
{
|
||||
if (is_file($path)) {
|
||||
@unlink($path);
|
||||
}
|
||||
}
|
||||
|
||||
private function assertSqliteAvailable(): void
|
||||
{
|
||||
if (!class_exists(SQLite3::class)) {
|
||||
throw new \RuntimeException('The sqlite3 PHP extension is required for lexical index building.');
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -8,6 +8,14 @@ use App\Config\QueryEnricherConfig;
|
||||
|
||||
final readonly class QueryEnricher
|
||||
{
|
||||
/**
|
||||
* Keep enrichment conservative.
|
||||
*
|
||||
* The enriched semantic query should help vector retrieval,
|
||||
* but must not become bloated enough to dilute the original user intent.
|
||||
*/
|
||||
private const MAX_EXPANSIONS = 4;
|
||||
|
||||
public function __construct(
|
||||
private QueryEnricherConfig $config
|
||||
) {
|
||||
@@ -16,6 +24,12 @@ final readonly class QueryEnricher
|
||||
/**
|
||||
* Enriches the query with mapped counterpart terms.
|
||||
*
|
||||
* Design goals:
|
||||
* - preserve the original query unchanged at the front
|
||||
* - only append counterpart terms that are not already present
|
||||
* - prefer longer / more specific phrase matches over short generic matches
|
||||
* - keep the number of appended terms intentionally small
|
||||
*
|
||||
* Example:
|
||||
* - input: "water hardness device"
|
||||
* - output: "water hardness device residual hardness model"
|
||||
@@ -29,26 +43,63 @@ final readonly class QueryEnricher
|
||||
}
|
||||
|
||||
$mapping = $this->config->getEnrichQueryList();
|
||||
|
||||
if ($mapping === []) {
|
||||
return $originalQuery;
|
||||
}
|
||||
|
||||
$lookup = $this->buildBidirectionalLookup($mapping);
|
||||
|
||||
if ($lookup === []) {
|
||||
return $originalQuery;
|
||||
}
|
||||
|
||||
$lookup = $this->sortLookupBySpecificity($lookup);
|
||||
$normalizedQuery = $this->normalizeForMatching($originalQuery);
|
||||
|
||||
$matches = [];
|
||||
if ($normalizedQuery === '') {
|
||||
return $originalQuery;
|
||||
}
|
||||
|
||||
foreach ($lookup as $needle => $mappedValue) {
|
||||
if ($needle === '') {
|
||||
$matches = [];
|
||||
$seenNormalizedExpansions = [];
|
||||
|
||||
foreach ($lookup as $normalizedNeedle => $mappedValue) {
|
||||
if ($normalizedNeedle === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($this->containsWholePhrase($normalizedQuery, $needle)) {
|
||||
$matches[] = $mappedValue;
|
||||
if (!$this->containsWholePhrase($normalizedQuery, $normalizedNeedle)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$mappedValue = trim($mappedValue);
|
||||
if ($mappedValue === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$normalizedMappedValue = $this->normalizeForMatching($mappedValue);
|
||||
if ($normalizedMappedValue === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Do not re-add information that is already present in the query.
|
||||
if ($this->containsWholePhrase($normalizedQuery, $normalizedMappedValue)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isset($seenNormalizedExpansions[$normalizedMappedValue])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$matches[] = $mappedValue;
|
||||
$seenNormalizedExpansions[$normalizedMappedValue] = true;
|
||||
|
||||
if (count($matches) >= self::MAX_EXPANSIONS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$matches = array_values(array_unique(array_filter(
|
||||
$matches,
|
||||
static fn(string $value): bool => trim($value) !== ''
|
||||
)));
|
||||
|
||||
if ($matches === []) {
|
||||
return $originalQuery;
|
||||
}
|
||||
@@ -106,6 +157,11 @@ final readonly class QueryEnricher
|
||||
* 'jacket' => 'coat',
|
||||
* 'coat' => 'jacket',
|
||||
* ]
|
||||
*
|
||||
* Returned format:
|
||||
* [
|
||||
* '<normalized needle>' => '<original mapped value>',
|
||||
* ]
|
||||
*/
|
||||
private function buildBidirectionalLookup(array $mapping): array
|
||||
{
|
||||
@@ -122,15 +178,49 @@ final readonly class QueryEnricher
|
||||
$normalizedKey = $this->normalizeForMatching($key);
|
||||
$normalizedValue = $this->normalizeForMatching($value);
|
||||
|
||||
if ($normalizedKey !== '') {
|
||||
if ($normalizedKey !== '' && !isset($lookup[$normalizedKey])) {
|
||||
$lookup[$normalizedKey] = $value;
|
||||
}
|
||||
|
||||
if ($normalizedValue !== '') {
|
||||
if ($normalizedValue !== '' && !isset($lookup[$normalizedValue])) {
|
||||
$lookup[$normalizedValue] = $key;
|
||||
}
|
||||
}
|
||||
|
||||
return $lookup;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sorts phrase rules by specificity so longer / more precise phrases win first.
|
||||
*
|
||||
* Priority:
|
||||
* 1. more words
|
||||
* 2. longer character length
|
||||
* 3. lexical order for deterministic output
|
||||
*
|
||||
* @param array<string, string> $lookup
|
||||
* @return array<string, string>
|
||||
*/
|
||||
private function sortLookupBySpecificity(array $lookup): array
|
||||
{
|
||||
uksort($lookup, static function (string $a, string $b): int {
|
||||
$aWordCount = substr_count($a, ' ') + 1;
|
||||
$bWordCount = substr_count($b, ' ') + 1;
|
||||
|
||||
if ($aWordCount !== $bWordCount) {
|
||||
return $bWordCount <=> $aWordCount;
|
||||
}
|
||||
|
||||
$aLength = mb_strlen($a, 'UTF-8');
|
||||
$bLength = mb_strlen($b, 'UTF-8');
|
||||
|
||||
if ($aLength !== $bLength) {
|
||||
return $bLength <=> $aLength;
|
||||
}
|
||||
|
||||
return strcmp($a, $b);
|
||||
});
|
||||
|
||||
return $lookup;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user