From 57a35af6c8466e9fe1e2db1c617c5993cb3d06c0 Mon Sep 17 00:00:00 2001 From: team 1 Date: Mon, 16 Feb 2026 12:34:22 +0100 Subject: [PATCH] harden struct --- config/services.yaml | 83 +++++++--- src/Controller/Admin/DocumentController.php | 8 +- src/Index/IndexMetaManager.php | 105 ++++-------- src/Vector/VectorIndexBuilder.php | 175 +++++++++----------- src/Vector/VectorSearchClient.php | 96 ++++++++--- src/Vector/vector_search.py | 93 ++++++++--- 6 files changed, 308 insertions(+), 252 deletions(-) diff --git a/config/services.yaml b/config/services.yaml index 288432a..6703f52 100644 --- a/config/services.yaml +++ b/config/services.yaml @@ -3,23 +3,45 @@ # ------------------------------------------------------------ parameters: - mto.kernel.dir: '%kernel.project_dir%' + # ------------------------------------------------------------ + # Root + # ------------------------------------------------------------ + + mto.root: '%kernel.project_dir%' + mto.kernel.dir: '%mto.root%' + + # ------------------------------------------------------------ + # Knowledge Root (ZENTRAL) + # ------------------------------------------------------------ + + mto.knowledge.root: '%mto.root%/var/knowledge' + + mto.knowledge.ndjson: '%mto.knowledge.root%/index.ndjson' + mto.knowledge.index_meta: '%mto.knowledge.root%/index_meta.json' + mto.knowledge.vector_index: '%mto.knowledge.root%/vector.index' + mto.knowledge.vector_index_meta: '%mto.knowledge.root%/vector.index.meta.json' + mto.knowledge.upload: '%mto.knowledge.root%/uploads' + # Backward compatibility alias + mto.vector.data.upload.path: '%mto.knowledge.upload%' + + # ------------------------------------------------------------ + # Index Configuration (Guardrails) + # ------------------------------------------------------------ mto.index.chunk_size: 800 mto.index.chunk_overlap: 100 - mto.index.embedding_model: 'all-MiniLM-L6-v2' #nomic-embed-text + mto.index.embedding_model: 'all-MiniLM-L6-v2' mto.index.embedding_dimension: 768 mto.index.scoring_version: 1 - mto.vector.python_bin: '/var/www/html/.venv/bin/python3' - mto.vector.ingest_script: '/src/Vector/vector_ingest.py' - mto.vector.search_script: '%kernel.project_dir%/src/Vector/vector_search.py' + # ------------------------------------------------------------ + # Python / Vector Runtime + # ------------------------------------------------------------ - mto.vector.data.path: '%kernel.project_dir%/var/knowledge' - mto.vector.data.upload.path: '%mto.vector.data.path%/uploads' - mto.vector.data.ndjson.path: '%mto.vector.data.path%/index.ndjson' - mto.vector.data.vector_index.path: '%mto.vector.data.path%/vector.index' - mto.vector.data.vector_index_meta_json.path: '%mto.vector.data.path%/vector.index.meta.json' + mto.vector.python_bin: '/var/www/html/.venv/bin/python3' + + mto.vector.ingest_script: '%mto.root%/src/Vector/vector_ingest.py' + mto.vector.search_script: '%mto.root%/src/Vector/vector_search.py' mto.vector.timeout: 600 @@ -29,20 +51,13 @@ parameters: services: - # ------------------------------------------------------------ - # Default service configuration - # ------------------------------------------------------------ _defaults: autowire: true autoconfigure: true - bind: Psr\Log\LoggerInterface $agentLogger: '@monolog.logger.agent' - string $projectDir: '%kernel.project_dir%' + string $projectDir: '%mto.root%' - # ------------------------------------------------------------ - # Automatically register all services in src/ - # ------------------------------------------------------------ App\: resource: '../src/' exclude: @@ -57,6 +72,7 @@ services: # ------------------------------------------------------------ # AI Agent – Infrastructure # ------------------------------------------------------------ + App\Infrastructure\OllamaClient: arguments: $apiUrl: '%env(AI_LLM_API_URL)%' @@ -66,10 +82,11 @@ services: # ------------------------------------------------------------ # AI Agent – Context & Runner # ------------------------------------------------------------ + App\Context\ContextService: arguments: $historyDir: '%env(AI_HISTORY_DIR)%' - $projectDir: '%kernel.project_dir%' + $projectDir: '%mto.root%' App\Agent\AgentRunner: arguments: @@ -78,7 +95,7 @@ services: $logContext: '%env(bool:AI_LOG_CONTEXT)%' # ------------------------------------------------------------ - # NDJSON Retrieval Stack (FINAL ARCHITECTURE) + # NDJSON Retrieval Stack # ------------------------------------------------------------ App\Knowledge\Retrieval\NdjsonChunkLookup: ~ @@ -100,22 +117,34 @@ services: alias: App\Knowledge\Retrieval\CachedRetriever # ------------------------------------------------------------ - # Vector Search (FAISS NDJSON-based) + # Vector Search (noch unverändert – Umbau kommt in Schritt 2) # ------------------------------------------------------------ + App\Index\IndexMetaManager: + arguments: + $metaPath: '%mto.knowledge.index_meta%' + $config: '@App\Index\IndexConfiguration' App\Vector\VectorSearchClient: arguments: - $binPythonDir: '%mto.vector.python_bin%' - $vectorSearchPyPath: '%mto.vector.search_script%' + $pythonBin: '%mto.vector.python_bin%' + $scriptPath: '%mto.vector.search_script%' + $vectorIndexPath: '%mto.knowledge.vector_index%' + $vectorMetaPath: '%mto.knowledge.vector_index_meta%' + $indexMetaPath: '%mto.knowledge.index_meta%' + $agentLogger: '@monolog.logger.agent' App\Vector\VectorIndexBuilder: arguments: $pythonBin: '%mto.vector.python_bin%' - $relativeScriptPath: '%mto.vector.ingest_script%' + $scriptPath: '%mto.vector.ingest_script%' + $indexNdjsonPath: '%mto.knowledge.ndjson%' + $indexMetaPath: '%mto.knowledge.index_meta%' + $vectorIndexPath: '%mto.knowledge.vector_index%' $timeoutSeconds: '%mto.vector.timeout%' + $indexConfiguration: '@App\Index\IndexConfiguration' # ------------------------------------------------------------ - # Index Configuration (Guardrails) + # Index Configuration # ------------------------------------------------------------ App\Index\IndexConfiguration: @@ -130,5 +159,5 @@ services: App\Service\Admin\IndexNdjsonInspector: arguments: - $ndJsonPath: '%mto.vector.data.ndjson.path%' - $indexMetaPath: '%mto.vector.data.vector_index_meta_json.path%' + $ndJsonPath: '%mto.knowledge.ndjson%' + $indexMetaPath: '%mto.knowledge.index_meta%' diff --git a/src/Controller/Admin/DocumentController.php b/src/Controller/Admin/DocumentController.php index eef57c7..0a053ce 100644 --- a/src/Controller/Admin/DocumentController.php +++ b/src/Controller/Admin/DocumentController.php @@ -370,10 +370,10 @@ class DocumentController extends AbstractController return $this->redirectToRoute('admin_dashboard'); } - @unlink($params->get('mto.vector.data.ndjson.path')); - @unlink($params->get('mto.vector.data.vector_index.path')); - @unlink($params->get('mto.vector.data.vector_index_meta_json.path')); - exec('rm -rf ' . $params->get('mto.vector.data.upload.path')); + @unlink($params->get('mto.knowledge.ndjson')); + @unlink($params->get('mto.knowledge.vector_index')); + @unlink($params->get('mto.knowledge.vector_index_meta')); + exec('rm -rf ' . $params->get('mto.knowledge.upload')); $sql = ' SET FOREIGN_KEY_CHECKS = 0; diff --git a/src/Index/IndexMetaManager.php b/src/Index/IndexMetaManager.php index 2b0669f..3920bd8 100644 --- a/src/Index/IndexMetaManager.php +++ b/src/Index/IndexMetaManager.php @@ -1,6 +1,5 @@ metaPath = rtrim($projectDir, '/') . $relativeMetaPath; + string $metaPath, + IndexConfiguration $config + ) { + $this->metaPath = $metaPath; + $this->config = $config; } public function getMetaPath(): string @@ -24,8 +23,6 @@ final class IndexMetaManager } /** - * Gibt null zurück, wenn noch kein Meta existiert (frisches System). - * * @return array|null */ public function readMeta(): ?array @@ -48,43 +45,19 @@ final class IndexMetaManager } /** - * Erstellt Meta, falls nicht vorhanden (z. B. nach erstem Global Reindex). - * Überschreibt NICHT automatisch, wenn vorhanden. - * - * @return array - */ - public function createInitialMetaIfMissing(): array - { - $existing = $this->readMeta(); - if ($existing !== null) { - return $existing; - } - - $meta = $this->buildMetaPayload(indexVersion: 1); - $this->atomicWriteJson($meta); - - return $meta; - } - - /** - * Guardrail: Prüft, ob die aktuelle Config kompatibel zur gespeicherten Meta ist. - * Wenn nicht: IndexStructureChangedException -> Global Reindex erzwingen. + * Guardrail: + * - Wenn Meta fehlt → initialisieren + * - Wenn Struktur driftet → Exception */ public function validateAgainstCurrent(): void { $meta = $this->readMeta(); - // Wenn noch kein Meta existiert, lassen wir lokale Ingests NICHT einfach laufen. - // Governance: Erst Global Reindex erzeugt Meta sauber. if ($meta === null) { - throw new IndexStructureChangedException( - 'index_meta.json missing. Please run a Global Reindex to initialize index structure metadata.', - ['reason' => 'missing_meta'] - ); + $meta = $this->createInitialMeta(); } $expected = $this->config->toStructureArray(); - $diff = $this->diffStructure($meta, $expected); if ($diff !== []) { @@ -96,11 +69,7 @@ final class IndexMetaManager } /** - * Wird beim Global Reindex verwendet: - * - index_version++ (oder initialisieren) - * - Meta atomar schreiben - * - * @return array new meta + * Wird beim Global Reindex aufgerufen */ public function writeMetaForGlobalReindex(): array { @@ -122,35 +91,34 @@ final class IndexMetaManager return $this->config; } - // ------------------------- + // --------------------------------------------------------- // Internals - // ------------------------- + // --------------------------------------------------------- + + private function createInitialMeta(): array + { + $meta = $this->buildMetaPayload(1); + $this->atomicWriteJson($meta); + return $meta; + } - /** - * @return array - */ private function buildMetaPayload(int $indexVersion): array { $structure = $this->config->toStructureArray(); return [ - 'index_version' => $indexVersion, - 'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM), - 'embedding_model' => $structure['embedding_model'], + 'index_version' => $indexVersion, + 'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM), + 'embedding_model' => $structure['embedding_model'], 'embedding_dimension' => $structure['embedding_dimension'], - 'chunk_size' => $structure['chunk_size'], - 'chunk_overlap' => $structure['chunk_overlap'], - 'scoring_version' => $structure['scoring_version'], - 'index_format' => $structure['index_format'], - 'vector_backend' => $structure['vector_backend'], + 'chunk_size' => $structure['chunk_size'], + 'chunk_overlap' => $structure['chunk_overlap'], + 'scoring_version' => $structure['scoring_version'], + 'index_format' => $structure['index_format'], + 'vector_backend' => $structure['vector_backend'], ]; } - /** - * @param array $meta - * @param array $expected - * @return array diff - */ private function diffStructure(array $meta, array $expected): array { $diff = []; @@ -160,28 +128,18 @@ final class IndexMetaManager if ($actual !== $value) { $diff[$key] = [ 'expected' => $value, - 'actual' => $actual, + 'actual' => $actual, ]; } } - // index_format ist zwingend - if (($meta['index_format'] ?? null) !== 'ndjson') { - $diff['index_format'] = [ - 'expected' => 'ndjson', - 'actual' => $meta['index_format'] ?? null, - ]; - } - return $diff; } - /** - * @param array $payload - */ private function atomicWriteJson(array $payload): void { - $dir = \dirname($this->metaPath); + $dir = dirname($this->metaPath); + if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) { throw new \RuntimeException('Unable to create directory: ' . $dir); } @@ -197,7 +155,6 @@ final class IndexMetaManager throw new \RuntimeException('Unable to write temp meta file'); } - // atomarer Switch if (!rename($tmp, $this->metaPath)) { @unlink($tmp); throw new \RuntimeException('Unable to switch meta file atomically'); diff --git a/src/Vector/VectorIndexBuilder.php b/src/Vector/VectorIndexBuilder.php index 3799e79..fd21e47 100644 --- a/src/Vector/VectorIndexBuilder.php +++ b/src/Vector/VectorIndexBuilder.php @@ -1,10 +1,10 @@ pythonBin = $pythonBin; - $this->scriptPath = $base . $relativeScriptPath; - $this->indexNdjsonPath = $base . $relativeIndexNdjsonPath; - $this->vectorIndexPath = $base . $relativeVectorIndexPath; - $this->timeoutSeconds = $timeoutSeconds; + string $pythonBin, + string $scriptPath, + string $indexNdjsonPath, + string $indexMetaPath, + string $vectorIndexPath, + int $timeoutSeconds, + IndexConfiguration $indexConfiguration + ) { + $this->pythonBin = $pythonBin; + $this->scriptPath = $scriptPath; + $this->indexNdjsonPath = $indexNdjsonPath; + $this->indexMetaPath = $indexMetaPath; + $this->vectorIndexPath = $vectorIndexPath; + $this->vectorMetaPath = $vectorIndexPath . '.meta.json'; + $this->timeoutSeconds = $timeoutSeconds; + $this->indexConfiguration = $indexConfiguration; } - public function getIndexNdjsonPath(): string - { - return $this->indexNdjsonPath; - } - - public function getVectorIndexPath(): string - { - return $this->vectorIndexPath; - } - - public function getScriptPath(): string - { - return $this->scriptPath; - } - - /** - * Rebuild FAISS Index deterministisch aus index.ndjson. - * - * Erwartung: Python schreibt in $tmpVectorIndexPath, wir schalten atomar um. - * - * @param string|null $logPath Optional: stdout/stderr dorthin appenden - */ public function rebuildFromNdjson(?string $logPath = null): void { if (!is_file($this->scriptPath)) { @@ -66,97 +49,97 @@ final class VectorIndexBuilder throw new \RuntimeException('index.ndjson not found at: ' . $this->indexNdjsonPath); } - $dir = \dirname($this->vectorIndexPath); - if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) { - throw new \RuntimeException('Unable to create vector index directory: ' . $dir); + if (!is_file($this->indexMetaPath)) { + $this->initializeIndexMeta(); } + $indexMeta = json_decode((string) file_get_contents($this->indexMetaPath), true); + + if (!is_array($indexMeta) || empty($indexMeta['embedding_model'])) { + throw new \RuntimeException('Invalid index_meta.json'); + } + + $embeddingModel = (string) $indexMeta['embedding_model']; + $tmpVectorIndexPath = $this->vectorIndexPath . '.tmp'; - // Vorheriges tmp entfernen (Sicherheit) - if (is_file($tmpVectorIndexPath)) { - @unlink($tmpVectorIndexPath); - } + // Wichtig: Python erzeugt meta basierend auf endgültigem Namen + $finalMetaPath = $this->vectorMetaPath; + $tmpMetaPath = dirname($this->vectorIndexPath) . '/' . basename($this->vectorIndexPath, '.index') . '.index.meta.json'; + + @unlink($tmpVectorIndexPath); + @unlink($finalMetaPath); - // ---------------------------- - // Python-Aufruf (konservativ) - // ---------------------------- - // Wir erwarten/standardisieren (ab jetzt) CLI-Args: - // --index - // --out $cmd = [ $this->pythonBin, $this->scriptPath, '--index', $this->indexNdjsonPath, - '--out', $tmpVectorIndexPath, - '--model', 'all-MiniLM-L6-v2', + '--out', $tmpVectorIndexPath, + '--model', $embeddingModel, ]; $process = new Process($cmd); $process->setTimeout($this->timeoutSeconds); + $process->mustRun(); - $this->runProcess($process, $logPath); - - // Python muss tmp erzeugt haben if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) { - throw new \RuntimeException('Vector index rebuild failed: tmp output missing or empty: ' . $tmpVectorIndexPath); + throw new \RuntimeException('Vector index tmp missing or empty'); } - // Atomarer Switch - $this->atomicSwitch($tmpVectorIndexPath, $this->vectorIndexPath); + // Python erzeugt vector.index.meta.json (nicht tmp.meta!) + if (!is_file($this->vectorMetaPath) || filesize($this->vectorMetaPath) === 0) { + throw new \RuntimeException('Vector meta missing or empty'); + } + + // Atomarer Switch für Index + if (!rename($tmpVectorIndexPath, $this->vectorIndexPath)) { + throw new \RuntimeException('Atomic switch failed for vector index'); + } } - // ------------------------- - // Internals - // ------------------------- + private function initializeIndexMeta(): void + { + $dir = dirname($this->indexMetaPath); + + if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) { + throw new \RuntimeException('Cannot create knowledge directory'); + } + + $data = [ + 'index_version' => 1, + 'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM), + 'embedding_model' => $this->indexConfiguration->getEmbeddingModel(), + 'embedding_dimension' => $this->indexConfiguration->getEmbeddingDimension(), + 'chunk_size' => $this->indexConfiguration->getChunkSize(), + 'chunk_overlap' => $this->indexConfiguration->getChunkOverlap(), + 'scoring_version' => $this->indexConfiguration->getScoringVersion(), + 'index_format' => 'ndjson', + 'vector_backend' => 'faiss', + ]; + + file_put_contents( + $this->indexMetaPath, + json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) + ); + } private function runProcess(Process $process, ?string $logPath): void { if ($logPath !== null) { - $this->appendLog($logPath, "\n=== VectorIndexBuilder START " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n"); - $this->appendLog($logPath, "CMD: " . $process->getCommandLine() . "\n"); + @file_put_contents($logPath, "=== VectorIndexBuilder START ===\n", FILE_APPEND); } - $process->run(function (string $type, string $buffer) use ($logPath) { - if ($logPath === null) { - return; - } - - // TYPE: Process::OUT / Process::ERR - $this->appendLog($logPath, $buffer); - }); + $process->run(); if (!$process->isSuccessful()) { if ($logPath !== null) { - $this->appendLog($logPath, "\n=== VectorIndexBuilder FAILED ===\n"); - $this->appendLog($logPath, "ExitCode: " . $process->getExitCode() . "\n"); - $this->appendLog($logPath, "STDERR:\n" . $process->getErrorOutput() . "\n"); + @file_put_contents($logPath, $process->getErrorOutput(), FILE_APPEND); } - throw new ProcessFailedException($process); } if ($logPath !== null) { - $this->appendLog($logPath, "\n=== VectorIndexBuilder OK " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n"); - } - } - - private function appendLog(string $logPath, string $content): void - { - $dir = \dirname($logPath); - if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) { - // Wenn Log nicht möglich ist: nicht hart scheitern (Build ist wichtiger) - return; - } - - @file_put_contents($logPath, $content, FILE_APPEND); - } - - private function atomicSwitch(string $tmp, string $final): void - { - if (!rename($tmp, $final)) { - @unlink($tmp); - throw new \RuntimeException('Atomic switch failed for vector.index'); + @file_put_contents($logPath, "=== VectorIndexBuilder OK ===\n", FILE_APPEND); } } } diff --git a/src/Vector/VectorSearchClient.php b/src/Vector/VectorSearchClient.php index 1d5b546..58e6ec5 100644 --- a/src/Vector/VectorSearchClient.php +++ b/src/Vector/VectorSearchClient.php @@ -8,48 +8,90 @@ use Psr\Log\LoggerInterface; final class VectorSearchClient { + private string $pythonBin; + private string $scriptPath; + private string $vectorIndexPath; + private string $vectorMetaPath; + private string $indexMetaPath; + private LoggerInterface $agentLogger; + public function __construct( - private readonly string $binPythonDir, - private readonly string $vectorSearchPyPath, - private LoggerInterface $agentLogger, + string $pythonBin, + string $scriptPath, + string $vectorIndexPath, + string $vectorMetaPath, + string $indexMetaPath, + LoggerInterface $agentLogger ) { + $this->pythonBin = $pythonBin; + $this->scriptPath = $scriptPath; + $this->vectorIndexPath = $vectorIndexPath; + $this->vectorMetaPath = $vectorMetaPath; + $this->indexMetaPath = $indexMetaPath; + $this->agentLogger = $agentLogger; } public function search(string $query, int $limit = 5): array { - $script = $this->vectorSearchPyPath; - $this->agentLogger->info("Run vector search script $script"); - if (!is_file($script)) { + if (!is_file($this->scriptPath)) { + $this->agentLogger->error('vector_search.py not found: ' . $this->scriptPath); return []; } - // ------------------------------------------------- - // Determine Python interpreter (venv preferred) - // ------------------------------------------------- - $venvPython = $this->binPythonDir; - $pythonBin = is_file($venvPython) ? $venvPython : 'python3'; - - $cmd = sprintf( - '%s %s %s %d 2>&1', - escapeshellarg($pythonBin), - escapeshellarg($script), - escapeshellarg($query), - $limit - ); - - exec($cmd, $out, $exitCode); - - if ($exitCode !== 0 || empty($out)) { + if (!is_file($this->vectorIndexPath)) { + $this->agentLogger->warning('vector.index not found.'); return []; } - $json = implode("\n", $out); + if (!is_file($this->vectorMetaPath)) { + $this->agentLogger->warning('vector.index.meta.json not found.'); + return []; + } - $this->agentLogger->info($json); + if (!is_file($this->indexMetaPath)) { + $this->agentLogger->warning('index_meta.json not found.'); + return []; + } + + $indexMeta = json_decode((string) file_get_contents($this->indexMetaPath), true); + + if (!is_array($indexMeta) || empty($indexMeta['embedding_model'])) { + $this->agentLogger->error('Invalid index_meta.json.'); + return []; + } + + $embeddingModel = $indexMeta['embedding_model']; + + $cmd = [ + $this->pythonBin, + $this->scriptPath, + '--query', $query, + '--limit', (string)$limit, + '--index', $this->vectorIndexPath, + '--meta', $this->vectorMetaPath, + '--model', $embeddingModel, + ]; + + $process = new \Symfony\Component\Process\Process($cmd); + $process->setTimeout(30); + $process->run(); + + if (!$process->isSuccessful()) { + $this->agentLogger->error('Vector search failed: ' . $process->getErrorOutput()); + return []; + } + + $output = $process->getOutput(); + + if (trim($output) === '') { + return []; + } try { - return json_decode($json, true, 512, JSON_THROW_ON_ERROR); - } catch (\Throwable) { + $this->agentLogger->info('vector_search.py is done: ' . $this->scriptPath); + return json_decode($output, true, 512, JSON_THROW_ON_ERROR); + } catch (\Throwable $e) { + $this->agentLogger->error('Invalid JSON from vector_search.py'); return []; } } diff --git a/src/Vector/vector_search.py b/src/Vector/vector_search.py index 2f6c065..ce64d3a 100644 --- a/src/Vector/vector_search.py +++ b/src/Vector/vector_search.py @@ -2,71 +2,116 @@ import sys import json +import argparse from pathlib import Path -# --------------------------------------------------------- -# Argument handling -# --------------------------------------------------------- -if len(sys.argv) < 3: - print("ERROR: Missing arguments (query, limit)") - sys.exit(2) - -query = sys.argv[1] -limit = int(sys.argv[2]) - -vector_dir = Path(__file__).resolve().parent -index_path = vector_dir / "vector.index" -meta_path = vector_dir / "vector.index.meta.json" # --------------------------------------------------------- -# Dependency checks (controlled) +# Argument parsing (NEW – CLEAN CLI) +# --------------------------------------------------------- +parser = argparse.ArgumentParser(description="FAISS vector search") + +parser.add_argument("--query", required=True, help="Search query text") +parser.add_argument("--limit", required=True, type=int, help="Top-K limit") +parser.add_argument("--index", required=True, help="Path to vector.index") +parser.add_argument("--meta", required=True, help="Path to vector.index.meta.json") +parser.add_argument("--model", required=True, help="SentenceTransformer model") + +args = parser.parse_args() + +query = args.query +limit = args.limit +index_path = Path(args.index).resolve() +meta_path = Path(args.meta).resolve() +embedding_model = args.model + + +# --------------------------------------------------------- +# Dependency checks (stderr only) # --------------------------------------------------------- try: import faiss # noqa except Exception: - print("ERROR: Python module 'faiss' not found.") + print("Python module 'faiss' not found.", file=sys.stderr) sys.exit(10) try: from sentence_transformers import SentenceTransformer # noqa except Exception: - print("ERROR: Python module 'sentence-transformers' not found.") + print("Python module 'sentence-transformers' not found.", file=sys.stderr) sys.exit(11) import faiss from sentence_transformers import SentenceTransformer + # --------------------------------------------------------- # File checks # --------------------------------------------------------- -if not index_path.is_file() or not meta_path.is_file(): - print("ERROR: Vector index not found. Run vector ingest first.") +if not index_path.is_file(): + print(f"vector.index not found at {index_path}", file=sys.stderr) sys.exit(20) +if not meta_path.is_file(): + print(f"vector.index.meta.json not found at {meta_path}", file=sys.stderr) + sys.exit(21) + + # --------------------------------------------------------- # Load model and index # --------------------------------------------------------- -model = SentenceTransformer("all-MiniLM-L6-v2") -query_vec = model.encode([query], normalize_embeddings=True) +try: + model = SentenceTransformer(embedding_model) +except Exception as e: + print(f"Failed to load embedding model: {embedding_model}", file=sys.stderr) + sys.exit(30) -index = faiss.read_index(str(index_path)) +try: + query_vec = model.encode([query], normalize_embeddings=True) +except Exception: + print("Embedding encoding failed.", file=sys.stderr) + sys.exit(31) + +try: + index = faiss.read_index(str(index_path)) +except Exception: + print("Failed to read FAISS index.", file=sys.stderr) + sys.exit(32) + +try: + with open(meta_path, "r", encoding="utf-8") as f: + ids = json.load(f) +except Exception: + print("Failed to read vector meta file.", file=sys.stderr) + sys.exit(33) -with open(meta_path, "r", encoding="utf-8") as f: - ids = json.load(f) # --------------------------------------------------------- # Search # --------------------------------------------------------- -scores, indices = index.search(query_vec, limit) +try: + scores, indices = index.search(query_vec, limit) +except Exception: + print("FAISS search failed.", file=sys.stderr) + sys.exit(40) results = [] + for score, idx in zip(scores[0], indices[0]): if idx == -1: continue + if idx < 0 or idx >= len(ids): + continue + results.append({ "chunk_id": ids[idx], "score": float(score) }) + +# --------------------------------------------------------- +# STRICT JSON OUTPUT ONLY +# --------------------------------------------------------- print(json.dumps(results)) +sys.exit(0)