harden struct

This commit is contained in:
team 1
2026-02-16 12:34:22 +01:00
parent c046a83767
commit 57a35af6c8
6 changed files with 308 additions and 252 deletions

View File

@@ -3,23 +3,45 @@
# ------------------------------------------------------------ # ------------------------------------------------------------
parameters: parameters:
mto.kernel.dir: '%kernel.project_dir%' # ------------------------------------------------------------
# Root
# ------------------------------------------------------------
mto.root: '%kernel.project_dir%'
mto.kernel.dir: '%mto.root%'
# ------------------------------------------------------------
# Knowledge Root (ZENTRAL)
# ------------------------------------------------------------
mto.knowledge.root: '%mto.root%/var/knowledge'
mto.knowledge.ndjson: '%mto.knowledge.root%/index.ndjson'
mto.knowledge.index_meta: '%mto.knowledge.root%/index_meta.json'
mto.knowledge.vector_index: '%mto.knowledge.root%/vector.index'
mto.knowledge.vector_index_meta: '%mto.knowledge.root%/vector.index.meta.json'
mto.knowledge.upload: '%mto.knowledge.root%/uploads'
# Backward compatibility alias
mto.vector.data.upload.path: '%mto.knowledge.upload%'
# ------------------------------------------------------------
# Index Configuration (Guardrails)
# ------------------------------------------------------------
mto.index.chunk_size: 800 mto.index.chunk_size: 800
mto.index.chunk_overlap: 100 mto.index.chunk_overlap: 100
mto.index.embedding_model: 'all-MiniLM-L6-v2' #nomic-embed-text mto.index.embedding_model: 'all-MiniLM-L6-v2'
mto.index.embedding_dimension: 768 mto.index.embedding_dimension: 768
mto.index.scoring_version: 1 mto.index.scoring_version: 1
mto.vector.python_bin: '/var/www/html/.venv/bin/python3' # ------------------------------------------------------------
mto.vector.ingest_script: '/src/Vector/vector_ingest.py' # Python / Vector Runtime
mto.vector.search_script: '%kernel.project_dir%/src/Vector/vector_search.py' # ------------------------------------------------------------
mto.vector.data.path: '%kernel.project_dir%/var/knowledge' mto.vector.python_bin: '/var/www/html/.venv/bin/python3'
mto.vector.data.upload.path: '%mto.vector.data.path%/uploads'
mto.vector.data.ndjson.path: '%mto.vector.data.path%/index.ndjson' mto.vector.ingest_script: '%mto.root%/src/Vector/vector_ingest.py'
mto.vector.data.vector_index.path: '%mto.vector.data.path%/vector.index' mto.vector.search_script: '%mto.root%/src/Vector/vector_search.py'
mto.vector.data.vector_index_meta_json.path: '%mto.vector.data.path%/vector.index.meta.json'
mto.vector.timeout: 600 mto.vector.timeout: 600
@@ -29,20 +51,13 @@ parameters:
services: services:
# ------------------------------------------------------------
# Default service configuration
# ------------------------------------------------------------
_defaults: _defaults:
autowire: true autowire: true
autoconfigure: true autoconfigure: true
bind: bind:
Psr\Log\LoggerInterface $agentLogger: '@monolog.logger.agent' Psr\Log\LoggerInterface $agentLogger: '@monolog.logger.agent'
string $projectDir: '%kernel.project_dir%' string $projectDir: '%mto.root%'
# ------------------------------------------------------------
# Automatically register all services in src/
# ------------------------------------------------------------
App\: App\:
resource: '../src/' resource: '../src/'
exclude: exclude:
@@ -57,6 +72,7 @@ services:
# ------------------------------------------------------------ # ------------------------------------------------------------
# AI Agent Infrastructure # AI Agent Infrastructure
# ------------------------------------------------------------ # ------------------------------------------------------------
App\Infrastructure\OllamaClient: App\Infrastructure\OllamaClient:
arguments: arguments:
$apiUrl: '%env(AI_LLM_API_URL)%' $apiUrl: '%env(AI_LLM_API_URL)%'
@@ -66,10 +82,11 @@ services:
# ------------------------------------------------------------ # ------------------------------------------------------------
# AI Agent Context & Runner # AI Agent Context & Runner
# ------------------------------------------------------------ # ------------------------------------------------------------
App\Context\ContextService: App\Context\ContextService:
arguments: arguments:
$historyDir: '%env(AI_HISTORY_DIR)%' $historyDir: '%env(AI_HISTORY_DIR)%'
$projectDir: '%kernel.project_dir%' $projectDir: '%mto.root%'
App\Agent\AgentRunner: App\Agent\AgentRunner:
arguments: arguments:
@@ -78,7 +95,7 @@ services:
$logContext: '%env(bool:AI_LOG_CONTEXT)%' $logContext: '%env(bool:AI_LOG_CONTEXT)%'
# ------------------------------------------------------------ # ------------------------------------------------------------
# NDJSON Retrieval Stack (FINAL ARCHITECTURE) # NDJSON Retrieval Stack
# ------------------------------------------------------------ # ------------------------------------------------------------
App\Knowledge\Retrieval\NdjsonChunkLookup: ~ App\Knowledge\Retrieval\NdjsonChunkLookup: ~
@@ -100,22 +117,34 @@ services:
alias: App\Knowledge\Retrieval\CachedRetriever alias: App\Knowledge\Retrieval\CachedRetriever
# ------------------------------------------------------------ # ------------------------------------------------------------
# Vector Search (FAISS NDJSON-based) # Vector Search (noch unverändert Umbau kommt in Schritt 2)
# ------------------------------------------------------------ # ------------------------------------------------------------
App\Index\IndexMetaManager:
arguments:
$metaPath: '%mto.knowledge.index_meta%'
$config: '@App\Index\IndexConfiguration'
App\Vector\VectorSearchClient: App\Vector\VectorSearchClient:
arguments: arguments:
$binPythonDir: '%mto.vector.python_bin%' $pythonBin: '%mto.vector.python_bin%'
$vectorSearchPyPath: '%mto.vector.search_script%' $scriptPath: '%mto.vector.search_script%'
$vectorIndexPath: '%mto.knowledge.vector_index%'
$vectorMetaPath: '%mto.knowledge.vector_index_meta%'
$indexMetaPath: '%mto.knowledge.index_meta%'
$agentLogger: '@monolog.logger.agent'
App\Vector\VectorIndexBuilder: App\Vector\VectorIndexBuilder:
arguments: arguments:
$pythonBin: '%mto.vector.python_bin%' $pythonBin: '%mto.vector.python_bin%'
$relativeScriptPath: '%mto.vector.ingest_script%' $scriptPath: '%mto.vector.ingest_script%'
$indexNdjsonPath: '%mto.knowledge.ndjson%'
$indexMetaPath: '%mto.knowledge.index_meta%'
$vectorIndexPath: '%mto.knowledge.vector_index%'
$timeoutSeconds: '%mto.vector.timeout%' $timeoutSeconds: '%mto.vector.timeout%'
$indexConfiguration: '@App\Index\IndexConfiguration'
# ------------------------------------------------------------ # ------------------------------------------------------------
# Index Configuration (Guardrails) # Index Configuration
# ------------------------------------------------------------ # ------------------------------------------------------------
App\Index\IndexConfiguration: App\Index\IndexConfiguration:
@@ -130,5 +159,5 @@ services:
App\Service\Admin\IndexNdjsonInspector: App\Service\Admin\IndexNdjsonInspector:
arguments: arguments:
$ndJsonPath: '%mto.vector.data.ndjson.path%' $ndJsonPath: '%mto.knowledge.ndjson%'
$indexMetaPath: '%mto.vector.data.vector_index_meta_json.path%' $indexMetaPath: '%mto.knowledge.index_meta%'

View File

@@ -370,10 +370,10 @@ class DocumentController extends AbstractController
return $this->redirectToRoute('admin_dashboard'); return $this->redirectToRoute('admin_dashboard');
} }
@unlink($params->get('mto.vector.data.ndjson.path')); @unlink($params->get('mto.knowledge.ndjson'));
@unlink($params->get('mto.vector.data.vector_index.path')); @unlink($params->get('mto.knowledge.vector_index'));
@unlink($params->get('mto.vector.data.vector_index_meta_json.path')); @unlink($params->get('mto.knowledge.vector_index_meta'));
exec('rm -rf ' . $params->get('mto.vector.data.upload.path')); exec('rm -rf ' . $params->get('mto.knowledge.upload'));
$sql = ' $sql = '
SET FOREIGN_KEY_CHECKS = 0; SET FOREIGN_KEY_CHECKS = 0;

View File

@@ -1,6 +1,5 @@
<?php <?php
declare(strict_types=1); declare(strict_types=1);
namespace App\Index; namespace App\Index;
@@ -8,14 +7,14 @@ namespace App\Index;
final class IndexMetaManager final class IndexMetaManager
{ {
private string $metaPath; private string $metaPath;
private IndexConfiguration $config;
public function __construct( public function __construct(
string $projectDir, string $metaPath,
private readonly IndexConfiguration $config, IndexConfiguration $config
string $relativeMetaPath = '/var/knowledge/index_meta.json' ) {
) $this->metaPath = $metaPath;
{ $this->config = $config;
$this->metaPath = rtrim($projectDir, '/') . $relativeMetaPath;
} }
public function getMetaPath(): string public function getMetaPath(): string
@@ -24,8 +23,6 @@ final class IndexMetaManager
} }
/** /**
* Gibt null zurück, wenn noch kein Meta existiert (frisches System).
*
* @return array<string,mixed>|null * @return array<string,mixed>|null
*/ */
public function readMeta(): ?array public function readMeta(): ?array
@@ -48,43 +45,19 @@ final class IndexMetaManager
} }
/** /**
* Erstellt Meta, falls nicht vorhanden (z. B. nach erstem Global Reindex). * Guardrail:
* Überschreibt NICHT automatisch, wenn vorhanden. * - Wenn Meta fehlt → initialisieren
* * - Wenn Struktur driftet → Exception
* @return array<string,mixed>
*/
public function createInitialMetaIfMissing(): array
{
$existing = $this->readMeta();
if ($existing !== null) {
return $existing;
}
$meta = $this->buildMetaPayload(indexVersion: 1);
$this->atomicWriteJson($meta);
return $meta;
}
/**
* Guardrail: Prüft, ob die aktuelle Config kompatibel zur gespeicherten Meta ist.
* Wenn nicht: IndexStructureChangedException -> Global Reindex erzwingen.
*/ */
public function validateAgainstCurrent(): void public function validateAgainstCurrent(): void
{ {
$meta = $this->readMeta(); $meta = $this->readMeta();
// Wenn noch kein Meta existiert, lassen wir lokale Ingests NICHT einfach laufen.
// Governance: Erst Global Reindex erzeugt Meta sauber.
if ($meta === null) { if ($meta === null) {
throw new IndexStructureChangedException( $meta = $this->createInitialMeta();
'index_meta.json missing. Please run a Global Reindex to initialize index structure metadata.',
['reason' => 'missing_meta']
);
} }
$expected = $this->config->toStructureArray(); $expected = $this->config->toStructureArray();
$diff = $this->diffStructure($meta, $expected); $diff = $this->diffStructure($meta, $expected);
if ($diff !== []) { if ($diff !== []) {
@@ -96,11 +69,7 @@ final class IndexMetaManager
} }
/** /**
* Wird beim Global Reindex verwendet: * Wird beim Global Reindex aufgerufen
* - index_version++ (oder initialisieren)
* - Meta atomar schreiben
*
* @return array<string,mixed> new meta
*/ */
public function writeMetaForGlobalReindex(): array public function writeMetaForGlobalReindex(): array
{ {
@@ -122,35 +91,34 @@ final class IndexMetaManager
return $this->config; return $this->config;
} }
// ------------------------- // ---------------------------------------------------------
// Internals // Internals
// ------------------------- // ---------------------------------------------------------
private function createInitialMeta(): array
{
$meta = $this->buildMetaPayload(1);
$this->atomicWriteJson($meta);
return $meta;
}
/**
* @return array<string,mixed>
*/
private function buildMetaPayload(int $indexVersion): array private function buildMetaPayload(int $indexVersion): array
{ {
$structure = $this->config->toStructureArray(); $structure = $this->config->toStructureArray();
return [ return [
'index_version' => $indexVersion, 'index_version' => $indexVersion,
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM), 'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
'embedding_model' => $structure['embedding_model'], 'embedding_model' => $structure['embedding_model'],
'embedding_dimension' => $structure['embedding_dimension'], 'embedding_dimension' => $structure['embedding_dimension'],
'chunk_size' => $structure['chunk_size'], 'chunk_size' => $structure['chunk_size'],
'chunk_overlap' => $structure['chunk_overlap'], 'chunk_overlap' => $structure['chunk_overlap'],
'scoring_version' => $structure['scoring_version'], 'scoring_version' => $structure['scoring_version'],
'index_format' => $structure['index_format'], 'index_format' => $structure['index_format'],
'vector_backend' => $structure['vector_backend'], 'vector_backend' => $structure['vector_backend'],
]; ];
} }
/**
* @param array<string,mixed> $meta
* @param array<string,mixed> $expected
* @return array<string,mixed> diff
*/
private function diffStructure(array $meta, array $expected): array private function diffStructure(array $meta, array $expected): array
{ {
$diff = []; $diff = [];
@@ -160,28 +128,18 @@ final class IndexMetaManager
if ($actual !== $value) { if ($actual !== $value) {
$diff[$key] = [ $diff[$key] = [
'expected' => $value, 'expected' => $value,
'actual' => $actual, 'actual' => $actual,
]; ];
} }
} }
// index_format ist zwingend
if (($meta['index_format'] ?? null) !== 'ndjson') {
$diff['index_format'] = [
'expected' => 'ndjson',
'actual' => $meta['index_format'] ?? null,
];
}
return $diff; return $diff;
} }
/**
* @param array<string,mixed> $payload
*/
private function atomicWriteJson(array $payload): void private function atomicWriteJson(array $payload): void
{ {
$dir = \dirname($this->metaPath); $dir = dirname($this->metaPath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) { if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create directory: ' . $dir); throw new \RuntimeException('Unable to create directory: ' . $dir);
} }
@@ -197,7 +155,6 @@ final class IndexMetaManager
throw new \RuntimeException('Unable to write temp meta file'); throw new \RuntimeException('Unable to write temp meta file');
} }
// atomarer Switch
if (!rename($tmp, $this->metaPath)) { if (!rename($tmp, $this->metaPath)) {
@unlink($tmp); @unlink($tmp);
throw new \RuntimeException('Unable to switch meta file atomically'); throw new \RuntimeException('Unable to switch meta file atomically');

View File

@@ -1,10 +1,10 @@
<?php <?php
declare(strict_types=1); declare(strict_types=1);
namespace App\Vector; namespace App\Vector;
use App\Index\IndexConfiguration;
use Symfony\Component\Process\Exception\ProcessFailedException; use Symfony\Component\Process\Exception\ProcessFailedException;
use Symfony\Component\Process\Process; use Symfony\Component\Process\Process;
@@ -13,49 +13,32 @@ final class VectorIndexBuilder
private string $pythonBin; private string $pythonBin;
private string $scriptPath; private string $scriptPath;
private string $indexNdjsonPath; private string $indexNdjsonPath;
private string $indexMetaPath;
private string $vectorIndexPath; private string $vectorIndexPath;
private string $vectorMetaPath;
private int $timeoutSeconds; private int $timeoutSeconds;
private IndexConfiguration $indexConfiguration;
public function __construct( public function __construct(
string $projectDir, string $pythonBin,
string $pythonBin = 'python3', string $scriptPath,
string $relativeScriptPath = '/vector/vector_ingest.py', string $indexNdjsonPath,
string $relativeIndexNdjsonPath = '/var/knowledge/index.ndjson', string $indexMetaPath,
string $relativeVectorIndexPath = '/var/knowledge/vector.index', string $vectorIndexPath,
int $timeoutSeconds = 600 int $timeoutSeconds,
) IndexConfiguration $indexConfiguration
{ ) {
$base = rtrim($projectDir, '/'); $this->pythonBin = $pythonBin;
$this->scriptPath = $scriptPath;
$this->pythonBin = $pythonBin; $this->indexNdjsonPath = $indexNdjsonPath;
$this->scriptPath = $base . $relativeScriptPath; $this->indexMetaPath = $indexMetaPath;
$this->indexNdjsonPath = $base . $relativeIndexNdjsonPath; $this->vectorIndexPath = $vectorIndexPath;
$this->vectorIndexPath = $base . $relativeVectorIndexPath; $this->vectorMetaPath = $vectorIndexPath . '.meta.json';
$this->timeoutSeconds = $timeoutSeconds; $this->timeoutSeconds = $timeoutSeconds;
$this->indexConfiguration = $indexConfiguration;
} }
public function getIndexNdjsonPath(): string
{
return $this->indexNdjsonPath;
}
public function getVectorIndexPath(): string
{
return $this->vectorIndexPath;
}
public function getScriptPath(): string
{
return $this->scriptPath;
}
/**
* Rebuild FAISS Index deterministisch aus index.ndjson.
*
* Erwartung: Python schreibt in $tmpVectorIndexPath, wir schalten atomar um.
*
* @param string|null $logPath Optional: stdout/stderr dorthin appenden
*/
public function rebuildFromNdjson(?string $logPath = null): void public function rebuildFromNdjson(?string $logPath = null): void
{ {
if (!is_file($this->scriptPath)) { if (!is_file($this->scriptPath)) {
@@ -66,97 +49,97 @@ final class VectorIndexBuilder
throw new \RuntimeException('index.ndjson not found at: ' . $this->indexNdjsonPath); throw new \RuntimeException('index.ndjson not found at: ' . $this->indexNdjsonPath);
} }
$dir = \dirname($this->vectorIndexPath); if (!is_file($this->indexMetaPath)) {
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) { $this->initializeIndexMeta();
throw new \RuntimeException('Unable to create vector index directory: ' . $dir);
} }
$indexMeta = json_decode((string) file_get_contents($this->indexMetaPath), true);
if (!is_array($indexMeta) || empty($indexMeta['embedding_model'])) {
throw new \RuntimeException('Invalid index_meta.json');
}
$embeddingModel = (string) $indexMeta['embedding_model'];
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp'; $tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
// Vorheriges tmp entfernen (Sicherheit) // Wichtig: Python erzeugt meta basierend auf endgültigem Namen
if (is_file($tmpVectorIndexPath)) { $finalMetaPath = $this->vectorMetaPath;
@unlink($tmpVectorIndexPath); $tmpMetaPath = dirname($this->vectorIndexPath) . '/' . basename($this->vectorIndexPath, '.index') . '.index.meta.json';
}
@unlink($tmpVectorIndexPath);
@unlink($finalMetaPath);
// ----------------------------
// Python-Aufruf (konservativ)
// ----------------------------
// Wir erwarten/standardisieren (ab jetzt) CLI-Args:
// --index <path-to-index.ndjson>
// --out <path-to-vector.index.tmp>
$cmd = [ $cmd = [
$this->pythonBin, $this->pythonBin,
$this->scriptPath, $this->scriptPath,
'--index', $this->indexNdjsonPath, '--index', $this->indexNdjsonPath,
'--out', $tmpVectorIndexPath, '--out', $tmpVectorIndexPath,
'--model', 'all-MiniLM-L6-v2', '--model', $embeddingModel,
]; ];
$process = new Process($cmd); $process = new Process($cmd);
$process->setTimeout($this->timeoutSeconds); $process->setTimeout($this->timeoutSeconds);
$process->mustRun();
$this->runProcess($process, $logPath);
// Python muss tmp erzeugt haben
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) { if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
throw new \RuntimeException('Vector index rebuild failed: tmp output missing or empty: ' . $tmpVectorIndexPath); throw new \RuntimeException('Vector index tmp missing or empty');
} }
// Atomarer Switch // Python erzeugt vector.index.meta.json (nicht tmp.meta!)
$this->atomicSwitch($tmpVectorIndexPath, $this->vectorIndexPath); if (!is_file($this->vectorMetaPath) || filesize($this->vectorMetaPath) === 0) {
throw new \RuntimeException('Vector meta missing or empty');
}
// Atomarer Switch für Index
if (!rename($tmpVectorIndexPath, $this->vectorIndexPath)) {
throw new \RuntimeException('Atomic switch failed for vector index');
}
} }
// ------------------------- private function initializeIndexMeta(): void
// Internals {
// ------------------------- $dir = dirname($this->indexMetaPath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Cannot create knowledge directory');
}
$data = [
'index_version' => 1,
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
'embedding_model' => $this->indexConfiguration->getEmbeddingModel(),
'embedding_dimension' => $this->indexConfiguration->getEmbeddingDimension(),
'chunk_size' => $this->indexConfiguration->getChunkSize(),
'chunk_overlap' => $this->indexConfiguration->getChunkOverlap(),
'scoring_version' => $this->indexConfiguration->getScoringVersion(),
'index_format' => 'ndjson',
'vector_backend' => 'faiss',
];
file_put_contents(
$this->indexMetaPath,
json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
);
}
private function runProcess(Process $process, ?string $logPath): void private function runProcess(Process $process, ?string $logPath): void
{ {
if ($logPath !== null) { if ($logPath !== null) {
$this->appendLog($logPath, "\n=== VectorIndexBuilder START " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n"); @file_put_contents($logPath, "=== VectorIndexBuilder START ===\n", FILE_APPEND);
$this->appendLog($logPath, "CMD: " . $process->getCommandLine() . "\n");
} }
$process->run(function (string $type, string $buffer) use ($logPath) { $process->run();
if ($logPath === null) {
return;
}
// TYPE: Process::OUT / Process::ERR
$this->appendLog($logPath, $buffer);
});
if (!$process->isSuccessful()) { if (!$process->isSuccessful()) {
if ($logPath !== null) { if ($logPath !== null) {
$this->appendLog($logPath, "\n=== VectorIndexBuilder FAILED ===\n"); @file_put_contents($logPath, $process->getErrorOutput(), FILE_APPEND);
$this->appendLog($logPath, "ExitCode: " . $process->getExitCode() . "\n");
$this->appendLog($logPath, "STDERR:\n" . $process->getErrorOutput() . "\n");
} }
throw new ProcessFailedException($process); throw new ProcessFailedException($process);
} }
if ($logPath !== null) { if ($logPath !== null) {
$this->appendLog($logPath, "\n=== VectorIndexBuilder OK " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n"); @file_put_contents($logPath, "=== VectorIndexBuilder OK ===\n", FILE_APPEND);
}
}
private function appendLog(string $logPath, string $content): void
{
$dir = \dirname($logPath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
// Wenn Log nicht möglich ist: nicht hart scheitern (Build ist wichtiger)
return;
}
@file_put_contents($logPath, $content, FILE_APPEND);
}
private function atomicSwitch(string $tmp, string $final): void
{
if (!rename($tmp, $final)) {
@unlink($tmp);
throw new \RuntimeException('Atomic switch failed for vector.index');
} }
} }
} }

View File

@@ -8,48 +8,90 @@ use Psr\Log\LoggerInterface;
final class VectorSearchClient final class VectorSearchClient
{ {
private string $pythonBin;
private string $scriptPath;
private string $vectorIndexPath;
private string $vectorMetaPath;
private string $indexMetaPath;
private LoggerInterface $agentLogger;
public function __construct( public function __construct(
private readonly string $binPythonDir, string $pythonBin,
private readonly string $vectorSearchPyPath, string $scriptPath,
private LoggerInterface $agentLogger, string $vectorIndexPath,
string $vectorMetaPath,
string $indexMetaPath,
LoggerInterface $agentLogger
) { ) {
$this->pythonBin = $pythonBin;
$this->scriptPath = $scriptPath;
$this->vectorIndexPath = $vectorIndexPath;
$this->vectorMetaPath = $vectorMetaPath;
$this->indexMetaPath = $indexMetaPath;
$this->agentLogger = $agentLogger;
} }
public function search(string $query, int $limit = 5): array public function search(string $query, int $limit = 5): array
{ {
$script = $this->vectorSearchPyPath; if (!is_file($this->scriptPath)) {
$this->agentLogger->info("Run vector search script $script"); $this->agentLogger->error('vector_search.py not found: ' . $this->scriptPath);
if (!is_file($script)) {
return []; return [];
} }
// ------------------------------------------------- if (!is_file($this->vectorIndexPath)) {
// Determine Python interpreter (venv preferred) $this->agentLogger->warning('vector.index not found.');
// -------------------------------------------------
$venvPython = $this->binPythonDir;
$pythonBin = is_file($venvPython) ? $venvPython : 'python3';
$cmd = sprintf(
'%s %s %s %d 2>&1',
escapeshellarg($pythonBin),
escapeshellarg($script),
escapeshellarg($query),
$limit
);
exec($cmd, $out, $exitCode);
if ($exitCode !== 0 || empty($out)) {
return []; return [];
} }
$json = implode("\n", $out); if (!is_file($this->vectorMetaPath)) {
$this->agentLogger->warning('vector.index.meta.json not found.');
return [];
}
$this->agentLogger->info($json); if (!is_file($this->indexMetaPath)) {
$this->agentLogger->warning('index_meta.json not found.');
return [];
}
$indexMeta = json_decode((string) file_get_contents($this->indexMetaPath), true);
if (!is_array($indexMeta) || empty($indexMeta['embedding_model'])) {
$this->agentLogger->error('Invalid index_meta.json.');
return [];
}
$embeddingModel = $indexMeta['embedding_model'];
$cmd = [
$this->pythonBin,
$this->scriptPath,
'--query', $query,
'--limit', (string)$limit,
'--index', $this->vectorIndexPath,
'--meta', $this->vectorMetaPath,
'--model', $embeddingModel,
];
$process = new \Symfony\Component\Process\Process($cmd);
$process->setTimeout(30);
$process->run();
if (!$process->isSuccessful()) {
$this->agentLogger->error('Vector search failed: ' . $process->getErrorOutput());
return [];
}
$output = $process->getOutput();
if (trim($output) === '') {
return [];
}
try { try {
return json_decode($json, true, 512, JSON_THROW_ON_ERROR); $this->agentLogger->info('vector_search.py is done: ' . $this->scriptPath);
} catch (\Throwable) { return json_decode($output, true, 512, JSON_THROW_ON_ERROR);
} catch (\Throwable $e) {
$this->agentLogger->error('Invalid JSON from vector_search.py');
return []; return [];
} }
} }

View File

@@ -2,71 +2,116 @@
import sys import sys
import json import json
import argparse
from pathlib import Path from pathlib import Path
# ---------------------------------------------------------
# Argument handling
# ---------------------------------------------------------
if len(sys.argv) < 3:
print("ERROR: Missing arguments (query, limit)")
sys.exit(2)
query = sys.argv[1]
limit = int(sys.argv[2])
vector_dir = Path(__file__).resolve().parent
index_path = vector_dir / "vector.index"
meta_path = vector_dir / "vector.index.meta.json"
# --------------------------------------------------------- # ---------------------------------------------------------
# Dependency checks (controlled) # Argument parsing (NEW CLEAN CLI)
# ---------------------------------------------------------
parser = argparse.ArgumentParser(description="FAISS vector search")
parser.add_argument("--query", required=True, help="Search query text")
parser.add_argument("--limit", required=True, type=int, help="Top-K limit")
parser.add_argument("--index", required=True, help="Path to vector.index")
parser.add_argument("--meta", required=True, help="Path to vector.index.meta.json")
parser.add_argument("--model", required=True, help="SentenceTransformer model")
args = parser.parse_args()
query = args.query
limit = args.limit
index_path = Path(args.index).resolve()
meta_path = Path(args.meta).resolve()
embedding_model = args.model
# ---------------------------------------------------------
# Dependency checks (stderr only)
# --------------------------------------------------------- # ---------------------------------------------------------
try: try:
import faiss # noqa import faiss # noqa
except Exception: except Exception:
print("ERROR: Python module 'faiss' not found.") print("Python module 'faiss' not found.", file=sys.stderr)
sys.exit(10) sys.exit(10)
try: try:
from sentence_transformers import SentenceTransformer # noqa from sentence_transformers import SentenceTransformer # noqa
except Exception: except Exception:
print("ERROR: Python module 'sentence-transformers' not found.") print("Python module 'sentence-transformers' not found.", file=sys.stderr)
sys.exit(11) sys.exit(11)
import faiss import faiss
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
# --------------------------------------------------------- # ---------------------------------------------------------
# File checks # File checks
# --------------------------------------------------------- # ---------------------------------------------------------
if not index_path.is_file() or not meta_path.is_file(): if not index_path.is_file():
print("ERROR: Vector index not found. Run vector ingest first.") print(f"vector.index not found at {index_path}", file=sys.stderr)
sys.exit(20) sys.exit(20)
if not meta_path.is_file():
print(f"vector.index.meta.json not found at {meta_path}", file=sys.stderr)
sys.exit(21)
# --------------------------------------------------------- # ---------------------------------------------------------
# Load model and index # Load model and index
# --------------------------------------------------------- # ---------------------------------------------------------
model = SentenceTransformer("all-MiniLM-L6-v2") try:
query_vec = model.encode([query], normalize_embeddings=True) model = SentenceTransformer(embedding_model)
except Exception as e:
print(f"Failed to load embedding model: {embedding_model}", file=sys.stderr)
sys.exit(30)
index = faiss.read_index(str(index_path)) try:
query_vec = model.encode([query], normalize_embeddings=True)
except Exception:
print("Embedding encoding failed.", file=sys.stderr)
sys.exit(31)
try:
index = faiss.read_index(str(index_path))
except Exception:
print("Failed to read FAISS index.", file=sys.stderr)
sys.exit(32)
try:
with open(meta_path, "r", encoding="utf-8") as f:
ids = json.load(f)
except Exception:
print("Failed to read vector meta file.", file=sys.stderr)
sys.exit(33)
with open(meta_path, "r", encoding="utf-8") as f:
ids = json.load(f)
# --------------------------------------------------------- # ---------------------------------------------------------
# Search # Search
# --------------------------------------------------------- # ---------------------------------------------------------
scores, indices = index.search(query_vec, limit) try:
scores, indices = index.search(query_vec, limit)
except Exception:
print("FAISS search failed.", file=sys.stderr)
sys.exit(40)
results = [] results = []
for score, idx in zip(scores[0], indices[0]): for score, idx in zip(scores[0], indices[0]):
if idx == -1: if idx == -1:
continue continue
if idx < 0 or idx >= len(ids):
continue
results.append({ results.append({
"chunk_id": ids[idx], "chunk_id": ids[idx],
"score": float(score) "score": float(score)
}) })
# ---------------------------------------------------------
# STRICT JSON OUTPUT ONLY
# ---------------------------------------------------------
print(json.dumps(results)) print(json.dumps(results))
sys.exit(0)