harden struct
This commit is contained in:
@@ -3,23 +3,45 @@
|
|||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
parameters:
|
parameters:
|
||||||
mto.kernel.dir: '%kernel.project_dir%'
|
# ------------------------------------------------------------
|
||||||
|
# Root
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
mto.root: '%kernel.project_dir%'
|
||||||
|
mto.kernel.dir: '%mto.root%'
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# Knowledge Root (ZENTRAL)
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
mto.knowledge.root: '%mto.root%/var/knowledge'
|
||||||
|
|
||||||
|
mto.knowledge.ndjson: '%mto.knowledge.root%/index.ndjson'
|
||||||
|
mto.knowledge.index_meta: '%mto.knowledge.root%/index_meta.json'
|
||||||
|
mto.knowledge.vector_index: '%mto.knowledge.root%/vector.index'
|
||||||
|
mto.knowledge.vector_index_meta: '%mto.knowledge.root%/vector.index.meta.json'
|
||||||
|
mto.knowledge.upload: '%mto.knowledge.root%/uploads'
|
||||||
|
# Backward compatibility alias
|
||||||
|
mto.vector.data.upload.path: '%mto.knowledge.upload%'
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# Index Configuration (Guardrails)
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
mto.index.chunk_size: 800
|
mto.index.chunk_size: 800
|
||||||
mto.index.chunk_overlap: 100
|
mto.index.chunk_overlap: 100
|
||||||
mto.index.embedding_model: 'all-MiniLM-L6-v2' #nomic-embed-text
|
mto.index.embedding_model: 'all-MiniLM-L6-v2'
|
||||||
mto.index.embedding_dimension: 768
|
mto.index.embedding_dimension: 768
|
||||||
mto.index.scoring_version: 1
|
mto.index.scoring_version: 1
|
||||||
|
|
||||||
mto.vector.python_bin: '/var/www/html/.venv/bin/python3'
|
# ------------------------------------------------------------
|
||||||
mto.vector.ingest_script: '/src/Vector/vector_ingest.py'
|
# Python / Vector Runtime
|
||||||
mto.vector.search_script: '%kernel.project_dir%/src/Vector/vector_search.py'
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
mto.vector.data.path: '%kernel.project_dir%/var/knowledge'
|
mto.vector.python_bin: '/var/www/html/.venv/bin/python3'
|
||||||
mto.vector.data.upload.path: '%mto.vector.data.path%/uploads'
|
|
||||||
mto.vector.data.ndjson.path: '%mto.vector.data.path%/index.ndjson'
|
mto.vector.ingest_script: '%mto.root%/src/Vector/vector_ingest.py'
|
||||||
mto.vector.data.vector_index.path: '%mto.vector.data.path%/vector.index'
|
mto.vector.search_script: '%mto.root%/src/Vector/vector_search.py'
|
||||||
mto.vector.data.vector_index_meta_json.path: '%mto.vector.data.path%/vector.index.meta.json'
|
|
||||||
|
|
||||||
mto.vector.timeout: 600
|
mto.vector.timeout: 600
|
||||||
|
|
||||||
@@ -29,20 +51,13 @@ parameters:
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
# Default service configuration
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
_defaults:
|
_defaults:
|
||||||
autowire: true
|
autowire: true
|
||||||
autoconfigure: true
|
autoconfigure: true
|
||||||
|
|
||||||
bind:
|
bind:
|
||||||
Psr\Log\LoggerInterface $agentLogger: '@monolog.logger.agent'
|
Psr\Log\LoggerInterface $agentLogger: '@monolog.logger.agent'
|
||||||
string $projectDir: '%kernel.project_dir%'
|
string $projectDir: '%mto.root%'
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
# Automatically register all services in src/
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
App\:
|
App\:
|
||||||
resource: '../src/'
|
resource: '../src/'
|
||||||
exclude:
|
exclude:
|
||||||
@@ -57,6 +72,7 @@ services:
|
|||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# AI Agent – Infrastructure
|
# AI Agent – Infrastructure
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
App\Infrastructure\OllamaClient:
|
App\Infrastructure\OllamaClient:
|
||||||
arguments:
|
arguments:
|
||||||
$apiUrl: '%env(AI_LLM_API_URL)%'
|
$apiUrl: '%env(AI_LLM_API_URL)%'
|
||||||
@@ -66,10 +82,11 @@ services:
|
|||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# AI Agent – Context & Runner
|
# AI Agent – Context & Runner
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
App\Context\ContextService:
|
App\Context\ContextService:
|
||||||
arguments:
|
arguments:
|
||||||
$historyDir: '%env(AI_HISTORY_DIR)%'
|
$historyDir: '%env(AI_HISTORY_DIR)%'
|
||||||
$projectDir: '%kernel.project_dir%'
|
$projectDir: '%mto.root%'
|
||||||
|
|
||||||
App\Agent\AgentRunner:
|
App\Agent\AgentRunner:
|
||||||
arguments:
|
arguments:
|
||||||
@@ -78,7 +95,7 @@ services:
|
|||||||
$logContext: '%env(bool:AI_LOG_CONTEXT)%'
|
$logContext: '%env(bool:AI_LOG_CONTEXT)%'
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# NDJSON Retrieval Stack (FINAL ARCHITECTURE)
|
# NDJSON Retrieval Stack
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
App\Knowledge\Retrieval\NdjsonChunkLookup: ~
|
App\Knowledge\Retrieval\NdjsonChunkLookup: ~
|
||||||
@@ -100,22 +117,34 @@ services:
|
|||||||
alias: App\Knowledge\Retrieval\CachedRetriever
|
alias: App\Knowledge\Retrieval\CachedRetriever
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# Vector Search (FAISS NDJSON-based)
|
# Vector Search (noch unverändert – Umbau kommt in Schritt 2)
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
App\Index\IndexMetaManager:
|
||||||
|
arguments:
|
||||||
|
$metaPath: '%mto.knowledge.index_meta%'
|
||||||
|
$config: '@App\Index\IndexConfiguration'
|
||||||
|
|
||||||
App\Vector\VectorSearchClient:
|
App\Vector\VectorSearchClient:
|
||||||
arguments:
|
arguments:
|
||||||
$binPythonDir: '%mto.vector.python_bin%'
|
$pythonBin: '%mto.vector.python_bin%'
|
||||||
$vectorSearchPyPath: '%mto.vector.search_script%'
|
$scriptPath: '%mto.vector.search_script%'
|
||||||
|
$vectorIndexPath: '%mto.knowledge.vector_index%'
|
||||||
|
$vectorMetaPath: '%mto.knowledge.vector_index_meta%'
|
||||||
|
$indexMetaPath: '%mto.knowledge.index_meta%'
|
||||||
|
$agentLogger: '@monolog.logger.agent'
|
||||||
|
|
||||||
App\Vector\VectorIndexBuilder:
|
App\Vector\VectorIndexBuilder:
|
||||||
arguments:
|
arguments:
|
||||||
$pythonBin: '%mto.vector.python_bin%'
|
$pythonBin: '%mto.vector.python_bin%'
|
||||||
$relativeScriptPath: '%mto.vector.ingest_script%'
|
$scriptPath: '%mto.vector.ingest_script%'
|
||||||
|
$indexNdjsonPath: '%mto.knowledge.ndjson%'
|
||||||
|
$indexMetaPath: '%mto.knowledge.index_meta%'
|
||||||
|
$vectorIndexPath: '%mto.knowledge.vector_index%'
|
||||||
$timeoutSeconds: '%mto.vector.timeout%'
|
$timeoutSeconds: '%mto.vector.timeout%'
|
||||||
|
$indexConfiguration: '@App\Index\IndexConfiguration'
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# Index Configuration (Guardrails)
|
# Index Configuration
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
App\Index\IndexConfiguration:
|
App\Index\IndexConfiguration:
|
||||||
@@ -130,5 +159,5 @@ services:
|
|||||||
|
|
||||||
App\Service\Admin\IndexNdjsonInspector:
|
App\Service\Admin\IndexNdjsonInspector:
|
||||||
arguments:
|
arguments:
|
||||||
$ndJsonPath: '%mto.vector.data.ndjson.path%'
|
$ndJsonPath: '%mto.knowledge.ndjson%'
|
||||||
$indexMetaPath: '%mto.vector.data.vector_index_meta_json.path%'
|
$indexMetaPath: '%mto.knowledge.index_meta%'
|
||||||
|
|||||||
@@ -370,10 +370,10 @@ class DocumentController extends AbstractController
|
|||||||
return $this->redirectToRoute('admin_dashboard');
|
return $this->redirectToRoute('admin_dashboard');
|
||||||
}
|
}
|
||||||
|
|
||||||
@unlink($params->get('mto.vector.data.ndjson.path'));
|
@unlink($params->get('mto.knowledge.ndjson'));
|
||||||
@unlink($params->get('mto.vector.data.vector_index.path'));
|
@unlink($params->get('mto.knowledge.vector_index'));
|
||||||
@unlink($params->get('mto.vector.data.vector_index_meta_json.path'));
|
@unlink($params->get('mto.knowledge.vector_index_meta'));
|
||||||
exec('rm -rf ' . $params->get('mto.vector.data.upload.path'));
|
exec('rm -rf ' . $params->get('mto.knowledge.upload'));
|
||||||
|
|
||||||
$sql = '
|
$sql = '
|
||||||
SET FOREIGN_KEY_CHECKS = 0;
|
SET FOREIGN_KEY_CHECKS = 0;
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
declare(strict_types=1);
|
||||||
|
|
||||||
namespace App\Index;
|
namespace App\Index;
|
||||||
@@ -8,14 +7,14 @@ namespace App\Index;
|
|||||||
final class IndexMetaManager
|
final class IndexMetaManager
|
||||||
{
|
{
|
||||||
private string $metaPath;
|
private string $metaPath;
|
||||||
|
private IndexConfiguration $config;
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
string $projectDir,
|
string $metaPath,
|
||||||
private readonly IndexConfiguration $config,
|
IndexConfiguration $config
|
||||||
string $relativeMetaPath = '/var/knowledge/index_meta.json'
|
) {
|
||||||
)
|
$this->metaPath = $metaPath;
|
||||||
{
|
$this->config = $config;
|
||||||
$this->metaPath = rtrim($projectDir, '/') . $relativeMetaPath;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getMetaPath(): string
|
public function getMetaPath(): string
|
||||||
@@ -24,8 +23,6 @@ final class IndexMetaManager
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gibt null zurück, wenn noch kein Meta existiert (frisches System).
|
|
||||||
*
|
|
||||||
* @return array<string,mixed>|null
|
* @return array<string,mixed>|null
|
||||||
*/
|
*/
|
||||||
public function readMeta(): ?array
|
public function readMeta(): ?array
|
||||||
@@ -48,43 +45,19 @@ final class IndexMetaManager
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Erstellt Meta, falls nicht vorhanden (z. B. nach erstem Global Reindex).
|
* Guardrail:
|
||||||
* Überschreibt NICHT automatisch, wenn vorhanden.
|
* - Wenn Meta fehlt → initialisieren
|
||||||
*
|
* - Wenn Struktur driftet → Exception
|
||||||
* @return array<string,mixed>
|
|
||||||
*/
|
|
||||||
public function createInitialMetaIfMissing(): array
|
|
||||||
{
|
|
||||||
$existing = $this->readMeta();
|
|
||||||
if ($existing !== null) {
|
|
||||||
return $existing;
|
|
||||||
}
|
|
||||||
|
|
||||||
$meta = $this->buildMetaPayload(indexVersion: 1);
|
|
||||||
$this->atomicWriteJson($meta);
|
|
||||||
|
|
||||||
return $meta;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Guardrail: Prüft, ob die aktuelle Config kompatibel zur gespeicherten Meta ist.
|
|
||||||
* Wenn nicht: IndexStructureChangedException -> Global Reindex erzwingen.
|
|
||||||
*/
|
*/
|
||||||
public function validateAgainstCurrent(): void
|
public function validateAgainstCurrent(): void
|
||||||
{
|
{
|
||||||
$meta = $this->readMeta();
|
$meta = $this->readMeta();
|
||||||
|
|
||||||
// Wenn noch kein Meta existiert, lassen wir lokale Ingests NICHT einfach laufen.
|
|
||||||
// Governance: Erst Global Reindex erzeugt Meta sauber.
|
|
||||||
if ($meta === null) {
|
if ($meta === null) {
|
||||||
throw new IndexStructureChangedException(
|
$meta = $this->createInitialMeta();
|
||||||
'index_meta.json missing. Please run a Global Reindex to initialize index structure metadata.',
|
|
||||||
['reason' => 'missing_meta']
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$expected = $this->config->toStructureArray();
|
$expected = $this->config->toStructureArray();
|
||||||
|
|
||||||
$diff = $this->diffStructure($meta, $expected);
|
$diff = $this->diffStructure($meta, $expected);
|
||||||
|
|
||||||
if ($diff !== []) {
|
if ($diff !== []) {
|
||||||
@@ -96,11 +69,7 @@ final class IndexMetaManager
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wird beim Global Reindex verwendet:
|
* Wird beim Global Reindex aufgerufen
|
||||||
* - index_version++ (oder initialisieren)
|
|
||||||
* - Meta atomar schreiben
|
|
||||||
*
|
|
||||||
* @return array<string,mixed> new meta
|
|
||||||
*/
|
*/
|
||||||
public function writeMetaForGlobalReindex(): array
|
public function writeMetaForGlobalReindex(): array
|
||||||
{
|
{
|
||||||
@@ -122,35 +91,34 @@ final class IndexMetaManager
|
|||||||
return $this->config;
|
return $this->config;
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------
|
// ---------------------------------------------------------
|
||||||
// Internals
|
// Internals
|
||||||
// -------------------------
|
// ---------------------------------------------------------
|
||||||
|
|
||||||
|
private function createInitialMeta(): array
|
||||||
|
{
|
||||||
|
$meta = $this->buildMetaPayload(1);
|
||||||
|
$this->atomicWriteJson($meta);
|
||||||
|
return $meta;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @return array<string,mixed>
|
|
||||||
*/
|
|
||||||
private function buildMetaPayload(int $indexVersion): array
|
private function buildMetaPayload(int $indexVersion): array
|
||||||
{
|
{
|
||||||
$structure = $this->config->toStructureArray();
|
$structure = $this->config->toStructureArray();
|
||||||
|
|
||||||
return [
|
return [
|
||||||
'index_version' => $indexVersion,
|
'index_version' => $indexVersion,
|
||||||
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
||||||
'embedding_model' => $structure['embedding_model'],
|
'embedding_model' => $structure['embedding_model'],
|
||||||
'embedding_dimension' => $structure['embedding_dimension'],
|
'embedding_dimension' => $structure['embedding_dimension'],
|
||||||
'chunk_size' => $structure['chunk_size'],
|
'chunk_size' => $structure['chunk_size'],
|
||||||
'chunk_overlap' => $structure['chunk_overlap'],
|
'chunk_overlap' => $structure['chunk_overlap'],
|
||||||
'scoring_version' => $structure['scoring_version'],
|
'scoring_version' => $structure['scoring_version'],
|
||||||
'index_format' => $structure['index_format'],
|
'index_format' => $structure['index_format'],
|
||||||
'vector_backend' => $structure['vector_backend'],
|
'vector_backend' => $structure['vector_backend'],
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @param array<string,mixed> $meta
|
|
||||||
* @param array<string,mixed> $expected
|
|
||||||
* @return array<string,mixed> diff
|
|
||||||
*/
|
|
||||||
private function diffStructure(array $meta, array $expected): array
|
private function diffStructure(array $meta, array $expected): array
|
||||||
{
|
{
|
||||||
$diff = [];
|
$diff = [];
|
||||||
@@ -160,28 +128,18 @@ final class IndexMetaManager
|
|||||||
if ($actual !== $value) {
|
if ($actual !== $value) {
|
||||||
$diff[$key] = [
|
$diff[$key] = [
|
||||||
'expected' => $value,
|
'expected' => $value,
|
||||||
'actual' => $actual,
|
'actual' => $actual,
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// index_format ist zwingend
|
|
||||||
if (($meta['index_format'] ?? null) !== 'ndjson') {
|
|
||||||
$diff['index_format'] = [
|
|
||||||
'expected' => 'ndjson',
|
|
||||||
'actual' => $meta['index_format'] ?? null,
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
return $diff;
|
return $diff;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @param array<string,mixed> $payload
|
|
||||||
*/
|
|
||||||
private function atomicWriteJson(array $payload): void
|
private function atomicWriteJson(array $payload): void
|
||||||
{
|
{
|
||||||
$dir = \dirname($this->metaPath);
|
$dir = dirname($this->metaPath);
|
||||||
|
|
||||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||||
throw new \RuntimeException('Unable to create directory: ' . $dir);
|
throw new \RuntimeException('Unable to create directory: ' . $dir);
|
||||||
}
|
}
|
||||||
@@ -197,7 +155,6 @@ final class IndexMetaManager
|
|||||||
throw new \RuntimeException('Unable to write temp meta file');
|
throw new \RuntimeException('Unable to write temp meta file');
|
||||||
}
|
}
|
||||||
|
|
||||||
// atomarer Switch
|
|
||||||
if (!rename($tmp, $this->metaPath)) {
|
if (!rename($tmp, $this->metaPath)) {
|
||||||
@unlink($tmp);
|
@unlink($tmp);
|
||||||
throw new \RuntimeException('Unable to switch meta file atomically');
|
throw new \RuntimeException('Unable to switch meta file atomically');
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
declare(strict_types=1);
|
||||||
|
|
||||||
namespace App\Vector;
|
namespace App\Vector;
|
||||||
|
|
||||||
|
use App\Index\IndexConfiguration;
|
||||||
use Symfony\Component\Process\Exception\ProcessFailedException;
|
use Symfony\Component\Process\Exception\ProcessFailedException;
|
||||||
use Symfony\Component\Process\Process;
|
use Symfony\Component\Process\Process;
|
||||||
|
|
||||||
@@ -13,49 +13,32 @@ final class VectorIndexBuilder
|
|||||||
private string $pythonBin;
|
private string $pythonBin;
|
||||||
private string $scriptPath;
|
private string $scriptPath;
|
||||||
private string $indexNdjsonPath;
|
private string $indexNdjsonPath;
|
||||||
|
private string $indexMetaPath;
|
||||||
private string $vectorIndexPath;
|
private string $vectorIndexPath;
|
||||||
|
private string $vectorMetaPath;
|
||||||
private int $timeoutSeconds;
|
private int $timeoutSeconds;
|
||||||
|
|
||||||
|
private IndexConfiguration $indexConfiguration;
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
string $projectDir,
|
string $pythonBin,
|
||||||
string $pythonBin = 'python3',
|
string $scriptPath,
|
||||||
string $relativeScriptPath = '/vector/vector_ingest.py',
|
string $indexNdjsonPath,
|
||||||
string $relativeIndexNdjsonPath = '/var/knowledge/index.ndjson',
|
string $indexMetaPath,
|
||||||
string $relativeVectorIndexPath = '/var/knowledge/vector.index',
|
string $vectorIndexPath,
|
||||||
int $timeoutSeconds = 600
|
int $timeoutSeconds,
|
||||||
)
|
IndexConfiguration $indexConfiguration
|
||||||
{
|
) {
|
||||||
$base = rtrim($projectDir, '/');
|
$this->pythonBin = $pythonBin;
|
||||||
|
$this->scriptPath = $scriptPath;
|
||||||
$this->pythonBin = $pythonBin;
|
$this->indexNdjsonPath = $indexNdjsonPath;
|
||||||
$this->scriptPath = $base . $relativeScriptPath;
|
$this->indexMetaPath = $indexMetaPath;
|
||||||
$this->indexNdjsonPath = $base . $relativeIndexNdjsonPath;
|
$this->vectorIndexPath = $vectorIndexPath;
|
||||||
$this->vectorIndexPath = $base . $relativeVectorIndexPath;
|
$this->vectorMetaPath = $vectorIndexPath . '.meta.json';
|
||||||
$this->timeoutSeconds = $timeoutSeconds;
|
$this->timeoutSeconds = $timeoutSeconds;
|
||||||
|
$this->indexConfiguration = $indexConfiguration;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getIndexNdjsonPath(): string
|
|
||||||
{
|
|
||||||
return $this->indexNdjsonPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getVectorIndexPath(): string
|
|
||||||
{
|
|
||||||
return $this->vectorIndexPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getScriptPath(): string
|
|
||||||
{
|
|
||||||
return $this->scriptPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Rebuild FAISS Index deterministisch aus index.ndjson.
|
|
||||||
*
|
|
||||||
* Erwartung: Python schreibt in $tmpVectorIndexPath, wir schalten atomar um.
|
|
||||||
*
|
|
||||||
* @param string|null $logPath Optional: stdout/stderr dorthin appenden
|
|
||||||
*/
|
|
||||||
public function rebuildFromNdjson(?string $logPath = null): void
|
public function rebuildFromNdjson(?string $logPath = null): void
|
||||||
{
|
{
|
||||||
if (!is_file($this->scriptPath)) {
|
if (!is_file($this->scriptPath)) {
|
||||||
@@ -66,97 +49,97 @@ final class VectorIndexBuilder
|
|||||||
throw new \RuntimeException('index.ndjson not found at: ' . $this->indexNdjsonPath);
|
throw new \RuntimeException('index.ndjson not found at: ' . $this->indexNdjsonPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
$dir = \dirname($this->vectorIndexPath);
|
if (!is_file($this->indexMetaPath)) {
|
||||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
$this->initializeIndexMeta();
|
||||||
throw new \RuntimeException('Unable to create vector index directory: ' . $dir);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$indexMeta = json_decode((string) file_get_contents($this->indexMetaPath), true);
|
||||||
|
|
||||||
|
if (!is_array($indexMeta) || empty($indexMeta['embedding_model'])) {
|
||||||
|
throw new \RuntimeException('Invalid index_meta.json');
|
||||||
|
}
|
||||||
|
|
||||||
|
$embeddingModel = (string) $indexMeta['embedding_model'];
|
||||||
|
|
||||||
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
|
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
|
||||||
|
|
||||||
// Vorheriges tmp entfernen (Sicherheit)
|
// Wichtig: Python erzeugt meta basierend auf endgültigem Namen
|
||||||
if (is_file($tmpVectorIndexPath)) {
|
$finalMetaPath = $this->vectorMetaPath;
|
||||||
@unlink($tmpVectorIndexPath);
|
$tmpMetaPath = dirname($this->vectorIndexPath) . '/' . basename($this->vectorIndexPath, '.index') . '.index.meta.json';
|
||||||
}
|
|
||||||
|
@unlink($tmpVectorIndexPath);
|
||||||
|
@unlink($finalMetaPath);
|
||||||
|
|
||||||
// ----------------------------
|
|
||||||
// Python-Aufruf (konservativ)
|
|
||||||
// ----------------------------
|
|
||||||
// Wir erwarten/standardisieren (ab jetzt) CLI-Args:
|
|
||||||
// --index <path-to-index.ndjson>
|
|
||||||
// --out <path-to-vector.index.tmp>
|
|
||||||
$cmd = [
|
$cmd = [
|
||||||
$this->pythonBin,
|
$this->pythonBin,
|
||||||
$this->scriptPath,
|
$this->scriptPath,
|
||||||
'--index', $this->indexNdjsonPath,
|
'--index', $this->indexNdjsonPath,
|
||||||
'--out', $tmpVectorIndexPath,
|
'--out', $tmpVectorIndexPath,
|
||||||
'--model', 'all-MiniLM-L6-v2',
|
'--model', $embeddingModel,
|
||||||
];
|
];
|
||||||
|
|
||||||
$process = new Process($cmd);
|
$process = new Process($cmd);
|
||||||
$process->setTimeout($this->timeoutSeconds);
|
$process->setTimeout($this->timeoutSeconds);
|
||||||
|
$process->mustRun();
|
||||||
|
|
||||||
$this->runProcess($process, $logPath);
|
|
||||||
|
|
||||||
// Python muss tmp erzeugt haben
|
|
||||||
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
|
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
|
||||||
throw new \RuntimeException('Vector index rebuild failed: tmp output missing or empty: ' . $tmpVectorIndexPath);
|
throw new \RuntimeException('Vector index tmp missing or empty');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Atomarer Switch
|
// Python erzeugt vector.index.meta.json (nicht tmp.meta!)
|
||||||
$this->atomicSwitch($tmpVectorIndexPath, $this->vectorIndexPath);
|
if (!is_file($this->vectorMetaPath) || filesize($this->vectorMetaPath) === 0) {
|
||||||
|
throw new \RuntimeException('Vector meta missing or empty');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Atomarer Switch für Index
|
||||||
|
if (!rename($tmpVectorIndexPath, $this->vectorIndexPath)) {
|
||||||
|
throw new \RuntimeException('Atomic switch failed for vector index');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------
|
private function initializeIndexMeta(): void
|
||||||
// Internals
|
{
|
||||||
// -------------------------
|
$dir = dirname($this->indexMetaPath);
|
||||||
|
|
||||||
|
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||||
|
throw new \RuntimeException('Cannot create knowledge directory');
|
||||||
|
}
|
||||||
|
|
||||||
|
$data = [
|
||||||
|
'index_version' => 1,
|
||||||
|
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
||||||
|
'embedding_model' => $this->indexConfiguration->getEmbeddingModel(),
|
||||||
|
'embedding_dimension' => $this->indexConfiguration->getEmbeddingDimension(),
|
||||||
|
'chunk_size' => $this->indexConfiguration->getChunkSize(),
|
||||||
|
'chunk_overlap' => $this->indexConfiguration->getChunkOverlap(),
|
||||||
|
'scoring_version' => $this->indexConfiguration->getScoringVersion(),
|
||||||
|
'index_format' => 'ndjson',
|
||||||
|
'vector_backend' => 'faiss',
|
||||||
|
];
|
||||||
|
|
||||||
|
file_put_contents(
|
||||||
|
$this->indexMetaPath,
|
||||||
|
json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
private function runProcess(Process $process, ?string $logPath): void
|
private function runProcess(Process $process, ?string $logPath): void
|
||||||
{
|
{
|
||||||
if ($logPath !== null) {
|
if ($logPath !== null) {
|
||||||
$this->appendLog($logPath, "\n=== VectorIndexBuilder START " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n");
|
@file_put_contents($logPath, "=== VectorIndexBuilder START ===\n", FILE_APPEND);
|
||||||
$this->appendLog($logPath, "CMD: " . $process->getCommandLine() . "\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$process->run(function (string $type, string $buffer) use ($logPath) {
|
$process->run();
|
||||||
if ($logPath === null) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TYPE: Process::OUT / Process::ERR
|
|
||||||
$this->appendLog($logPath, $buffer);
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!$process->isSuccessful()) {
|
if (!$process->isSuccessful()) {
|
||||||
if ($logPath !== null) {
|
if ($logPath !== null) {
|
||||||
$this->appendLog($logPath, "\n=== VectorIndexBuilder FAILED ===\n");
|
@file_put_contents($logPath, $process->getErrorOutput(), FILE_APPEND);
|
||||||
$this->appendLog($logPath, "ExitCode: " . $process->getExitCode() . "\n");
|
|
||||||
$this->appendLog($logPath, "STDERR:\n" . $process->getErrorOutput() . "\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
throw new ProcessFailedException($process);
|
throw new ProcessFailedException($process);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($logPath !== null) {
|
if ($logPath !== null) {
|
||||||
$this->appendLog($logPath, "\n=== VectorIndexBuilder OK " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n");
|
@file_put_contents($logPath, "=== VectorIndexBuilder OK ===\n", FILE_APPEND);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private function appendLog(string $logPath, string $content): void
|
|
||||||
{
|
|
||||||
$dir = \dirname($logPath);
|
|
||||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
|
||||||
// Wenn Log nicht möglich ist: nicht hart scheitern (Build ist wichtiger)
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
@file_put_contents($logPath, $content, FILE_APPEND);
|
|
||||||
}
|
|
||||||
|
|
||||||
private function atomicSwitch(string $tmp, string $final): void
|
|
||||||
{
|
|
||||||
if (!rename($tmp, $final)) {
|
|
||||||
@unlink($tmp);
|
|
||||||
throw new \RuntimeException('Atomic switch failed for vector.index');
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,48 +8,90 @@ use Psr\Log\LoggerInterface;
|
|||||||
|
|
||||||
final class VectorSearchClient
|
final class VectorSearchClient
|
||||||
{
|
{
|
||||||
|
private string $pythonBin;
|
||||||
|
private string $scriptPath;
|
||||||
|
private string $vectorIndexPath;
|
||||||
|
private string $vectorMetaPath;
|
||||||
|
private string $indexMetaPath;
|
||||||
|
private LoggerInterface $agentLogger;
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly string $binPythonDir,
|
string $pythonBin,
|
||||||
private readonly string $vectorSearchPyPath,
|
string $scriptPath,
|
||||||
private LoggerInterface $agentLogger,
|
string $vectorIndexPath,
|
||||||
|
string $vectorMetaPath,
|
||||||
|
string $indexMetaPath,
|
||||||
|
LoggerInterface $agentLogger
|
||||||
) {
|
) {
|
||||||
|
$this->pythonBin = $pythonBin;
|
||||||
|
$this->scriptPath = $scriptPath;
|
||||||
|
$this->vectorIndexPath = $vectorIndexPath;
|
||||||
|
$this->vectorMetaPath = $vectorMetaPath;
|
||||||
|
$this->indexMetaPath = $indexMetaPath;
|
||||||
|
$this->agentLogger = $agentLogger;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function search(string $query, int $limit = 5): array
|
public function search(string $query, int $limit = 5): array
|
||||||
{
|
{
|
||||||
$script = $this->vectorSearchPyPath;
|
if (!is_file($this->scriptPath)) {
|
||||||
$this->agentLogger->info("Run vector search script $script");
|
$this->agentLogger->error('vector_search.py not found: ' . $this->scriptPath);
|
||||||
if (!is_file($script)) {
|
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------
|
if (!is_file($this->vectorIndexPath)) {
|
||||||
// Determine Python interpreter (venv preferred)
|
$this->agentLogger->warning('vector.index not found.');
|
||||||
// -------------------------------------------------
|
|
||||||
$venvPython = $this->binPythonDir;
|
|
||||||
$pythonBin = is_file($venvPython) ? $venvPython : 'python3';
|
|
||||||
|
|
||||||
$cmd = sprintf(
|
|
||||||
'%s %s %s %d 2>&1',
|
|
||||||
escapeshellarg($pythonBin),
|
|
||||||
escapeshellarg($script),
|
|
||||||
escapeshellarg($query),
|
|
||||||
$limit
|
|
||||||
);
|
|
||||||
|
|
||||||
exec($cmd, $out, $exitCode);
|
|
||||||
|
|
||||||
if ($exitCode !== 0 || empty($out)) {
|
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
$json = implode("\n", $out);
|
if (!is_file($this->vectorMetaPath)) {
|
||||||
|
$this->agentLogger->warning('vector.index.meta.json not found.');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
$this->agentLogger->info($json);
|
if (!is_file($this->indexMetaPath)) {
|
||||||
|
$this->agentLogger->warning('index_meta.json not found.');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
$indexMeta = json_decode((string) file_get_contents($this->indexMetaPath), true);
|
||||||
|
|
||||||
|
if (!is_array($indexMeta) || empty($indexMeta['embedding_model'])) {
|
||||||
|
$this->agentLogger->error('Invalid index_meta.json.');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
$embeddingModel = $indexMeta['embedding_model'];
|
||||||
|
|
||||||
|
$cmd = [
|
||||||
|
$this->pythonBin,
|
||||||
|
$this->scriptPath,
|
||||||
|
'--query', $query,
|
||||||
|
'--limit', (string)$limit,
|
||||||
|
'--index', $this->vectorIndexPath,
|
||||||
|
'--meta', $this->vectorMetaPath,
|
||||||
|
'--model', $embeddingModel,
|
||||||
|
];
|
||||||
|
|
||||||
|
$process = new \Symfony\Component\Process\Process($cmd);
|
||||||
|
$process->setTimeout(30);
|
||||||
|
$process->run();
|
||||||
|
|
||||||
|
if (!$process->isSuccessful()) {
|
||||||
|
$this->agentLogger->error('Vector search failed: ' . $process->getErrorOutput());
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
$output = $process->getOutput();
|
||||||
|
|
||||||
|
if (trim($output) === '') {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return json_decode($json, true, 512, JSON_THROW_ON_ERROR);
|
$this->agentLogger->info('vector_search.py is done: ' . $this->scriptPath);
|
||||||
} catch (\Throwable) {
|
return json_decode($output, true, 512, JSON_THROW_ON_ERROR);
|
||||||
|
} catch (\Throwable $e) {
|
||||||
|
$this->agentLogger->error('Invalid JSON from vector_search.py');
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,71 +2,116 @@
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
import argparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# ---------------------------------------------------------
|
|
||||||
# Argument handling
|
|
||||||
# ---------------------------------------------------------
|
|
||||||
if len(sys.argv) < 3:
|
|
||||||
print("ERROR: Missing arguments (query, limit)")
|
|
||||||
sys.exit(2)
|
|
||||||
|
|
||||||
query = sys.argv[1]
|
|
||||||
limit = int(sys.argv[2])
|
|
||||||
|
|
||||||
vector_dir = Path(__file__).resolve().parent
|
|
||||||
index_path = vector_dir / "vector.index"
|
|
||||||
meta_path = vector_dir / "vector.index.meta.json"
|
|
||||||
|
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
# Dependency checks (controlled)
|
# Argument parsing (NEW – CLEAN CLI)
|
||||||
|
# ---------------------------------------------------------
|
||||||
|
parser = argparse.ArgumentParser(description="FAISS vector search")
|
||||||
|
|
||||||
|
parser.add_argument("--query", required=True, help="Search query text")
|
||||||
|
parser.add_argument("--limit", required=True, type=int, help="Top-K limit")
|
||||||
|
parser.add_argument("--index", required=True, help="Path to vector.index")
|
||||||
|
parser.add_argument("--meta", required=True, help="Path to vector.index.meta.json")
|
||||||
|
parser.add_argument("--model", required=True, help="SentenceTransformer model")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
query = args.query
|
||||||
|
limit = args.limit
|
||||||
|
index_path = Path(args.index).resolve()
|
||||||
|
meta_path = Path(args.meta).resolve()
|
||||||
|
embedding_model = args.model
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------
|
||||||
|
# Dependency checks (stderr only)
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
try:
|
try:
|
||||||
import faiss # noqa
|
import faiss # noqa
|
||||||
except Exception:
|
except Exception:
|
||||||
print("ERROR: Python module 'faiss' not found.")
|
print("Python module 'faiss' not found.", file=sys.stderr)
|
||||||
sys.exit(10)
|
sys.exit(10)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from sentence_transformers import SentenceTransformer # noqa
|
from sentence_transformers import SentenceTransformer # noqa
|
||||||
except Exception:
|
except Exception:
|
||||||
print("ERROR: Python module 'sentence-transformers' not found.")
|
print("Python module 'sentence-transformers' not found.", file=sys.stderr)
|
||||||
sys.exit(11)
|
sys.exit(11)
|
||||||
|
|
||||||
import faiss
|
import faiss
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
# File checks
|
# File checks
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
if not index_path.is_file() or not meta_path.is_file():
|
if not index_path.is_file():
|
||||||
print("ERROR: Vector index not found. Run vector ingest first.")
|
print(f"vector.index not found at {index_path}", file=sys.stderr)
|
||||||
sys.exit(20)
|
sys.exit(20)
|
||||||
|
|
||||||
|
if not meta_path.is_file():
|
||||||
|
print(f"vector.index.meta.json not found at {meta_path}", file=sys.stderr)
|
||||||
|
sys.exit(21)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
# Load model and index
|
# Load model and index
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
try:
|
||||||
query_vec = model.encode([query], normalize_embeddings=True)
|
model = SentenceTransformer(embedding_model)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to load embedding model: {embedding_model}", file=sys.stderr)
|
||||||
|
sys.exit(30)
|
||||||
|
|
||||||
index = faiss.read_index(str(index_path))
|
try:
|
||||||
|
query_vec = model.encode([query], normalize_embeddings=True)
|
||||||
|
except Exception:
|
||||||
|
print("Embedding encoding failed.", file=sys.stderr)
|
||||||
|
sys.exit(31)
|
||||||
|
|
||||||
|
try:
|
||||||
|
index = faiss.read_index(str(index_path))
|
||||||
|
except Exception:
|
||||||
|
print("Failed to read FAISS index.", file=sys.stderr)
|
||||||
|
sys.exit(32)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(meta_path, "r", encoding="utf-8") as f:
|
||||||
|
ids = json.load(f)
|
||||||
|
except Exception:
|
||||||
|
print("Failed to read vector meta file.", file=sys.stderr)
|
||||||
|
sys.exit(33)
|
||||||
|
|
||||||
with open(meta_path, "r", encoding="utf-8") as f:
|
|
||||||
ids = json.load(f)
|
|
||||||
|
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
# Search
|
# Search
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
scores, indices = index.search(query_vec, limit)
|
try:
|
||||||
|
scores, indices = index.search(query_vec, limit)
|
||||||
|
except Exception:
|
||||||
|
print("FAISS search failed.", file=sys.stderr)
|
||||||
|
sys.exit(40)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for score, idx in zip(scores[0], indices[0]):
|
for score, idx in zip(scores[0], indices[0]):
|
||||||
if idx == -1:
|
if idx == -1:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if idx < 0 or idx >= len(ids):
|
||||||
|
continue
|
||||||
|
|
||||||
results.append({
|
results.append({
|
||||||
"chunk_id": ids[idx],
|
"chunk_id": ids[idx],
|
||||||
"score": float(score)
|
"score": float(score)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------
|
||||||
|
# STRICT JSON OUTPUT ONLY
|
||||||
|
# ---------------------------------------------------------
|
||||||
print(json.dumps(results))
|
print(json.dumps(results))
|
||||||
|
sys.exit(0)
|
||||||
|
|||||||
Reference in New Issue
Block a user