harden struct
This commit is contained in:
@@ -1,10 +1,10 @@
|
||||
<?php
|
||||
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Vector;
|
||||
|
||||
use App\Index\IndexConfiguration;
|
||||
use Symfony\Component\Process\Exception\ProcessFailedException;
|
||||
use Symfony\Component\Process\Process;
|
||||
|
||||
@@ -13,49 +13,32 @@ final class VectorIndexBuilder
|
||||
private string $pythonBin;
|
||||
private string $scriptPath;
|
||||
private string $indexNdjsonPath;
|
||||
private string $indexMetaPath;
|
||||
private string $vectorIndexPath;
|
||||
private string $vectorMetaPath;
|
||||
private int $timeoutSeconds;
|
||||
|
||||
private IndexConfiguration $indexConfiguration;
|
||||
|
||||
public function __construct(
|
||||
string $projectDir,
|
||||
string $pythonBin = 'python3',
|
||||
string $relativeScriptPath = '/vector/vector_ingest.py',
|
||||
string $relativeIndexNdjsonPath = '/var/knowledge/index.ndjson',
|
||||
string $relativeVectorIndexPath = '/var/knowledge/vector.index',
|
||||
int $timeoutSeconds = 600
|
||||
)
|
||||
{
|
||||
$base = rtrim($projectDir, '/');
|
||||
|
||||
$this->pythonBin = $pythonBin;
|
||||
$this->scriptPath = $base . $relativeScriptPath;
|
||||
$this->indexNdjsonPath = $base . $relativeIndexNdjsonPath;
|
||||
$this->vectorIndexPath = $base . $relativeVectorIndexPath;
|
||||
$this->timeoutSeconds = $timeoutSeconds;
|
||||
string $pythonBin,
|
||||
string $scriptPath,
|
||||
string $indexNdjsonPath,
|
||||
string $indexMetaPath,
|
||||
string $vectorIndexPath,
|
||||
int $timeoutSeconds,
|
||||
IndexConfiguration $indexConfiguration
|
||||
) {
|
||||
$this->pythonBin = $pythonBin;
|
||||
$this->scriptPath = $scriptPath;
|
||||
$this->indexNdjsonPath = $indexNdjsonPath;
|
||||
$this->indexMetaPath = $indexMetaPath;
|
||||
$this->vectorIndexPath = $vectorIndexPath;
|
||||
$this->vectorMetaPath = $vectorIndexPath . '.meta.json';
|
||||
$this->timeoutSeconds = $timeoutSeconds;
|
||||
$this->indexConfiguration = $indexConfiguration;
|
||||
}
|
||||
|
||||
public function getIndexNdjsonPath(): string
|
||||
{
|
||||
return $this->indexNdjsonPath;
|
||||
}
|
||||
|
||||
public function getVectorIndexPath(): string
|
||||
{
|
||||
return $this->vectorIndexPath;
|
||||
}
|
||||
|
||||
public function getScriptPath(): string
|
||||
{
|
||||
return $this->scriptPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rebuild FAISS Index deterministisch aus index.ndjson.
|
||||
*
|
||||
* Erwartung: Python schreibt in $tmpVectorIndexPath, wir schalten atomar um.
|
||||
*
|
||||
* @param string|null $logPath Optional: stdout/stderr dorthin appenden
|
||||
*/
|
||||
public function rebuildFromNdjson(?string $logPath = null): void
|
||||
{
|
||||
if (!is_file($this->scriptPath)) {
|
||||
@@ -66,97 +49,97 @@ final class VectorIndexBuilder
|
||||
throw new \RuntimeException('index.ndjson not found at: ' . $this->indexNdjsonPath);
|
||||
}
|
||||
|
||||
$dir = \dirname($this->vectorIndexPath);
|
||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||
throw new \RuntimeException('Unable to create vector index directory: ' . $dir);
|
||||
if (!is_file($this->indexMetaPath)) {
|
||||
$this->initializeIndexMeta();
|
||||
}
|
||||
|
||||
$indexMeta = json_decode((string) file_get_contents($this->indexMetaPath), true);
|
||||
|
||||
if (!is_array($indexMeta) || empty($indexMeta['embedding_model'])) {
|
||||
throw new \RuntimeException('Invalid index_meta.json');
|
||||
}
|
||||
|
||||
$embeddingModel = (string) $indexMeta['embedding_model'];
|
||||
|
||||
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
|
||||
|
||||
// Vorheriges tmp entfernen (Sicherheit)
|
||||
if (is_file($tmpVectorIndexPath)) {
|
||||
@unlink($tmpVectorIndexPath);
|
||||
}
|
||||
// Wichtig: Python erzeugt meta basierend auf endgültigem Namen
|
||||
$finalMetaPath = $this->vectorMetaPath;
|
||||
$tmpMetaPath = dirname($this->vectorIndexPath) . '/' . basename($this->vectorIndexPath, '.index') . '.index.meta.json';
|
||||
|
||||
@unlink($tmpVectorIndexPath);
|
||||
@unlink($finalMetaPath);
|
||||
|
||||
// ----------------------------
|
||||
// Python-Aufruf (konservativ)
|
||||
// ----------------------------
|
||||
// Wir erwarten/standardisieren (ab jetzt) CLI-Args:
|
||||
// --index <path-to-index.ndjson>
|
||||
// --out <path-to-vector.index.tmp>
|
||||
$cmd = [
|
||||
$this->pythonBin,
|
||||
$this->scriptPath,
|
||||
'--index', $this->indexNdjsonPath,
|
||||
'--out', $tmpVectorIndexPath,
|
||||
'--model', 'all-MiniLM-L6-v2',
|
||||
'--out', $tmpVectorIndexPath,
|
||||
'--model', $embeddingModel,
|
||||
];
|
||||
|
||||
$process = new Process($cmd);
|
||||
$process->setTimeout($this->timeoutSeconds);
|
||||
$process->mustRun();
|
||||
|
||||
$this->runProcess($process, $logPath);
|
||||
|
||||
// Python muss tmp erzeugt haben
|
||||
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
|
||||
throw new \RuntimeException('Vector index rebuild failed: tmp output missing or empty: ' . $tmpVectorIndexPath);
|
||||
throw new \RuntimeException('Vector index tmp missing or empty');
|
||||
}
|
||||
|
||||
// Atomarer Switch
|
||||
$this->atomicSwitch($tmpVectorIndexPath, $this->vectorIndexPath);
|
||||
// Python erzeugt vector.index.meta.json (nicht tmp.meta!)
|
||||
if (!is_file($this->vectorMetaPath) || filesize($this->vectorMetaPath) === 0) {
|
||||
throw new \RuntimeException('Vector meta missing or empty');
|
||||
}
|
||||
|
||||
// Atomarer Switch für Index
|
||||
if (!rename($tmpVectorIndexPath, $this->vectorIndexPath)) {
|
||||
throw new \RuntimeException('Atomic switch failed for vector index');
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------
|
||||
// Internals
|
||||
// -------------------------
|
||||
private function initializeIndexMeta(): void
|
||||
{
|
||||
$dir = dirname($this->indexMetaPath);
|
||||
|
||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||
throw new \RuntimeException('Cannot create knowledge directory');
|
||||
}
|
||||
|
||||
$data = [
|
||||
'index_version' => 1,
|
||||
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
||||
'embedding_model' => $this->indexConfiguration->getEmbeddingModel(),
|
||||
'embedding_dimension' => $this->indexConfiguration->getEmbeddingDimension(),
|
||||
'chunk_size' => $this->indexConfiguration->getChunkSize(),
|
||||
'chunk_overlap' => $this->indexConfiguration->getChunkOverlap(),
|
||||
'scoring_version' => $this->indexConfiguration->getScoringVersion(),
|
||||
'index_format' => 'ndjson',
|
||||
'vector_backend' => 'faiss',
|
||||
];
|
||||
|
||||
file_put_contents(
|
||||
$this->indexMetaPath,
|
||||
json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
|
||||
);
|
||||
}
|
||||
|
||||
private function runProcess(Process $process, ?string $logPath): void
|
||||
{
|
||||
if ($logPath !== null) {
|
||||
$this->appendLog($logPath, "\n=== VectorIndexBuilder START " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n");
|
||||
$this->appendLog($logPath, "CMD: " . $process->getCommandLine() . "\n");
|
||||
@file_put_contents($logPath, "=== VectorIndexBuilder START ===\n", FILE_APPEND);
|
||||
}
|
||||
|
||||
$process->run(function (string $type, string $buffer) use ($logPath) {
|
||||
if ($logPath === null) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TYPE: Process::OUT / Process::ERR
|
||||
$this->appendLog($logPath, $buffer);
|
||||
});
|
||||
$process->run();
|
||||
|
||||
if (!$process->isSuccessful()) {
|
||||
if ($logPath !== null) {
|
||||
$this->appendLog($logPath, "\n=== VectorIndexBuilder FAILED ===\n");
|
||||
$this->appendLog($logPath, "ExitCode: " . $process->getExitCode() . "\n");
|
||||
$this->appendLog($logPath, "STDERR:\n" . $process->getErrorOutput() . "\n");
|
||||
@file_put_contents($logPath, $process->getErrorOutput(), FILE_APPEND);
|
||||
}
|
||||
|
||||
throw new ProcessFailedException($process);
|
||||
}
|
||||
|
||||
if ($logPath !== null) {
|
||||
$this->appendLog($logPath, "\n=== VectorIndexBuilder OK " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n");
|
||||
}
|
||||
}
|
||||
|
||||
private function appendLog(string $logPath, string $content): void
|
||||
{
|
||||
$dir = \dirname($logPath);
|
||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||
// Wenn Log nicht möglich ist: nicht hart scheitern (Build ist wichtiger)
|
||||
return;
|
||||
}
|
||||
|
||||
@file_put_contents($logPath, $content, FILE_APPEND);
|
||||
}
|
||||
|
||||
private function atomicSwitch(string $tmp, string $final): void
|
||||
{
|
||||
if (!rename($tmp, $final)) {
|
||||
@unlink($tmp);
|
||||
throw new \RuntimeException('Atomic switch failed for vector.index');
|
||||
@file_put_contents($logPath, "=== VectorIndexBuilder OK ===\n", FILE_APPEND);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user