phase a audit
This commit is contained in:
@@ -13,7 +13,6 @@ final class VectorIndexBuilder
|
||||
private string $pythonBin;
|
||||
private string $scriptPath;
|
||||
private string $indexNdjsonPath;
|
||||
private string $indexMetaPath;
|
||||
private string $vectorIndexPath;
|
||||
private string $vectorMetaPath;
|
||||
private int $timeoutSeconds;
|
||||
@@ -24,7 +23,6 @@ final class VectorIndexBuilder
|
||||
string $pythonBin,
|
||||
string $scriptPath,
|
||||
string $indexNdjsonPath,
|
||||
string $indexMetaPath,
|
||||
string $vectorIndexPath,
|
||||
int $timeoutSeconds,
|
||||
IndexConfigurationProvider $configurationProvider
|
||||
@@ -32,54 +30,30 @@ final class VectorIndexBuilder
|
||||
$this->pythonBin = $pythonBin;
|
||||
$this->scriptPath = $scriptPath;
|
||||
$this->indexNdjsonPath = $indexNdjsonPath;
|
||||
$this->indexMetaPath = $indexMetaPath;
|
||||
$this->vectorIndexPath = $vectorIndexPath;
|
||||
$this->vectorMetaPath = $vectorIndexPath . '.meta.json';
|
||||
$this->timeoutSeconds = $timeoutSeconds;
|
||||
$this->configurationProvider = $configurationProvider;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rebuild FAISS Index deterministisch aus index.ndjson.
|
||||
*/
|
||||
public function rebuildFromNdjson(?string $logPath = null): void
|
||||
{
|
||||
$this->assertPreconditions();
|
||||
|
||||
// --------------------------------------------
|
||||
// 🔵 FALL: NDJSON ist leer → kein Vector Index
|
||||
// --------------------------------------------
|
||||
if (!is_file($this->indexNdjsonPath) || filesize($this->indexNdjsonPath) === 0) {
|
||||
|
||||
@unlink($this->vectorIndexPath);
|
||||
@unlink($this->vectorMetaPath);
|
||||
|
||||
if ($logPath !== null) {
|
||||
@file_put_contents(
|
||||
$logPath,
|
||||
"NDJSON empty → Vector index removed\n",
|
||||
FILE_APPEND
|
||||
);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// --------------------------------------------
|
||||
// 🟢 FALL: NDJSON enthält Chunks
|
||||
// --------------------------------------------
|
||||
|
||||
if (!is_file($this->indexMetaPath)) {
|
||||
$this->initializeIndexMeta();
|
||||
}
|
||||
|
||||
$indexMeta = $this->readIndexMeta();
|
||||
$embeddingModel = $indexMeta['embedding_model'];
|
||||
$config = $this->configurationProvider->getConfiguration();
|
||||
$embeddingModel = $config->getEmbeddingModel();
|
||||
|
||||
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
|
||||
$tmpVectorMetaPath = $tmpVectorIndexPath . '.meta.json';
|
||||
|
||||
@unlink($tmpVectorIndexPath);
|
||||
@unlink($this->vectorMetaPath);
|
||||
@unlink($tmpVectorMetaPath);
|
||||
|
||||
$cmd = [
|
||||
$this->pythonBin,
|
||||
@@ -94,107 +68,51 @@ final class VectorIndexBuilder
|
||||
|
||||
$this->runProcess($process, $logPath);
|
||||
|
||||
$this->validatePythonOutputs($tmpVectorIndexPath);
|
||||
$this->validateOutputs($tmpVectorIndexPath, $tmpVectorMetaPath);
|
||||
|
||||
$this->atomicSwitch($tmpVectorIndexPath);
|
||||
$this->atomicSwitchPair(
|
||||
$tmpVectorIndexPath,
|
||||
$tmpVectorMetaPath
|
||||
);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------
|
||||
// Internals
|
||||
// -----------------------------------------------------
|
||||
|
||||
private function assertPreconditions(): void
|
||||
{
|
||||
if (!is_file($this->scriptPath)) {
|
||||
throw new \RuntimeException(
|
||||
'Vector build script not found at: ' . $this->scriptPath
|
||||
);
|
||||
throw new \RuntimeException('Vector build script not found.');
|
||||
}
|
||||
|
||||
if (!is_file($this->indexNdjsonPath)) {
|
||||
throw new \RuntimeException(
|
||||
'index.ndjson not found at: ' . $this->indexNdjsonPath
|
||||
);
|
||||
throw new \RuntimeException('index.ndjson not found.');
|
||||
}
|
||||
}
|
||||
|
||||
private function readIndexMeta(): array
|
||||
private function validateOutputs(string $tmpIndex, string $tmpMeta): void
|
||||
{
|
||||
$meta = json_decode(
|
||||
(string) file_get_contents($this->indexMetaPath),
|
||||
true
|
||||
);
|
||||
|
||||
if (!is_array($meta) || empty($meta['embedding_model'])) {
|
||||
throw new \RuntimeException('Invalid index_meta.json');
|
||||
}
|
||||
|
||||
return $meta;
|
||||
}
|
||||
|
||||
private function initializeIndexMeta(): void
|
||||
{
|
||||
$dir = dirname($this->indexMetaPath);
|
||||
|
||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||
throw new \RuntimeException('Cannot create knowledge directory');
|
||||
}
|
||||
|
||||
$config = $this->configurationProvider->getConfiguration();
|
||||
|
||||
$data = [
|
||||
'index_version' => 1,
|
||||
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
||||
'embedding_model' => $config->getEmbeddingModel(),
|
||||
'embedding_dimension' => $config->getEmbeddingDimension(),
|
||||
'chunk_size' => $config->getChunkSize(),
|
||||
'chunk_overlap' => $config->getChunkOverlap(),
|
||||
'scoring_version' => $config->getScoringVersion(),
|
||||
'index_format' => 'ndjson',
|
||||
'vector_backend' => 'faiss',
|
||||
];
|
||||
|
||||
file_put_contents(
|
||||
$this->indexMetaPath,
|
||||
json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
|
||||
);
|
||||
}
|
||||
|
||||
private function validatePythonOutputs(string $tmpVectorIndexPath): void
|
||||
{
|
||||
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
|
||||
if (!is_file($tmpIndex) || filesize($tmpIndex) === 0) {
|
||||
throw new \RuntimeException('Vector index tmp missing or empty');
|
||||
}
|
||||
|
||||
if (!is_file($this->vectorMetaPath) || filesize($this->vectorMetaPath) === 0) {
|
||||
throw new \RuntimeException('Vector meta missing or empty');
|
||||
if (!is_file($tmpMeta) || filesize($tmpMeta) === 0) {
|
||||
throw new \RuntimeException('Vector meta tmp missing or empty');
|
||||
}
|
||||
}
|
||||
|
||||
private function atomicSwitch(string $tmpVectorIndexPath): void
|
||||
private function atomicSwitchPair(string $tmpIndex, string $tmpMeta): void
|
||||
{
|
||||
if (!rename($tmpVectorIndexPath, $this->vectorIndexPath)) {
|
||||
if (!rename($tmpIndex, $this->vectorIndexPath)) {
|
||||
throw new \RuntimeException('Atomic switch failed for vector index');
|
||||
}
|
||||
|
||||
if (!rename($tmpMeta, $this->vectorMetaPath)) {
|
||||
throw new \RuntimeException('Atomic switch failed for vector meta');
|
||||
}
|
||||
}
|
||||
|
||||
private function runProcess(Process $process, ?string $logPath): void
|
||||
{
|
||||
if ($logPath !== null) {
|
||||
@file_put_contents($logPath, "=== VectorIndexBuilder START ===\n", FILE_APPEND);
|
||||
}
|
||||
|
||||
$process->run();
|
||||
|
||||
if (!$process->isSuccessful()) {
|
||||
if ($logPath !== null) {
|
||||
@file_put_contents($logPath, $process->getErrorOutput(), FILE_APPEND);
|
||||
}
|
||||
throw new ProcessFailedException($process);
|
||||
}
|
||||
|
||||
if ($logPath !== null) {
|
||||
@file_put_contents($logPath, "=== VectorIndexBuilder OK ===\n", FILE_APPEND);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user