phase a audit

This commit is contained in:
team2
2026-02-22 18:04:53 +01:00
parent b3e9110dd1
commit 3b2e1bc772
10 changed files with 608 additions and 516 deletions

View File

@@ -13,7 +13,6 @@ final class VectorIndexBuilder
private string $pythonBin;
private string $scriptPath;
private string $indexNdjsonPath;
private string $indexMetaPath;
private string $vectorIndexPath;
private string $vectorMetaPath;
private int $timeoutSeconds;
@@ -24,7 +23,6 @@ final class VectorIndexBuilder
string $pythonBin,
string $scriptPath,
string $indexNdjsonPath,
string $indexMetaPath,
string $vectorIndexPath,
int $timeoutSeconds,
IndexConfigurationProvider $configurationProvider
@@ -32,54 +30,30 @@ final class VectorIndexBuilder
$this->pythonBin = $pythonBin;
$this->scriptPath = $scriptPath;
$this->indexNdjsonPath = $indexNdjsonPath;
$this->indexMetaPath = $indexMetaPath;
$this->vectorIndexPath = $vectorIndexPath;
$this->vectorMetaPath = $vectorIndexPath . '.meta.json';
$this->timeoutSeconds = $timeoutSeconds;
$this->configurationProvider = $configurationProvider;
}
/**
* Rebuild FAISS Index deterministisch aus index.ndjson.
*/
public function rebuildFromNdjson(?string $logPath = null): void
{
$this->assertPreconditions();
// --------------------------------------------
// 🔵 FALL: NDJSON ist leer → kein Vector Index
// --------------------------------------------
if (!is_file($this->indexNdjsonPath) || filesize($this->indexNdjsonPath) === 0) {
@unlink($this->vectorIndexPath);
@unlink($this->vectorMetaPath);
if ($logPath !== null) {
@file_put_contents(
$logPath,
"NDJSON empty → Vector index removed\n",
FILE_APPEND
);
}
return;
}
// --------------------------------------------
// 🟢 FALL: NDJSON enthält Chunks
// --------------------------------------------
if (!is_file($this->indexMetaPath)) {
$this->initializeIndexMeta();
}
$indexMeta = $this->readIndexMeta();
$embeddingModel = $indexMeta['embedding_model'];
$config = $this->configurationProvider->getConfiguration();
$embeddingModel = $config->getEmbeddingModel();
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
$tmpVectorMetaPath = $tmpVectorIndexPath . '.meta.json';
@unlink($tmpVectorIndexPath);
@unlink($this->vectorMetaPath);
@unlink($tmpVectorMetaPath);
$cmd = [
$this->pythonBin,
@@ -94,107 +68,51 @@ final class VectorIndexBuilder
$this->runProcess($process, $logPath);
$this->validatePythonOutputs($tmpVectorIndexPath);
$this->validateOutputs($tmpVectorIndexPath, $tmpVectorMetaPath);
$this->atomicSwitch($tmpVectorIndexPath);
$this->atomicSwitchPair(
$tmpVectorIndexPath,
$tmpVectorMetaPath
);
}
// -----------------------------------------------------
// Internals
// -----------------------------------------------------
private function assertPreconditions(): void
{
if (!is_file($this->scriptPath)) {
throw new \RuntimeException(
'Vector build script not found at: ' . $this->scriptPath
);
throw new \RuntimeException('Vector build script not found.');
}
if (!is_file($this->indexNdjsonPath)) {
throw new \RuntimeException(
'index.ndjson not found at: ' . $this->indexNdjsonPath
);
throw new \RuntimeException('index.ndjson not found.');
}
}
private function readIndexMeta(): array
private function validateOutputs(string $tmpIndex, string $tmpMeta): void
{
$meta = json_decode(
(string) file_get_contents($this->indexMetaPath),
true
);
if (!is_array($meta) || empty($meta['embedding_model'])) {
throw new \RuntimeException('Invalid index_meta.json');
}
return $meta;
}
private function initializeIndexMeta(): void
{
$dir = dirname($this->indexMetaPath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Cannot create knowledge directory');
}
$config = $this->configurationProvider->getConfiguration();
$data = [
'index_version' => 1,
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
'embedding_model' => $config->getEmbeddingModel(),
'embedding_dimension' => $config->getEmbeddingDimension(),
'chunk_size' => $config->getChunkSize(),
'chunk_overlap' => $config->getChunkOverlap(),
'scoring_version' => $config->getScoringVersion(),
'index_format' => 'ndjson',
'vector_backend' => 'faiss',
];
file_put_contents(
$this->indexMetaPath,
json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
);
}
private function validatePythonOutputs(string $tmpVectorIndexPath): void
{
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
if (!is_file($tmpIndex) || filesize($tmpIndex) === 0) {
throw new \RuntimeException('Vector index tmp missing or empty');
}
if (!is_file($this->vectorMetaPath) || filesize($this->vectorMetaPath) === 0) {
throw new \RuntimeException('Vector meta missing or empty');
if (!is_file($tmpMeta) || filesize($tmpMeta) === 0) {
throw new \RuntimeException('Vector meta tmp missing or empty');
}
}
private function atomicSwitch(string $tmpVectorIndexPath): void
private function atomicSwitchPair(string $tmpIndex, string $tmpMeta): void
{
if (!rename($tmpVectorIndexPath, $this->vectorIndexPath)) {
if (!rename($tmpIndex, $this->vectorIndexPath)) {
throw new \RuntimeException('Atomic switch failed for vector index');
}
if (!rename($tmpMeta, $this->vectorMetaPath)) {
throw new \RuntimeException('Atomic switch failed for vector meta');
}
}
private function runProcess(Process $process, ?string $logPath): void
{
if ($logPath !== null) {
@file_put_contents($logPath, "=== VectorIndexBuilder START ===\n", FILE_APPEND);
}
$process->run();
if (!$process->isSuccessful()) {
if ($logPath !== null) {
@file_put_contents($logPath, $process->getErrorOutput(), FILE_APPEND);
}
throw new ProcessFailedException($process);
}
if ($logPath !== null) {
@file_put_contents($logPath, "=== VectorIndexBuilder OK ===\n", FILE_APPEND);
}
}
}