new ingest und profile settings
This commit is contained in:
@@ -4,7 +4,7 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Vector;
|
||||
|
||||
use App\Index\IndexConfiguration;
|
||||
use App\Index\IndexConfigurationProvider;
|
||||
use Symfony\Component\Process\Exception\ProcessFailedException;
|
||||
use Symfony\Component\Process\Process;
|
||||
|
||||
@@ -18,7 +18,7 @@ final class VectorIndexBuilder
|
||||
private string $vectorMetaPath;
|
||||
private int $timeoutSeconds;
|
||||
|
||||
private IndexConfiguration $indexConfiguration;
|
||||
private IndexConfigurationProvider $configurationProvider;
|
||||
|
||||
public function __construct(
|
||||
string $pythonBin,
|
||||
@@ -27,7 +27,7 @@ final class VectorIndexBuilder
|
||||
string $indexMetaPath,
|
||||
string $vectorIndexPath,
|
||||
int $timeoutSeconds,
|
||||
IndexConfiguration $indexConfiguration
|
||||
IndexConfigurationProvider $configurationProvider
|
||||
) {
|
||||
$this->pythonBin = $pythonBin;
|
||||
$this->scriptPath = $scriptPath;
|
||||
@@ -36,39 +36,29 @@ final class VectorIndexBuilder
|
||||
$this->vectorIndexPath = $vectorIndexPath;
|
||||
$this->vectorMetaPath = $vectorIndexPath . '.meta.json';
|
||||
$this->timeoutSeconds = $timeoutSeconds;
|
||||
$this->indexConfiguration = $indexConfiguration;
|
||||
$this->configurationProvider = $configurationProvider;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rebuild FAISS Index deterministisch aus index.ndjson.
|
||||
*/
|
||||
public function rebuildFromNdjson(?string $logPath = null): void
|
||||
{
|
||||
if (!is_file($this->scriptPath)) {
|
||||
throw new \RuntimeException('vector_ingest.py not found at: ' . $this->scriptPath);
|
||||
}
|
||||
|
||||
if (!is_file($this->indexNdjsonPath)) {
|
||||
throw new \RuntimeException('index.ndjson not found at: ' . $this->indexNdjsonPath);
|
||||
}
|
||||
$this->assertPreconditions();
|
||||
|
||||
if (!is_file($this->indexMetaPath)) {
|
||||
$this->initializeIndexMeta();
|
||||
}
|
||||
|
||||
$indexMeta = json_decode((string) file_get_contents($this->indexMetaPath), true);
|
||||
$indexMeta = $this->readIndexMeta();
|
||||
|
||||
if (!is_array($indexMeta) || empty($indexMeta['embedding_model'])) {
|
||||
throw new \RuntimeException('Invalid index_meta.json');
|
||||
}
|
||||
|
||||
$embeddingModel = (string) $indexMeta['embedding_model'];
|
||||
$embeddingModel = $indexMeta['embedding_model'];
|
||||
|
||||
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
|
||||
|
||||
// Wichtig: Python erzeugt meta basierend auf endgültigem Namen
|
||||
$finalMetaPath = $this->vectorMetaPath;
|
||||
$tmpMetaPath = dirname($this->vectorIndexPath) . '/' . basename($this->vectorIndexPath, '.index') . '.index.meta.json';
|
||||
|
||||
// Clean leftovers
|
||||
@unlink($tmpVectorIndexPath);
|
||||
@unlink($finalMetaPath);
|
||||
@unlink($this->vectorMetaPath);
|
||||
|
||||
$cmd = [
|
||||
$this->pythonBin,
|
||||
@@ -80,21 +70,41 @@ final class VectorIndexBuilder
|
||||
|
||||
$process = new Process($cmd);
|
||||
$process->setTimeout($this->timeoutSeconds);
|
||||
$process->mustRun();
|
||||
|
||||
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
|
||||
throw new \RuntimeException('Vector index tmp missing or empty');
|
||||
$this->runProcess($process, $logPath);
|
||||
|
||||
$this->validatePythonOutputs($tmpVectorIndexPath);
|
||||
|
||||
$this->atomicSwitch($tmpVectorIndexPath);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------
|
||||
// Internals
|
||||
// -----------------------------------------------------
|
||||
|
||||
private function assertPreconditions(): void
|
||||
{
|
||||
if (!is_file($this->scriptPath)) {
|
||||
throw new \RuntimeException('vector_ingest.py not found at: ' . $this->scriptPath);
|
||||
}
|
||||
|
||||
// Python erzeugt vector.index.meta.json (nicht tmp.meta!)
|
||||
if (!is_file($this->vectorMetaPath) || filesize($this->vectorMetaPath) === 0) {
|
||||
throw new \RuntimeException('Vector meta missing or empty');
|
||||
if (!is_file($this->indexNdjsonPath)) {
|
||||
throw new \RuntimeException('index.ndjson not found at: ' . $this->indexNdjsonPath);
|
||||
}
|
||||
}
|
||||
|
||||
private function readIndexMeta(): array
|
||||
{
|
||||
$meta = json_decode(
|
||||
(string) file_get_contents($this->indexMetaPath),
|
||||
true
|
||||
);
|
||||
|
||||
if (!is_array($meta) || empty($meta['embedding_model'])) {
|
||||
throw new \RuntimeException('Invalid index_meta.json');
|
||||
}
|
||||
|
||||
// Atomarer Switch für Index
|
||||
if (!rename($tmpVectorIndexPath, $this->vectorIndexPath)) {
|
||||
throw new \RuntimeException('Atomic switch failed for vector index');
|
||||
}
|
||||
return $meta;
|
||||
}
|
||||
|
||||
private function initializeIndexMeta(): void
|
||||
@@ -105,14 +115,16 @@ final class VectorIndexBuilder
|
||||
throw new \RuntimeException('Cannot create knowledge directory');
|
||||
}
|
||||
|
||||
$config = $this->configurationProvider->getConfiguration();
|
||||
|
||||
$data = [
|
||||
'index_version' => 1,
|
||||
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
||||
'embedding_model' => $this->indexConfiguration->getEmbeddingModel(),
|
||||
'embedding_dimension' => $this->indexConfiguration->getEmbeddingDimension(),
|
||||
'chunk_size' => $this->indexConfiguration->getChunkSize(),
|
||||
'chunk_overlap' => $this->indexConfiguration->getChunkOverlap(),
|
||||
'scoring_version' => $this->indexConfiguration->getScoringVersion(),
|
||||
'embedding_model' => $config->getEmbeddingModel(),
|
||||
'embedding_dimension' => $config->getEmbeddingDimension(),
|
||||
'chunk_size' => $config->getChunkSize(),
|
||||
'chunk_overlap' => $config->getChunkOverlap(),
|
||||
'scoring_version' => $config->getScoringVersion(),
|
||||
'index_format' => 'ndjson',
|
||||
'vector_backend' => 'faiss',
|
||||
];
|
||||
@@ -123,6 +135,24 @@ final class VectorIndexBuilder
|
||||
);
|
||||
}
|
||||
|
||||
private function validatePythonOutputs(string $tmpVectorIndexPath): void
|
||||
{
|
||||
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
|
||||
throw new \RuntimeException('Vector index tmp missing or empty');
|
||||
}
|
||||
|
||||
if (!is_file($this->vectorMetaPath) || filesize($this->vectorMetaPath) === 0) {
|
||||
throw new \RuntimeException('Vector meta missing or empty');
|
||||
}
|
||||
}
|
||||
|
||||
private function atomicSwitch(string $tmpVectorIndexPath): void
|
||||
{
|
||||
if (!rename($tmpVectorIndexPath, $this->vectorIndexPath)) {
|
||||
throw new \RuntimeException('Atomic switch failed for vector index');
|
||||
}
|
||||
}
|
||||
|
||||
private function runProcess(Process $process, ?string $logPath): void
|
||||
{
|
||||
if ($logPath !== null) {
|
||||
|
||||
Reference in New Issue
Block a user