stash light
This commit is contained in:
164
src/Vector/VectorIndexBuilder.php
Normal file
164
src/Vector/VectorIndexBuilder.php
Normal file
@@ -0,0 +1,164 @@
|
||||
<?php
|
||||
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Vector;
|
||||
|
||||
use Symfony\Component\Process\Exception\ProcessFailedException;
|
||||
use Symfony\Component\Process\Process;
|
||||
|
||||
final class VectorIndexBuilder
|
||||
{
|
||||
private string $pythonBin;
|
||||
private string $scriptPath;
|
||||
private string $indexNdjsonPath;
|
||||
private string $vectorIndexPath;
|
||||
private int $timeoutSeconds;
|
||||
|
||||
public function __construct(
|
||||
string $projectDir,
|
||||
string $pythonBin = 'python3',
|
||||
string $relativeScriptPath = '/vector/vector_ingest.py',
|
||||
string $relativeIndexNdjsonPath = '/var/knowledge/index.ndjson',
|
||||
string $relativeVectorIndexPath = '/var/knowledge/vector.index',
|
||||
int $timeoutSeconds = 600
|
||||
)
|
||||
{
|
||||
$base = rtrim($projectDir, '/');
|
||||
|
||||
$this->pythonBin = $pythonBin;
|
||||
$this->scriptPath = $base . $relativeScriptPath;
|
||||
$this->indexNdjsonPath = $base . $relativeIndexNdjsonPath;
|
||||
$this->vectorIndexPath = $base . $relativeVectorIndexPath;
|
||||
$this->timeoutSeconds = $timeoutSeconds;
|
||||
}
|
||||
|
||||
public function getIndexNdjsonPath(): string
|
||||
{
|
||||
return $this->indexNdjsonPath;
|
||||
}
|
||||
|
||||
public function getVectorIndexPath(): string
|
||||
{
|
||||
return $this->vectorIndexPath;
|
||||
}
|
||||
|
||||
public function getScriptPath(): string
|
||||
{
|
||||
return $this->scriptPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rebuild FAISS Index deterministisch aus index.ndjson.
|
||||
*
|
||||
* Erwartung: Python schreibt in $tmpVectorIndexPath, wir schalten atomar um.
|
||||
*
|
||||
* @param string|null $logPath Optional: stdout/stderr dorthin appenden
|
||||
*/
|
||||
public function rebuildFromNdjson(?string $logPath = null): void
|
||||
{
|
||||
if (!is_file($this->scriptPath)) {
|
||||
throw new \RuntimeException('vector_ingest.py not found at: ' . $this->scriptPath);
|
||||
}
|
||||
|
||||
if (!is_file($this->indexNdjsonPath)) {
|
||||
throw new \RuntimeException('index.ndjson not found at: ' . $this->indexNdjsonPath);
|
||||
}
|
||||
|
||||
$dir = \dirname($this->vectorIndexPath);
|
||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||
throw new \RuntimeException('Unable to create vector index directory: ' . $dir);
|
||||
}
|
||||
|
||||
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
|
||||
|
||||
// Vorheriges tmp entfernen (Sicherheit)
|
||||
if (is_file($tmpVectorIndexPath)) {
|
||||
@unlink($tmpVectorIndexPath);
|
||||
}
|
||||
|
||||
// ----------------------------
|
||||
// Python-Aufruf (konservativ)
|
||||
// ----------------------------
|
||||
// Wir erwarten/standardisieren (ab jetzt) CLI-Args:
|
||||
// --index <path-to-index.ndjson>
|
||||
// --out <path-to-vector.index.tmp>
|
||||
//
|
||||
// Falls dein Python-Script aktuell andere Args hat,
|
||||
// passen wir es im nächsten Schritt konsistent an.
|
||||
$cmd = [
|
||||
$this->pythonBin,
|
||||
$this->scriptPath,
|
||||
'--index', $this->indexNdjsonPath,
|
||||
'--out', $tmpVectorIndexPath,
|
||||
];
|
||||
|
||||
$process = new Process($cmd);
|
||||
$process->setTimeout($this->timeoutSeconds);
|
||||
|
||||
$this->runProcess($process, $logPath);
|
||||
|
||||
// Python muss tmp erzeugt haben
|
||||
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
|
||||
throw new \RuntimeException('Vector index rebuild failed: tmp output missing or empty: ' . $tmpVectorIndexPath);
|
||||
}
|
||||
|
||||
// Atomarer Switch
|
||||
$this->atomicSwitch($tmpVectorIndexPath, $this->vectorIndexPath);
|
||||
}
|
||||
|
||||
// -------------------------
|
||||
// Internals
|
||||
// -------------------------
|
||||
|
||||
private function runProcess(Process $process, ?string $logPath): void
|
||||
{
|
||||
if ($logPath !== null) {
|
||||
$this->appendLog($logPath, "\n=== VectorIndexBuilder START " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n");
|
||||
$this->appendLog($logPath, "CMD: " . $process->getCommandLine() . "\n");
|
||||
}
|
||||
|
||||
$process->run(function (string $type, string $buffer) use ($logPath) {
|
||||
if ($logPath === null) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TYPE: Process::OUT / Process::ERR
|
||||
$this->appendLog($logPath, $buffer);
|
||||
});
|
||||
|
||||
if (!$process->isSuccessful()) {
|
||||
if ($logPath !== null) {
|
||||
$this->appendLog($logPath, "\n=== VectorIndexBuilder FAILED ===\n");
|
||||
$this->appendLog($logPath, "ExitCode: " . $process->getExitCode() . "\n");
|
||||
$this->appendLog($logPath, "STDERR:\n" . $process->getErrorOutput() . "\n");
|
||||
}
|
||||
|
||||
throw new ProcessFailedException($process);
|
||||
}
|
||||
|
||||
if ($logPath !== null) {
|
||||
$this->appendLog($logPath, "\n=== VectorIndexBuilder OK " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n");
|
||||
}
|
||||
}
|
||||
|
||||
private function appendLog(string $logPath, string $content): void
|
||||
{
|
||||
$dir = \dirname($logPath);
|
||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||
// Wenn Log nicht möglich ist: nicht hart scheitern (Build ist wichtiger)
|
||||
return;
|
||||
}
|
||||
|
||||
@file_put_contents($logPath, $content, FILE_APPEND);
|
||||
}
|
||||
|
||||
private function atomicSwitch(string $tmp, string $final): void
|
||||
{
|
||||
if (!rename($tmp, $final)) {
|
||||
@unlink($tmp);
|
||||
throw new \RuntimeException('Atomic switch failed for vector.index');
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,88 +2,125 @@
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Argument handling
|
||||
# Argument parsing
|
||||
# ---------------------------------------------------------
|
||||
if len(sys.argv) < 3:
|
||||
print("ERROR: Missing arguments (vectorDir, knowledgeDir)")
|
||||
sys.exit(2)
|
||||
parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON")
|
||||
|
||||
vector_dir = Path(sys.argv[1]).resolve()
|
||||
knowledge_dir = Path(sys.argv[2]).resolve()
|
||||
parser.add_argument("--index", required=True, help="Path to index.ndjson")
|
||||
parser.add_argument("--out", required=True, help="Path to output vector.index")
|
||||
parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model")
|
||||
|
||||
index_json = knowledge_dir / "index.json"
|
||||
index_out = vector_dir / "vector.index"
|
||||
meta_out = vector_dir / "vector_meta.json"
|
||||
args = parser.parse_args()
|
||||
|
||||
index_path = Path(args.index).resolve()
|
||||
out_path = Path(args.out).resolve()
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Dependency checks
|
||||
# ---------------------------------------------------------
|
||||
try:
|
||||
import faiss # noqa
|
||||
import faiss
|
||||
except Exception:
|
||||
print("ERROR: Python module 'faiss' not found.")
|
||||
sys.exit(10)
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer # noqa
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except Exception:
|
||||
print("ERROR: Python module 'sentence-transformers' not found.")
|
||||
sys.exit(11)
|
||||
|
||||
import numpy as np
|
||||
import faiss
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# File checks
|
||||
# ---------------------------------------------------------
|
||||
if not index_json.is_file():
|
||||
print(f"ERROR: index.json not found at {index_json}")
|
||||
if not index_path.is_file():
|
||||
print(f"ERROR: index.ndjson not found at {index_path}")
|
||||
sys.exit(20)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Load chunks from index.json
|
||||
# Load model
|
||||
# ---------------------------------------------------------
|
||||
with open(index_json, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
print(f"Loading embedding model: {args.model}")
|
||||
model = SentenceTransformer(args.model)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Streaming read NDJSON
|
||||
# ---------------------------------------------------------
|
||||
texts = []
|
||||
ids = []
|
||||
|
||||
for entry in data:
|
||||
if "file" not in entry:
|
||||
continue
|
||||
print("Reading NDJSON...")
|
||||
|
||||
chunk_path = knowledge_dir / "chunks" / entry["file"]
|
||||
if not chunk_path.is_file():
|
||||
continue
|
||||
with open(index_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
text = chunk_path.read_text(encoding="utf-8").strip()
|
||||
if not text:
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
texts.append(text)
|
||||
ids.append(entry["file"])
|
||||
text = entry.get("text")
|
||||
chunk_id = entry.get("chunk_id")
|
||||
|
||||
if not text or not chunk_id:
|
||||
continue
|
||||
|
||||
texts.append(text)
|
||||
ids.append(chunk_id)
|
||||
|
||||
if not texts:
|
||||
print("ERROR: No chunks loaded from index.json")
|
||||
print("ERROR: No valid chunks found in index.ndjson")
|
||||
sys.exit(21)
|
||||
|
||||
print(f"Loaded {len(texts)} chunks.")
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Build vector index
|
||||
# Build embeddings
|
||||
# ---------------------------------------------------------
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
embeddings = model.encode(texts, normalize_embeddings=True)
|
||||
print("Encoding embeddings...")
|
||||
embeddings = model.encode(
|
||||
texts,
|
||||
normalize_embeddings=True,
|
||||
show_progress_bar=True,
|
||||
batch_size=64
|
||||
)
|
||||
|
||||
embeddings = np.array(embeddings).astype("float32")
|
||||
|
||||
dim = embeddings.shape[1]
|
||||
print(f"Embedding dimension: {dim}")
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Build FAISS index
|
||||
# ---------------------------------------------------------
|
||||
print("Building FAISS index...")
|
||||
index = faiss.IndexFlatIP(dim)
|
||||
index.add(embeddings)
|
||||
|
||||
faiss.write_index(index, str(index_out))
|
||||
# Ensure output directory exists
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(meta_out, "w", encoding="utf-8") as f:
|
||||
print(f"Writing FAISS index to {out_path}")
|
||||
faiss.write_index(index, str(out_path))
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Write ID mapping meta
|
||||
# ---------------------------------------------------------
|
||||
meta_path = out_path.with_suffix(".meta.json")
|
||||
|
||||
with open(meta_path, "w", encoding="utf-8") as f:
|
||||
json.dump(ids, f)
|
||||
|
||||
print(f"Indexed {len(ids)} chunks.")
|
||||
print(f"Indexed {len(ids)} chunks successfully.")
|
||||
sys.exit(0)
|
||||
|
||||
Reference in New Issue
Block a user