Files
MtoRagSystem/python/vector/vector_ingest.py
2026-02-22 18:04:53 +01:00

132 lines
3.6 KiB
Python

#!/usr/bin/env python3
import sys
import json
import argparse
from pathlib import Path
# ---------------------------------------------------------
# Argument parsing
# ---------------------------------------------------------
parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON")
parser.add_argument("--index", required=True, help="Path to index.ndjson")
parser.add_argument("--out", required=True, help="Path to output vector.index (tmp)")
parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model")
args = parser.parse_args()
index_path = Path(args.index).resolve()
out_path = Path(args.out).resolve()
# ---------------------------------------------------------
# Dependency checks
# ---------------------------------------------------------
try:
import faiss
except Exception:
print("ERROR: Python module 'faiss' not found.")
sys.exit(10)
try:
from sentence_transformers import SentenceTransformer
except Exception:
print("ERROR: Python module 'sentence-transformers' not found.")
sys.exit(11)
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if not index_path.is_file():
print(f"ERROR: index.ndjson not found at {index_path}")
sys.exit(20)
# ---------------------------------------------------------
# Load model
# ---------------------------------------------------------
print(f"Loading embedding model: {args.model}")
model = SentenceTransformer(args.model)
# ---------------------------------------------------------
# Streaming read NDJSON
# ---------------------------------------------------------
texts = []
ids = []
print("Reading NDJSON...")
with open(index_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
except Exception:
continue
text = entry.get("text")
chunk_id = entry.get("chunk_id")
if not text or not chunk_id:
continue
texts.append(text)
ids.append(chunk_id)
if not texts:
print("No chunks found. Removing vector index.")
# Entferne final erst später in PHP atomar
sys.exit(0)
print(f"Loaded {len(texts)} chunks.")
# ---------------------------------------------------------
# Build embeddings
# ---------------------------------------------------------
print("Encoding embeddings...")
embeddings = model.encode(
texts,
normalize_embeddings=True,
show_progress_bar=True,
batch_size=64
)
embeddings = np.array(embeddings).astype("float32")
dim = embeddings.shape[1]
print(f"Embedding dimension: {dim}")
# ---------------------------------------------------------
# Build FAISS index
# ---------------------------------------------------------
print("Building FAISS index...")
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
# Ensure output directory exists
out_path.parent.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------
# Write FAISS index (tmp)
# ---------------------------------------------------------
print(f"Writing FAISS index to {out_path}")
faiss.write_index(index, str(out_path))
# ---------------------------------------------------------
# Write ID mapping meta (tmp)
# ---------------------------------------------------------
meta_tmp_path = Path(str(out_path) + ".meta.json")
with open(meta_tmp_path, "w", encoding="utf-8") as f:
json.dump(ids, f)
print(f"Indexed {len(ids)} chunks successfully.")
sys.exit(0)