MtoRagSystem/python/vector/vector_ingest.py

#!/usr/bin/env python3

import sys
import json
import argparse
from pathlib import Path

# ---------------------------------------------------------
# Argument parsing
# ---------------------------------------------------------
parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON")

parser.add_argument("--index", required=True, help="Path to index.ndjson")
parser.add_argument("--out", required=True, help="Path to output vector.index (tmp)")

parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model")

args = parser.parse_args()

index_path = Path(args.index).resolve()
out_path   = Path(args.out).resolve()

# ---------------------------------------------------------
# Dependency checks
# ---------------------------------------------------------
try:
    import faiss
except Exception:
    print("ERROR: Python module 'faiss' not found.")
    sys.exit(10)

try:
    from sentence_transformers import SentenceTransformer
except Exception:
    print("ERROR: Python module 'sentence-transformers' not found.")
    sys.exit(11)

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if not index_path.is_file():
    print(f"ERROR: index.ndjson not found at {index_path}")
    sys.exit(20)

# ---------------------------------------------------------
# Load model
# ---------------------------------------------------------
print(f"Loading embedding model: {args.model}")
model = SentenceTransformer(args.model)

# ---------------------------------------------------------
# Streaming read NDJSON
# ---------------------------------------------------------
texts = []
ids = []

print("Reading NDJSON...")

with open(index_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        try:
            entry = json.loads(line)
        except Exception:
            continue

        text = entry.get("text")
        chunk_id = entry.get("chunk_id")

        if not text or not chunk_id:
            continue

        texts.append(text)
        ids.append(chunk_id)

if not texts:
    print("No chunks found. Removing vector index.")

    # Entferne final erst später in PHP atomar
    sys.exit(0)

print(f"Loaded {len(texts)} chunks.")

# ---------------------------------------------------------
# Build embeddings
# ---------------------------------------------------------
print("Encoding embeddings...")
embeddings = model.encode(
    texts,
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=64
)

embeddings = np.array(embeddings).astype("float32")

dim = embeddings.shape[1]
print(f"Embedding dimension: {dim}")

# ---------------------------------------------------------
# Build FAISS index
# ---------------------------------------------------------
print("Building FAISS index...")
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

# Ensure output directory exists
out_path.parent.mkdir(parents=True, exist_ok=True)

# ---------------------------------------------------------
# Write FAISS index (tmp)
# ---------------------------------------------------------
print(f"Writing FAISS index to {out_path}")
faiss.write_index(index, str(out_path))

# ---------------------------------------------------------
# Write ID mapping meta (tmp)
# ---------------------------------------------------------
meta_tmp_path = Path(str(out_path) + ".meta.json")

with open(meta_tmp_path, "w", encoding="utf-8") as f:
    json.dump(ids, f)

print(f"Indexed {len(ids)} chunks successfully.")
sys.exit(0)