132 lines
3.6 KiB
Python
132 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import sys
|
|
import json
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
# ---------------------------------------------------------
|
|
# Argument parsing
|
|
# ---------------------------------------------------------
|
|
parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON")
|
|
|
|
parser.add_argument("--index", required=True, help="Path to index.ndjson")
|
|
parser.add_argument("--out", required=True, help="Path to output vector.index (tmp)")
|
|
|
|
parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model")
|
|
|
|
args = parser.parse_args()
|
|
|
|
index_path = Path(args.index).resolve()
|
|
out_path = Path(args.out).resolve()
|
|
|
|
# ---------------------------------------------------------
|
|
# Dependency checks
|
|
# ---------------------------------------------------------
|
|
try:
|
|
import faiss
|
|
except Exception:
|
|
print("ERROR: Python module 'faiss' not found.")
|
|
sys.exit(10)
|
|
|
|
try:
|
|
from sentence_transformers import SentenceTransformer
|
|
except Exception:
|
|
print("ERROR: Python module 'sentence-transformers' not found.")
|
|
sys.exit(11)
|
|
|
|
import numpy as np
|
|
import faiss
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
# ---------------------------------------------------------
|
|
# File checks
|
|
# ---------------------------------------------------------
|
|
if not index_path.is_file():
|
|
print(f"ERROR: index.ndjson not found at {index_path}")
|
|
sys.exit(20)
|
|
|
|
# ---------------------------------------------------------
|
|
# Load model
|
|
# ---------------------------------------------------------
|
|
print(f"Loading embedding model: {args.model}")
|
|
model = SentenceTransformer(args.model)
|
|
|
|
# ---------------------------------------------------------
|
|
# Streaming read NDJSON
|
|
# ---------------------------------------------------------
|
|
texts = []
|
|
ids = []
|
|
|
|
print("Reading NDJSON...")
|
|
|
|
with open(index_path, "r", encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
try:
|
|
entry = json.loads(line)
|
|
except Exception:
|
|
continue
|
|
|
|
text = entry.get("text")
|
|
chunk_id = entry.get("chunk_id")
|
|
|
|
if not text or not chunk_id:
|
|
continue
|
|
|
|
texts.append(text)
|
|
ids.append(chunk_id)
|
|
|
|
if not texts:
|
|
print("No chunks found. Removing vector index.")
|
|
|
|
# Entferne final erst später in PHP atomar
|
|
sys.exit(0)
|
|
|
|
print(f"Loaded {len(texts)} chunks.")
|
|
|
|
# ---------------------------------------------------------
|
|
# Build embeddings
|
|
# ---------------------------------------------------------
|
|
print("Encoding embeddings...")
|
|
embeddings = model.encode(
|
|
texts,
|
|
normalize_embeddings=True,
|
|
show_progress_bar=True,
|
|
batch_size=64
|
|
)
|
|
|
|
embeddings = np.array(embeddings).astype("float32")
|
|
|
|
dim = embeddings.shape[1]
|
|
print(f"Embedding dimension: {dim}")
|
|
|
|
# ---------------------------------------------------------
|
|
# Build FAISS index
|
|
# ---------------------------------------------------------
|
|
print("Building FAISS index...")
|
|
index = faiss.IndexFlatIP(dim)
|
|
index.add(embeddings)
|
|
|
|
# Ensure output directory exists
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# ---------------------------------------------------------
|
|
# Write FAISS index (tmp)
|
|
# ---------------------------------------------------------
|
|
print(f"Writing FAISS index to {out_path}")
|
|
faiss.write_index(index, str(out_path))
|
|
|
|
# ---------------------------------------------------------
|
|
# Write ID mapping meta (tmp)
|
|
# ---------------------------------------------------------
|
|
meta_tmp_path = Path(str(out_path) + ".meta.json")
|
|
|
|
with open(meta_tmp_path, "w", encoding="utf-8") as f:
|
|
json.dump(ids, f)
|
|
|
|
print(f"Indexed {len(ids)} chunks successfully.")
|
|
sys.exit(0) |