#!/usr/bin/env python3 import sys import json import argparse from pathlib import Path # --------------------------------------------------------- # Argument parsing # --------------------------------------------------------- parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON") parser.add_argument("--index", required=True, help="Path to index.ndjson") parser.add_argument("--out", required=True, help="Path to output vector.index") parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model") args = parser.parse_args() index_path = Path(args.index).resolve() out_path = Path(args.out).resolve() # --------------------------------------------------------- # Dependency checks # --------------------------------------------------------- try: import faiss except Exception: print("ERROR: Python module 'faiss' not found.") sys.exit(10) try: from sentence_transformers import SentenceTransformer except Exception: print("ERROR: Python module 'sentence-transformers' not found.") sys.exit(11) import numpy as np import faiss from sentence_transformers import SentenceTransformer # --------------------------------------------------------- # File checks # --------------------------------------------------------- if not index_path.is_file(): print(f"ERROR: index.ndjson not found at {index_path}") sys.exit(20) # --------------------------------------------------------- # Load model # --------------------------------------------------------- print(f"Loading embedding model: {args.model}") model = SentenceTransformer(args.model) # --------------------------------------------------------- # Streaming read NDJSON # --------------------------------------------------------- texts = [] ids = [] print("Reading NDJSON...") with open(index_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue try: entry = json.loads(line) except Exception: continue text = entry.get("text") chunk_id = entry.get("chunk_id") if not text or not chunk_id: continue texts.append(text) ids.append(chunk_id) if not texts: print("No chunks found. Removing vector index.") if out_path.exists(): out_path.unlink() meta_path = out_path.with_suffix(".meta.json") if meta_path.exists(): meta_path.unlink() sys.exit(0) print(f"Loaded {len(texts)} chunks.") # --------------------------------------------------------- # Build embeddings # --------------------------------------------------------- print("Encoding embeddings...") embeddings = model.encode( texts, normalize_embeddings=True, show_progress_bar=True, batch_size=64 ) embeddings = np.array(embeddings).astype("float32") dim = embeddings.shape[1] print(f"Embedding dimension: {dim}") # --------------------------------------------------------- # Build FAISS index # --------------------------------------------------------- print("Building FAISS index...") index = faiss.IndexFlatIP(dim) index.add(embeddings) # Ensure output directory exists out_path.parent.mkdir(parents=True, exist_ok=True) print(f"Writing FAISS index to {out_path}") faiss.write_index(index, str(out_path)) # --------------------------------------------------------- # Write ID mapping meta # --------------------------------------------------------- meta_path = out_path.with_suffix(".meta.json") with open(meta_path, "w", encoding="utf-8") as f: json.dump(ids, f) print(f"Indexed {len(ids)} chunks successfully.") sys.exit(0)