phase a audit
This commit is contained in:
134
python/vector/vector_ingest.py
Normal file
134
python/vector/vector_ingest.py
Normal file
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Argument parsing
|
||||
# ---------------------------------------------------------
|
||||
parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON")
|
||||
|
||||
parser.add_argument("--index", required=True, help="Path to index.ndjson")
|
||||
parser.add_argument("--out", required=True, help="Path to output vector.index")
|
||||
parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
index_path = Path(args.index).resolve()
|
||||
out_path = Path(args.out).resolve()
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Dependency checks
|
||||
# ---------------------------------------------------------
|
||||
try:
|
||||
import faiss
|
||||
except Exception:
|
||||
print("ERROR: Python module 'faiss' not found.")
|
||||
sys.exit(10)
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except Exception:
|
||||
print("ERROR: Python module 'sentence-transformers' not found.")
|
||||
sys.exit(11)
|
||||
|
||||
import numpy as np
|
||||
import faiss
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# File checks
|
||||
# ---------------------------------------------------------
|
||||
if not index_path.is_file():
|
||||
print(f"ERROR: index.ndjson not found at {index_path}")
|
||||
sys.exit(20)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Load model
|
||||
# ---------------------------------------------------------
|
||||
print(f"Loading embedding model: {args.model}")
|
||||
model = SentenceTransformer(args.model)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Streaming read NDJSON
|
||||
# ---------------------------------------------------------
|
||||
texts = []
|
||||
ids = []
|
||||
|
||||
print("Reading NDJSON...")
|
||||
|
||||
with open(index_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
text = entry.get("text")
|
||||
chunk_id = entry.get("chunk_id")
|
||||
|
||||
if not text or not chunk_id:
|
||||
continue
|
||||
|
||||
texts.append(text)
|
||||
ids.append(chunk_id)
|
||||
|
||||
if not texts:
|
||||
print("No chunks found. Removing vector index.")
|
||||
|
||||
if out_path.exists():
|
||||
out_path.unlink()
|
||||
|
||||
meta_path = out_path.with_suffix(".meta.json")
|
||||
if meta_path.exists():
|
||||
meta_path.unlink()
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
print(f"Loaded {len(texts)} chunks.")
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Build embeddings
|
||||
# ---------------------------------------------------------
|
||||
print("Encoding embeddings...")
|
||||
embeddings = model.encode(
|
||||
texts,
|
||||
normalize_embeddings=True,
|
||||
show_progress_bar=True,
|
||||
batch_size=64
|
||||
)
|
||||
|
||||
embeddings = np.array(embeddings).astype("float32")
|
||||
|
||||
dim = embeddings.shape[1]
|
||||
print(f"Embedding dimension: {dim}")
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Build FAISS index
|
||||
# ---------------------------------------------------------
|
||||
print("Building FAISS index...")
|
||||
index = faiss.IndexFlatIP(dim)
|
||||
index.add(embeddings)
|
||||
|
||||
# Ensure output directory exists
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Writing FAISS index to {out_path}")
|
||||
faiss.write_index(index, str(out_path))
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Write ID mapping meta
|
||||
# ---------------------------------------------------------
|
||||
meta_path = out_path.with_suffix(".meta.json")
|
||||
|
||||
with open(meta_path, "w", encoding="utf-8") as f:
|
||||
json.dump(ids, f)
|
||||
|
||||
print(f"Indexed {len(ids)} chunks successfully.")
|
||||
sys.exit(0)
|
||||
Reference in New Issue
Block a user