add tagging
This commit is contained in:
126
src/Vector/vector_ingest_tags.py
Normal file
126
src/Vector/vector_ingest_tags.py
Normal file
@@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Positional args (aligned with PHP builder exec call)
|
||||
# ---------------------------------------------------------
|
||||
# 1 tags.ndjson
|
||||
# 2 out_index_path (can be .tmp)
|
||||
# 3 model
|
||||
# Example:
|
||||
# python vector_ingest_tags.py /var/knowledge/tags.ndjson /var/knowledge/vector_tags.index.tmp all-MiniLM-L6-v2
|
||||
# ---------------------------------------------------------
|
||||
|
||||
if len(sys.argv) < 4:
|
||||
print("ERROR: usage: vector_ingest_tags.py <tags.ndjson> <out.index> <model>", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
tags_path = Path(sys.argv[1]).resolve()
|
||||
out_path = Path(sys.argv[2]).resolve()
|
||||
model_name = sys.argv[3]
|
||||
|
||||
meta_path = Path(str(out_path) + ".meta.json") # vector_tags.index(.tmp).meta.json
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Dependency checks
|
||||
# ---------------------------------------------------------
|
||||
try:
|
||||
import faiss
|
||||
except Exception:
|
||||
print("ERROR: Python module 'faiss' not found.", file=sys.stderr)
|
||||
sys.exit(10)
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except Exception:
|
||||
print("ERROR: Python module 'sentence-transformers' not found.", file=sys.stderr)
|
||||
sys.exit(11)
|
||||
|
||||
import numpy as np
|
||||
import faiss
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# File checks
|
||||
# ---------------------------------------------------------
|
||||
if not tags_path.is_file():
|
||||
print(f"ERROR: tags.ndjson not found at {tags_path}", file=sys.stderr)
|
||||
sys.exit(20)
|
||||
|
||||
# Ensure output directory exists
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Load model
|
||||
# ---------------------------------------------------------
|
||||
model = SentenceTransformer(model_name)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Streaming read NDJSON
|
||||
# ---------------------------------------------------------
|
||||
texts = []
|
||||
ids = []
|
||||
|
||||
with open(tags_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
text = entry.get("text")
|
||||
tag_id = entry.get("tag_id")
|
||||
|
||||
if not text or not tag_id:
|
||||
continue
|
||||
|
||||
text = str(text)
|
||||
if len(text) > 4000:
|
||||
text = text[:4000]
|
||||
|
||||
texts.append(text)
|
||||
ids.append(str(tag_id))
|
||||
|
||||
# If empty: remove outputs (tmp) and exit success
|
||||
if not texts:
|
||||
if out_path.exists():
|
||||
out_path.unlink()
|
||||
if meta_path.exists():
|
||||
meta_path.unlink()
|
||||
sys.exit(0)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Build embeddings
|
||||
# ---------------------------------------------------------
|
||||
embeddings = model.encode(
|
||||
texts,
|
||||
normalize_embeddings=True,
|
||||
show_progress_bar=False,
|
||||
batch_size=64
|
||||
)
|
||||
|
||||
embeddings = np.array(embeddings).astype("float32")
|
||||
dim = embeddings.shape[1]
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Build FAISS index
|
||||
# ---------------------------------------------------------
|
||||
index = faiss.IndexFlatIP(dim)
|
||||
index.add(embeddings)
|
||||
|
||||
faiss.write_index(index, str(out_path))
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Write ID mapping meta
|
||||
# ---------------------------------------------------------
|
||||
with open(meta_path, "w", encoding="utf-8") as f:
|
||||
json.dump(ids, f)
|
||||
|
||||
sys.exit(0)
|
||||
103
src/Vector/vector_search_tags.py
Normal file
103
src/Vector/vector_search_tags.py
Normal file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Positional args (aligned with PHP client exec call)
|
||||
# ---------------------------------------------------------
|
||||
# 1 query
|
||||
# 2 limit
|
||||
# 3 index_path
|
||||
# 4 meta_path
|
||||
# 5 model
|
||||
#
|
||||
# Example:
|
||||
# python vector_search_tags.py "foo" 8 /path/vector_tags.index /path/vector_tags.index.meta.json all-MiniLM-L6-v2
|
||||
# ---------------------------------------------------------
|
||||
|
||||
if len(sys.argv) < 6:
|
||||
print("[]")
|
||||
sys.exit(0)
|
||||
|
||||
query = sys.argv[1]
|
||||
|
||||
try:
|
||||
limit = int(sys.argv[2])
|
||||
except Exception:
|
||||
limit = 5
|
||||
|
||||
index_path = Path(sys.argv[3]).resolve()
|
||||
meta_path = Path(sys.argv[4]).resolve()
|
||||
model_name = sys.argv[5]
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Dependency checks
|
||||
# ---------------------------------------------------------
|
||||
try:
|
||||
import faiss
|
||||
except Exception:
|
||||
# keep stdout clean for caller
|
||||
print("[]")
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except Exception:
|
||||
print("[]")
|
||||
sys.exit(0)
|
||||
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# File checks
|
||||
# ---------------------------------------------------------
|
||||
if limit <= 0:
|
||||
print("[]")
|
||||
sys.exit(0)
|
||||
|
||||
if not index_path.is_file() or not meta_path.is_file():
|
||||
# No tag index available => no routing
|
||||
print("[]")
|
||||
sys.exit(0)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Load model
|
||||
# ---------------------------------------------------------
|
||||
model = SentenceTransformer(model_name)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Load index + meta
|
||||
# ---------------------------------------------------------
|
||||
index = faiss.read_index(str(index_path))
|
||||
|
||||
try:
|
||||
with open(meta_path, "r", encoding="utf-8") as f:
|
||||
ids = json.load(f)
|
||||
except Exception:
|
||||
print("[]")
|
||||
sys.exit(0)
|
||||
|
||||
if not isinstance(ids, list) or len(ids) == 0:
|
||||
print("[]")
|
||||
sys.exit(0)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Embed & search
|
||||
# ---------------------------------------------------------
|
||||
qvec = model.encode([query], normalize_embeddings=True)
|
||||
|
||||
scores, idxs = index.search(qvec, limit)
|
||||
|
||||
out = []
|
||||
for score, idx in zip(scores[0], idxs[0]):
|
||||
if idx is None or idx < 0 or idx >= len(ids):
|
||||
continue
|
||||
out.append({
|
||||
"tag_id": str(ids[idx]),
|
||||
"score": float(score),
|
||||
})
|
||||
|
||||
print(json.dumps(out))
|
||||
sys.exit(0)
|
||||
Reference in New Issue
Block a user