add tagging

This commit is contained in:
team 1
2026-02-21 16:23:34 +01:00
parent 5a3852db12
commit cf5b473034
23 changed files with 1984 additions and 85 deletions

View File

@@ -0,0 +1,126 @@
#!/usr/bin/env python3
import sys
import json
from pathlib import Path
# ---------------------------------------------------------
# Positional args (aligned with PHP builder exec call)
# ---------------------------------------------------------
# 1 tags.ndjson
# 2 out_index_path (can be .tmp)
# 3 model
# Example:
# python vector_ingest_tags.py /var/knowledge/tags.ndjson /var/knowledge/vector_tags.index.tmp all-MiniLM-L6-v2
# ---------------------------------------------------------
if len(sys.argv) < 4:
print("ERROR: usage: vector_ingest_tags.py <tags.ndjson> <out.index> <model>", file=sys.stderr)
sys.exit(2)
tags_path = Path(sys.argv[1]).resolve()
out_path = Path(sys.argv[2]).resolve()
model_name = sys.argv[3]
meta_path = Path(str(out_path) + ".meta.json") # vector_tags.index(.tmp).meta.json
# ---------------------------------------------------------
# Dependency checks
# ---------------------------------------------------------
try:
import faiss
except Exception:
print("ERROR: Python module 'faiss' not found.", file=sys.stderr)
sys.exit(10)
try:
from sentence_transformers import SentenceTransformer
except Exception:
print("ERROR: Python module 'sentence-transformers' not found.", file=sys.stderr)
sys.exit(11)
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if not tags_path.is_file():
print(f"ERROR: tags.ndjson not found at {tags_path}", file=sys.stderr)
sys.exit(20)
# Ensure output directory exists
out_path.parent.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------
# Load model
# ---------------------------------------------------------
model = SentenceTransformer(model_name)
# ---------------------------------------------------------
# Streaming read NDJSON
# ---------------------------------------------------------
texts = []
ids = []
with open(tags_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
except Exception:
continue
text = entry.get("text")
tag_id = entry.get("tag_id")
if not text or not tag_id:
continue
text = str(text)
if len(text) > 4000:
text = text[:4000]
texts.append(text)
ids.append(str(tag_id))
# If empty: remove outputs (tmp) and exit success
if not texts:
if out_path.exists():
out_path.unlink()
if meta_path.exists():
meta_path.unlink()
sys.exit(0)
# ---------------------------------------------------------
# Build embeddings
# ---------------------------------------------------------
embeddings = model.encode(
texts,
normalize_embeddings=True,
show_progress_bar=False,
batch_size=64
)
embeddings = np.array(embeddings).astype("float32")
dim = embeddings.shape[1]
# ---------------------------------------------------------
# Build FAISS index
# ---------------------------------------------------------
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
faiss.write_index(index, str(out_path))
# ---------------------------------------------------------
# Write ID mapping meta
# ---------------------------------------------------------
with open(meta_path, "w", encoding="utf-8") as f:
json.dump(ids, f)
sys.exit(0)

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python3
import sys
import json
from pathlib import Path
# ---------------------------------------------------------
# Positional args (aligned with PHP client exec call)
# ---------------------------------------------------------
# 1 query
# 2 limit
# 3 index_path
# 4 meta_path
# 5 model
#
# Example:
# python vector_search_tags.py "foo" 8 /path/vector_tags.index /path/vector_tags.index.meta.json all-MiniLM-L6-v2
# ---------------------------------------------------------
if len(sys.argv) < 6:
print("[]")
sys.exit(0)
query = sys.argv[1]
try:
limit = int(sys.argv[2])
except Exception:
limit = 5
index_path = Path(sys.argv[3]).resolve()
meta_path = Path(sys.argv[4]).resolve()
model_name = sys.argv[5]
# ---------------------------------------------------------
# Dependency checks
# ---------------------------------------------------------
try:
import faiss
except Exception:
# keep stdout clean for caller
print("[]")
sys.exit(0)
try:
from sentence_transformers import SentenceTransformer
except Exception:
print("[]")
sys.exit(0)
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if limit <= 0:
print("[]")
sys.exit(0)
if not index_path.is_file() or not meta_path.is_file():
# No tag index available => no routing
print("[]")
sys.exit(0)
# ---------------------------------------------------------
# Load model
# ---------------------------------------------------------
model = SentenceTransformer(model_name)
# ---------------------------------------------------------
# Load index + meta
# ---------------------------------------------------------
index = faiss.read_index(str(index_path))
try:
with open(meta_path, "r", encoding="utf-8") as f:
ids = json.load(f)
except Exception:
print("[]")
sys.exit(0)
if not isinstance(ids, list) or len(ids) == 0:
print("[]")
sys.exit(0)
# ---------------------------------------------------------
# Embed & search
# ---------------------------------------------------------
qvec = model.encode([query], normalize_embeddings=True)
scores, idxs = index.search(qvec, limit)
out = []
for score, idx in zip(scores[0], idxs[0]):
if idx is None or idx < 0 or idx >= len(ids):
continue
out.append({
"tag_id": str(ids[idx]),
"score": float(score),
})
print(json.dumps(out))
sys.exit(0)