phase a audit

This commit is contained in:
team2
2026-02-22 13:51:45 +01:00
parent 5656a10930
commit b3e9110dd1
14 changed files with 222 additions and 463 deletions

View File

@@ -49,9 +49,8 @@ final class VectorIndexBuilder
// --------------------------------------------
// 🔵 FALL: NDJSON ist leer → kein Vector Index
// --------------------------------------------
if (filesize($this->indexNdjsonPath) === 0) {
if (!is_file($this->indexNdjsonPath) || filesize($this->indexNdjsonPath) === 0) {
// Alten Index entfernen
@unlink($this->vectorIndexPath);
@unlink($this->vectorMetaPath);
@@ -63,7 +62,7 @@ final class VectorIndexBuilder
);
}
return; // WICHTIG: kein Python, kein tmp, kein Fehler
return;
}
// --------------------------------------------
@@ -79,7 +78,6 @@ final class VectorIndexBuilder
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
// Clean leftovers
@unlink($tmpVectorIndexPath);
@unlink($this->vectorMetaPath);
@@ -108,11 +106,15 @@ final class VectorIndexBuilder
private function assertPreconditions(): void
{
if (!is_file($this->scriptPath)) {
throw new \RuntimeException('vector_ingest.py not found at: ' . $this->scriptPath);
throw new \RuntimeException(
'Vector build script not found at: ' . $this->scriptPath
);
}
if (!is_file($this->indexNdjsonPath)) {
throw new \RuntimeException('index.ndjson not found at: ' . $this->indexNdjsonPath);
throw new \RuntimeException(
'index.ndjson not found at: ' . $this->indexNdjsonPath
);
}
}
@@ -195,4 +197,4 @@ final class VectorIndexBuilder
@file_put_contents($logPath, "=== VectorIndexBuilder OK ===\n", FILE_APPEND);
}
}
}
}

View File

@@ -1,295 +0,0 @@
#!/usr/bin/env python3
import argparse
import importlib
import json
import os
import signal
import socket
import subprocess
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple
# ============================================================
# Paths
# ============================================================
BASE_PATH = Path(__file__).resolve().parents[2]
VENV_DIR = BASE_PATH / ".venv"
VENV_PY = VENV_DIR / "bin" / "python"
VENV_PIP = VENV_DIR / "bin" / "pip"
UVICORN_BIN = VENV_DIR / "bin" / "uvicorn"
PID_DIR = BASE_PATH / "var" / "run"
PID_FILE = PID_DIR / "vector_service.pid"
DEFAULT_HOST = "0.0.0.0"
DEFAULT_PORT = 8090
DEFAULT_HEALTH_URL = "http://127.0.0.1:{port}/health"
DEFAULT_RELOAD_URL = "http://127.0.0.1:{port}/reload"
REQUIRED_MODULES = [
"fastapi",
"uvicorn",
"faiss",
"sentence_transformers",
"numpy",
]
# ============================================================
# Utilities
# ============================================================
def _now_ms() -> int:
return int(time.time() * 1000)
def _read_pid() -> Optional[int]:
try:
if PID_FILE.exists():
content = PID_FILE.read_text(encoding="utf-8").strip()
if content.isdigit():
return int(content)
except Exception:
return None
return None
def _write_pid(pid: int) -> None:
PID_DIR.mkdir(parents=True, exist_ok=True)
PID_FILE.write_text(str(pid), encoding="utf-8")
def _remove_pid() -> None:
try:
if PID_FILE.exists():
PID_FILE.unlink()
except Exception:
pass
def _pid_is_running(pid: int) -> bool:
try:
os.kill(pid, 0)
return True
except Exception:
return False
def _is_port_open(host: str, port: int, timeout: float = 0.5) -> bool:
try:
with socket.create_connection((host, port), timeout=timeout):
return True
except Exception:
return False
def _curl(url: str, method: str = "GET", timeout_seconds: int = 3) -> Tuple[int, str]:
cmd = [
"curl",
"-s",
"-X",
method,
"-m",
str(timeout_seconds),
"-w",
"\n%{http_code}",
url,
]
p = subprocess.run(cmd, capture_output=True, text=True)
out = (p.stdout or "").rstrip("\n")
if "\n" in out:
body, code = out.rsplit("\n", 1)
try:
return int(code), body
except Exception:
return 0, body
return 0, out
# ============================================================
# Dependency Handling
# ============================================================
def check_modules() -> List[str]:
missing = []
for module in REQUIRED_MODULES:
try:
importlib.import_module(module)
except Exception:
missing.append(module)
return missing
def install_missing_modules(missing: List[str]) -> Dict[str, str]:
mod_to_pkg = {
"fastapi": "fastapi",
"uvicorn": "uvicorn",
"numpy": "numpy",
"sentence_transformers": "sentence-transformers",
"faiss": "faiss-cpu",
}
pkgs = [mod_to_pkg.get(m, m) for m in missing]
if not VENV_PIP.exists():
return {"status": "error", "detail": "pip not found in .venv"}
cmd = [str(VENV_PIP), "install", *pkgs]
p = subprocess.run(cmd, capture_output=True, text=True)
if p.returncode != 0:
return {"status": "error", "detail": (p.stderr or p.stdout).strip()}
return {"status": "ok", "detail": "installed: " + " ".join(pkgs)}
# ============================================================
# Service Control
# ============================================================
def service_status(port: int) -> Dict:
pid = _read_pid()
pid_running = bool(pid and _pid_is_running(pid))
if pid and not pid_running:
_remove_pid()
pid = None
code, body = _curl(DEFAULT_HEALTH_URL.format(port=port), method="GET")
return {
"pid": pid,
"pid_running": pid_running,
"health_code": code,
"healthy": code == 200,
"health_body": body,
"port": port,
}
def start_service(host: str, port: int) -> Dict:
if not UVICORN_BIN.exists():
return {"status": "error", "detail": "uvicorn not found in .venv"}
if _is_port_open("127.0.0.1", port):
return {"status": "error", "detail": f"port {port} already in use"}
cmd = [
str(UVICORN_BIN),
"src.Vector.vector_service:app",
"--host", host,
"--port", str(port),
]
p = subprocess.Popen(
cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
cwd=str(BASE_PATH),
start_new_session=True,
)
_write_pid(p.pid)
time.sleep(2)
return {"status": "ok", "detail": "service started", "pid": p.pid}
def stop_service(port: int, force: bool = False) -> Dict:
pid = _read_pid()
if not pid:
return {"status": "ok", "detail": "not running"}
if not _pid_is_running(pid):
_remove_pid()
return {"status": "ok", "detail": "stale pid removed"}
try:
os.kill(pid, signal.SIGTERM)
time.sleep(2)
if not _pid_is_running(pid):
_remove_pid()
return {"status": "ok", "detail": "stopped"}
if force:
os.kill(pid, signal.SIGKILL)
_remove_pid()
return {"status": "ok", "detail": "force stopped"}
return {"status": "error", "detail": "stop timeout (use --force)"}
except Exception as e:
return {"status": "error", "detail": str(e)}
def reload_service(port: int) -> Dict:
code, body = _curl(DEFAULT_RELOAD_URL.format(port=port), method="POST")
if code == 200:
return {"status": "ok", "detail": body}
if code == 404:
return {"status": "error", "detail": "reload endpoint not found"}
return {"status": "error", "detail": f"reload failed (http {code}): {body}"}
# ============================================================
# Main
# ============================================================
def main() -> int:
parser = argparse.ArgumentParser(description="Vector service control")
parser.add_argument("--install", action="store_true")
parser.add_argument("--start", action="store_true")
parser.add_argument("--stop", action="store_true")
parser.add_argument("--force", action="store_true")
parser.add_argument("--reload", action="store_true")
parser.add_argument("--status", action="store_true")
parser.add_argument("--port", type=int, default=DEFAULT_PORT)
parser.add_argument("--host", type=str, default=DEFAULT_HOST)
args = parser.parse_args()
result = {
"ts_ms": _now_ms(),
"actions": [],
"results": {},
}
missing = check_modules()
result["results"]["modules_missing"] = missing
if missing and args.install:
result["actions"].append("install")
result["results"]["install"] = install_missing_modules(missing)
if args.stop:
result["actions"].append("stop")
result["results"]["stop"] = stop_service(args.port, args.force)
if args.start:
result["actions"].append("start")
result["results"]["start"] = start_service(args.host, args.port)
if args.reload:
result["actions"].append("reload")
result["results"]["reload"] = reload_service(args.port)
if args.status or not any([args.install, args.start, args.stop, args.reload]):
result["actions"].append("status")
result["results"]["status"] = service_status(args.port)
result["duration_ms"] = _now_ms() - result["ts_ms"]
print(json.dumps(result, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,134 +0,0 @@
#!/usr/bin/env python3
import sys
import json
import argparse
from pathlib import Path
# ---------------------------------------------------------
# Argument parsing
# ---------------------------------------------------------
parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON")
parser.add_argument("--index", required=True, help="Path to index.ndjson")
parser.add_argument("--out", required=True, help="Path to output vector.index")
parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model")
args = parser.parse_args()
index_path = Path(args.index).resolve()
out_path = Path(args.out).resolve()
# ---------------------------------------------------------
# Dependency checks
# ---------------------------------------------------------
try:
import faiss
except Exception:
print("ERROR: Python module 'faiss' not found.")
sys.exit(10)
try:
from sentence_transformers import SentenceTransformer
except Exception:
print("ERROR: Python module 'sentence-transformers' not found.")
sys.exit(11)
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if not index_path.is_file():
print(f"ERROR: index.ndjson not found at {index_path}")
sys.exit(20)
# ---------------------------------------------------------
# Load model
# ---------------------------------------------------------
print(f"Loading embedding model: {args.model}")
model = SentenceTransformer(args.model)
# ---------------------------------------------------------
# Streaming read NDJSON
# ---------------------------------------------------------
texts = []
ids = []
print("Reading NDJSON...")
with open(index_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
except Exception:
continue
text = entry.get("text")
chunk_id = entry.get("chunk_id")
if not text or not chunk_id:
continue
texts.append(text)
ids.append(chunk_id)
if not texts:
print("No chunks found. Removing vector index.")
if out_path.exists():
out_path.unlink()
meta_path = out_path.with_suffix(".meta.json")
if meta_path.exists():
meta_path.unlink()
sys.exit(0)
print(f"Loaded {len(texts)} chunks.")
# ---------------------------------------------------------
# Build embeddings
# ---------------------------------------------------------
print("Encoding embeddings...")
embeddings = model.encode(
texts,
normalize_embeddings=True,
show_progress_bar=True,
batch_size=64
)
embeddings = np.array(embeddings).astype("float32")
dim = embeddings.shape[1]
print(f"Embedding dimension: {dim}")
# ---------------------------------------------------------
# Build FAISS index
# ---------------------------------------------------------
print("Building FAISS index...")
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
# Ensure output directory exists
out_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Writing FAISS index to {out_path}")
faiss.write_index(index, str(out_path))
# ---------------------------------------------------------
# Write ID mapping meta
# ---------------------------------------------------------
meta_path = out_path.with_suffix(".meta.json")
with open(meta_path, "w", encoding="utf-8") as f:
json.dump(ids, f)
print(f"Indexed {len(ids)} chunks successfully.")
sys.exit(0)

View File

@@ -1,126 +0,0 @@
#!/usr/bin/env python3
import sys
import json
from pathlib import Path
# ---------------------------------------------------------
# Positional args (aligned with PHP builder exec call)
# ---------------------------------------------------------
# 1 tags.ndjson
# 2 out_index_path (can be .tmp)
# 3 model
# Example:
# python vector_ingest_tags.py /var/knowledge/tags.ndjson /var/knowledge/vector_tags.index.tmp all-MiniLM-L6-v2
# ---------------------------------------------------------
if len(sys.argv) < 4:
print("ERROR: usage: vector_ingest_tags.py <tags.ndjson> <out.index> <model>", file=sys.stderr)
sys.exit(2)
tags_path = Path(sys.argv[1]).resolve()
out_path = Path(sys.argv[2]).resolve()
model_name = sys.argv[3]
meta_path = Path(str(out_path) + ".meta.json") # vector_tags.index(.tmp).meta.json
# ---------------------------------------------------------
# Dependency checks
# ---------------------------------------------------------
try:
import faiss
except Exception:
print("ERROR: Python module 'faiss' not found.", file=sys.stderr)
sys.exit(10)
try:
from sentence_transformers import SentenceTransformer
except Exception:
print("ERROR: Python module 'sentence-transformers' not found.", file=sys.stderr)
sys.exit(11)
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if not tags_path.is_file():
print(f"ERROR: tags.ndjson not found at {tags_path}", file=sys.stderr)
sys.exit(20)
# Ensure output directory exists
out_path.parent.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------
# Load model
# ---------------------------------------------------------
model = SentenceTransformer(model_name)
# ---------------------------------------------------------
# Streaming read NDJSON
# ---------------------------------------------------------
texts = []
ids = []
with open(tags_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
except Exception:
continue
text = entry.get("text")
tag_id = entry.get("tag_id")
if not text or not tag_id:
continue
text = str(text)
if len(text) > 4000:
text = text[:4000]
texts.append(text)
ids.append(str(tag_id))
# If empty: remove outputs (tmp) and exit success
if not texts:
if out_path.exists():
out_path.unlink()
if meta_path.exists():
meta_path.unlink()
sys.exit(0)
# ---------------------------------------------------------
# Build embeddings
# ---------------------------------------------------------
embeddings = model.encode(
texts,
normalize_embeddings=True,
show_progress_bar=False,
batch_size=64
)
embeddings = np.array(embeddings).astype("float32")
dim = embeddings.shape[1]
# ---------------------------------------------------------
# Build FAISS index
# ---------------------------------------------------------
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
faiss.write_index(index, str(out_path))
# ---------------------------------------------------------
# Write ID mapping meta
# ---------------------------------------------------------
with open(meta_path, "w", encoding="utf-8") as f:
json.dump(ids, f)
sys.exit(0)

View File

@@ -1,117 +0,0 @@
#!/usr/bin/env python3
import sys
import json
import argparse
from pathlib import Path
# ---------------------------------------------------------
# Argument parsing (NEW CLEAN CLI)
# ---------------------------------------------------------
parser = argparse.ArgumentParser(description="FAISS vector search")
parser.add_argument("--query", required=True, help="Search query text")
parser.add_argument("--limit", required=True, type=int, help="Top-K limit")
parser.add_argument("--index", required=True, help="Path to vector.index")
parser.add_argument("--meta", required=True, help="Path to vector.index.meta.json")
parser.add_argument("--model", required=True, help="SentenceTransformer model")
args = parser.parse_args()
query = args.query
limit = args.limit
index_path = Path(args.index).resolve()
meta_path = Path(args.meta).resolve()
embedding_model = args.model
# ---------------------------------------------------------
# Dependency checks (stderr only)
# ---------------------------------------------------------
try:
import faiss # noqa
except Exception:
print("Python module 'faiss' not found.", file=sys.stderr)
sys.exit(10)
try:
from sentence_transformers import SentenceTransformer # noqa
except Exception:
print("Python module 'sentence-transformers' not found.", file=sys.stderr)
sys.exit(11)
import faiss
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if not index_path.is_file():
print(f"vector.index not found at {index_path}", file=sys.stderr)
sys.exit(20)
if not meta_path.is_file():
print(f"vector.index.meta.json not found at {meta_path}", file=sys.stderr)
sys.exit(21)
# ---------------------------------------------------------
# Load model and index
# ---------------------------------------------------------
try:
model = SentenceTransformer(embedding_model)
except Exception as e:
print(f"Failed to load embedding model: {embedding_model}", file=sys.stderr)
sys.exit(30)
try:
query_vec = model.encode([query], normalize_embeddings=True)
except Exception:
print("Embedding encoding failed.", file=sys.stderr)
sys.exit(31)
try:
index = faiss.read_index(str(index_path))
except Exception:
print("Failed to read FAISS index.", file=sys.stderr)
sys.exit(32)
try:
with open(meta_path, "r", encoding="utf-8") as f:
ids = json.load(f)
except Exception:
print("Failed to read vector meta file.", file=sys.stderr)
sys.exit(33)
# ---------------------------------------------------------
# Search
# ---------------------------------------------------------
try:
scores, indices = index.search(query_vec, limit)
except Exception:
print("FAISS search failed.", file=sys.stderr)
sys.exit(40)
results = []
for score, idx in zip(scores[0], indices[0]):
if idx == -1:
continue
if idx < 0 or idx >= len(ids):
continue
results.append({
"chunk_id": ids[idx],
"score": float(score)
})
# ---------------------------------------------------------
# STRICT JSON OUTPUT ONLY
# ---------------------------------------------------------
print(json.dumps(results))
sys.exit(0)

View File

@@ -1,103 +0,0 @@
#!/usr/bin/env python3
import sys
import json
from pathlib import Path
# ---------------------------------------------------------
# Positional args (aligned with PHP client exec call)
# ---------------------------------------------------------
# 1 query
# 2 limit
# 3 index_path
# 4 meta_path
# 5 model
#
# Example:
# python vector_search_tags.py "foo" 8 /path/vector_tags.index /path/vector_tags.index.meta.json all-MiniLM-L6-v2
# ---------------------------------------------------------
if len(sys.argv) < 6:
print("[]")
sys.exit(0)
query = sys.argv[1]
try:
limit = int(sys.argv[2])
except Exception:
limit = 5
index_path = Path(sys.argv[3]).resolve()
meta_path = Path(sys.argv[4]).resolve()
model_name = sys.argv[5]
# ---------------------------------------------------------
# Dependency checks
# ---------------------------------------------------------
try:
import faiss
except Exception:
# keep stdout clean for caller
print("[]")
sys.exit(0)
try:
from sentence_transformers import SentenceTransformer
except Exception:
print("[]")
sys.exit(0)
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if limit <= 0:
print("[]")
sys.exit(0)
if not index_path.is_file() or not meta_path.is_file():
# No tag index available => no routing
print("[]")
sys.exit(0)
# ---------------------------------------------------------
# Load model
# ---------------------------------------------------------
model = SentenceTransformer(model_name)
# ---------------------------------------------------------
# Load index + meta
# ---------------------------------------------------------
index = faiss.read_index(str(index_path))
try:
with open(meta_path, "r", encoding="utf-8") as f:
ids = json.load(f)
except Exception:
print("[]")
sys.exit(0)
if not isinstance(ids, list) or len(ids) == 0:
print("[]")
sys.exit(0)
# ---------------------------------------------------------
# Embed & search
# ---------------------------------------------------------
qvec = model.encode([query], normalize_embeddings=True)
scores, idxs = index.search(qvec, limit)
out = []
for score, idx in zip(scores[0], idxs[0]):
if idx is None or idx < 0 or idx >= len(ids):
continue
out.append({
"tag_id": str(ids[idx]),
"score": float(score),
})
print(json.dumps(out))
sys.exit(0)

View File

@@ -1,176 +0,0 @@
#!/usr/bin/env python3
import json
from pathlib import Path
from typing import Any, List, Optional
import numpy as np
import faiss
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
# ============================================================
# Paths
# ============================================================
BASE_PATH = Path(__file__).resolve().parents[2]
KNOWLEDGE_DIR = BASE_PATH / "var" / "knowledge"
CHUNK_INDEX_PATH = KNOWLEDGE_DIR / "vector.index"
CHUNK_MAP_PATH = KNOWLEDGE_DIR / "vector.index.meta.json"
TAG_INDEX_PATH = KNOWLEDGE_DIR / "vector_tags.index"
TAG_MAP_PATH = KNOWLEDGE_DIR / "vector_tags.index.meta.json"
INDEX_META_PATH = KNOWLEDGE_DIR / "index_meta.json"
# ============================================================
# FastAPI
# ============================================================
app = FastAPI()
model: Optional[SentenceTransformer] = None
chunk_index = None
chunk_ids: Optional[List[Any]] = None
tag_index = None
tag_ids: Optional[List[Any]] = None
loaded_embedding_model_name: Optional[str] = None
# ============================================================
# Models
# ============================================================
class SearchRequest(BaseModel):
query: str
limit: int = 8
# ============================================================
# Loader
# ============================================================
def load_all():
global model, chunk_index, chunk_ids, tag_index, tag_ids, loaded_embedding_model_name
if not INDEX_META_PATH.exists():
raise RuntimeError("index_meta.json not found")
meta = json.loads(INDEX_META_PATH.read_text())
embedding_model_name = meta.get("embedding_model")
if not embedding_model_name:
raise RuntimeError("embedding_model missing in index_meta.json")
# Reload model only if changed
if model is None or embedding_model_name != loaded_embedding_model_name:
print(f"[Reload] Loading embedding model: {embedding_model_name}")
model = SentenceTransformer(embedding_model_name)
loaded_embedding_model_name = embedding_model_name
# Reload chunk index
if CHUNK_INDEX_PATH.exists() and CHUNK_MAP_PATH.exists():
print("[Reload] Loading chunk index")
chunk_index = faiss.read_index(str(CHUNK_INDEX_PATH))
chunk_ids = json.loads(CHUNK_MAP_PATH.read_text())
else:
chunk_index = None
chunk_ids = None
# Reload tag index
if TAG_INDEX_PATH.exists() and TAG_MAP_PATH.exists():
print("[Reload] Loading tag index")
tag_index = faiss.read_index(str(TAG_INDEX_PATH))
tag_ids = json.loads(TAG_MAP_PATH.read_text())
else:
tag_index = None
tag_ids = None
print("[Reload] Completed")
# ============================================================
# Startup
# ============================================================
@app.on_event("startup")
def startup_event():
load_all()
print("[VectorService] Ready")
# ============================================================
# Endpoints
# ============================================================
@app.get("/health")
def health():
return {
"status": "ok",
"chunk_index_loaded": chunk_index is not None,
"tag_index_loaded": tag_index is not None,
"model_loaded": model is not None,
}
@app.post("/reload")
def reload():
try:
load_all()
return {"status": "reloaded"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/search-chunks")
def search_chunks(req: SearchRequest):
if chunk_index is None or chunk_ids is None:
raise HTTPException(status_code=503, detail="Chunk index not available")
query_vec = model.encode([req.query], normalize_embeddings=True)
query_vec = np.array(query_vec).astype("float32")
scores, indices = chunk_index.search(query_vec, req.limit)
results = []
for score, idx in zip(scores[0], indices[0]):
if idx == -1:
continue
if idx < 0 or idx >= len(chunk_ids):
continue
results.append({
"chunk_id": chunk_ids[idx],
"score": float(score),
})
return results
@app.post("/search-tags")
def search_tags(req: SearchRequest):
if tag_index is None or tag_ids is None:
raise HTTPException(status_code=503, detail="Tag index not available")
query_vec = model.encode([req.query], normalize_embeddings=True)
query_vec = np.array(query_vec).astype("float32")
scores, indices = tag_index.search(query_vec, req.limit)
results = []
for score, idx in zip(scores[0], indices[0]):
if idx == -1:
continue
if idx < 0 or idx >= len(tag_ids):
continue
results.append({
"chunk_id": tag_ids[idx],
"score": float(score),
})
return results