add catalog mode
This commit is contained in:
@@ -31,7 +31,7 @@ parameters:
|
||||
|
||||
mto.index.chunk_size: 800
|
||||
mto.index.chunk_overlap: 100
|
||||
mto.index.embedding_model: 'all-MiniLM-L6-v2'
|
||||
mto.index.embedding_model: 'intfloat/multilingual-e5-base'
|
||||
mto.index.embedding_dimension: 768
|
||||
mto.index.scoring_version: 1
|
||||
|
||||
|
||||
@@ -5,24 +5,19 @@ import json
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Positional args (aligned with PHP builder exec call)
|
||||
# ---------------------------------------------------------
|
||||
# Positional args
|
||||
# 1 tags.ndjson
|
||||
# 2 out_index_path (can be .tmp)
|
||||
# 3 model
|
||||
# Example:
|
||||
# python vector_ingest_tags.py /var/knowledge/tags.ndjson /var/knowledge/vector_tags.index.tmp all-MiniLM-L6-v2
|
||||
# ---------------------------------------------------------
|
||||
|
||||
if len(sys.argv) < 4:
|
||||
print("ERROR: usage: vector_ingest_tags.py <tags.ndjson> <out.index> <model>", file=sys.stderr)
|
||||
if len(sys.argv) < 3:
|
||||
print("ERROR: usage: vector_ingest_tags.py <tags.ndjson> <out.index>", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
tags_path = Path(sys.argv[1]).resolve()
|
||||
out_path = Path(sys.argv[2]).resolve()
|
||||
model_name = sys.argv[3]
|
||||
|
||||
meta_path = Path(str(out_path) + ".meta.json") # vector_tags.index(.tmp).meta.json
|
||||
meta_path = Path(str(out_path) + ".meta.json")
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Dependency checks
|
||||
@@ -43,6 +38,25 @@ import numpy as np
|
||||
import faiss
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Load embedding model from index_meta.json (Single Source of Truth)
|
||||
# ---------------------------------------------------------
|
||||
BASE_PATH = Path(__file__).resolve().parents[2]
|
||||
INDEX_META_PATH = BASE_PATH / "var" / "knowledge" / "index_meta.json"
|
||||
|
||||
if not INDEX_META_PATH.exists():
|
||||
print("ERROR: index_meta.json not found", file=sys.stderr)
|
||||
sys.exit(30)
|
||||
|
||||
meta = json.loads(INDEX_META_PATH.read_text(encoding="utf-8"))
|
||||
embedding_model = meta.get("embedding_model")
|
||||
|
||||
if not embedding_model:
|
||||
print("ERROR: embedding_model missing in index_meta.json", file=sys.stderr)
|
||||
sys.exit(31)
|
||||
|
||||
model = SentenceTransformer(embedding_model)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# File checks
|
||||
# ---------------------------------------------------------
|
||||
@@ -50,14 +64,8 @@ if not tags_path.is_file():
|
||||
print(f"ERROR: tags.ndjson not found at {tags_path}", file=sys.stderr)
|
||||
sys.exit(20)
|
||||
|
||||
# Ensure output directory exists
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Load model
|
||||
# ---------------------------------------------------------
|
||||
model = SentenceTransformer(model_name)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Streaming read NDJSON
|
||||
# ---------------------------------------------------------
|
||||
@@ -85,13 +93,9 @@ with open(tags_path, "r", encoding="utf-8") as f:
|
||||
if len(text) > 4000:
|
||||
text = text[:4000]
|
||||
|
||||
# -------------------------------------------------
|
||||
# E5 requires "passage:" prefix for indexed texts
|
||||
# -------------------------------------------------
|
||||
texts.append(f"passage: {text}")
|
||||
ids.append(str(tag_id))
|
||||
|
||||
# If empty: remove outputs (tmp) and exit success
|
||||
if not texts:
|
||||
if out_path.exists():
|
||||
out_path.unlink()
|
||||
@@ -112,17 +116,11 @@ embeddings = model.encode(
|
||||
embeddings = np.array(embeddings).astype("float32")
|
||||
dim = embeddings.shape[1]
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Build FAISS index
|
||||
# ---------------------------------------------------------
|
||||
index = faiss.IndexFlatIP(dim)
|
||||
index.add(embeddings)
|
||||
|
||||
faiss.write_index(index, str(out_path))
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Write ID mapping meta
|
||||
# ---------------------------------------------------------
|
||||
with open(meta_path, "w", encoding="utf-8") as f:
|
||||
json.dump(ids, f)
|
||||
|
||||
|
||||
@@ -10,11 +10,19 @@ from typing import Any, List, Optional, Dict
|
||||
|
||||
import numpy as np
|
||||
import faiss
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Service Stamp (to verify you are running THIS file)
|
||||
# ============================================================
|
||||
|
||||
SERVICE_STAMP = "vector_service.py@2026-02-28T10:20+01:00"
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Paths
|
||||
# ============================================================
|
||||
@@ -42,6 +50,7 @@ INDEX_NDJSON_PATH = KNOWLEDGE_DIR / "index.ndjson"
|
||||
logger = logging.getLogger("vector_service")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def setup_logging() -> None:
|
||||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@@ -68,6 +77,23 @@ def setup_logging() -> None:
|
||||
if not any(isinstance(h, logging.StreamHandler) for h in logger.handlers):
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
# Capture uvicorn logs in the same file as well (critical for hidden 500s)
|
||||
uvicorn_error = logging.getLogger("uvicorn.error")
|
||||
uvicorn_access = logging.getLogger("uvicorn.access")
|
||||
|
||||
uvicorn_error.setLevel(logging.INFO)
|
||||
uvicorn_access.setLevel(logging.INFO)
|
||||
|
||||
if not any(isinstance(h, RotatingFileHandler) for h in uvicorn_error.handlers):
|
||||
uvicorn_error.addHandler(file_handler)
|
||||
if not any(isinstance(h, logging.StreamHandler) for h in uvicorn_error.handlers):
|
||||
uvicorn_error.addHandler(stream_handler)
|
||||
|
||||
if not any(isinstance(h, RotatingFileHandler) for h in uvicorn_access.handlers):
|
||||
uvicorn_access.addHandler(file_handler)
|
||||
if not any(isinstance(h, logging.StreamHandler) for h in uvicorn_access.handlers):
|
||||
uvicorn_access.addHandler(stream_handler)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# FastAPI
|
||||
@@ -79,9 +105,6 @@ model: Optional[SentenceTransformer] = None
|
||||
chunk_index = None
|
||||
chunk_ids: Optional[List[Any]] = None
|
||||
|
||||
# Sales-RAG signals derived from NDJSON (loaded on startup and reload):
|
||||
# - chunk_doc_map: chunk_id -> document_id
|
||||
# - chunk_pos_map: chunk_id -> chunk_index (position within document, if available)
|
||||
chunk_doc_map: Dict[str, str] = {}
|
||||
chunk_pos_map: Dict[str, int] = {}
|
||||
|
||||
@@ -89,7 +112,6 @@ tag_index = None
|
||||
tag_ids: Optional[List[Any]] = None
|
||||
|
||||
loaded_embedding_model_name: Optional[str] = None
|
||||
|
||||
current_index_version: Optional[int] = None
|
||||
current_runtime_stamp: Optional[str] = None
|
||||
|
||||
@@ -107,10 +129,10 @@ class SearchRequest(BaseModel):
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Loader
|
||||
# Loader Helpers
|
||||
# ============================================================
|
||||
|
||||
def _safe_read_json(path: Path) -> Optional[dict]:
|
||||
def _safe_read_json(path: Path) -> Optional[Any]:
|
||||
try:
|
||||
if not path.exists():
|
||||
return None
|
||||
@@ -121,9 +143,6 @@ def _safe_read_json(path: Path) -> Optional[dict]:
|
||||
|
||||
|
||||
def _as_key(value: Any) -> Optional[str]:
|
||||
"""
|
||||
Normalize IDs to string keys for maps. Returns None if unusable.
|
||||
"""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, str):
|
||||
@@ -136,12 +155,19 @@ def _as_key(value: Any) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def _sanitize_limit(limit: int, default: int = 8, max_limit: int = 200) -> int:
|
||||
try:
|
||||
v = int(limit)
|
||||
except Exception:
|
||||
return default
|
||||
if v <= 0:
|
||||
return default
|
||||
if v > max_limit:
|
||||
return max_limit
|
||||
return v
|
||||
|
||||
|
||||
def load_chunk_maps_from_ndjson() -> None:
|
||||
"""
|
||||
Builds two maps from index.ndjson:
|
||||
- chunk_id -> document_id
|
||||
- chunk_id -> chunk_index (position inside document, if present)
|
||||
"""
|
||||
global chunk_doc_map, chunk_pos_map
|
||||
|
||||
chunk_doc_map = {}
|
||||
@@ -156,7 +182,6 @@ def load_chunk_maps_from_ndjson() -> None:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
row = json.loads(line)
|
||||
except Exception:
|
||||
@@ -166,40 +191,43 @@ def load_chunk_maps_from_ndjson() -> None:
|
||||
if not chunk_id_key:
|
||||
continue
|
||||
|
||||
document_id = row.get("document_id")
|
||||
doc_id_key = _as_key(document_id)
|
||||
doc_id_key = _as_key(row.get("document_id"))
|
||||
if doc_id_key:
|
||||
chunk_doc_map[chunk_id_key] = doc_id_key
|
||||
|
||||
# chunk_index is optional but very useful for Sales-RAG diversity rules
|
||||
# (e.g. min distance within a doc)
|
||||
ci = row.get("chunk_index")
|
||||
if isinstance(ci, int):
|
||||
chunk_pos_map[chunk_id_key] = ci
|
||||
else:
|
||||
# tolerate numeric strings
|
||||
if isinstance(ci, str):
|
||||
s = ci.strip()
|
||||
if s.isdigit():
|
||||
try:
|
||||
chunk_pos_map[chunk_id_key] = int(s)
|
||||
except Exception:
|
||||
pass
|
||||
elif isinstance(ci, str):
|
||||
s = ci.strip()
|
||||
if s.isdigit():
|
||||
try:
|
||||
chunk_pos_map[chunk_id_key] = int(s)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load chunk maps from ndjson: %s", str(e))
|
||||
|
||||
|
||||
def _sanitize_limit(limit: int, default: int = 8, max_limit: int = 200) -> int:
|
||||
try:
|
||||
v = int(limit)
|
||||
except Exception:
|
||||
return default
|
||||
if v <= 0:
|
||||
return default
|
||||
if v > max_limit:
|
||||
return max_limit
|
||||
return v
|
||||
def _normalize_meta_list(value: Any) -> Optional[List[Any]]:
|
||||
"""
|
||||
Accepts:
|
||||
- list: ok
|
||||
- dict like {"0": "...", "1": "..."}: convert to list sorted by numeric key
|
||||
Returns None if invalid.
|
||||
"""
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
|
||||
if isinstance(value, dict):
|
||||
try:
|
||||
keys = sorted(int(k) for k in value.keys())
|
||||
return [value[str(i)] for i in keys]
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def load_all() -> None:
|
||||
@@ -225,13 +253,14 @@ def load_all() -> None:
|
||||
model = SentenceTransformer(embedding_model_name)
|
||||
loaded_embedding_model_name = embedding_model_name
|
||||
|
||||
# Chunks
|
||||
if CHUNK_INDEX_PATH.exists() and CHUNK_MAP_PATH.exists():
|
||||
logger.info("[Reload] Loading chunk index")
|
||||
chunk_index = faiss.read_index(str(CHUNK_INDEX_PATH))
|
||||
chunk_ids = _safe_read_json(CHUNK_MAP_PATH) or None
|
||||
if not isinstance(chunk_ids, list):
|
||||
raw = _safe_read_json(CHUNK_MAP_PATH)
|
||||
chunk_ids = _normalize_meta_list(raw)
|
||||
if chunk_ids is None:
|
||||
chunk_index = None
|
||||
chunk_ids = None
|
||||
logger.warning("[Reload] chunk_ids meta invalid -> chunk index disabled")
|
||||
else:
|
||||
chunk_index = None
|
||||
@@ -240,13 +269,14 @@ def load_all() -> None:
|
||||
logger.info("[Reload] Loading chunk maps (doc_id + chunk_index)")
|
||||
load_chunk_maps_from_ndjson()
|
||||
|
||||
# Tags
|
||||
if TAG_INDEX_PATH.exists() and TAG_MAP_PATH.exists():
|
||||
logger.info("[Reload] Loading tag index")
|
||||
tag_index = faiss.read_index(str(TAG_INDEX_PATH))
|
||||
tag_ids = _safe_read_json(TAG_MAP_PATH) or None
|
||||
if not isinstance(tag_ids, list):
|
||||
raw = _safe_read_json(TAG_MAP_PATH)
|
||||
tag_ids = _normalize_meta_list(raw)
|
||||
if tag_ids is None:
|
||||
tag_index = None
|
||||
tag_ids = None
|
||||
logger.warning("[Reload] tag_ids meta invalid -> tag index disabled")
|
||||
else:
|
||||
tag_index = None
|
||||
@@ -262,15 +292,17 @@ def load_all() -> None:
|
||||
current_index_version = index_version if isinstance(index_version, int) else None
|
||||
|
||||
logger.info(
|
||||
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s)",
|
||||
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s stamp=%s file=%s)",
|
||||
str(current_index_version),
|
||||
str(current_runtime_stamp),
|
||||
str(loaded_embedding_model_name),
|
||||
SERVICE_STAMP,
|
||||
str(Path(__file__).resolve()),
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Observer (Enterprise Auto Reload)
|
||||
# Observer
|
||||
# ============================================================
|
||||
|
||||
def observer_loop() -> None:
|
||||
@@ -294,24 +326,34 @@ def observer_loop() -> None:
|
||||
new_runtime = v if isinstance(v, str) else None
|
||||
|
||||
if new_version != current_index_version:
|
||||
logger.info(
|
||||
"[Observer] index_version changed (%s -> %s) -> Reload",
|
||||
str(current_index_version),
|
||||
str(new_version),
|
||||
)
|
||||
logger.info("[Observer] index_version changed (%s -> %s) -> Reload", str(current_index_version), str(new_version))
|
||||
load_all()
|
||||
continue
|
||||
|
||||
if new_runtime != current_runtime_stamp:
|
||||
logger.info(
|
||||
"[Observer] runtime changed (%s -> %s) -> Reload",
|
||||
str(current_runtime_stamp),
|
||||
str(new_runtime),
|
||||
)
|
||||
logger.info("[Observer] runtime changed (%s -> %s) -> Reload", str(current_runtime_stamp), str(new_runtime))
|
||||
load_all()
|
||||
|
||||
except Exception as e:
|
||||
logger.error("[Observer ERROR] %s", str(e))
|
||||
logger.exception("[Observer ERROR] %s", str(e))
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Global Exception Handler (forces JSON + logs)
|
||||
# ============================================================
|
||||
|
||||
@app.exception_handler(Exception)
|
||||
async def unhandled_exception_handler(request: Request, exc: Exception):
|
||||
logger.exception("UNHANDLED_EXCEPTION path=%s method=%s", request.url.path, request.method)
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content={
|
||||
"error": "Internal Server Error",
|
||||
"detail": str(exc),
|
||||
"path": request.url.path,
|
||||
"stamp": SERVICE_STAMP,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
@@ -321,13 +363,10 @@ def observer_loop() -> None:
|
||||
@app.on_event("startup")
|
||||
def startup_event():
|
||||
setup_logging()
|
||||
logger.info("[VectorService] Startup")
|
||||
|
||||
logger.info("[VectorService] Startup stamp=%s file=%s", SERVICE_STAMP, str(Path(__file__).resolve()))
|
||||
load_all()
|
||||
|
||||
t = threading.Thread(target=observer_loop, daemon=True)
|
||||
t.start()
|
||||
|
||||
logger.info("[VectorService] Ready (log=%s)", str(LOG_FILE))
|
||||
|
||||
|
||||
@@ -339,12 +378,18 @@ def startup_event():
|
||||
def health():
|
||||
return {
|
||||
"status": "ok",
|
||||
"stamp": SERVICE_STAMP,
|
||||
"file": str(Path(__file__).resolve()),
|
||||
"chunk_index_loaded": chunk_index is not None,
|
||||
"tag_index_loaded": tag_index is not None,
|
||||
"model_loaded": model is not None,
|
||||
"embedding_model": loaded_embedding_model_name,
|
||||
"index_version": current_index_version,
|
||||
"runtime_stamp": current_runtime_stamp,
|
||||
"tag_meta_type": type(tag_ids).__name__ if tag_ids is not None else None,
|
||||
"tag_meta_len": len(tag_ids) if isinstance(tag_ids, list) else None,
|
||||
"chunk_meta_type": type(chunk_ids).__name__ if chunk_ids is not None else None,
|
||||
"chunk_meta_len": len(chunk_ids) if isinstance(chunk_ids, list) else None,
|
||||
"log_file": str(LOG_FILE),
|
||||
}
|
||||
|
||||
@@ -353,8 +398,9 @@ def health():
|
||||
def reload():
|
||||
try:
|
||||
load_all()
|
||||
return {"status": "reloaded"}
|
||||
return {"status": "reloaded", "stamp": SERVICE_STAMP}
|
||||
except Exception as e:
|
||||
logger.exception("reload failed")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@@ -363,74 +409,68 @@ def search_chunks(req: SearchRequest):
|
||||
if chunk_index is None or chunk_ids is None or model is None:
|
||||
raise HTTPException(status_code=503, detail="Chunk index not available")
|
||||
|
||||
# Safety: clamp limit to prevent abuse / accidental huge queries
|
||||
limit = _sanitize_limit(req.limit, default=8, max_limit=200)
|
||||
try:
|
||||
limit = _sanitize_limit(req.limit, default=8, max_limit=200)
|
||||
|
||||
query = (req.query or "").strip()
|
||||
if not query:
|
||||
raise HTTPException(status_code=400, detail="query must not be empty")
|
||||
query = (req.query or "").strip()
|
||||
if not query:
|
||||
raise HTTPException(status_code=400, detail="query must not be empty")
|
||||
|
||||
query_vec = model.encode(
|
||||
[f"query: {query}"],
|
||||
normalize_embeddings=True
|
||||
)
|
||||
query_vec = np.array(query_vec).astype("float32")
|
||||
query_vec = model.encode([f"query: {query}"], normalize_embeddings=True)
|
||||
query_vec = np.array(query_vec).astype("float32")
|
||||
|
||||
effective_limit = limit
|
||||
doc_filter: Optional[List[str]] = None
|
||||
if req.doc_ids:
|
||||
# Normalize incoming doc_ids for reliable matching
|
||||
doc_filter = []
|
||||
for d in req.doc_ids:
|
||||
dk = _as_key(d)
|
||||
if dk:
|
||||
doc_filter.append(dk)
|
||||
effective_limit = limit
|
||||
doc_filter: Optional[List[str]] = None
|
||||
if req.doc_ids:
|
||||
doc_filter = []
|
||||
for d in req.doc_ids:
|
||||
dk = _as_key(d)
|
||||
if dk:
|
||||
doc_filter.append(dk)
|
||||
effective_limit = max(limit * 5, 50)
|
||||
effective_limit = min(effective_limit, 500)
|
||||
|
||||
# When doc filtering is enabled, we fetch a wider pool and filter down.
|
||||
# Keep it bounded to avoid expensive scans on huge indices.
|
||||
effective_limit = max(limit * 5, 50)
|
||||
effective_limit = min(effective_limit, 500)
|
||||
scores, indices = chunk_index.search(query_vec, effective_limit)
|
||||
|
||||
scores, indices = chunk_index.search(query_vec, effective_limit)
|
||||
|
||||
results = []
|
||||
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx == -1:
|
||||
continue
|
||||
if idx < 0 or idx >= len(chunk_ids):
|
||||
continue
|
||||
|
||||
raw_chunk_id = chunk_ids[idx]
|
||||
chunk_id_key = _as_key(raw_chunk_id)
|
||||
if not chunk_id_key:
|
||||
continue
|
||||
|
||||
# Apply doc filter if requested
|
||||
doc_id = chunk_doc_map.get(chunk_id_key)
|
||||
if doc_filter is not None:
|
||||
if doc_id is None or doc_id not in doc_filter:
|
||||
results = []
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx == -1:
|
||||
continue
|
||||
if idx < 0 or idx >= len(chunk_ids):
|
||||
continue
|
||||
|
||||
# Sales-RAG signals:
|
||||
# - document_id (for doc quotas / diversity rules)
|
||||
# - chunk_index (position within doc for distance constraints)
|
||||
payload = {
|
||||
"chunk_id": raw_chunk_id,
|
||||
"score": float(score),
|
||||
"document_id": doc_id, # may be None if ndjson missing/partial
|
||||
}
|
||||
raw_chunk_id = chunk_ids[idx]
|
||||
chunk_id_key = _as_key(raw_chunk_id)
|
||||
if not chunk_id_key:
|
||||
continue
|
||||
|
||||
ci = chunk_pos_map.get(chunk_id_key)
|
||||
if isinstance(ci, int):
|
||||
payload["chunk_index"] = ci
|
||||
doc_id = chunk_doc_map.get(chunk_id_key)
|
||||
if doc_filter is not None:
|
||||
if doc_id is None or doc_id not in doc_filter:
|
||||
continue
|
||||
|
||||
results.append(payload)
|
||||
payload = {
|
||||
"chunk_id": raw_chunk_id,
|
||||
"score": float(score),
|
||||
"document_id": doc_id,
|
||||
}
|
||||
|
||||
if len(results) >= limit:
|
||||
break
|
||||
ci = chunk_pos_map.get(chunk_id_key)
|
||||
if isinstance(ci, int):
|
||||
payload["chunk_index"] = ci
|
||||
|
||||
return results
|
||||
results.append(payload)
|
||||
|
||||
if len(results) >= limit:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.exception("search-chunks failure")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/search-tags")
|
||||
@@ -438,31 +478,36 @@ def search_tags(req: SearchRequest):
|
||||
if tag_index is None or tag_ids is None or model is None:
|
||||
raise HTTPException(status_code=503, detail="Tag index not available")
|
||||
|
||||
limit = _sanitize_limit(req.limit, default=8, max_limit=200)
|
||||
try:
|
||||
limit = _sanitize_limit(req.limit, default=8, max_limit=200)
|
||||
|
||||
query = (req.query or "").strip()
|
||||
if not query:
|
||||
raise HTTPException(status_code=400, detail="query must not be empty")
|
||||
query = (req.query or "").strip()
|
||||
if not query:
|
||||
raise HTTPException(status_code=400, detail="query must not be empty")
|
||||
|
||||
query_vec = model.encode(
|
||||
[f"query: {query}"],
|
||||
normalize_embeddings=True
|
||||
)
|
||||
query_vec = np.array(query_vec).astype("float32")
|
||||
query_vec = model.encode([f"query: {query}"], normalize_embeddings=True)
|
||||
query_vec = np.array(query_vec).astype("float32")
|
||||
|
||||
scores, indices = tag_index.search(query_vec, limit)
|
||||
if query_vec.ndim != 2:
|
||||
raise RuntimeError(f"Invalid embedding shape: {query_vec.shape}")
|
||||
|
||||
results = []
|
||||
if query_vec.shape[1] != tag_index.d:
|
||||
raise RuntimeError(f"Embedding dimension mismatch (vec={query_vec.shape[1]}, index={tag_index.d})")
|
||||
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx == -1:
|
||||
continue
|
||||
if idx < 0 or idx >= len(tag_ids):
|
||||
continue
|
||||
scores, indices = tag_index.search(query_vec, limit)
|
||||
|
||||
results.append({
|
||||
"tag_id": tag_ids[idx],
|
||||
"score": float(score),
|
||||
})
|
||||
results = []
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx == -1:
|
||||
continue
|
||||
if idx < 0 or idx >= len(tag_ids):
|
||||
continue
|
||||
results.append({"tag_id": tag_ids[idx], "score": float(score)})
|
||||
|
||||
return results
|
||||
return results
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.exception("search-tags failure")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
118
src/Catalog/EntityCatalogService.php
Normal file
118
src/Catalog/EntityCatalogService.php
Normal file
@@ -0,0 +1,118 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Catalog;
|
||||
|
||||
use App\Tag\TagVectorSearchClient;
|
||||
use Doctrine\DBAL\Connection;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
/**
|
||||
* EntityCatalogService
|
||||
*
|
||||
* Deterministische Katalog-Listen auf Basis eines Entity-Terms:
|
||||
* - TagVectorSearch (Score-Gate + Ambiguity-Check)
|
||||
* - DB Query auf document_tag + document (ACTIVE)
|
||||
* - Rückgabe als EIN Textblock (string) oder null (Fallback auf normalen Retrieval)
|
||||
*/
|
||||
final class EntityCatalogService
|
||||
{
|
||||
private const MIN_SCORE = 0.55;
|
||||
private const AMBIGUITY_DELTA = 0.05;
|
||||
|
||||
public function __construct(
|
||||
private readonly TagVectorSearchClient $tagVectorClient,
|
||||
private readonly Connection $connection,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* @return string|null Textblock oder null (wenn kein sicherer Catalog möglich ist)
|
||||
*/
|
||||
public function listByTerm(string $entityTerm): ?string
|
||||
{
|
||||
$entityTerm = trim($entityTerm);
|
||||
if ($entityTerm === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 1) Tag-Vektorsuche (Top 3 für Ambiguity-Prüfung)
|
||||
$hits = $this->tagVectorClient->search($entityTerm, 3);
|
||||
|
||||
if ($hits === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$best = $hits[0];
|
||||
|
||||
$bestScore = isset($best['score']) ? (float)$best['score'] : 0.0;
|
||||
if ($bestScore < self::MIN_SCORE) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 2) Ambiguity: wenn Top2 zu nah ist → konservativ abbrechen
|
||||
if (isset($hits[1])) {
|
||||
$secondScore = isset($hits[1]['score']) ? (float)$hits[1]['score'] : 0.0;
|
||||
if (abs($bestScore - $secondScore) < self::AMBIGUITY_DELTA) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
$tagHex = (string)($best['tag_id'] ?? '');
|
||||
if ($tagHex === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 3) DB Query: alle ACTIVE Dokumente zu diesem Tag
|
||||
$rows = $this->connection->fetchAllAssociative(
|
||||
'
|
||||
SELECT d.title
|
||||
FROM document d
|
||||
INNER JOIN document_tag dt ON dt.document_id = d.id
|
||||
WHERE dt.tag_id = :tagId
|
||||
AND d.status = :status
|
||||
ORDER BY d.title ASC
|
||||
',
|
||||
[
|
||||
'tagId' => Uuid::fromString($tagHex)->toBinary(),
|
||||
'status' => 'ACTIVE',
|
||||
]
|
||||
);
|
||||
|
||||
if ($rows === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$titles = [];
|
||||
foreach ($rows as $row) {
|
||||
$t = trim((string)($row['title'] ?? ''));
|
||||
if ($t !== '') {
|
||||
$titles[] = $t;
|
||||
}
|
||||
}
|
||||
|
||||
if ($titles === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return $this->buildTextBlock($entityTerm, $titles);
|
||||
}
|
||||
|
||||
private function buildTextBlock(string $entityTerm, array $titles): string
|
||||
{
|
||||
$headline = match ($entityTerm) {
|
||||
'geräte' => 'Folgende Geräte sind verfügbar:',
|
||||
'indikatoren' => 'Folgende Indikatoren sind verfügbar:',
|
||||
'funktionen' => 'Folgende Funktionen sind verfügbar:',
|
||||
'zubehör' => 'Folgendes Zubehör ist verfügbar:',
|
||||
default => 'Folgende Einträge sind verfügbar:',
|
||||
};
|
||||
|
||||
$lines = [];
|
||||
foreach ($titles as $title) {
|
||||
$lines[] = '- ' . $title;
|
||||
}
|
||||
|
||||
return $headline . "\n\n" . implode("\n", $lines);
|
||||
}
|
||||
}
|
||||
138
src/Intent/CatalogIntentLite.php
Normal file
138
src/Intent/CatalogIntentLite.php
Normal file
@@ -0,0 +1,138 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Intent;
|
||||
|
||||
/**
|
||||
* CatalogIntentLite
|
||||
*
|
||||
* Minimal, deterministische Erkennung von Katalog-/Entity-Listenanfragen.
|
||||
*
|
||||
* Ziel:
|
||||
* - "Liste aller Geräte" / "Welche Indikatoren gibt es?" / "Zeig mir alle Funktionen"
|
||||
*
|
||||
* Guardrails:
|
||||
* - Kein Catalog-Mode bei Sales-/Pricing-/Comparison-/ROI-/Implementation-/Objection-Intents.
|
||||
* - Kein Catalog-Mode ohne expliziten Entity-Term.
|
||||
*
|
||||
* WICHTIG:
|
||||
* - Immer mit ORIGINAL-Prompt aufrufen.
|
||||
* - Kein LLM, kein ML.
|
||||
*/
|
||||
final class CatalogIntentLite
|
||||
{
|
||||
/**
|
||||
* Listensignale (leichtgewichtig) – IntentLite bleibt weiterhin für "allgemeine" List Detection zuständig.
|
||||
*/
|
||||
private const LIST_SIGNALS = [
|
||||
'liste',
|
||||
'auflisten',
|
||||
'aufzaehl',
|
||||
'aufzähl',
|
||||
'übersicht',
|
||||
'uebersicht',
|
||||
'welche gibt es',
|
||||
'welche sind',
|
||||
'zeig mir alle',
|
||||
'zeige mir alle',
|
||||
'alle',
|
||||
];
|
||||
|
||||
/**
|
||||
* Entity-Terms, die wir als Katalogtypen unterstützen.
|
||||
*
|
||||
* Left side: canonical term (für Tag-Suche)
|
||||
* Right side: Such-Synonyme, die im Prompt vorkommen dürfen.
|
||||
*/
|
||||
private const ENTITY_TERMS = [
|
||||
'geräte' => ['gerät', 'geräte', 'geraet', 'geraete', 'device', 'devices'],
|
||||
'indikatoren' => ['indikator', 'indikatoren', 'indicator', 'indicators'],
|
||||
'funktionen' => ['funktion', 'funktionen', 'feature', 'features', 'funktionalität', 'funktionalitaet'],
|
||||
'zubehör' => ['zubehör', 'zubehoer', 'accessory', 'accessories', 'zubehor'],
|
||||
];
|
||||
|
||||
public function __construct(
|
||||
private readonly SalesIntentLite $salesIntentLite,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* @return string|null canonical entity term (z. B. "geräte") oder null wenn kein Catalog-Intent.
|
||||
*/
|
||||
public function detect(string $originalPrompt): ?string
|
||||
{
|
||||
$p = $this->normalize($originalPrompt);
|
||||
|
||||
// 1) Muss ein Listen-Signal enthalten
|
||||
if (!$this->containsAny($p, self::LIST_SIGNALS)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 2) Guardrail: Kein Catalog-Mode bei Sales-Intents
|
||||
$sales = $this->salesIntentLite->detect($originalPrompt);
|
||||
$intent = (string)($sales['intent'] ?? SalesIntentLite::DISCOVERY);
|
||||
|
||||
if ($intent !== SalesIntentLite::DISCOVERY) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 3) Expliziten Entity-Term extrahieren (sonst kein Catalog)
|
||||
foreach (self::ENTITY_TERMS as $canonical => $synonyms) {
|
||||
foreach ($synonyms as $syn) {
|
||||
if ($this->containsWord($p, $syn)) {
|
||||
return $canonical;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// Helpers
|
||||
// ------------------------------------------------------------
|
||||
|
||||
private function containsAny(string $haystack, array $needles): bool
|
||||
{
|
||||
foreach ($needles as $needle) {
|
||||
if ($needle === '') {
|
||||
continue;
|
||||
}
|
||||
if (str_contains($haystack, $needle)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private function containsWord(string $haystack, string $word): bool
|
||||
{
|
||||
$word = trim($word);
|
||||
if ($word === '') {
|
||||
return false;
|
||||
}
|
||||
return preg_match('/\b' . preg_quote($word, '/') . '\b/u', $haystack) === 1;
|
||||
}
|
||||
|
||||
private function normalize(string $s): string
|
||||
{
|
||||
$s = mb_strtolower($s);
|
||||
|
||||
// Umlaute absichern (analog IntentLite/SalesIntentLite)
|
||||
$replacements = [
|
||||
'ä' => 'ae',
|
||||
'ö' => 'oe',
|
||||
'ü' => 'ue',
|
||||
'ß' => 'ss',
|
||||
];
|
||||
|
||||
foreach ($replacements as $umlaut => $alt) {
|
||||
if (str_contains($s, $umlaut)) {
|
||||
$s .= ' ' . str_replace($umlaut, $alt, $s);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $s;
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,9 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Catalog\EntityCatalogService;
|
||||
use App\Entity\ModelGenerationConfig;
|
||||
use App\Intent\CatalogIntentLite;
|
||||
use App\Intent\IntentLite;
|
||||
use App\Intent\SalesIntentLite;
|
||||
use App\Knowledge\QueryCleaner;
|
||||
@@ -32,7 +34,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
private readonly ModelGenerationConfigRepository $configRepository,
|
||||
private readonly QueryCleaner $queryCleaner,
|
||||
private readonly IntentLite $intentLite,
|
||||
private readonly SalesIntentLite $salesIntentLite
|
||||
private readonly SalesIntentLite $salesIntentLite,
|
||||
private readonly CatalogIntentLite $catalogIntent,
|
||||
private readonly EntityCatalogService $entityCatalogService
|
||||
)
|
||||
{
|
||||
}
|
||||
@@ -54,6 +58,17 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
public function retrieveInternal(string $prompt, ModelGenerationConfig $config): array
|
||||
{
|
||||
// 🔵 ENTITY CATALOG EARLY EXIT (jetzt auch im Admin-Test aktiv)
|
||||
$entityTerm = $this->catalogIntent->detect($prompt);
|
||||
|
||||
if ($entityTerm !== null) {
|
||||
$catalogBlock = $this->entityCatalogService->listByTerm($entityTerm);
|
||||
|
||||
if ($catalogBlock !== null) {
|
||||
return [$catalogBlock];
|
||||
}
|
||||
}
|
||||
|
||||
$core = $this->runCore($prompt, $config, false);
|
||||
|
||||
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
||||
@@ -111,8 +126,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return [];
|
||||
}
|
||||
|
||||
// 1) Production-like selection: wir selektieren Texte,
|
||||
// aber in Debug brauchen wir die ChunkIds dazu.
|
||||
$selectedChunkIds = $core['is_list_query']
|
||||
? $this->selectChunkIdsListMode($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
|
||||
: $this->selectChunkIdsSalesMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
||||
@@ -121,7 +134,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return [];
|
||||
}
|
||||
|
||||
// 2) Ausgabe inklusive Scores
|
||||
$out = [];
|
||||
$rank = 0;
|
||||
|
||||
@@ -179,7 +191,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$cleanQuery = $prompt;
|
||||
}
|
||||
|
||||
// Intent-based adjustments (identisch zur Produktionslogik)
|
||||
$threshold = self::VECTOR_SCORE_THRESHOLD;
|
||||
$topK = $vectorTopKBase;
|
||||
|
||||
@@ -216,7 +227,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
|
||||
|
||||
// Tag routing (identisch)
|
||||
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
||||
$candidateSet = null;
|
||||
|
||||
@@ -224,7 +234,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$candidateSet = array_fill_keys($candidateDocIds, true);
|
||||
}
|
||||
|
||||
// Dual search (identisch)
|
||||
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
||||
|
||||
$scopedHits = [];
|
||||
@@ -249,7 +258,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$rrfScores = [];
|
||||
$rawScores = [];
|
||||
|
||||
// RRF (identisch) + optional raw capture
|
||||
$this->applyRrfWithOptionalRaw($globalHits, $rrfScores, $rawScores, $threshold, false, $withScores);
|
||||
$this->applyRrfWithOptionalRaw(
|
||||
$scopedHits,
|
||||
@@ -292,13 +300,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gleiche Logik wie applyRrf(), aber optional mit raw-score capture.
|
||||
*
|
||||
* @param array<int, array{chunk_id:string, score:float}> $hits
|
||||
* @param array<string,float> $rrfScores
|
||||
* @param array<string,float> $rawScores
|
||||
*/
|
||||
private function applyRrfWithOptionalRaw(
|
||||
array $hits,
|
||||
array &$rrfScores,
|
||||
@@ -322,7 +323,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$chunkId = (string)$hit['chunk_id'];
|
||||
|
||||
if ($captureRaw) {
|
||||
// wenn global+scoped vorkommt: bestes raw behalten
|
||||
if (!isset($rawScores[$chunkId]) || $raw > $rawScores[$chunkId]) {
|
||||
$rawScores[$chunkId] = $raw;
|
||||
}
|
||||
@@ -343,15 +343,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// DEBUG SELECTION HELPERS (identisch zu Produktionsregeln)
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* List-Mode nutzt exakt collectTexts() Regeln, aber gibt ChunkIds zurück.
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
private function selectChunkIdsListMode(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$seen = [];
|
||||
@@ -384,11 +375,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normal-Mode nutzt exakt collectSalesOptimized() Regeln, aber gibt ChunkIds zurück.
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
private function selectChunkIdsSalesMode(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$out = [];
|
||||
@@ -437,10 +423,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return $out;
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// ORIGINAL METHODS (UNVERÄNDERT)
|
||||
// =========================================================
|
||||
|
||||
private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void
|
||||
{
|
||||
$rank = 0;
|
||||
|
||||
@@ -58,6 +58,7 @@ final readonly class TagVectorSearchClient
|
||||
'Tag vector service returned non-200',
|
||||
['status' => $response->getStatusCode()]
|
||||
);
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
@@ -68,6 +69,7 @@ final readonly class TagVectorSearchClient
|
||||
'Tag vector service unreachable',
|
||||
['error' => $e->getMessage()]
|
||||
);
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user