first commit
This commit is contained in:
@@ -1,42 +1,44 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
|
||||
def fail(message: str, code: int) -> None:
|
||||
print(f"ERROR: {message}", file=sys.stderr)
|
||||
sys.exit(code)
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Positional args
|
||||
# 1 tags.ndjson
|
||||
# 2 out_index_path (can be .tmp)
|
||||
# ---------------------------------------------------------
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("ERROR: usage: vector_ingest_tags.py <tags.ndjson> <out.index>", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
fail("usage: vector_ingest_tags.py <tags.ndjson> <out.index>", 2)
|
||||
|
||||
tags_path = Path(sys.argv[1]).resolve()
|
||||
out_path = Path(sys.argv[2]).resolve()
|
||||
|
||||
out_path = Path(sys.argv[2]).resolve()
|
||||
meta_path = Path(str(out_path) + ".meta.json")
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Dependency checks
|
||||
# ---------------------------------------------------------
|
||||
try:
|
||||
import faiss
|
||||
except Exception:
|
||||
print("ERROR: Python module 'faiss' not found.", file=sys.stderr)
|
||||
sys.exit(10)
|
||||
fail("Python module 'faiss' not found.", 10)
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except Exception:
|
||||
print("ERROR: Python module 'sentence-transformers' not found.", file=sys.stderr)
|
||||
sys.exit(11)
|
||||
fail("Python module 'sentence-transformers' not found.", 11)
|
||||
|
||||
import numpy as np
|
||||
import faiss
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Load embedding model from index_meta.json (Single Source of Truth)
|
||||
@@ -45,64 +47,122 @@ BASE_PATH = Path(__file__).resolve().parents[2]
|
||||
INDEX_META_PATH = BASE_PATH / "var" / "knowledge" / "index_meta.json"
|
||||
|
||||
if not INDEX_META_PATH.exists():
|
||||
print("ERROR: index_meta.json not found", file=sys.stderr)
|
||||
sys.exit(30)
|
||||
fail("index_meta.json not found", 30)
|
||||
|
||||
try:
|
||||
meta = json.loads(INDEX_META_PATH.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
fail("index_meta.json is invalid", 30)
|
||||
|
||||
meta = json.loads(INDEX_META_PATH.read_text(encoding="utf-8"))
|
||||
embedding_model = meta.get("embedding_model")
|
||||
if not isinstance(embedding_model, str) or embedding_model.strip() == "":
|
||||
fail("embedding_model missing in index_meta.json", 31)
|
||||
|
||||
if not embedding_model:
|
||||
print("ERROR: embedding_model missing in index_meta.json", file=sys.stderr)
|
||||
sys.exit(31)
|
||||
model = SentenceTransformer(embedding_model.strip())
|
||||
|
||||
model = SentenceTransformer(embedding_model)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# File checks
|
||||
# ---------------------------------------------------------
|
||||
if not tags_path.is_file():
|
||||
print(f"ERROR: tags.ndjson not found at {tags_path}", file=sys.stderr)
|
||||
sys.exit(20)
|
||||
fail(f"tags.ndjson not found at {tags_path}", 20)
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Streaming read NDJSON
|
||||
# Helpers
|
||||
# ---------------------------------------------------------
|
||||
texts = []
|
||||
ids = []
|
||||
|
||||
with open(tags_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
text = entry.get("text")
|
||||
tag_id = entry.get("tag_id")
|
||||
|
||||
if not text or not tag_id:
|
||||
continue
|
||||
|
||||
text = str(text)
|
||||
if len(text) > 4000:
|
||||
text = text[:4000]
|
||||
|
||||
texts.append(f"passage: {text}")
|
||||
ids.append(str(tag_id))
|
||||
|
||||
if not texts:
|
||||
def cleanup_outputs() -> None:
|
||||
if out_path.exists():
|
||||
out_path.unlink()
|
||||
if meta_path.exists():
|
||||
meta_path.unlink()
|
||||
|
||||
|
||||
def normalize_text(value: Any) -> str:
|
||||
text = str(value).strip()
|
||||
text = " ".join(text.split())
|
||||
|
||||
if len(text) > 4000:
|
||||
text = text[:4000].rstrip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Streaming read NDJSON
|
||||
# ---------------------------------------------------------
|
||||
def load_rows(path: Path) -> Tuple[List[str], List[str], Dict[str, int]]:
|
||||
texts: List[str] = []
|
||||
ids: List[str] = []
|
||||
seen_ids = set()
|
||||
|
||||
stats = {
|
||||
"lines_total": 0,
|
||||
"lines_empty": 0,
|
||||
"lines_invalid_json": 0,
|
||||
"rows_missing_fields": 0,
|
||||
"rows_duplicate_tag_id": 0,
|
||||
"rows_accepted": 0,
|
||||
}
|
||||
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
stats["lines_total"] += 1
|
||||
line = line.strip()
|
||||
|
||||
if line == "":
|
||||
stats["lines_empty"] += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
except Exception:
|
||||
stats["lines_invalid_json"] += 1
|
||||
continue
|
||||
|
||||
if not isinstance(entry, dict):
|
||||
stats["rows_missing_fields"] += 1
|
||||
continue
|
||||
|
||||
tag_id = str(entry.get("tag_id", "")).strip()
|
||||
text = normalize_text(entry.get("text", ""))
|
||||
|
||||
if tag_id == "" or text == "":
|
||||
stats["rows_missing_fields"] += 1
|
||||
continue
|
||||
|
||||
if tag_id in seen_ids:
|
||||
stats["rows_duplicate_tag_id"] += 1
|
||||
continue
|
||||
|
||||
seen_ids.add(tag_id)
|
||||
ids.append(tag_id)
|
||||
texts.append(f"passage: {text}")
|
||||
stats["rows_accepted"] += 1
|
||||
|
||||
return texts, ids, stats
|
||||
|
||||
|
||||
texts, ids, stats = load_rows(tags_path)
|
||||
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"event": "tag_rows_loaded",
|
||||
**stats,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
if not texts:
|
||||
cleanup_outputs()
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Build embeddings
|
||||
# ---------------------------------------------------------
|
||||
@@ -110,18 +170,32 @@ embeddings = model.encode(
|
||||
texts,
|
||||
normalize_embeddings=True,
|
||||
show_progress_bar=True,
|
||||
batch_size=128
|
||||
batch_size=128,
|
||||
)
|
||||
|
||||
embeddings = np.array(embeddings).astype("float32")
|
||||
dim = embeddings.shape[1]
|
||||
embeddings = np.array(embeddings, dtype="float32")
|
||||
|
||||
if embeddings.ndim != 2 or embeddings.shape[0] != len(ids) or embeddings.shape[0] == 0:
|
||||
cleanup_outputs()
|
||||
fail("tag embeddings have invalid shape", 40)
|
||||
|
||||
if embeddings.shape[1] <= 0:
|
||||
cleanup_outputs()
|
||||
fail("tag embeddings have invalid dimension", 41)
|
||||
|
||||
dim = int(embeddings.shape[1])
|
||||
|
||||
index = faiss.IndexFlatIP(dim)
|
||||
index.add(embeddings)
|
||||
|
||||
faiss.write_index(index, str(out_path))
|
||||
if int(index.ntotal) != len(ids):
|
||||
cleanup_outputs()
|
||||
fail("FAISS tag index count does not match meta ids", 42)
|
||||
|
||||
with open(meta_path, "w", encoding="utf-8") as f:
|
||||
json.dump(ids, f)
|
||||
faiss.write_index(index, str(out_path))
|
||||
meta_path.write_text(
|
||||
json.dumps(ids, ensure_ascii=False),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
sys.exit(0)
|
||||
@@ -6,10 +6,10 @@ from logging.handlers import RotatingFileHandler
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Dict
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import faiss
|
||||
import numpy as np
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel
|
||||
@@ -20,7 +20,7 @@ from sentence_transformers import SentenceTransformer
|
||||
# Service Stamp (to verify you are running THIS file)
|
||||
# ============================================================
|
||||
|
||||
SERVICE_STAMP = "vector_service.py@2026-02-28T10:20+01:00"
|
||||
SERVICE_STAMP = "vector_service.py@2026-04-20T00:00+02:00"
|
||||
|
||||
|
||||
# ============================================================
|
||||
@@ -41,8 +41,6 @@ TAG_MAP_PATH = KNOWLEDGE_DIR / "vector_tags.index.meta.json"
|
||||
INDEX_META_PATH = KNOWLEDGE_DIR / "index_meta.json"
|
||||
INDEX_RUNTIME_PATH = KNOWLEDGE_DIR / "index_runtime.json"
|
||||
INDEX_NDJSON_PATH = KNOWLEDGE_DIR / "index.ndjson"
|
||||
|
||||
# NEW: Tags NDJSON (exported by PHP) used to enrich /search-tags responses
|
||||
TAGS_NDJSON_PATH = KNOWLEDGE_DIR / "tags.ndjson"
|
||||
|
||||
|
||||
@@ -54,6 +52,48 @@ logger = logging.getLogger("vector_service")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# App State
|
||||
# ============================================================
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
model: Optional[SentenceTransformer] = None
|
||||
chunk_index = None
|
||||
chunk_ids: Optional[List[Any]] = None
|
||||
|
||||
chunk_doc_map: Dict[str, str] = {}
|
||||
chunk_pos_map: Dict[str, int] = {}
|
||||
|
||||
tag_index = None
|
||||
tag_ids: Optional[List[Any]] = None
|
||||
|
||||
# tag_id -> {"label": "...", "tag_type": "..."}
|
||||
tag_meta_map: Dict[str, Dict[str, str]] = {}
|
||||
|
||||
loaded_embedding_model_name: Optional[str] = None
|
||||
current_index_version: Optional[int] = None
|
||||
current_chunk_runtime_stamp: Optional[str] = None
|
||||
current_tags_runtime_stamp: Optional[str] = None
|
||||
current_tags_index_present: Optional[bool] = None
|
||||
|
||||
reload_lock = threading.Lock()
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Models
|
||||
# ============================================================
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
query: str
|
||||
limit: int = 8
|
||||
doc_ids: Optional[List[str]] = None
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Helpers
|
||||
# ============================================================
|
||||
|
||||
def setup_logging() -> None:
|
||||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@@ -77,10 +117,9 @@ def setup_logging() -> None:
|
||||
|
||||
if not any(isinstance(h, RotatingFileHandler) for h in logger.handlers):
|
||||
logger.addHandler(file_handler)
|
||||
if not any(isinstance(h, logging.StreamHandler) for h in logger.handlers):
|
||||
if not any(type(h) is logging.StreamHandler for h in logger.handlers):
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
# Capture uvicorn logs in the same file as well (critical for hidden 500s)
|
||||
uvicorn_error = logging.getLogger("uvicorn.error")
|
||||
uvicorn_access = logging.getLogger("uvicorn.access")
|
||||
|
||||
@@ -89,62 +128,22 @@ def setup_logging() -> None:
|
||||
|
||||
if not any(isinstance(h, RotatingFileHandler) for h in uvicorn_error.handlers):
|
||||
uvicorn_error.addHandler(file_handler)
|
||||
if not any(isinstance(h, logging.StreamHandler) for h in uvicorn_error.handlers):
|
||||
if not any(type(h) is logging.StreamHandler for h in uvicorn_error.handlers):
|
||||
uvicorn_error.addHandler(stream_handler)
|
||||
|
||||
if not any(isinstance(h, RotatingFileHandler) for h in uvicorn_access.handlers):
|
||||
uvicorn_access.addHandler(file_handler)
|
||||
if not any(isinstance(h, logging.StreamHandler) for h in uvicorn_access.handlers):
|
||||
if not any(type(h) is logging.StreamHandler for h in uvicorn_access.handlers):
|
||||
uvicorn_access.addHandler(stream_handler)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# FastAPI
|
||||
# ============================================================
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
model: Optional[SentenceTransformer] = None
|
||||
chunk_index = None
|
||||
chunk_ids: Optional[List[Any]] = None
|
||||
|
||||
chunk_doc_map: Dict[str, str] = {}
|
||||
chunk_pos_map: Dict[str, int] = {}
|
||||
|
||||
tag_index = None
|
||||
tag_ids: Optional[List[Any]] = None
|
||||
|
||||
# NEW: tag_id -> {"label": "...", "tag_type": "..."}
|
||||
tag_meta_map: Dict[str, Dict[str, str]] = {}
|
||||
|
||||
loaded_embedding_model_name: Optional[str] = None
|
||||
current_index_version: Optional[int] = None
|
||||
current_runtime_stamp: Optional[str] = None
|
||||
|
||||
reload_lock = threading.Lock()
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Models
|
||||
# ============================================================
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
query: str
|
||||
limit: int = 8
|
||||
doc_ids: Optional[List[str]] = None
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Loader Helpers
|
||||
# ============================================================
|
||||
|
||||
def _safe_read_json(path: Path) -> Optional[Any]:
|
||||
try:
|
||||
if not path.exists():
|
||||
return None
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
except Exception as e:
|
||||
logger.warning("Failed to read json %s: %s", str(path), str(e))
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to read json %s: %s", str(path), str(exc))
|
||||
return None
|
||||
|
||||
|
||||
@@ -152,25 +151,97 @@ def _as_key(value: Any) -> Optional[str]:
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, str):
|
||||
v = value.strip()
|
||||
return v if v else None
|
||||
value = value.strip()
|
||||
return value or None
|
||||
try:
|
||||
v = str(value).strip()
|
||||
return v if v else None
|
||||
value = str(value).strip()
|
||||
return value or None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _sanitize_limit(limit: int, default: int = 8, max_limit: int = 200) -> int:
|
||||
try:
|
||||
v = int(limit)
|
||||
value = int(limit)
|
||||
except Exception:
|
||||
return default
|
||||
if v <= 0:
|
||||
if value <= 0:
|
||||
return default
|
||||
if v > max_limit:
|
||||
if value > max_limit:
|
||||
return max_limit
|
||||
return v
|
||||
return value
|
||||
|
||||
|
||||
def _normalize_meta_list(value: Any) -> Optional[List[Any]]:
|
||||
"""
|
||||
Accepts:
|
||||
- list: ok
|
||||
- dict like {"0": "...", "1": "..."}: convert to list sorted by numeric key
|
||||
Returns None if invalid.
|
||||
"""
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
|
||||
if isinstance(value, dict):
|
||||
try:
|
||||
keys = sorted(int(key) for key in value.keys())
|
||||
return [value[str(i)] for i in keys]
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _normalize_tag_type(value: Any) -> str:
|
||||
normalized = _as_key(value)
|
||||
if normalized is None:
|
||||
return "generic"
|
||||
|
||||
normalized = normalized.lower()
|
||||
if normalized in {"generic", "catalog_entity", "sales_signal"}:
|
||||
return normalized
|
||||
|
||||
return "generic"
|
||||
|
||||
|
||||
def _extract_runtime_state(runtime: Any) -> Tuple[Optional[str], Optional[str], Optional[bool]]:
|
||||
if not isinstance(runtime, dict):
|
||||
return None, None, None
|
||||
|
||||
chunk_runtime = runtime.get("last_rebuild_at")
|
||||
tags_runtime = runtime.get("last_tags_rebuild_at")
|
||||
tags_index_present = runtime.get("tags_index_present")
|
||||
|
||||
if not isinstance(chunk_runtime, str):
|
||||
chunk_runtime = None
|
||||
if not isinstance(tags_runtime, str):
|
||||
tags_runtime = None
|
||||
if not isinstance(tags_index_present, bool):
|
||||
tags_index_present = None
|
||||
|
||||
return chunk_runtime, tags_runtime, tags_index_present
|
||||
|
||||
|
||||
def _validate_index_alignment(index_obj: Any, ids: Optional[List[Any]], label: str) -> Tuple[Any, Optional[List[Any]]]:
|
||||
if index_obj is None or ids is None:
|
||||
return None, None
|
||||
|
||||
try:
|
||||
index_count = int(index_obj.ntotal)
|
||||
except Exception:
|
||||
logger.warning("[Reload] %s index has no ntotal -> disabled", label)
|
||||
return None, None
|
||||
|
||||
if index_count != len(ids):
|
||||
logger.warning(
|
||||
"[Reload] %s meta/index mismatch (ids=%s index=%s) -> disabled",
|
||||
label,
|
||||
len(ids),
|
||||
index_count,
|
||||
)
|
||||
return None, None
|
||||
|
||||
return index_obj, ids
|
||||
|
||||
|
||||
def load_chunk_maps_from_ndjson() -> None:
|
||||
@@ -183,8 +254,8 @@ def load_chunk_maps_from_ndjson() -> None:
|
||||
return
|
||||
|
||||
try:
|
||||
with INDEX_NDJSON_PATH.open("r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
with INDEX_NDJSON_PATH.open("r", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
@@ -201,29 +272,32 @@ def load_chunk_maps_from_ndjson() -> None:
|
||||
if doc_id_key:
|
||||
chunk_doc_map[chunk_id_key] = doc_id_key
|
||||
|
||||
ci = row.get("chunk_index")
|
||||
if isinstance(ci, int):
|
||||
chunk_pos_map[chunk_id_key] = ci
|
||||
elif isinstance(ci, str):
|
||||
s = ci.strip()
|
||||
if s.isdigit():
|
||||
chunk_index_value = row.get("chunk_index")
|
||||
if isinstance(chunk_index_value, int):
|
||||
chunk_pos_map[chunk_id_key] = chunk_index_value
|
||||
elif isinstance(chunk_index_value, str):
|
||||
stripped = chunk_index_value.strip()
|
||||
if stripped.isdigit():
|
||||
try:
|
||||
chunk_pos_map[chunk_id_key] = int(s)
|
||||
chunk_pos_map[chunk_id_key] = int(stripped)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load chunk maps from ndjson: %s", str(e))
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to load chunk maps from ndjson: %s", str(exc))
|
||||
|
||||
|
||||
def load_tag_meta_from_tags_ndjson() -> None:
|
||||
"""
|
||||
Loads minimal tag metadata from tags.ndjson to enrich /search-tags results.
|
||||
Expected line format (from PHP exporter / ingester pipeline):
|
||||
{"tag_id":"...","text":"LABEL\\nSLUG\\noptional description", ...}
|
||||
We extract:
|
||||
label = first line of "text" (fallback: "")
|
||||
tag_type = "type" if present (preferred), else "generic"
|
||||
Expected line format:
|
||||
{
|
||||
"tag_id": "...",
|
||||
"text": "LABEL\\nSLUG\\noptional description",
|
||||
"type": "catalog_entity|generic|sales_signal",
|
||||
"document_ids": ["..."]
|
||||
}
|
||||
|
||||
Only tags with at least one exported document id are kept.
|
||||
"""
|
||||
global tag_meta_map
|
||||
|
||||
@@ -234,11 +308,12 @@ def load_tag_meta_from_tags_ndjson() -> None:
|
||||
return
|
||||
|
||||
try:
|
||||
with TAGS_NDJSON_PATH.open("r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
with TAGS_NDJSON_PATH.open("r", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
row = json.loads(line)
|
||||
except Exception:
|
||||
@@ -248,55 +323,33 @@ def load_tag_meta_from_tags_ndjson() -> None:
|
||||
if not tag_id:
|
||||
continue
|
||||
|
||||
# Prefer explicit fields if present
|
||||
ttype = row.get("type")
|
||||
if isinstance(ttype, str) and ttype.strip():
|
||||
tag_type = ttype.strip()
|
||||
else:
|
||||
tag_type = "generic"
|
||||
document_ids = row.get("document_ids")
|
||||
if isinstance(document_ids, list) and len(document_ids) == 0:
|
||||
continue
|
||||
|
||||
tag_type = _normalize_tag_type(row.get("type"))
|
||||
label = ""
|
||||
txt = row.get("text")
|
||||
if isinstance(txt, str) and txt.strip():
|
||||
first = txt.splitlines()[0].strip() if txt.splitlines() else ""
|
||||
label = first
|
||||
|
||||
if label:
|
||||
tag_meta_map[tag_id] = {"label": label, "tag_type": tag_type}
|
||||
else:
|
||||
tag_meta_map[tag_id] = {"label": "", "tag_type": tag_type}
|
||||
text_value = row.get("text")
|
||||
if isinstance(text_value, str) and text_value.strip():
|
||||
first_line = text_value.splitlines()[0].strip() if text_value.splitlines() else ""
|
||||
label = first_line
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load tag meta from tags.ndjson: %s", str(e))
|
||||
tag_meta_map[tag_id] = {
|
||||
"label": label,
|
||||
"tag_type": tag_type,
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to load tag meta from tags.ndjson: %s", str(exc))
|
||||
tag_meta_map = {}
|
||||
|
||||
|
||||
def _normalize_meta_list(value: Any) -> Optional[List[Any]]:
|
||||
"""
|
||||
Accepts:
|
||||
- list: ok
|
||||
- dict like {"0": "...", "1": "..."}: convert to list sorted by numeric key
|
||||
Returns None if invalid.
|
||||
"""
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
|
||||
if isinstance(value, dict):
|
||||
try:
|
||||
keys = sorted(int(k) for k in value.keys())
|
||||
return [value[str(i)] for i in keys]
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def load_all() -> None:
|
||||
global model, chunk_index, chunk_ids
|
||||
global tag_index, tag_ids
|
||||
global loaded_embedding_model_name
|
||||
global current_index_version
|
||||
global current_runtime_stamp
|
||||
global current_chunk_runtime_stamp, current_tags_runtime_stamp, current_tags_index_present
|
||||
|
||||
with reload_lock:
|
||||
meta = _safe_read_json(INDEX_META_PATH)
|
||||
@@ -314,15 +367,21 @@ def load_all() -> None:
|
||||
model = SentenceTransformer(embedding_model_name)
|
||||
loaded_embedding_model_name = embedding_model_name
|
||||
|
||||
runtime = _safe_read_json(INDEX_RUNTIME_PATH)
|
||||
chunk_runtime_stamp, tags_runtime_stamp, tags_index_present = _extract_runtime_state(runtime)
|
||||
|
||||
# Chunks
|
||||
if CHUNK_INDEX_PATH.exists() and CHUNK_MAP_PATH.exists():
|
||||
logger.info("[Reload] Loading chunk index")
|
||||
chunk_index = faiss.read_index(str(CHUNK_INDEX_PATH))
|
||||
raw = _safe_read_json(CHUNK_MAP_PATH)
|
||||
chunk_ids = _normalize_meta_list(raw)
|
||||
if chunk_ids is None:
|
||||
loaded_chunk_index = faiss.read_index(str(CHUNK_INDEX_PATH))
|
||||
raw_chunk_meta = _safe_read_json(CHUNK_MAP_PATH)
|
||||
loaded_chunk_ids = _normalize_meta_list(raw_chunk_meta)
|
||||
if loaded_chunk_ids is None:
|
||||
chunk_index = None
|
||||
chunk_ids = None
|
||||
logger.warning("[Reload] chunk_ids meta invalid -> chunk index disabled")
|
||||
else:
|
||||
chunk_index, chunk_ids = _validate_index_alignment(loaded_chunk_index, loaded_chunk_ids, "chunk")
|
||||
else:
|
||||
chunk_index = None
|
||||
chunk_ids = None
|
||||
@@ -331,35 +390,38 @@ def load_all() -> None:
|
||||
load_chunk_maps_from_ndjson()
|
||||
|
||||
# Tags
|
||||
if TAG_INDEX_PATH.exists() and TAG_MAP_PATH.exists():
|
||||
should_load_tag_index = tags_index_present is not False
|
||||
if should_load_tag_index and TAG_INDEX_PATH.exists() and TAG_MAP_PATH.exists():
|
||||
logger.info("[Reload] Loading tag index")
|
||||
tag_index = faiss.read_index(str(TAG_INDEX_PATH))
|
||||
raw = _safe_read_json(TAG_MAP_PATH)
|
||||
tag_ids = _normalize_meta_list(raw)
|
||||
if tag_ids is None:
|
||||
loaded_tag_index = faiss.read_index(str(TAG_INDEX_PATH))
|
||||
raw_tag_meta = _safe_read_json(TAG_MAP_PATH)
|
||||
loaded_tag_ids = _normalize_meta_list(raw_tag_meta)
|
||||
if loaded_tag_ids is None:
|
||||
tag_index = None
|
||||
tag_ids = None
|
||||
logger.warning("[Reload] tag_ids meta invalid -> tag index disabled")
|
||||
else:
|
||||
tag_index, tag_ids = _validate_index_alignment(loaded_tag_index, loaded_tag_ids, "tag")
|
||||
else:
|
||||
tag_index = None
|
||||
tag_ids = None
|
||||
if tags_index_present is False:
|
||||
logger.info("[Reload] Runtime marks tags index as absent -> tag index disabled")
|
||||
|
||||
# NEW: load tag meta for enrichment
|
||||
logger.info("[Reload] Loading tag meta from tags.ndjson")
|
||||
load_tag_meta_from_tags_ndjson()
|
||||
|
||||
runtime = _safe_read_json(INDEX_RUNTIME_PATH)
|
||||
if isinstance(runtime, dict):
|
||||
v = runtime.get("last_rebuild_at")
|
||||
current_runtime_stamp = v if isinstance(v, str) else None
|
||||
else:
|
||||
current_runtime_stamp = None
|
||||
|
||||
current_index_version = index_version if isinstance(index_version, int) else None
|
||||
current_chunk_runtime_stamp = chunk_runtime_stamp
|
||||
current_tags_runtime_stamp = tags_runtime_stamp
|
||||
current_tags_index_present = tags_index_present
|
||||
|
||||
logger.info(
|
||||
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s tag_meta=%s stamp=%s file=%s)",
|
||||
"[Reload] Completed (index_version=%s chunk_runtime=%s tags_runtime=%s tags_index_present=%s embedding_model=%s tag_meta=%s stamp=%s file=%s)",
|
||||
str(current_index_version),
|
||||
str(current_runtime_stamp),
|
||||
str(current_chunk_runtime_stamp),
|
||||
str(current_tags_runtime_stamp),
|
||||
str(current_tags_index_present),
|
||||
str(loaded_embedding_model_name),
|
||||
str(len(tag_meta_map)),
|
||||
SERVICE_STAMP,
|
||||
@@ -373,7 +435,7 @@ def load_all() -> None:
|
||||
|
||||
def observer_loop() -> None:
|
||||
global current_index_version
|
||||
global current_runtime_stamp
|
||||
global current_chunk_runtime_stamp, current_tags_runtime_stamp, current_tags_index_present
|
||||
|
||||
while True:
|
||||
time.sleep(2)
|
||||
@@ -384,28 +446,50 @@ def observer_loop() -> None:
|
||||
continue
|
||||
|
||||
new_version = meta.get("index_version") if isinstance(meta.get("index_version"), int) else None
|
||||
|
||||
runtime = _safe_read_json(INDEX_RUNTIME_PATH)
|
||||
new_runtime = None
|
||||
if isinstance(runtime, dict):
|
||||
v = runtime.get("last_rebuild_at")
|
||||
new_runtime = v if isinstance(v, str) else None
|
||||
new_chunk_runtime, new_tags_runtime, new_tags_index_present = _extract_runtime_state(runtime)
|
||||
|
||||
if new_version != current_index_version:
|
||||
logger.info("[Observer] index_version changed (%s -> %s) -> Reload", str(current_index_version), str(new_version))
|
||||
logger.info(
|
||||
"[Observer] index_version changed (%s -> %s) -> Reload",
|
||||
str(current_index_version),
|
||||
str(new_version),
|
||||
)
|
||||
load_all()
|
||||
continue
|
||||
|
||||
if new_runtime != current_runtime_stamp:
|
||||
logger.info("[Observer] runtime changed (%s -> %s) -> Reload", str(current_runtime_stamp), str(new_runtime))
|
||||
if new_chunk_runtime != current_chunk_runtime_stamp:
|
||||
logger.info(
|
||||
"[Observer] chunk runtime changed (%s -> %s) -> Reload",
|
||||
str(current_chunk_runtime_stamp),
|
||||
str(new_chunk_runtime),
|
||||
)
|
||||
load_all()
|
||||
continue
|
||||
|
||||
if new_tags_runtime != current_tags_runtime_stamp:
|
||||
logger.info(
|
||||
"[Observer] tags runtime changed (%s -> %s) -> Reload",
|
||||
str(current_tags_runtime_stamp),
|
||||
str(new_tags_runtime),
|
||||
)
|
||||
load_all()
|
||||
continue
|
||||
|
||||
if new_tags_index_present != current_tags_index_present:
|
||||
logger.info(
|
||||
"[Observer] tags_index_present changed (%s -> %s) -> Reload",
|
||||
str(current_tags_index_present),
|
||||
str(new_tags_index_present),
|
||||
)
|
||||
load_all()
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("[Observer ERROR] %s", str(e))
|
||||
except Exception as exc:
|
||||
logger.exception("[Observer ERROR] %s", str(exc))
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Global Exception Handler (forces JSON + logs)
|
||||
# Global Exception Handler
|
||||
# ============================================================
|
||||
|
||||
@app.exception_handler(Exception)
|
||||
@@ -427,12 +511,12 @@ async def unhandled_exception_handler(request: Request, exc: Exception):
|
||||
# ============================================================
|
||||
|
||||
@app.on_event("startup")
|
||||
def startup_event():
|
||||
def startup_event() -> None:
|
||||
setup_logging()
|
||||
logger.info("[VectorService] Startup stamp=%s file=%s", SERVICE_STAMP, str(Path(__file__).resolve()))
|
||||
load_all()
|
||||
t = threading.Thread(target=observer_loop, daemon=True)
|
||||
t.start()
|
||||
observer = threading.Thread(target=observer_loop, daemon=True)
|
||||
observer.start()
|
||||
logger.info("[VectorService] Ready (log=%s)", str(LOG_FILE))
|
||||
|
||||
|
||||
@@ -441,7 +525,7 @@ def startup_event():
|
||||
# ============================================================
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
def health() -> Dict[str, Any]:
|
||||
return {
|
||||
"status": "ok",
|
||||
"stamp": SERVICE_STAMP,
|
||||
@@ -451,7 +535,9 @@ def health():
|
||||
"model_loaded": model is not None,
|
||||
"embedding_model": loaded_embedding_model_name,
|
||||
"index_version": current_index_version,
|
||||
"runtime_stamp": current_runtime_stamp,
|
||||
"chunk_runtime_stamp": current_chunk_runtime_stamp,
|
||||
"tags_runtime_stamp": current_tags_runtime_stamp,
|
||||
"tags_index_present": current_tags_index_present,
|
||||
"tag_meta_type": type(tag_ids).__name__ if tag_ids is not None else None,
|
||||
"tag_meta_len": len(tag_ids) if isinstance(tag_ids, list) else None,
|
||||
"chunk_meta_type": type(chunk_ids).__name__ if chunk_ids is not None else None,
|
||||
@@ -463,17 +549,17 @@ def health():
|
||||
|
||||
|
||||
@app.post("/reload")
|
||||
def reload():
|
||||
def reload() -> Dict[str, str]:
|
||||
try:
|
||||
load_all()
|
||||
return {"status": "reloaded", "stamp": SERVICE_STAMP}
|
||||
except Exception as e:
|
||||
except Exception as exc:
|
||||
logger.exception("reload failed")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
|
||||
|
||||
@app.post("/search-chunks")
|
||||
def search_chunks(req: SearchRequest):
|
||||
def search_chunks(req: SearchRequest) -> List[Dict[str, Any]]:
|
||||
if chunk_index is None or chunk_ids is None or model is None:
|
||||
raise HTTPException(status_code=503, detail="Chunk index not available")
|
||||
|
||||
@@ -491,16 +577,16 @@ def search_chunks(req: SearchRequest):
|
||||
doc_filter: Optional[List[str]] = None
|
||||
if req.doc_ids:
|
||||
doc_filter = []
|
||||
for d in req.doc_ids:
|
||||
dk = _as_key(d)
|
||||
if dk:
|
||||
doc_filter.append(dk)
|
||||
for document_id in req.doc_ids:
|
||||
document_key = _as_key(document_id)
|
||||
if document_key:
|
||||
doc_filter.append(document_key)
|
||||
effective_limit = max(limit * 5, 50)
|
||||
effective_limit = min(effective_limit, 500)
|
||||
|
||||
scores, indices = chunk_index.search(query_vec, effective_limit)
|
||||
|
||||
results = []
|
||||
results: List[Dict[str, Any]] = []
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx == -1:
|
||||
continue
|
||||
@@ -512,20 +598,20 @@ def search_chunks(req: SearchRequest):
|
||||
if not chunk_id_key:
|
||||
continue
|
||||
|
||||
doc_id = chunk_doc_map.get(chunk_id_key)
|
||||
document_id = chunk_doc_map.get(chunk_id_key)
|
||||
if doc_filter is not None:
|
||||
if doc_id is None or doc_id not in doc_filter:
|
||||
if document_id is None or document_id not in doc_filter:
|
||||
continue
|
||||
|
||||
payload = {
|
||||
payload: Dict[str, Any] = {
|
||||
"chunk_id": raw_chunk_id,
|
||||
"score": float(score),
|
||||
"document_id": doc_id,
|
||||
"document_id": document_id,
|
||||
}
|
||||
|
||||
ci = chunk_pos_map.get(chunk_id_key)
|
||||
if isinstance(ci, int):
|
||||
payload["chunk_index"] = ci
|
||||
chunk_position = chunk_pos_map.get(chunk_id_key)
|
||||
if isinstance(chunk_position, int):
|
||||
payload["chunk_index"] = chunk_position
|
||||
|
||||
results.append(payload)
|
||||
|
||||
@@ -536,13 +622,13 @@ def search_chunks(req: SearchRequest):
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
except Exception as exc:
|
||||
logger.exception("search-chunks failure")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
|
||||
|
||||
@app.post("/search-tags")
|
||||
def search_tags(req: SearchRequest):
|
||||
def search_tags(req: SearchRequest) -> List[Dict[str, Any]]:
|
||||
if tag_index is None or tag_ids is None or model is None:
|
||||
raise HTTPException(status_code=503, detail="Tag index not available")
|
||||
|
||||
@@ -564,37 +650,47 @@ def search_tags(req: SearchRequest):
|
||||
|
||||
scores, indices = tag_index.search(query_vec, limit)
|
||||
|
||||
results = []
|
||||
results: List[Dict[str, Any]] = []
|
||||
seen_tag_ids = set()
|
||||
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx == -1:
|
||||
continue
|
||||
if idx < 0 or idx >= len(tag_ids):
|
||||
continue
|
||||
|
||||
tag_id = tag_ids[idx]
|
||||
tag_id_key = _as_key(tag_id) or ""
|
||||
raw_tag_id = tag_ids[idx]
|
||||
tag_id_key = _as_key(raw_tag_id)
|
||||
if not tag_id_key or tag_id_key in seen_tag_ids:
|
||||
continue
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"tag_id": tag_id,
|
||||
"tag_id": raw_tag_id,
|
||||
"score": float(score),
|
||||
}
|
||||
|
||||
meta = tag_meta_map.get(tag_id_key)
|
||||
if isinstance(meta, dict):
|
||||
label = meta.get("label")
|
||||
ttype = meta.get("tag_type")
|
||||
tag_type = meta.get("tag_type")
|
||||
|
||||
if isinstance(label, str) and label.strip():
|
||||
payload["label"] = label
|
||||
if isinstance(ttype, str) and ttype.strip():
|
||||
payload["tag_type"] = ttype
|
||||
if isinstance(label, str):
|
||||
payload["label"] = label.strip()
|
||||
payload["tag_type"] = _normalize_tag_type(tag_type)
|
||||
else:
|
||||
payload["label"] = ""
|
||||
payload["tag_type"] = "generic"
|
||||
|
||||
results.append(payload)
|
||||
seen_tag_ids.add(tag_id_key)
|
||||
|
||||
if len(results) >= limit:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
except Exception as exc:
|
||||
logger.exception("search-tags failure")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
Reference in New Issue
Block a user