optimize catalog semantic match sby tags

This commit is contained in:
team2
2026-02-28 16:10:47 +01:00
parent d3294464ea
commit 0d3f6e21d6
13 changed files with 329 additions and 151 deletions

View File

@@ -42,6 +42,9 @@ INDEX_META_PATH = KNOWLEDGE_DIR / "index_meta.json"
INDEX_RUNTIME_PATH = KNOWLEDGE_DIR / "index_runtime.json"
INDEX_NDJSON_PATH = KNOWLEDGE_DIR / "index.ndjson"
# NEW: Tags NDJSON (exported by PHP) used to enrich /search-tags responses
TAGS_NDJSON_PATH = KNOWLEDGE_DIR / "tags.ndjson"
# ============================================================
# Logging
@@ -111,6 +114,9 @@ chunk_pos_map: Dict[str, int] = {}
tag_index = None
tag_ids: Optional[List[Any]] = None
# NEW: tag_id -> {"label": "...", "tag_type": "..."}
tag_meta_map: Dict[str, Dict[str, str]] = {}
loaded_embedding_model_name: Optional[str] = None
current_index_version: Optional[int] = None
current_runtime_stamp: Optional[str] = None
@@ -210,6 +216,61 @@ def load_chunk_maps_from_ndjson() -> None:
logger.warning("Failed to load chunk maps from ndjson: %s", str(e))
def load_tag_meta_from_tags_ndjson() -> None:
"""
Loads minimal tag metadata from tags.ndjson to enrich /search-tags results.
Expected line format (from PHP exporter / ingester pipeline):
{"tag_id":"...","text":"LABEL\\nSLUG\\noptional description", ...}
We extract:
label = first line of "text" (fallback: "")
tag_type = "type" if present (preferred), else "generic"
"""
global tag_meta_map
tag_meta_map = {}
if not TAGS_NDJSON_PATH.exists():
logger.info("[Reload] tags.ndjson missing -> tag_meta_map empty (%s)", str(TAGS_NDJSON_PATH))
return
try:
with TAGS_NDJSON_PATH.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
row = json.loads(line)
except Exception:
continue
tag_id = _as_key(row.get("tag_id"))
if not tag_id:
continue
# Prefer explicit fields if present
ttype = row.get("type")
if isinstance(ttype, str) and ttype.strip():
tag_type = ttype.strip()
else:
tag_type = "generic"
label = ""
txt = row.get("text")
if isinstance(txt, str) and txt.strip():
first = txt.splitlines()[0].strip() if txt.splitlines() else ""
label = first
if label:
tag_meta_map[tag_id] = {"label": label, "tag_type": tag_type}
else:
tag_meta_map[tag_id] = {"label": "", "tag_type": tag_type}
except Exception as e:
logger.warning("Failed to load tag meta from tags.ndjson: %s", str(e))
tag_meta_map = {}
def _normalize_meta_list(value: Any) -> Optional[List[Any]]:
"""
Accepts:
@@ -282,6 +343,10 @@ def load_all() -> None:
tag_index = None
tag_ids = None
# NEW: load tag meta for enrichment
logger.info("[Reload] Loading tag meta from tags.ndjson")
load_tag_meta_from_tags_ndjson()
runtime = _safe_read_json(INDEX_RUNTIME_PATH)
if isinstance(runtime, dict):
v = runtime.get("last_rebuild_at")
@@ -292,10 +357,11 @@ def load_all() -> None:
current_index_version = index_version if isinstance(index_version, int) else None
logger.info(
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s stamp=%s file=%s)",
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s tag_meta=%s stamp=%s file=%s)",
str(current_index_version),
str(current_runtime_stamp),
str(loaded_embedding_model_name),
str(len(tag_meta_map)),
SERVICE_STAMP,
str(Path(__file__).resolve()),
)
@@ -390,6 +456,8 @@ def health():
"tag_meta_len": len(tag_ids) if isinstance(tag_ids, list) else None,
"chunk_meta_type": type(chunk_ids).__name__ if chunk_ids is not None else None,
"chunk_meta_len": len(chunk_ids) if isinstance(chunk_ids, list) else None,
"tag_meta_map_len": len(tag_meta_map),
"tags_ndjson_path": str(TAGS_NDJSON_PATH),
"log_file": str(LOG_FILE),
}
@@ -502,7 +570,26 @@ def search_tags(req: SearchRequest):
continue
if idx < 0 or idx >= len(tag_ids):
continue
results.append({"tag_id": tag_ids[idx], "score": float(score)})
tag_id = tag_ids[idx]
tag_id_key = _as_key(tag_id) or ""
payload: Dict[str, Any] = {
"tag_id": tag_id,
"score": float(score),
}
meta = tag_meta_map.get(tag_id_key)
if isinstance(meta, dict):
label = meta.get("label")
ttype = meta.get("tag_type")
if isinstance(label, str) and label.strip():
payload["label"] = label
if isinstance(ttype, str) and ttype.strip():
payload["tag_type"] = ttype
results.append(payload)
return results