optimize catalog semantic match sby tags
This commit is contained in:
@@ -42,6 +42,9 @@ INDEX_META_PATH = KNOWLEDGE_DIR / "index_meta.json"
|
||||
INDEX_RUNTIME_PATH = KNOWLEDGE_DIR / "index_runtime.json"
|
||||
INDEX_NDJSON_PATH = KNOWLEDGE_DIR / "index.ndjson"
|
||||
|
||||
# NEW: Tags NDJSON (exported by PHP) used to enrich /search-tags responses
|
||||
TAGS_NDJSON_PATH = KNOWLEDGE_DIR / "tags.ndjson"
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Logging
|
||||
@@ -111,6 +114,9 @@ chunk_pos_map: Dict[str, int] = {}
|
||||
tag_index = None
|
||||
tag_ids: Optional[List[Any]] = None
|
||||
|
||||
# NEW: tag_id -> {"label": "...", "tag_type": "..."}
|
||||
tag_meta_map: Dict[str, Dict[str, str]] = {}
|
||||
|
||||
loaded_embedding_model_name: Optional[str] = None
|
||||
current_index_version: Optional[int] = None
|
||||
current_runtime_stamp: Optional[str] = None
|
||||
@@ -210,6 +216,61 @@ def load_chunk_maps_from_ndjson() -> None:
|
||||
logger.warning("Failed to load chunk maps from ndjson: %s", str(e))
|
||||
|
||||
|
||||
def load_tag_meta_from_tags_ndjson() -> None:
|
||||
"""
|
||||
Loads minimal tag metadata from tags.ndjson to enrich /search-tags results.
|
||||
Expected line format (from PHP exporter / ingester pipeline):
|
||||
{"tag_id":"...","text":"LABEL\\nSLUG\\noptional description", ...}
|
||||
We extract:
|
||||
label = first line of "text" (fallback: "")
|
||||
tag_type = "type" if present (preferred), else "generic"
|
||||
"""
|
||||
global tag_meta_map
|
||||
|
||||
tag_meta_map = {}
|
||||
|
||||
if not TAGS_NDJSON_PATH.exists():
|
||||
logger.info("[Reload] tags.ndjson missing -> tag_meta_map empty (%s)", str(TAGS_NDJSON_PATH))
|
||||
return
|
||||
|
||||
try:
|
||||
with TAGS_NDJSON_PATH.open("r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
row = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
tag_id = _as_key(row.get("tag_id"))
|
||||
if not tag_id:
|
||||
continue
|
||||
|
||||
# Prefer explicit fields if present
|
||||
ttype = row.get("type")
|
||||
if isinstance(ttype, str) and ttype.strip():
|
||||
tag_type = ttype.strip()
|
||||
else:
|
||||
tag_type = "generic"
|
||||
|
||||
label = ""
|
||||
txt = row.get("text")
|
||||
if isinstance(txt, str) and txt.strip():
|
||||
first = txt.splitlines()[0].strip() if txt.splitlines() else ""
|
||||
label = first
|
||||
|
||||
if label:
|
||||
tag_meta_map[tag_id] = {"label": label, "tag_type": tag_type}
|
||||
else:
|
||||
tag_meta_map[tag_id] = {"label": "", "tag_type": tag_type}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load tag meta from tags.ndjson: %s", str(e))
|
||||
tag_meta_map = {}
|
||||
|
||||
|
||||
def _normalize_meta_list(value: Any) -> Optional[List[Any]]:
|
||||
"""
|
||||
Accepts:
|
||||
@@ -282,6 +343,10 @@ def load_all() -> None:
|
||||
tag_index = None
|
||||
tag_ids = None
|
||||
|
||||
# NEW: load tag meta for enrichment
|
||||
logger.info("[Reload] Loading tag meta from tags.ndjson")
|
||||
load_tag_meta_from_tags_ndjson()
|
||||
|
||||
runtime = _safe_read_json(INDEX_RUNTIME_PATH)
|
||||
if isinstance(runtime, dict):
|
||||
v = runtime.get("last_rebuild_at")
|
||||
@@ -292,10 +357,11 @@ def load_all() -> None:
|
||||
current_index_version = index_version if isinstance(index_version, int) else None
|
||||
|
||||
logger.info(
|
||||
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s stamp=%s file=%s)",
|
||||
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s tag_meta=%s stamp=%s file=%s)",
|
||||
str(current_index_version),
|
||||
str(current_runtime_stamp),
|
||||
str(loaded_embedding_model_name),
|
||||
str(len(tag_meta_map)),
|
||||
SERVICE_STAMP,
|
||||
str(Path(__file__).resolve()),
|
||||
)
|
||||
@@ -390,6 +456,8 @@ def health():
|
||||
"tag_meta_len": len(tag_ids) if isinstance(tag_ids, list) else None,
|
||||
"chunk_meta_type": type(chunk_ids).__name__ if chunk_ids is not None else None,
|
||||
"chunk_meta_len": len(chunk_ids) if isinstance(chunk_ids, list) else None,
|
||||
"tag_meta_map_len": len(tag_meta_map),
|
||||
"tags_ndjson_path": str(TAGS_NDJSON_PATH),
|
||||
"log_file": str(LOG_FILE),
|
||||
}
|
||||
|
||||
@@ -502,7 +570,26 @@ def search_tags(req: SearchRequest):
|
||||
continue
|
||||
if idx < 0 or idx >= len(tag_ids):
|
||||
continue
|
||||
results.append({"tag_id": tag_ids[idx], "score": float(score)})
|
||||
|
||||
tag_id = tag_ids[idx]
|
||||
tag_id_key = _as_key(tag_id) or ""
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"tag_id": tag_id,
|
||||
"score": float(score),
|
||||
}
|
||||
|
||||
meta = tag_meta_map.get(tag_id_key)
|
||||
if isinstance(meta, dict):
|
||||
label = meta.get("label")
|
||||
ttype = meta.get("tag_type")
|
||||
|
||||
if isinstance(label, str) and label.strip():
|
||||
payload["label"] = label
|
||||
if isinstance(ttype, str) and ttype.strip():
|
||||
payload["tag_type"] = ttype
|
||||
|
||||
results.append(payload)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
Reference in New Issue
Block a user