From 2587ac8b4b369022c0be6e442e270d2f7ad302ae Mon Sep 17 00:00:00 2001 From: team 1 Date: Mon, 20 Apr 2026 16:36:28 +0200 Subject: [PATCH] first commit --- python/vector/vector_ingest_tags.py | 186 ++++-- python/vector/vector_service.py | 458 ++++++++----- src/Catalog/EntityCatalogService.php | 96 +-- src/Command/SystemRebuildCommand.php | 303 +++++---- src/Command/TagHealthCheckCommand.php | 83 ++- src/Command/TagRebuildRunJobCommand.php | 195 +++--- src/Command/TagsExportCommand.php | 42 +- src/Command/TagsRebuildCommand.php | 57 +- src/Config/CatalogIntentConfig.php | 56 +- src/Controller/Admin/DashboardController.php | 24 +- src/Controller/Admin/DocumentController.php | 628 ++++++++++++------ .../Admin/DocumentTagController.php | 87 ++- src/Controller/Admin/IngestJobController.php | 238 +++++-- src/Controller/Admin/TagController.php | 132 ++-- .../Admin/TagRebuildStreamController.php | 67 +- src/Entity/DocumentTag.php | 21 +- src/Entity/Tag.php | 78 ++- src/Entity/TagRebuildJob.php | 77 ++- src/Intent/CatalogIntentLite.php | 126 ++-- src/Service/Admin/DocumentTagAdminService.php | 86 ++- src/Service/Admin/TagAdminService.php | 101 ++- src/Service/DocumentService.php | 93 ++- src/Service/TagRebuildJobService.php | 214 ++++-- src/Service/TagRebuildStatusProvider.php | 69 +- src/Tag/TagNdjsonExporter.php | 246 ++++--- src/Tag/TagRoutingService.php | 209 +++++- src/Tag/TagService.php | 138 ++-- src/Tag/TagTypes.php | 75 ++- src/Tag/TagVectorIndexBuilder.php | 281 ++++++-- src/Tag/TagVectorIndexHealthService.php | 229 +++++-- src/Tag/TagVectorSearchClient.php | 116 +++- templates/admin/dashboard/index.html.twig | 291 ++++---- templates/admin/document/index.html.twig | 305 +++++---- templates/admin/document/new.html.twig | 99 ++- .../admin/document/new_version.html.twig | 134 +++- templates/admin/document/show.html.twig | 389 +++++++---- templates/admin/document_tags/edit.html.twig | 315 ++++++--- templates/admin/job/index.html.twig | 298 ++++++--- templates/admin/job/show.html.twig | 264 +++++--- templates/admin/tag/assign.html.twig | 249 ++++--- templates/admin/tag/index.html.twig | 251 ++++--- 41 files changed, 5126 insertions(+), 2280 deletions(-) diff --git a/python/vector/vector_ingest_tags.py b/python/vector/vector_ingest_tags.py index 2048dc9..bb88919 100644 --- a/python/vector/vector_ingest_tags.py +++ b/python/vector/vector_ingest_tags.py @@ -1,42 +1,44 @@ #!/usr/bin/env python3 -import sys import json +import sys from pathlib import Path +from typing import Any, Dict, List, Tuple + + +def fail(message: str, code: int) -> None: + print(f"ERROR: {message}", file=sys.stderr) + sys.exit(code) + # --------------------------------------------------------- # Positional args # 1 tags.ndjson # 2 out_index_path (can be .tmp) # --------------------------------------------------------- - if len(sys.argv) < 3: - print("ERROR: usage: vector_ingest_tags.py ", file=sys.stderr) - sys.exit(2) + fail("usage: vector_ingest_tags.py ", 2) tags_path = Path(sys.argv[1]).resolve() -out_path = Path(sys.argv[2]).resolve() - +out_path = Path(sys.argv[2]).resolve() meta_path = Path(str(out_path) + ".meta.json") + # --------------------------------------------------------- # Dependency checks # --------------------------------------------------------- try: import faiss except Exception: - print("ERROR: Python module 'faiss' not found.", file=sys.stderr) - sys.exit(10) + fail("Python module 'faiss' not found.", 10) try: from sentence_transformers import SentenceTransformer except Exception: - print("ERROR: Python module 'sentence-transformers' not found.", file=sys.stderr) - sys.exit(11) + fail("Python module 'sentence-transformers' not found.", 11) import numpy as np -import faiss -from sentence_transformers import SentenceTransformer + # --------------------------------------------------------- # Load embedding model from index_meta.json (Single Source of Truth) @@ -45,64 +47,122 @@ BASE_PATH = Path(__file__).resolve().parents[2] INDEX_META_PATH = BASE_PATH / "var" / "knowledge" / "index_meta.json" if not INDEX_META_PATH.exists(): - print("ERROR: index_meta.json not found", file=sys.stderr) - sys.exit(30) + fail("index_meta.json not found", 30) + +try: + meta = json.loads(INDEX_META_PATH.read_text(encoding="utf-8")) +except Exception: + fail("index_meta.json is invalid", 30) -meta = json.loads(INDEX_META_PATH.read_text(encoding="utf-8")) embedding_model = meta.get("embedding_model") +if not isinstance(embedding_model, str) or embedding_model.strip() == "": + fail("embedding_model missing in index_meta.json", 31) -if not embedding_model: - print("ERROR: embedding_model missing in index_meta.json", file=sys.stderr) - sys.exit(31) +model = SentenceTransformer(embedding_model.strip()) -model = SentenceTransformer(embedding_model) # --------------------------------------------------------- # File checks # --------------------------------------------------------- if not tags_path.is_file(): - print(f"ERROR: tags.ndjson not found at {tags_path}", file=sys.stderr) - sys.exit(20) + fail(f"tags.ndjson not found at {tags_path}", 20) out_path.parent.mkdir(parents=True, exist_ok=True) + # --------------------------------------------------------- -# Streaming read NDJSON +# Helpers # --------------------------------------------------------- -texts = [] -ids = [] - -with open(tags_path, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line: - continue - - try: - entry = json.loads(line) - except Exception: - continue - - text = entry.get("text") - tag_id = entry.get("tag_id") - - if not text or not tag_id: - continue - - text = str(text) - if len(text) > 4000: - text = text[:4000] - - texts.append(f"passage: {text}") - ids.append(str(tag_id)) - -if not texts: +def cleanup_outputs() -> None: if out_path.exists(): out_path.unlink() if meta_path.exists(): meta_path.unlink() + + +def normalize_text(value: Any) -> str: + text = str(value).strip() + text = " ".join(text.split()) + + if len(text) > 4000: + text = text[:4000].rstrip() + + return text + + +# --------------------------------------------------------- +# Streaming read NDJSON +# --------------------------------------------------------- +def load_rows(path: Path) -> Tuple[List[str], List[str], Dict[str, int]]: + texts: List[str] = [] + ids: List[str] = [] + seen_ids = set() + + stats = { + "lines_total": 0, + "lines_empty": 0, + "lines_invalid_json": 0, + "rows_missing_fields": 0, + "rows_duplicate_tag_id": 0, + "rows_accepted": 0, + } + + with path.open("r", encoding="utf-8") as handle: + for line in handle: + stats["lines_total"] += 1 + line = line.strip() + + if line == "": + stats["lines_empty"] += 1 + continue + + try: + entry = json.loads(line) + except Exception: + stats["lines_invalid_json"] += 1 + continue + + if not isinstance(entry, dict): + stats["rows_missing_fields"] += 1 + continue + + tag_id = str(entry.get("tag_id", "")).strip() + text = normalize_text(entry.get("text", "")) + + if tag_id == "" or text == "": + stats["rows_missing_fields"] += 1 + continue + + if tag_id in seen_ids: + stats["rows_duplicate_tag_id"] += 1 + continue + + seen_ids.add(tag_id) + ids.append(tag_id) + texts.append(f"passage: {text}") + stats["rows_accepted"] += 1 + + return texts, ids, stats + + +texts, ids, stats = load_rows(tags_path) + +print( + json.dumps( + { + "event": "tag_rows_loaded", + **stats, + }, + ensure_ascii=False, + ), + file=sys.stderr, +) + +if not texts: + cleanup_outputs() sys.exit(0) + # --------------------------------------------------------- # Build embeddings # --------------------------------------------------------- @@ -110,18 +170,32 @@ embeddings = model.encode( texts, normalize_embeddings=True, show_progress_bar=True, - batch_size=128 + batch_size=128, ) -embeddings = np.array(embeddings).astype("float32") -dim = embeddings.shape[1] +embeddings = np.array(embeddings, dtype="float32") + +if embeddings.ndim != 2 or embeddings.shape[0] != len(ids) or embeddings.shape[0] == 0: + cleanup_outputs() + fail("tag embeddings have invalid shape", 40) + +if embeddings.shape[1] <= 0: + cleanup_outputs() + fail("tag embeddings have invalid dimension", 41) + +dim = int(embeddings.shape[1]) index = faiss.IndexFlatIP(dim) index.add(embeddings) -faiss.write_index(index, str(out_path)) +if int(index.ntotal) != len(ids): + cleanup_outputs() + fail("FAISS tag index count does not match meta ids", 42) -with open(meta_path, "w", encoding="utf-8") as f: - json.dump(ids, f) +faiss.write_index(index, str(out_path)) +meta_path.write_text( + json.dumps(ids, ensure_ascii=False), + encoding="utf-8", +) sys.exit(0) \ No newline at end of file diff --git a/python/vector/vector_service.py b/python/vector/vector_service.py index 4507741..97f45bf 100644 --- a/python/vector/vector_service.py +++ b/python/vector/vector_service.py @@ -6,10 +6,10 @@ from logging.handlers import RotatingFileHandler import threading import time from pathlib import Path -from typing import Any, List, Optional, Dict +from typing import Any, Dict, List, Optional, Tuple -import numpy as np import faiss +import numpy as np from fastapi import FastAPI, HTTPException, Request from fastapi.responses import JSONResponse from pydantic import BaseModel @@ -20,7 +20,7 @@ from sentence_transformers import SentenceTransformer # Service Stamp (to verify you are running THIS file) # ============================================================ -SERVICE_STAMP = "vector_service.py@2026-02-28T10:20+01:00" +SERVICE_STAMP = "vector_service.py@2026-04-20T00:00+02:00" # ============================================================ @@ -41,8 +41,6 @@ TAG_MAP_PATH = KNOWLEDGE_DIR / "vector_tags.index.meta.json" INDEX_META_PATH = KNOWLEDGE_DIR / "index_meta.json" INDEX_RUNTIME_PATH = KNOWLEDGE_DIR / "index_runtime.json" INDEX_NDJSON_PATH = KNOWLEDGE_DIR / "index.ndjson" - -# NEW: Tags NDJSON (exported by PHP) used to enrich /search-tags responses TAGS_NDJSON_PATH = KNOWLEDGE_DIR / "tags.ndjson" @@ -54,6 +52,48 @@ logger = logging.getLogger("vector_service") logger.setLevel(logging.INFO) +# ============================================================ +# App State +# ============================================================ + +app = FastAPI() + +model: Optional[SentenceTransformer] = None +chunk_index = None +chunk_ids: Optional[List[Any]] = None + +chunk_doc_map: Dict[str, str] = {} +chunk_pos_map: Dict[str, int] = {} + +tag_index = None +tag_ids: Optional[List[Any]] = None + +# tag_id -> {"label": "...", "tag_type": "..."} +tag_meta_map: Dict[str, Dict[str, str]] = {} + +loaded_embedding_model_name: Optional[str] = None +current_index_version: Optional[int] = None +current_chunk_runtime_stamp: Optional[str] = None +current_tags_runtime_stamp: Optional[str] = None +current_tags_index_present: Optional[bool] = None + +reload_lock = threading.Lock() + + +# ============================================================ +# Models +# ============================================================ + +class SearchRequest(BaseModel): + query: str + limit: int = 8 + doc_ids: Optional[List[str]] = None + + +# ============================================================ +# Helpers +# ============================================================ + def setup_logging() -> None: LOG_DIR.mkdir(parents=True, exist_ok=True) @@ -77,10 +117,9 @@ def setup_logging() -> None: if not any(isinstance(h, RotatingFileHandler) for h in logger.handlers): logger.addHandler(file_handler) - if not any(isinstance(h, logging.StreamHandler) for h in logger.handlers): + if not any(type(h) is logging.StreamHandler for h in logger.handlers): logger.addHandler(stream_handler) - # Capture uvicorn logs in the same file as well (critical for hidden 500s) uvicorn_error = logging.getLogger("uvicorn.error") uvicorn_access = logging.getLogger("uvicorn.access") @@ -89,62 +128,22 @@ def setup_logging() -> None: if not any(isinstance(h, RotatingFileHandler) for h in uvicorn_error.handlers): uvicorn_error.addHandler(file_handler) - if not any(isinstance(h, logging.StreamHandler) for h in uvicorn_error.handlers): + if not any(type(h) is logging.StreamHandler for h in uvicorn_error.handlers): uvicorn_error.addHandler(stream_handler) if not any(isinstance(h, RotatingFileHandler) for h in uvicorn_access.handlers): uvicorn_access.addHandler(file_handler) - if not any(isinstance(h, logging.StreamHandler) for h in uvicorn_access.handlers): + if not any(type(h) is logging.StreamHandler for h in uvicorn_access.handlers): uvicorn_access.addHandler(stream_handler) -# ============================================================ -# FastAPI -# ============================================================ - -app = FastAPI() - -model: Optional[SentenceTransformer] = None -chunk_index = None -chunk_ids: Optional[List[Any]] = None - -chunk_doc_map: Dict[str, str] = {} -chunk_pos_map: Dict[str, int] = {} - -tag_index = None -tag_ids: Optional[List[Any]] = None - -# NEW: tag_id -> {"label": "...", "tag_type": "..."} -tag_meta_map: Dict[str, Dict[str, str]] = {} - -loaded_embedding_model_name: Optional[str] = None -current_index_version: Optional[int] = None -current_runtime_stamp: Optional[str] = None - -reload_lock = threading.Lock() - - -# ============================================================ -# Models -# ============================================================ - -class SearchRequest(BaseModel): - query: str - limit: int = 8 - doc_ids: Optional[List[str]] = None - - -# ============================================================ -# Loader Helpers -# ============================================================ - def _safe_read_json(path: Path) -> Optional[Any]: try: if not path.exists(): return None return json.loads(path.read_text(encoding="utf-8")) - except Exception as e: - logger.warning("Failed to read json %s: %s", str(path), str(e)) + except Exception as exc: + logger.warning("Failed to read json %s: %s", str(path), str(exc)) return None @@ -152,25 +151,97 @@ def _as_key(value: Any) -> Optional[str]: if value is None: return None if isinstance(value, str): - v = value.strip() - return v if v else None + value = value.strip() + return value or None try: - v = str(value).strip() - return v if v else None + value = str(value).strip() + return value or None except Exception: return None def _sanitize_limit(limit: int, default: int = 8, max_limit: int = 200) -> int: try: - v = int(limit) + value = int(limit) except Exception: return default - if v <= 0: + if value <= 0: return default - if v > max_limit: + if value > max_limit: return max_limit - return v + return value + + +def _normalize_meta_list(value: Any) -> Optional[List[Any]]: + """ + Accepts: + - list: ok + - dict like {"0": "...", "1": "..."}: convert to list sorted by numeric key + Returns None if invalid. + """ + if isinstance(value, list): + return value + + if isinstance(value, dict): + try: + keys = sorted(int(key) for key in value.keys()) + return [value[str(i)] for i in keys] + except Exception: + return None + + return None + + +def _normalize_tag_type(value: Any) -> str: + normalized = _as_key(value) + if normalized is None: + return "generic" + + normalized = normalized.lower() + if normalized in {"generic", "catalog_entity", "sales_signal"}: + return normalized + + return "generic" + + +def _extract_runtime_state(runtime: Any) -> Tuple[Optional[str], Optional[str], Optional[bool]]: + if not isinstance(runtime, dict): + return None, None, None + + chunk_runtime = runtime.get("last_rebuild_at") + tags_runtime = runtime.get("last_tags_rebuild_at") + tags_index_present = runtime.get("tags_index_present") + + if not isinstance(chunk_runtime, str): + chunk_runtime = None + if not isinstance(tags_runtime, str): + tags_runtime = None + if not isinstance(tags_index_present, bool): + tags_index_present = None + + return chunk_runtime, tags_runtime, tags_index_present + + +def _validate_index_alignment(index_obj: Any, ids: Optional[List[Any]], label: str) -> Tuple[Any, Optional[List[Any]]]: + if index_obj is None or ids is None: + return None, None + + try: + index_count = int(index_obj.ntotal) + except Exception: + logger.warning("[Reload] %s index has no ntotal -> disabled", label) + return None, None + + if index_count != len(ids): + logger.warning( + "[Reload] %s meta/index mismatch (ids=%s index=%s) -> disabled", + label, + len(ids), + index_count, + ) + return None, None + + return index_obj, ids def load_chunk_maps_from_ndjson() -> None: @@ -183,8 +254,8 @@ def load_chunk_maps_from_ndjson() -> None: return try: - with INDEX_NDJSON_PATH.open("r", encoding="utf-8") as f: - for line in f: + with INDEX_NDJSON_PATH.open("r", encoding="utf-8") as handle: + for line in handle: line = line.strip() if not line: continue @@ -201,29 +272,32 @@ def load_chunk_maps_from_ndjson() -> None: if doc_id_key: chunk_doc_map[chunk_id_key] = doc_id_key - ci = row.get("chunk_index") - if isinstance(ci, int): - chunk_pos_map[chunk_id_key] = ci - elif isinstance(ci, str): - s = ci.strip() - if s.isdigit(): + chunk_index_value = row.get("chunk_index") + if isinstance(chunk_index_value, int): + chunk_pos_map[chunk_id_key] = chunk_index_value + elif isinstance(chunk_index_value, str): + stripped = chunk_index_value.strip() + if stripped.isdigit(): try: - chunk_pos_map[chunk_id_key] = int(s) + chunk_pos_map[chunk_id_key] = int(stripped) except Exception: pass - - except Exception as e: - logger.warning("Failed to load chunk maps from ndjson: %s", str(e)) + except Exception as exc: + logger.warning("Failed to load chunk maps from ndjson: %s", str(exc)) def load_tag_meta_from_tags_ndjson() -> None: """ Loads minimal tag metadata from tags.ndjson to enrich /search-tags results. - Expected line format (from PHP exporter / ingester pipeline): - {"tag_id":"...","text":"LABEL\\nSLUG\\noptional description", ...} - We extract: - label = first line of "text" (fallback: "") - tag_type = "type" if present (preferred), else "generic" + Expected line format: + { + "tag_id": "...", + "text": "LABEL\\nSLUG\\noptional description", + "type": "catalog_entity|generic|sales_signal", + "document_ids": ["..."] + } + + Only tags with at least one exported document id are kept. """ global tag_meta_map @@ -234,11 +308,12 @@ def load_tag_meta_from_tags_ndjson() -> None: return try: - with TAGS_NDJSON_PATH.open("r", encoding="utf-8") as f: - for line in f: + with TAGS_NDJSON_PATH.open("r", encoding="utf-8") as handle: + for line in handle: line = line.strip() if not line: continue + try: row = json.loads(line) except Exception: @@ -248,55 +323,33 @@ def load_tag_meta_from_tags_ndjson() -> None: if not tag_id: continue - # Prefer explicit fields if present - ttype = row.get("type") - if isinstance(ttype, str) and ttype.strip(): - tag_type = ttype.strip() - else: - tag_type = "generic" + document_ids = row.get("document_ids") + if isinstance(document_ids, list) and len(document_ids) == 0: + continue + tag_type = _normalize_tag_type(row.get("type")) label = "" - txt = row.get("text") - if isinstance(txt, str) and txt.strip(): - first = txt.splitlines()[0].strip() if txt.splitlines() else "" - label = first - if label: - tag_meta_map[tag_id] = {"label": label, "tag_type": tag_type} - else: - tag_meta_map[tag_id] = {"label": "", "tag_type": tag_type} + text_value = row.get("text") + if isinstance(text_value, str) and text_value.strip(): + first_line = text_value.splitlines()[0].strip() if text_value.splitlines() else "" + label = first_line - except Exception as e: - logger.warning("Failed to load tag meta from tags.ndjson: %s", str(e)) + tag_meta_map[tag_id] = { + "label": label, + "tag_type": tag_type, + } + except Exception as exc: + logger.warning("Failed to load tag meta from tags.ndjson: %s", str(exc)) tag_meta_map = {} -def _normalize_meta_list(value: Any) -> Optional[List[Any]]: - """ - Accepts: - - list: ok - - dict like {"0": "...", "1": "..."}: convert to list sorted by numeric key - Returns None if invalid. - """ - if isinstance(value, list): - return value - - if isinstance(value, dict): - try: - keys = sorted(int(k) for k in value.keys()) - return [value[str(i)] for i in keys] - except Exception: - return None - - return None - - def load_all() -> None: global model, chunk_index, chunk_ids global tag_index, tag_ids global loaded_embedding_model_name global current_index_version - global current_runtime_stamp + global current_chunk_runtime_stamp, current_tags_runtime_stamp, current_tags_index_present with reload_lock: meta = _safe_read_json(INDEX_META_PATH) @@ -314,15 +367,21 @@ def load_all() -> None: model = SentenceTransformer(embedding_model_name) loaded_embedding_model_name = embedding_model_name + runtime = _safe_read_json(INDEX_RUNTIME_PATH) + chunk_runtime_stamp, tags_runtime_stamp, tags_index_present = _extract_runtime_state(runtime) + # Chunks if CHUNK_INDEX_PATH.exists() and CHUNK_MAP_PATH.exists(): logger.info("[Reload] Loading chunk index") - chunk_index = faiss.read_index(str(CHUNK_INDEX_PATH)) - raw = _safe_read_json(CHUNK_MAP_PATH) - chunk_ids = _normalize_meta_list(raw) - if chunk_ids is None: + loaded_chunk_index = faiss.read_index(str(CHUNK_INDEX_PATH)) + raw_chunk_meta = _safe_read_json(CHUNK_MAP_PATH) + loaded_chunk_ids = _normalize_meta_list(raw_chunk_meta) + if loaded_chunk_ids is None: chunk_index = None + chunk_ids = None logger.warning("[Reload] chunk_ids meta invalid -> chunk index disabled") + else: + chunk_index, chunk_ids = _validate_index_alignment(loaded_chunk_index, loaded_chunk_ids, "chunk") else: chunk_index = None chunk_ids = None @@ -331,35 +390,38 @@ def load_all() -> None: load_chunk_maps_from_ndjson() # Tags - if TAG_INDEX_PATH.exists() and TAG_MAP_PATH.exists(): + should_load_tag_index = tags_index_present is not False + if should_load_tag_index and TAG_INDEX_PATH.exists() and TAG_MAP_PATH.exists(): logger.info("[Reload] Loading tag index") - tag_index = faiss.read_index(str(TAG_INDEX_PATH)) - raw = _safe_read_json(TAG_MAP_PATH) - tag_ids = _normalize_meta_list(raw) - if tag_ids is None: + loaded_tag_index = faiss.read_index(str(TAG_INDEX_PATH)) + raw_tag_meta = _safe_read_json(TAG_MAP_PATH) + loaded_tag_ids = _normalize_meta_list(raw_tag_meta) + if loaded_tag_ids is None: tag_index = None + tag_ids = None logger.warning("[Reload] tag_ids meta invalid -> tag index disabled") + else: + tag_index, tag_ids = _validate_index_alignment(loaded_tag_index, loaded_tag_ids, "tag") else: tag_index = None tag_ids = None + if tags_index_present is False: + logger.info("[Reload] Runtime marks tags index as absent -> tag index disabled") - # NEW: load tag meta for enrichment logger.info("[Reload] Loading tag meta from tags.ndjson") load_tag_meta_from_tags_ndjson() - runtime = _safe_read_json(INDEX_RUNTIME_PATH) - if isinstance(runtime, dict): - v = runtime.get("last_rebuild_at") - current_runtime_stamp = v if isinstance(v, str) else None - else: - current_runtime_stamp = None - current_index_version = index_version if isinstance(index_version, int) else None + current_chunk_runtime_stamp = chunk_runtime_stamp + current_tags_runtime_stamp = tags_runtime_stamp + current_tags_index_present = tags_index_present logger.info( - "[Reload] Completed (index_version=%s runtime=%s embedding_model=%s tag_meta=%s stamp=%s file=%s)", + "[Reload] Completed (index_version=%s chunk_runtime=%s tags_runtime=%s tags_index_present=%s embedding_model=%s tag_meta=%s stamp=%s file=%s)", str(current_index_version), - str(current_runtime_stamp), + str(current_chunk_runtime_stamp), + str(current_tags_runtime_stamp), + str(current_tags_index_present), str(loaded_embedding_model_name), str(len(tag_meta_map)), SERVICE_STAMP, @@ -373,7 +435,7 @@ def load_all() -> None: def observer_loop() -> None: global current_index_version - global current_runtime_stamp + global current_chunk_runtime_stamp, current_tags_runtime_stamp, current_tags_index_present while True: time.sleep(2) @@ -384,28 +446,50 @@ def observer_loop() -> None: continue new_version = meta.get("index_version") if isinstance(meta.get("index_version"), int) else None - runtime = _safe_read_json(INDEX_RUNTIME_PATH) - new_runtime = None - if isinstance(runtime, dict): - v = runtime.get("last_rebuild_at") - new_runtime = v if isinstance(v, str) else None + new_chunk_runtime, new_tags_runtime, new_tags_index_present = _extract_runtime_state(runtime) if new_version != current_index_version: - logger.info("[Observer] index_version changed (%s -> %s) -> Reload", str(current_index_version), str(new_version)) + logger.info( + "[Observer] index_version changed (%s -> %s) -> Reload", + str(current_index_version), + str(new_version), + ) load_all() continue - if new_runtime != current_runtime_stamp: - logger.info("[Observer] runtime changed (%s -> %s) -> Reload", str(current_runtime_stamp), str(new_runtime)) + if new_chunk_runtime != current_chunk_runtime_stamp: + logger.info( + "[Observer] chunk runtime changed (%s -> %s) -> Reload", + str(current_chunk_runtime_stamp), + str(new_chunk_runtime), + ) + load_all() + continue + + if new_tags_runtime != current_tags_runtime_stamp: + logger.info( + "[Observer] tags runtime changed (%s -> %s) -> Reload", + str(current_tags_runtime_stamp), + str(new_tags_runtime), + ) + load_all() + continue + + if new_tags_index_present != current_tags_index_present: + logger.info( + "[Observer] tags_index_present changed (%s -> %s) -> Reload", + str(current_tags_index_present), + str(new_tags_index_present), + ) load_all() - except Exception as e: - logger.exception("[Observer ERROR] %s", str(e)) + except Exception as exc: + logger.exception("[Observer ERROR] %s", str(exc)) # ============================================================ -# Global Exception Handler (forces JSON + logs) +# Global Exception Handler # ============================================================ @app.exception_handler(Exception) @@ -427,12 +511,12 @@ async def unhandled_exception_handler(request: Request, exc: Exception): # ============================================================ @app.on_event("startup") -def startup_event(): +def startup_event() -> None: setup_logging() logger.info("[VectorService] Startup stamp=%s file=%s", SERVICE_STAMP, str(Path(__file__).resolve())) load_all() - t = threading.Thread(target=observer_loop, daemon=True) - t.start() + observer = threading.Thread(target=observer_loop, daemon=True) + observer.start() logger.info("[VectorService] Ready (log=%s)", str(LOG_FILE)) @@ -441,7 +525,7 @@ def startup_event(): # ============================================================ @app.get("/health") -def health(): +def health() -> Dict[str, Any]: return { "status": "ok", "stamp": SERVICE_STAMP, @@ -451,7 +535,9 @@ def health(): "model_loaded": model is not None, "embedding_model": loaded_embedding_model_name, "index_version": current_index_version, - "runtime_stamp": current_runtime_stamp, + "chunk_runtime_stamp": current_chunk_runtime_stamp, + "tags_runtime_stamp": current_tags_runtime_stamp, + "tags_index_present": current_tags_index_present, "tag_meta_type": type(tag_ids).__name__ if tag_ids is not None else None, "tag_meta_len": len(tag_ids) if isinstance(tag_ids, list) else None, "chunk_meta_type": type(chunk_ids).__name__ if chunk_ids is not None else None, @@ -463,17 +549,17 @@ def health(): @app.post("/reload") -def reload(): +def reload() -> Dict[str, str]: try: load_all() return {"status": "reloaded", "stamp": SERVICE_STAMP} - except Exception as e: + except Exception as exc: logger.exception("reload failed") - raise HTTPException(status_code=500, detail=str(e)) + raise HTTPException(status_code=500, detail=str(exc)) @app.post("/search-chunks") -def search_chunks(req: SearchRequest): +def search_chunks(req: SearchRequest) -> List[Dict[str, Any]]: if chunk_index is None or chunk_ids is None or model is None: raise HTTPException(status_code=503, detail="Chunk index not available") @@ -491,16 +577,16 @@ def search_chunks(req: SearchRequest): doc_filter: Optional[List[str]] = None if req.doc_ids: doc_filter = [] - for d in req.doc_ids: - dk = _as_key(d) - if dk: - doc_filter.append(dk) + for document_id in req.doc_ids: + document_key = _as_key(document_id) + if document_key: + doc_filter.append(document_key) effective_limit = max(limit * 5, 50) effective_limit = min(effective_limit, 500) scores, indices = chunk_index.search(query_vec, effective_limit) - results = [] + results: List[Dict[str, Any]] = [] for score, idx in zip(scores[0], indices[0]): if idx == -1: continue @@ -512,20 +598,20 @@ def search_chunks(req: SearchRequest): if not chunk_id_key: continue - doc_id = chunk_doc_map.get(chunk_id_key) + document_id = chunk_doc_map.get(chunk_id_key) if doc_filter is not None: - if doc_id is None or doc_id not in doc_filter: + if document_id is None or document_id not in doc_filter: continue - payload = { + payload: Dict[str, Any] = { "chunk_id": raw_chunk_id, "score": float(score), - "document_id": doc_id, + "document_id": document_id, } - ci = chunk_pos_map.get(chunk_id_key) - if isinstance(ci, int): - payload["chunk_index"] = ci + chunk_position = chunk_pos_map.get(chunk_id_key) + if isinstance(chunk_position, int): + payload["chunk_index"] = chunk_position results.append(payload) @@ -536,13 +622,13 @@ def search_chunks(req: SearchRequest): except HTTPException: raise - except Exception as e: + except Exception as exc: logger.exception("search-chunks failure") - raise HTTPException(status_code=500, detail=str(e)) + raise HTTPException(status_code=500, detail=str(exc)) @app.post("/search-tags") -def search_tags(req: SearchRequest): +def search_tags(req: SearchRequest) -> List[Dict[str, Any]]: if tag_index is None or tag_ids is None or model is None: raise HTTPException(status_code=503, detail="Tag index not available") @@ -564,37 +650,47 @@ def search_tags(req: SearchRequest): scores, indices = tag_index.search(query_vec, limit) - results = [] + results: List[Dict[str, Any]] = [] + seen_tag_ids = set() + for score, idx in zip(scores[0], indices[0]): if idx == -1: continue if idx < 0 or idx >= len(tag_ids): continue - tag_id = tag_ids[idx] - tag_id_key = _as_key(tag_id) or "" + raw_tag_id = tag_ids[idx] + tag_id_key = _as_key(raw_tag_id) + if not tag_id_key or tag_id_key in seen_tag_ids: + continue payload: Dict[str, Any] = { - "tag_id": tag_id, + "tag_id": raw_tag_id, "score": float(score), } meta = tag_meta_map.get(tag_id_key) if isinstance(meta, dict): label = meta.get("label") - ttype = meta.get("tag_type") + tag_type = meta.get("tag_type") - if isinstance(label, str) and label.strip(): - payload["label"] = label - if isinstance(ttype, str) and ttype.strip(): - payload["tag_type"] = ttype + if isinstance(label, str): + payload["label"] = label.strip() + payload["tag_type"] = _normalize_tag_type(tag_type) + else: + payload["label"] = "" + payload["tag_type"] = "generic" results.append(payload) + seen_tag_ids.add(tag_id_key) + + if len(results) >= limit: + break return results except HTTPException: raise - except Exception as e: + except Exception as exc: logger.exception("search-tags failure") - raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file + raise HTTPException(status_code=500, detail=str(exc)) \ No newline at end of file diff --git a/src/Catalog/EntityCatalogService.php b/src/Catalog/EntityCatalogService.php index a3962c8..4b06ab1 100644 --- a/src/Catalog/EntityCatalogService.php +++ b/src/Catalog/EntityCatalogService.php @@ -4,77 +4,84 @@ declare(strict_types=1); namespace App\Catalog; +use App\Config\CatalogIntentConfig; +use App\Entity\Document; +use App\Tag\TagTypes; use App\Tag\TagVectorSearchClient; use Doctrine\DBAL\Connection; use Symfony\Component\Uid\Uuid; /** - * EntityCatalogService + * Builds deterministic catalog lists from a validated catalog entity term. * - * Deterministische Katalog-Listen auf Basis eines Entity-Terms: - * - TagVectorSearch (Score-Gate + Ambiguity-Check) - * - DB Query auf document_tag + document (ACTIVE) - * - Rückgabe als EIN Textblock (string) oder null (Fallback auf normalen Retrieval) - * - * Schritt-3 Änderung: - * - Headline ist NICHT mehr hardcoded - * - Headline basiert dynamisch auf dem gefundenen Tag + * This service is intentionally conservative: + * - only strong catalog_entity matches may open the catalog path + * - ambiguous matches fall back to normal retrieval + * - only ACTIVE documents are listed */ final class EntityCatalogService { - private const MIN_SCORE = 0.55; - private const AMBIGUITY_DELTA = 0.05; + private const SEARCH_LIMIT = 3; public function __construct( private readonly TagVectorSearchClient $tagVectorClient, - private readonly Connection $connection, - ) {} + private readonly Connection $connection, + ) { + } /** - * @return string|null Textblock oder null (wenn kein sicherer Catalog möglich ist) + * Returns a catalog text block or null when no safe catalog path exists. */ public function listByTerm(string $entityTerm): ?string { $entityTerm = trim($entityTerm); + if ($entityTerm === '') { return null; } - // 1) Tag-Vektorsuche (Top 3 für Ambiguity-Prüfung) - $hits = $this->tagVectorClient->search($entityTerm, 3); + $hits = $this->tagVectorClient->search($entityTerm, self::SEARCH_LIMIT); if ($hits === []) { return null; } $best = $hits[0]; + $bestScore = (float) ($best['score'] ?? 0.0); - $bestScore = isset($best['score']) ? (float)$best['score'] : 0.0; - if ($bestScore < self::MIN_SCORE) { + if ($bestScore < CatalogIntentConfig::MIN_SCORE) { + return null; + } + + if (($best['tag_type'] ?? null) !== TagTypes::CATALOG_ENTITY) { return null; } - // 2) Ambiguity: wenn Top2 zu nah ist → konservativ abbrechen if (isset($hits[1])) { - $secondScore = isset($hits[1]['score']) ? (float)$hits[1]['score'] : 0.0; - if (abs($bestScore - $secondScore) < self::AMBIGUITY_DELTA) { + $secondScore = (float) ($hits[1]['score'] ?? 0.0); + + if (abs($bestScore - $secondScore) < CatalogIntentConfig::AMBIGUITY_DELTA) { return null; } } - $tagHex = (string)($best['tag_id'] ?? ''); - if ($tagHex === '') { + $tagId = trim((string) ($best['tag_id'] ?? '')); + + if ($tagId === '') { return null; } - // OPTIONAL: Falls TagVectorSearchClient künftig tag_label zurückliefert, - // kann das hier direkt verwendet werden. - $tagLabel = isset($best['tag_label']) ? (string)$best['tag_label'] : null; + try { + $tagBinaryId = Uuid::fromString($tagId)->toBinary(); + } catch (\Throwable) { + return null; + } + + $tagLabel = trim((string) ($best['label'] ?? '')); - // 3) DB Query: alle ACTIVE Dokumente zu diesem Tag $rows = $this->connection->fetchAllAssociative( ' - SELECT d.title + SELECT DISTINCT d.title FROM document d INNER JOIN document_tag dt ON dt.document_id = d.id WHERE dt.tag_id = :tagId @@ -82,8 +89,8 @@ final class EntityCatalogService ORDER BY d.title ASC ', [ - 'tagId' => Uuid::fromString($tagHex)->toBinary(), - 'status' => 'ACTIVE', + 'tagId' => $tagBinaryId, + 'status' => Document::STATUS_ACTIVE, ] ); @@ -92,37 +99,42 @@ final class EntityCatalogService } $titles = []; + foreach ($rows as $row) { - $t = trim((string)($row['title'] ?? '')); - if ($t !== '') { - $titles[] = $t; + $title = trim((string) ($row['title'] ?? '')); + + if ($title === '') { + continue; } + + $titles[$title] = $title; } if ($titles === []) { return null; } - return $this->buildTextBlock($tagLabel, $titles); + return $this->buildTextBlock( + $tagLabel !== '' ? $tagLabel : null, + array_values($titles) + ); } /** - * Dynamische Headline: - * - Wenn Tag-Label vorhanden → verwenden - * - Sonst generischer Fallback + * Builds a stable human-readable list block for the prompt. + * + * @param list $titles */ private function buildTextBlock(?string $tagLabel, array $titles): string { $headline = 'Folgende Einträge sind verfügbar:'; - if (\is_string($tagLabel) && \trim($tagLabel) !== '') { - $headline = sprintf( - 'Folgende %s sind verfügbar:', - $tagLabel - ); + if ($tagLabel !== null && trim($tagLabel) !== '') { + $headline = sprintf('Folgende %s sind verfügbar:', trim($tagLabel)); } $lines = []; + foreach ($titles as $title) { $lines[] = '- ' . $title; } diff --git a/src/Command/SystemRebuildCommand.php b/src/Command/SystemRebuildCommand.php index 4dbf736..54f141f 100644 --- a/src/Command/SystemRebuildCommand.php +++ b/src/Command/SystemRebuildCommand.php @@ -1,6 +1,5 @@ getOption('hard')) { $io->error('Safety switch missing: you must pass --hard to run this command.'); $io->writeln('Example: bin/console mto:agent:system:rebuild --hard'); + return Command::FAILURE; } - $dryRun = (bool)$input->getOption('dry-run'); + $dryRun = (bool) $input->getOption('dry-run'); $io->title('mto:agent:system:rebuild --hard'); - // --------------------------------------------------------- - // 1) GLOBAL REINDEX (chunks rewrite + vector rebuild) - // --------------------------------------------------------- + if (!$this->runGlobalReindex($io, $dryRun)) { + return Command::FAILURE; + } + + if (!$this->runTagRebuild($io, $input, $dryRun)) { + return Command::FAILURE; + } + + if (!$this->runVectorServiceReload($io, $input, $dryRun)) { + return Command::FAILURE; + } + + if (!$this->runHealthChecks($io, $input)) { + return Command::FAILURE; + } + + $io->success('System rebuild finished.'); + + return Command::SUCCESS; + } + + private function runGlobalReindex(SymfonyStyle $io, bool $dryRun): bool + { $io->section('1/4 Global reindex (chunks + vector index)'); $job = $this->jobService->startJob( @@ -82,141 +101,181 @@ final class SystemRebuildCommand extends Command try { $this->orchestrator->runExistingJob($job, $dryRun); $io->success('Global reindex completed.'); + + return true; } catch (\Throwable $e) { $io->error('Global reindex failed: ' . $e->getMessage()); - return Command::FAILURE; + + return false; } + } - // --------------------------------------------------------- - // 2) TAG REBUILD (tags.ndjson + vector_tags.index) - // --------------------------------------------------------- - if (!$input->getOption('no-tags')) { - $io->section('2/4 Tag rebuild (tags.ndjson + vector_tags.index)'); - - if ($dryRun) { - $io->note('dry-run enabled: tag rebuild skipped (would export + build tag index).'); - } else { - try { - $export = $this->tagExporter->export(); - - $io->writeln('Exported tags.ndjson'); - $io->writeln('Path: ' . $export['path']); - $io->writeln('Tags: ' . $export['tags']); - $io->writeln('Lines: ' . $export['lines']); - $io->writeln('Bytes: ' . $export['bytes']); - - $this->tagIndexBuilder->build(); - $io->writeln('Built vector_tags.index'); - - $this->metaManager->touchRuntime([ - 'last_tags_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM), - ]); - $io->success('Tag rebuild completed.'); - } catch (\Throwable $e) { - $io->error('Tag rebuild failed: ' . $e->getMessage()); - return Command::FAILURE; - } - } - } else { + private function runTagRebuild(SymfonyStyle $io, InputInterface $input, bool $dryRun): bool + { + if ((bool) $input->getOption('no-tags')) { $io->section('2/4 Tag rebuild'); $io->note('Skipped due to --no-tags.'); + + return true; } - // --------------------------------------------------------- - // 3) VECTOR SERVICE (install deps + start + reload) - // --------------------------------------------------------- - if (!$input->getOption('no-reload')) { - $io->section('3/4 Vector service reload (uvicorn)'); + $io->section('2/4 Tag rebuild (tags.ndjson + vector_tags.index)'); - if ($dryRun) { - $io->note('dry-run enabled: service reload skipped.'); - } else { - $cmd = [ - '.venv/bin/python', - 'python/vector/vector_control.py', - '--install', - '--start', - '--reload', - '--port', '8090', - '--host', '0.0.0.0' - ]; + if ($dryRun) { + $io->note('dry-run enabled: tag rebuild skipped (would export + build tag index).'); - $process = new Process($cmd, $this->projectDir); - $process->setTimeout(600); - $process->run(); + return true; + } - $out = trim($process->getOutput()); - $err = trim($process->getErrorOutput()); + try { + $export = $this->tagExporter->export(); - if ($out !== '') { - $io->writeln($out); - } - if ($err !== '') { - $io->writeln('' . $err . ''); - } + $io->writeln('Exported tags.ndjson'); + $io->writeln('Path: ' . (string) $export['path']); + $io->writeln('Tags: ' . (string) $export['tags']); + $io->writeln('Lines: ' . (string) $export['lines']); + $io->writeln('Bytes: ' . (string) $export['bytes']); - if (!$process->isSuccessful()) { - $io->error('Vector service reload failed (non-zero exit code).'); - return Command::FAILURE; - } + $this->tagIndexBuilder->build(); - $io->success('Vector service reloaded.'); - } - } else { + $io->success('Tag rebuild completed.'); + + return true; + } catch (\Throwable $e) { + $io->error('Tag rebuild failed: ' . $e->getMessage()); + + return false; + } + } + + private function runVectorServiceReload(SymfonyStyle $io, InputInterface $input, bool $dryRun): bool + { + if ((bool) $input->getOption('no-reload')) { $io->section('3/4 Vector service reload'); $io->note('Skipped due to --no-reload.'); + + return true; } - // --------------------------------------------------------- - // 4) HEALTH CHECK (NDJSON vs vector meta) - // --------------------------------------------------------- - if (!$input->getOption('no-health')) { - $io->section('4/4 Health check'); + $io->section('3/4 Vector service reload (uvicorn)'); - try { - $report = $this->health->check(); - } catch (\Throwable $e) { - $io->error('Health check failed: ' . $e->getMessage()); - return Command::FAILURE; - } + if ($dryRun) { + $io->note('dry-run enabled: service reload skipped.'); - try { - $reportTag = $this->tagHealth->check(); - } catch (\Throwable $e) { - $io->error('Tag health check failed: ' . $e->getMessage()); - return Command::FAILURE; - } + return true; + } - $io->definitionList( - ['ndjson_exists' => $report['ndjson_exists'] ? 'yes' : 'no'], - ['ndjson_chunk_count' => (string)$report['ndjson_chunk_count']], - ['vector_exists' => $report['vector_exists'] ? 'yes' : 'no'], - ['meta_exists' => $report['meta_exists'] ? 'yes' : 'no'], - ['vector_chunk_count' => (string)$report['vector_chunk_count']], - ['status' => (string)$report['status']], - ); + $cmd = [ + '.venv/bin/python', + 'python/vector/vector_control.py', + '--install', + '--start', + '--reload', + '--port', '8090', + '--host', '0.0.0.0', + ]; - $io->definitionList( - ['tags_ndjson_exists' => $reportTag['tags_ndjson_exists'] ? 'yes' : 'no'], - ['tags_ndjson_count' => (string)$reportTag['tags_ndjson_count']], - ['tag_vector_exists' => $reportTag['vector_exists'] ? 'yes' : 'no'], - ['tag_meta_exists' => $reportTag['meta_exists'] ? 'yes' : 'no'], - ['vector_tag_count' => (string)$reportTag['vector_tag_count']], - ['status' => (string)$reportTag['status']], - ); + $process = new Process($cmd, $this->projectDir); + $process->setTimeout(600); + $process->run(); - if (!in_array($report['status'], ['OK', 'OK_EMPTY'], true)) { - $io->error('Health check not OK: ' . $report['status']); - return Command::FAILURE; - } + $stdout = trim($process->getOutput()); + $stderr = trim($process->getErrorOutput()); - $io->success('Health check OK.'); - } else { + if ($stdout !== '') { + $io->writeln($stdout); + } + + if ($stderr !== '') { + $io->writeln('' . $stderr . ''); + } + + if (!$process->isSuccessful()) { + $io->error('Vector service reload failed (non-zero exit code).'); + + return false; + } + + $io->success('Vector service reloaded.'); + + return true; + } + + private function runHealthChecks(SymfonyStyle $io, InputInterface $input): bool + { + if ((bool) $input->getOption('no-health')) { $io->section('4/4 Health check'); $io->note('Skipped due to --no-health.'); + + return true; } - $io->success('System rebuild finished.'); - return Command::SUCCESS; + $io->section('4/4 Health check'); + + try { + $chunkReport = $this->health->check(); + } catch (\Throwable $e) { + $io->error('Health check failed: ' . $e->getMessage()); + + return false; + } + + try { + $tagReport = $this->tagHealth->check(); + } catch (\Throwable $e) { + $io->error('Tag health check failed: ' . $e->getMessage()); + + return false; + } + + $this->renderChunkHealth($io, $chunkReport); + $this->renderTagHealth($io, $tagReport); + + if (!$this->isHealthOk((string) ($chunkReport['status'] ?? 'UNKNOWN'))) { + $io->error('Chunk health check not OK: ' . (string) ($chunkReport['status'] ?? 'UNKNOWN')); + + return false; + } + + if (!$this->isHealthOk((string) ($tagReport['status'] ?? 'UNKNOWN'))) { + $io->error('Tag health check not OK: ' . (string) ($tagReport['status'] ?? 'UNKNOWN')); + + return false; + } + + $io->success('Health check OK.'); + + return true; + } + + private function renderChunkHealth(SymfonyStyle $io, array $report): void + { + $io->definitionList( + ['ndjson_exists' => !empty($report['ndjson_exists']) ? 'yes' : 'no'], + ['ndjson_chunk_count' => (string) ($report['ndjson_chunk_count'] ?? 0)], + ['vector_exists' => !empty($report['vector_exists']) ? 'yes' : 'no'], + ['meta_exists' => !empty($report['meta_exists']) ? 'yes' : 'no'], + ['vector_chunk_count' => (string) ($report['vector_chunk_count'] ?? 0)], + ['status' => (string) ($report['status'] ?? 'UNKNOWN')], + ); + } + + private function renderTagHealth(SymfonyStyle $io, array $report): void + { + $io->definitionList( + ['tags_ndjson_exists' => !empty($report['tags_ndjson_exists']) ? 'yes' : 'no'], + ['tags_ndjson_count' => (string) ($report['tags_ndjson_count'] ?? 0)], + ['tag_vector_exists' => !empty($report['vector_exists']) ? 'yes' : 'no'], + ['tag_meta_exists' => !empty($report['meta_exists']) ? 'yes' : 'no'], + ['vector_tag_count' => (string) ($report['vector_tag_count'] ?? 0)], + ['tags_with_active_document_ids' => (string) ($report['tags_with_active_document_ids'] ?? 0)], + ['meta_valid' => !empty($report['meta_valid']) ? 'yes' : 'no'], + ['status' => (string) ($report['status'] ?? 'UNKNOWN')], + ); + } + + private function isHealthOk(string $status): bool + { + return in_array($status, ['OK', 'OK_EMPTY'], true); } } \ No newline at end of file diff --git a/src/Command/TagHealthCheckCommand.php b/src/Command/TagHealthCheckCommand.php index d3737f5..090f765 100644 --- a/src/Command/TagHealthCheckCommand.php +++ b/src/Command/TagHealthCheckCommand.php @@ -8,11 +8,13 @@ use App\Tag\TagVectorIndexHealthService; use Symfony\Component\Console\Attribute\AsCommand; use Symfony\Component\Console\Command\Command; use Symfony\Component\Console\Input\InputInterface; +use Symfony\Component\Console\Input\InputOption; use Symfony\Component\Console\Output\OutputInterface; +use Symfony\Component\Console\Style\SymfonyStyle; #[AsCommand( name: 'mto:agent:tag:health', - description: 'Health-Check für TAG/FAISS Konsistenz' + description: 'Health-Check für Tag-/FAISS-Konsistenz' )] final class TagHealthCheckCommand extends Command { @@ -22,14 +24,87 @@ final class TagHealthCheckCommand extends Command parent::__construct(); } + protected function configure(): void + { + $this->addOption( + 'summary', + null, + InputOption::VALUE_NONE, + 'Gibt eine lesbare Zusammenfassung statt JSON aus.' + ); + } + protected function execute(InputInterface $input, OutputInterface $output): int { $result = $this->health->check(); + $status = trim((string) ($result['status'] ?? '')); - $output->writeln(json_encode($result, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)); + if ($status === '') { + $status = 'UNKNOWN'; + $result['status'] = $status; + $result['error'] = 'Health service returned no status.'; + } - return str_starts_with($result['status'], 'OK') + if ((bool) $input->getOption('summary')) { + $this->renderSummary(new SymfonyStyle($input, $output), $result); + } else { + $this->renderJson($output, $result); + } + + return $this->isHealthy($status) ? Command::SUCCESS : Command::FAILURE; } -} + + /** + * @param array $result + */ + private function renderSummary(SymfonyStyle $io, array $result): void + { + $io->title('Tag Vector Health'); + + $io->definitionList( + ['status' => (string) ($result['status'] ?? 'UNKNOWN')], + ['tags_ndjson_exists' => !empty($result['tags_ndjson_exists']) ? 'yes' : 'no'], + ['tags_ndjson_count' => (string) ($result['tags_ndjson_count'] ?? 0)], + ['vector_exists' => !empty($result['vector_exists']) ? 'yes' : 'no'], + ['meta_exists' => !empty($result['meta_exists']) ? 'yes' : 'no'], + ['vector_tag_count' => (string) ($result['vector_tag_count'] ?? 0)], + ['meta_valid' => !empty($result['meta_valid']) ? 'yes' : 'no'], + ['tags_with_active_document_ids' => (string) ($result['tags_with_active_document_ids'] ?? 0)], + ); + + if (!empty($result['error'])) { + $io->warning((string) $result['error']); + } + } + + /** + * @param array $result + */ + private function renderJson(OutputInterface $output, array $result): void + { + $json = json_encode( + $result, + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE + ); + + if (!is_string($json)) { + $json = json_encode([ + 'status' => 'UNKNOWN', + 'error' => 'json_encode_failed', + ], JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE); + + if (!is_string($json)) { + $json = "{\"status\":\"UNKNOWN\",\"error\":\"json_encode_failed\"}"; + } + } + + $output->writeln($json); + } + + private function isHealthy(string $status): bool + { + return in_array($status, ['OK', 'OK_EMPTY'], true); + } +} \ No newline at end of file diff --git a/src/Command/TagRebuildRunJobCommand.php b/src/Command/TagRebuildRunJobCommand.php index 9e1592b..97c9b99 100644 --- a/src/Command/TagRebuildRunJobCommand.php +++ b/src/Command/TagRebuildRunJobCommand.php @@ -14,6 +14,7 @@ use Symfony\Component\Console\Input\InputArgument; use Symfony\Component\Console\Input\InputInterface; use Symfony\Component\Console\Input\InputOption; use Symfony\Component\Console\Output\OutputInterface; +use Symfony\Component\Console\Style\SymfonyStyle; #[AsCommand( name: 'mto:agent:tags:job:run', @@ -39,112 +40,152 @@ final class TagRebuildRunJobCommand extends Command protected function execute(InputInterface $input, OutputInterface $output): int { - $jobId = $input->getArgument('jobId'); + $io = new SymfonyStyle($input, $output); + + $jobId = trim((string) $input->getArgument('jobId')); $create = (bool) $input->getOption('create'); - if (!$create && !$jobId) { - $output->writeln('You must provide either a jobId or use --create.'); + if (!$create && $jobId === '') { + $io->error('You must provide either a jobId or use --create.'); + return Command::FAILURE; } - if ($create && $jobId) { - $output->writeln('Use either jobId OR --create, not both.'); + if ($create && $jobId !== '') { + $io->error('Use either jobId OR --create, not both.'); + return Command::FAILURE; } - if ($create) { - $job = new TagRebuildJob(); - $this->em->persist($job); - $this->em->flush(); - $jobId = $job->getId(); - $output->writeln('Created new TagRebuildJob: ' . $jobId . ''); - } else { - /** @var TagRebuildJob|null $job */ - $job = $this->em->getRepository(TagRebuildJob::class)->find($jobId); - - if (!$job instanceof TagRebuildJob) { - $output->writeln('Job not found.'); - return Command::FAILURE; - } - } - - $fh = null; + $job = null; + $lockHandle = null; try { - // --------------------------------------------------------- - // LOCK INITIALIZATION - // --------------------------------------------------------- - $lockDir = \dirname($this->lockFilePath); + $job = $create ? $this->createJob($io) : $this->findJob($jobId); + $lockHandle = $this->acquireLock(); - if (!\is_dir($lockDir) && !@\mkdir($lockDir, 0775, true) && !\is_dir($lockDir)) { - throw new \RuntimeException('Cannot create lock directory.'); - } - - $fh = @\fopen($this->lockFilePath, 'c+'); - if (!$fh) { - throw new \RuntimeException('Cannot open lock file: ' . $this->lockFilePath); - } - - if (!@\flock($fh, LOCK_EX | LOCK_NB)) { - throw new \RuntimeException('Another tag rebuild is currently running (lock busy).'); - } - - // --------------------------------------------------------- - // MARK RUNNING - // --------------------------------------------------------- $job->markRunning(); $this->em->flush(); - // --------------------------------------------------------- - // EXPORT TAGS (NDJSON) - // --------------------------------------------------------- $export = $this->exporter->export(); + $this->assertValidExport($export); - if ( - !isset($export['path']) || - !\is_string($export['path']) || - !\file_exists($export['path']) - ) { - throw new \RuntimeException('Export failed: NDJSON file missing.'); - } + $io->writeln('tags.ndjson exported'); + $io->writeln('Path: ' . (string) $export['path']); + $io->writeln('Tags: ' . (string) ($export['tags'] ?? 0)); + $io->writeln('Lines: ' . (string) ($export['lines'] ?? 0)); + $io->writeln('Bytes: ' . (string) ($export['bytes'] ?? 0)); - if (isset($export['count']) && (int) $export['count'] === 0) { - throw new \RuntimeException('Export produced zero tags.'); - } - - // --------------------------------------------------------- - // BUILD VECTOR INDEX - // --------------------------------------------------------- $this->builder->build(); - // --------------------------------------------------------- - // MARK COMPLETED - // --------------------------------------------------------- $job->markCompleted(); $this->em->flush(); - $output->writeln('Tag rebuild successful.'); - $output->writeln('NDJSON: ' . $export['path']); + $io->success('Tag rebuild successful.'); return Command::SUCCESS; - } - catch (\Throwable $e) { - - if (isset($job)) { - $job->markFailed($e->getMessage()); + } catch (\Throwable $e) { + if ($job instanceof TagRebuildJob) { + $job->markFailed($this->buildSafeErrorMessage($e)); $this->em->flush(); } - $output->writeln('FAILED: ' . $e->getMessage() . ''); + $io->error('FAILED: ' . $e->getMessage()); return Command::FAILURE; - } - finally { - - if ($fh) { - @\flock($fh, LOCK_UN); - @\fclose($fh); - } + } finally { + $this->releaseLock($lockHandle); } } + + private function createJob(SymfonyStyle $io): TagRebuildJob + { + $job = new TagRebuildJob(); + $this->em->persist($job); + $this->em->flush(); + + $io->writeln('Created new TagRebuildJob: ' . (string) $job->getId() . ''); + + return $job; + } + + private function findJob(string $jobId): TagRebuildJob + { + /** @var TagRebuildJob|null $job */ + $job = $this->em->getRepository(TagRebuildJob::class)->find($jobId); + + if (!$job instanceof TagRebuildJob) { + throw new \RuntimeException('Job not found.'); + } + + return $job; + } + + /** + * @return resource + */ + private function acquireLock() + { + $lockDir = \dirname($this->lockFilePath); + + if (!\is_dir($lockDir) && !@\mkdir($lockDir, 0775, true) && !\is_dir($lockDir)) { + throw new \RuntimeException('Cannot create lock directory.'); + } + + $handle = @\fopen($this->lockFilePath, 'c+'); + + if ($handle === false) { + throw new \RuntimeException('Cannot open lock file: ' . $this->lockFilePath); + } + + if (!@\flock($handle, LOCK_EX | LOCK_NB)) { + @\fclose($handle); + throw new \RuntimeException('Another tag rebuild is currently running (lock busy).'); + } + + return $handle; + } + + /** + * @param resource|null $handle + */ + private function releaseLock($handle): void + { + if (!is_resource($handle)) { + return; + } + + @\flock($handle, LOCK_UN); + @\fclose($handle); + } + + /** + * @param array $export + */ + private function assertValidExport(array $export): void + { + $path = trim((string) ($export['path'] ?? '')); + + if ($path === '' || !\is_file($path)) { + throw new \RuntimeException('Export failed: NDJSON file missing.'); + } + + $tags = (int) ($export['tags'] ?? 0); + $lines = (int) ($export['lines'] ?? 0); + + if ($tags < 0 || $lines < 0) { + throw new \RuntimeException('Export returned invalid statistics.'); + } + } + + private function buildSafeErrorMessage(\Throwable $e): string + { + $message = trim($e->getMessage()); + + if ($message === '') { + return 'Unknown tag rebuild failure.'; + } + + return mb_substr($message, 0, 4000); + } } \ No newline at end of file diff --git a/src/Command/TagsExportCommand.php b/src/Command/TagsExportCommand.php index 5d3bd27..613e1a9 100644 --- a/src/Command/TagsExportCommand.php +++ b/src/Command/TagsExportCommand.php @@ -9,6 +9,7 @@ use Symfony\Component\Console\Attribute\AsCommand; use Symfony\Component\Console\Command\Command; use Symfony\Component\Console\Input\InputInterface; use Symfony\Component\Console\Output\OutputInterface; +use Symfony\Component\Console\Style\SymfonyStyle; #[AsCommand( name: 'mto:agent:tags:export', @@ -17,26 +18,51 @@ use Symfony\Component\Console\Output\OutputInterface; final class TagsExportCommand extends Command { public function __construct( - private TagNdjsonExporter $exporter, + private readonly TagNdjsonExporter $exporter, ) { parent::__construct(); } protected function execute(InputInterface $input, OutputInterface $output): int { + $io = new SymfonyStyle($input, $output); + try { $result = $this->exporter->export(); + $this->assertValidExport($result); + + $io->writeln('Tags NDJSON exported'); + $io->writeln('Path: ' . (string) ($result['path'] ?? '')); + $io->writeln('Tags: ' . (string) ($result['tags'] ?? 0)); + $io->writeln('Lines: ' . (string) ($result['lines'] ?? 0)); + $io->writeln('Bytes: ' . (string) ($result['bytes'] ?? 0)); + $io->success('Tag export completed.'); + + return Command::SUCCESS; } catch (\Throwable $e) { - $output->writeln('ERROR: ' . $e->getMessage() . ''); + $io->error($e->getMessage()); + return Command::FAILURE; } + } - $output->writeln('Tags NDJSON exported'); - $output->writeln('Path: ' . $result['path']); - $output->writeln('Tags: ' . $result['tags']); - $output->writeln('Lines: ' . $result['lines']); - $output->writeln('Bytes: ' . $result['bytes']); + /** + * @param array $result + */ + private function assertValidExport(array $result): void + { + $path = trim((string) ($result['path'] ?? '')); - return Command::SUCCESS; + if ($path === '' || !is_file($path)) { + throw new \RuntimeException('Tag export failed: tags.ndjson is missing.'); + } + + $tags = (int) ($result['tags'] ?? 0); + $lines = (int) ($result['lines'] ?? 0); + $bytes = (int) ($result['bytes'] ?? 0); + + if ($tags < 0 || $lines < 0 || $bytes < 0) { + throw new \RuntimeException('Tag export returned invalid statistics.'); + } } } \ No newline at end of file diff --git a/src/Command/TagsRebuildCommand.php b/src/Command/TagsRebuildCommand.php index 15fb8fb..b42d1a4 100644 --- a/src/Command/TagsRebuildCommand.php +++ b/src/Command/TagsRebuildCommand.php @@ -4,13 +4,13 @@ declare(strict_types=1); namespace App\Command; -use App\Index\IndexMetaManager; use App\Tag\TagNdjsonExporter; use App\Tag\TagVectorIndexBuilder; use Symfony\Component\Console\Attribute\AsCommand; use Symfony\Component\Console\Command\Command; use Symfony\Component\Console\Input\InputInterface; use Symfony\Component\Console\Output\OutputInterface; +use Symfony\Component\Console\Style\SymfonyStyle; #[AsCommand( name: 'mto:agent:tags:rebuild', @@ -21,45 +21,54 @@ final class TagsRebuildCommand extends Command public function __construct( private readonly TagNdjsonExporter $exporter, private readonly TagVectorIndexBuilder $builder, - private readonly IndexMetaManager $metaManager, ) { parent::__construct(); } protected function execute(InputInterface $input, OutputInterface $output): int { + $io = new SymfonyStyle($input, $output); + try { - // ----------------------------------------- - // 1) Export tags.ndjson - // ----------------------------------------- $export = $this->exporter->export(); + $this->assertValidExport($export); - $output->writeln('1/3 Exported tags.ndjson'); - $output->writeln('Path: ' . $export['path']); - $output->writeln('Tags: ' . $export['tags']); - $output->writeln('Lines: ' . $export['lines']); - $output->writeln('Bytes: ' . $export['bytes']); + $io->writeln('1/2 Exported tags.ndjson'); + $io->writeln('Path: ' . (string) ($export['path'] ?? '')); + $io->writeln('Tags: ' . (string) ($export['tags'] ?? 0)); + $io->writeln('Lines: ' . (string) ($export['lines'] ?? 0)); + $io->writeln('Bytes: ' . (string) ($export['bytes'] ?? 0)); - // ----------------------------------------- - // 2) Build FAISS tag index - // ----------------------------------------- $this->builder->build(); - $output->writeln('2/3 Built vector_tags.index'); + $io->writeln('2/2 Built vector_tags.index'); + $io->success('Tag rebuild completed.'); - // ----------------------------------------- - // 3) Enterprise Commit Marker - // ----------------------------------------- - $this->metaManager->touchRuntime([ - 'last_tags_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM), - ]); - - $output->writeln('3/3 Runtime commit marker updated'); + return Command::SUCCESS; } catch (\Throwable $e) { - $output->writeln('ERROR: ' . $e->getMessage() . ''); + $io->error($e->getMessage()); + return Command::FAILURE; } + } - return Command::SUCCESS; + /** + * @param array $export + */ + private function assertValidExport(array $export): void + { + $path = trim((string) ($export['path'] ?? '')); + + if ($path === '' || !is_file($path)) { + throw new \RuntimeException('Tag export failed: tags.ndjson is missing.'); + } + + $tags = (int) ($export['tags'] ?? 0); + $lines = (int) ($export['lines'] ?? 0); + $bytes = (int) ($export['bytes'] ?? 0); + + if ($tags < 0 || $lines < 0 || $bytes < 0) { + throw new \RuntimeException('Tag export returned invalid statistics.'); + } } } \ No newline at end of file diff --git a/src/Config/CatalogIntentConfig.php b/src/Config/CatalogIntentConfig.php index 2eed8c0..7a5b2ea 100644 --- a/src/Config/CatalogIntentConfig.php +++ b/src/Config/CatalogIntentConfig.php @@ -1,12 +1,62 @@ = self::MIN_SCORE; + } + + public static function isAmbiguous(float $bestScore, float $secondScore): bool + { + return abs($bestScore - $secondScore) < self::AMBIGUITY_DELTA; + } + + public static function clampScore(float $score): float + { + return max(self::MIN_ALLOWED_SCORE, min(self::MAX_ALLOWED_SCORE, $score)); + } + + private function __construct() + { + } } \ No newline at end of file diff --git a/src/Controller/Admin/DashboardController.php b/src/Controller/Admin/DashboardController.php index bd11017..7a18355 100644 --- a/src/Controller/Admin/DashboardController.php +++ b/src/Controller/Admin/DashboardController.php @@ -1,5 +1,6 @@ redirectToRoute('admin_dashboard'); } - - #[Route('/admin/dashboard', name: 'admin_dashboard')] - public function dashboard(IndexMetaManager $metaManager,VectorIndexHealthService $health,TagVectorIndexHealthService $tagHealth): Response - { - $chunkCount = $metaManager->getRuntimeChunkCount(); - $limit = IngestFlow::CHUNK_LIMIT_HARD; - + #[Route('/admin/dashboard', name: 'admin_dashboard', methods: ['GET'])] + public function dashboard( + IndexMetaManager $metaManager, + VectorIndexHealthService $health, + TagVectorIndexHealthService $tagHealth + ): Response { return $this->render('admin/dashboard/index.html.twig', [ - 'chunkCount' => $chunkCount, - 'chunkLimit' => $limit, + 'chunkCount' => $metaManager->getRuntimeChunkCount(), + 'chunkLimit' => IngestFlow::CHUNK_LIMIT_HARD, 'vectorHealth' => $health->check(), 'tagVectorHealth' => $tagHealth->check(), ]); } - - -} +} \ No newline at end of file diff --git a/src/Controller/Admin/DocumentController.php b/src/Controller/Admin/DocumentController.php index 80aedee..60d2cf1 100644 --- a/src/Controller/Admin/DocumentController.php +++ b/src/Controller/Admin/DocumentController.php @@ -1,10 +1,13 @@ getRepository(Document::class) @@ -46,115 +51,106 @@ class DocumentController extends AbstractController #[Route( '/{id}', name: 'admin_document_show', - requirements: ['id' => '[0-9a-fA-F\-]{36}'] + requirements: ['id' => '[0-9a-fA-F\-]{36}'], + methods: ['GET'] )] public function show(string $id, EntityManagerInterface $em): Response { - try { - $uuid = Uuid::fromString($id); - } catch (\Exception) { - throw new NotFoundHttpException(); - } - - $document = $em->getRepository(Document::class)->find($uuid); - - if (!$document) { - $this->addFlash('danger', 'Das Dokument existiert nicht mehr.'); - } - return $this->render('admin/document/show.html.twig', [ - 'document' => $document, + 'document' => $this->findDocument($id, $em), ]); } - #[Route('/new', name: 'admin_document_new')] + #[Route('/new', name: 'admin_document_new', methods: ['GET', 'POST'])] public function new( - Request $request, - DocumentService $documentService, - FormatText $formatText, - IngestJobService $jobService, - ParameterBagInterface $params + Request $request, + DocumentService $documentService, + FormatText $formatText, + IngestJobService $jobService, + ParameterBagInterface $params, + EntityManagerInterface $em, ): Response { if (!$request->isMethod('POST')) { return $this->render('admin/document/new.html.twig'); } - /** @var UploadedFile|null $file */ - $file = $request->files->get('file'); - if (!$file instanceof UploadedFile) { - throw new \InvalidArgumentException('No valid file uploaded.'); - } + if (!$this->isCsrfTokenValid('create_document', (string) $request->request->get('_token'))) { + $this->addFlash('danger', 'Ungültiges CSRF-Token.'); - $rawTitle = $request->request->get('title'); - $title = is_string($rawTitle) && $rawTitle !== '' - ? $rawTitle - : $formatText->slugify($file->getClientOriginalName()); - - if (!$title) { - $this->addFlash('error', 'Titel ist erforderlich.'); return $this->redirectToRoute('admin_document_new'); } - $uploadDir = (string)$params->get('mto.vector.data.upload.path'); - $this->ensureDir($uploadDir); + /** @var UploadedFile|null $file */ + $file = $request->files->get('file'); + if (!$file instanceof UploadedFile) { + $this->addFlash('danger', 'Keine gültige Datei hochgeladen.'); - $newFilename = uniqid('', true) . '_' . $file->getClientOriginalName(); + return $this->redirectToRoute('admin_document_new'); + } + + $title = $this->resolveDocumentTitle($request, $file, $formatText); + if ($title === '') { + $this->addFlash('danger', 'Titel ist erforderlich.'); + + return $this->redirectToRoute('admin_document_new'); + } + + $user = $this->requireUser(); + $uploadDir = trim((string) $params->get('mto.vector.data.upload.path')); try { - $file->move($uploadDir, $newFilename); - } catch (FileException) { - throw new \RuntimeException('File upload failed.'); + $this->ensureDir($uploadDir); + $filePath = $this->moveUploadedFile($file, $uploadDir, $formatText); + + $document = $documentService->createDocument($title, $filePath, $user); + $version = $document->getCurrentVersion(); + + if (!$version instanceof DocumentVersion) { + throw new \RuntimeException('Dokument erstellt, aber keine aktuelle Version vorhanden.'); + } + + $job = $jobService->startJob( + IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE, + $user, + $version->getDocument()->getId(), + $version->getId(), + null, + IngestJob::STATUS_QUEUED + ); + + $logFile = $this->prepareJobLogFile((string) $job->getId()); + $job->setLogPath($logFile); + $em->flush(); + + if (!$this->canExec()) { + $jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).'); + $this->addFlash('danger', 'Dokument erstellt, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).'); + + return $this->redirectToRoute('admin_documents'); + } + + $this->startIngestJob((string) $job->getId(), $logFile); + + return $this->redirectToRoute('admin_job_show', [ + 'id' => (string) $job->getId(), + ]); + } catch (\Throwable $e) { + $this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Dokument konnte nicht erstellt werden.')); + + return $this->redirectToRoute('admin_document_new'); } - - $filePath = $uploadDir . '/' . $newFilename; - - $document = $documentService->createDocument( - $title, - $filePath, - $this->getUser() - ); - - $version = $document->getCurrentVersion(); - if (!$version instanceof DocumentVersion) { - $this->addFlash('danger', 'Dokument erstellt, aber es wurde keine aktuelle Version erzeugt.'); - return $this->redirectToRoute('admin_documents'); - } - - $job = $jobService->startJob( - IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE, - $this->getUser(), - $version->getDocument()->getId(), - $version->getId(), - null, - IngestJob::STATUS_QUEUED - ); - - if (!$this->canExec()) { - $jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).'); - $this->addFlash('danger', 'Dokument erstellt, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).'); - return $this->redirectToRoute('admin_documents'); - } - - $this->startIngestJob((string)$job->getId()); - - return $this->redirectToRoute('admin_job_show', [ - 'id' => (string)$job->getId(), - ]); } - #[Route('/{id}/version/new', name: 'admin_document_version_new', requirements: ['id' => '[0-9a-fA-F\-]{36}'])] + #[Route('/{id}/version/new', name: 'admin_document_version_new', requirements: ['id' => '[0-9a-fA-F\-]{36}'], methods: ['GET', 'POST'])] public function newVersion( - string $id, - Request $request, + string $id, + Request $request, EntityManagerInterface $em, - DocumentService $documentService, - ParameterBagInterface $params + DocumentService $documentService, + ParameterBagInterface $params, + FormatText $formatText, ): Response { - $document = $em->getRepository(Document::class)->find($id); - - if (!$document) { - throw $this->createNotFoundException(); - } + $document = $this->findDocument($id, $em); if (!$request->isMethod('POST')) { return $this->render('admin/document/new_version.html.twig', [ @@ -162,31 +158,33 @@ class DocumentController extends AbstractController ]); } - /** @var UploadedFile|null $file */ - $file = $request->files->get('file'); - if (!$file instanceof UploadedFile) { - $this->addFlash('error', 'Datei ist erforderlich.'); + if (!$this->isCsrfTokenValid('create_document_version_' . $id, (string) $request->request->get('_token'))) { + $this->addFlash('danger', 'Ungültiges CSRF-Token.'); + return $this->redirectToRoute('admin_document_version_new', ['id' => $id]); } - $uploadDir = (string)$params->get('mto.vector.data.upload.path'); - $this->ensureDir($uploadDir); + /** @var UploadedFile|null $file */ + $file = $request->files->get('file'); + if (!$file instanceof UploadedFile) { + $this->addFlash('danger', 'Datei ist erforderlich.'); - $newFilename = uniqid('', true) . '_' . $file->getClientOriginalName(); - - try { - $file->move($uploadDir, $newFilename); - } catch (FileException) { - throw new \RuntimeException('File upload failed.'); + return $this->redirectToRoute('admin_document_version_new', ['id' => $id]); } - $filePath = $uploadDir . '/' . $newFilename; + try { + $user = $this->requireUser(); + $uploadDir = trim((string) $params->get('mto.vector.data.upload.path')); + $this->ensureDir($uploadDir); + $filePath = $this->moveUploadedFile($file, $uploadDir, $formatText); - $documentService->addVersion( - $document, - $filePath, - $this->getUser() - ); + $documentService->addVersion($document, $filePath, $user); + $this->addFlash('success', 'Neue Dokumentversion wurde hochgeladen.'); + } catch (\Throwable $e) { + $this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Neue Dokumentversion konnte nicht erstellt werden.')); + + return $this->redirectToRoute('admin_document_version_new', ['id' => $id]); + } return $this->redirectToRoute('admin_document_show', ['id' => $id]); } @@ -198,54 +196,55 @@ class DocumentController extends AbstractController methods: ['POST'] )] public function activateVersion( - string $versionId, - Request $request, + string $versionId, + Request $request, EntityManagerInterface $em, - DocumentService $documentService, - IngestJobService $jobService, + DocumentService $documentService, + IngestJobService $jobService, ): RedirectResponse { - if (!$this->isCsrfTokenValid('activate_version_' . $versionId, (string)$request->request->get('_token'))) { + if (!$this->isCsrfTokenValid('activate_version_' . $versionId, (string) $request->request->get('_token'))) { throw $this->createAccessDeniedException(); } - $version = $em->getRepository(DocumentVersion::class)->find($versionId); - if (!$version) { - throw $this->createNotFoundException(); - } + $version = $this->findDocumentVersion($versionId, $em); try { $documentService->activateVersion($version); $job = $jobService->startJob( IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE, - $this->getUser(), + $this->requireUser(), $version->getDocument()->getId(), $version->getId(), null, IngestJob::STATUS_QUEUED ); + $logFile = $this->prepareJobLogFile((string) $job->getId()); + $job->setLogPath($logFile); + $em->flush(); + if (!$this->canExec()) { $jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).'); $this->addFlash('danger', 'Aktivierung ok, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).'); + return $this->redirectToRoute('admin_document_show', [ - 'id' => $version->getDocument()->getId(), + 'id' => (string) $version->getDocument()->getId(), ]); } - $this->startIngestJob((string)$job->getId()); - + $this->startIngestJob((string) $job->getId(), $logFile); $this->addFlash('success', 'Version aktiviert. Ingest-Job wurde erstellt und gestartet.'); return $this->redirectToRoute('admin_job_show', [ - 'id' => (string)$job->getId(), + 'id' => (string) $job->getId(), ]); } catch (\Throwable $e) { - $this->addFlash('danger', 'Aktivierung/Re-Ingest fehlgeschlagen: ' . $e->getMessage()); + $this->addFlash('danger', 'Aktivierung/Re-Ingest fehlgeschlagen: ' . $this->buildSafeErrorMessage($e, 'Unbekannter Fehler.')); } return $this->redirectToRoute('admin_document_show', [ - 'id' => $version->getDocument()->getId(), + 'id' => (string) $version->getDocument()->getId(), ]); } @@ -256,115 +255,135 @@ class DocumentController extends AbstractController methods: ['POST'] )] public function ingestVersion( - string $versionId, - Request $request, + string $versionId, + Request $request, EntityManagerInterface $em, - IngestJobService $jobService, - ): ?RedirectResponse { - if (!$this->isCsrfTokenValid('ingest_version_' . $versionId, (string)$request->request->get('_token'))) { + IngestJobService $jobService, + ): RedirectResponse { + if (!$this->isCsrfTokenValid('ingest_version_' . $versionId, (string) $request->request->get('_token'))) { throw $this->createAccessDeniedException(); } - $version = $em->getRepository(DocumentVersion::class)->find($versionId); - if (!$version) { - throw $this->createNotFoundException(); - } + $version = $this->findDocumentVersion($versionId, $em); /** @var IngestJob|null $existing */ $existing = $em->getRepository(IngestJob::class) ->findOneBy( ['documentVersionId' => $version->getId()], - ['startedAt' => 'DESC'] + ['startedAt' => 'DESC', 'id' => 'DESC'] ); - if ($existing && $existing->getStartedAt() > new \DateTimeImmutable('-3 seconds')) { - return null; + if ( + $existing instanceof IngestJob + && $existing->getStartedAt() > new \DateTimeImmutable('-' . self::INGEST_DUPLICATE_WINDOW_SECONDS . ' seconds') + && in_array($existing->getStatus(), [IngestJob::STATUS_QUEUED, IngestJob::STATUS_RUNNING], true) + ) { + $this->addFlash('info', 'Für diese Version läuft bereits ein aktueller Ingest-Job.'); + + return $this->redirectToRoute('admin_job_show', [ + 'id' => (string) $existing->getId(), + ]); } $job = $jobService->startJob( IngestJob::TYPE_DOCUMENT, - $this->getUser(), + $this->requireUser(), $version->getDocument()->getId(), $version->getId(), null, IngestJob::STATUS_QUEUED ); + $logFile = $this->prepareJobLogFile((string) $job->getId()); + $job->setLogPath($logFile); + $em->flush(); + if (!$this->canExec()) { $jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).'); - $this->addFlash('error', 'Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).'); + $this->addFlash('danger', 'Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).'); + return $this->redirectToRoute('admin_document_show', [ - 'id' => $version->getDocument()->getId(), + 'id' => (string) $version->getDocument()->getId(), ]); } - $this->startIngestJob((string)$job->getId()); + try { + $this->startIngestJob((string) $job->getId(), $logFile); + } catch (\Throwable $e) { + $jobService->markFailed($job, 'Ingest async start failed: ' . $e->getMessage()); + $this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Ingest konnte nicht gestartet werden.')); + + return $this->redirectToRoute('admin_document_show', [ + 'id' => (string) $version->getDocument()->getId(), + ]); + } return $this->redirectToRoute('admin_job_show', [ - 'id' => (string)$job->getId(), + 'id' => (string) $job->getId(), ]); } - #[Route( - '/reset', - name: 'admin_document_reset', - methods: ['POST'] - )] - public function resetCompleteSystem(ParameterBagInterface $params, Connection $connection): ?RedirectResponse - { - if (!$this->canExec()) { - $this->addFlash('danger', 'Der Reset konnte nicht gestartet werden (exec deaktiviert).'); + #[Route('/reset', name: 'admin_document_reset', methods: ['POST'])] + public function resetCompleteSystem( + Request $request, + ParameterBagInterface $params, + Connection $connection, + ): RedirectResponse { + $this->denyAccessUnlessGranted('ROLE_SUPER_ADMIN'); + + if (!$this->isCsrfTokenValid('system_reset', (string) $request->request->get('_token'))) { + $this->addFlash('danger', 'Ungültiges CSRF-Token.'); + return $this->redirectToRoute('admin_dashboard'); } - @unlink((string)$params->get('mto.knowledge.ndjson')); - @unlink((string)$params->get('mto.knowledge.vector_index')); - @unlink((string)$params->get('mto.knowledge.vector_index_meta')); - @unlink((string)$params->get('mto.knowledge.index_meta')); - @unlink((string)$params->get('mto.runtime.meta')); + if (!$this->canExec()) { + $this->addFlash('danger', 'Der Reset konnte nicht gestartet werden (exec deaktiviert).'); - @unlink((string)$params->get('mto.knowledge.tags_ndjson')); - @unlink((string)$params->get('mto.knowledge.vector_tags_index')); - @unlink((string)$params->get('mto.knowledge.vector_tags_index_meta')); + return $this->redirectToRoute('admin_dashboard'); + } - $uploadDir = (string)$params->get('mto.knowledge.upload'); + foreach ([ + 'mto.knowledge.ndjson', + 'mto.knowledge.vector_index', + 'mto.knowledge.vector_index_meta', + 'mto.knowledge.index_meta', + 'mto.runtime.meta', + 'mto.knowledge.tags_ndjson', + 'mto.knowledge.vector_tags_index', + 'mto.knowledge.vector_tags_index_meta', + ] as $parameterName) { + $path = trim((string) $params->get($parameterName)); + if ($path !== '' && is_file($path)) { + @unlink($path); + } + } + + $uploadDir = trim((string) $params->get('mto.knowledge.upload')); if ($uploadDir !== '' && is_dir($uploadDir)) { exec('rm -rf ' . escapeshellarg($uploadDir)); } - $lockDir = (string)$params->get('mto.locks.dir'); + $lockDir = trim((string) $params->get('mto.locks.dir')); if ($lockDir !== '' && is_dir($lockDir)) { exec('rm -rf ' . escapeshellarg($lockDir)); } - $sql = ' - SET FOREIGN_KEY_CHECKS = 0; - TRUNCATE TABLE db.document; - SET FOREIGN_KEY_CHECKS = 1; - - SET FOREIGN_KEY_CHECKS = 0; - TRUNCATE TABLE db.document_version; - SET FOREIGN_KEY_CHECKS = 1; - - SET FOREIGN_KEY_CHECKS = 0; - TRUNCATE TABLE db.ingest_job; - SET FOREIGN_KEY_CHECKS = 1; - - SET FOREIGN_KEY_CHECKS = 0; - TRUNCATE TABLE db.knowledge_tag; - SET FOREIGN_KEY_CHECKS = 1; - - SET FOREIGN_KEY_CHECKS = 0; - TRUNCATE TABLE db.tag_rebuild_job; - SET FOREIGN_KEY_CHECKS = 1; - - SET FOREIGN_KEY_CHECKS = 0; - TRUNCATE TABLE db.document_tag; - SET FOREIGN_KEY_CHECKS = 1; - '; - $connection->executeQuery($sql); + $sql = <<<'SQL' +SET FOREIGN_KEY_CHECKS = 0; +TRUNCATE TABLE db.document_tag; +TRUNCATE TABLE db.tag_rebuild_job; +TRUNCATE TABLE db.knowledge_tag; +TRUNCATE TABLE db.ingest_job; +TRUNCATE TABLE db.document_version; +TRUNCATE TABLE db.document; +SET FOREIGN_KEY_CHECKS = 1; +SQL; + + $connection->executeStatement($sql); $this->addFlash('success', 'Das System wurde erfolgreich zurückgesetzt.'); + return $this->redirectToRoute('admin_dashboard'); } @@ -375,62 +394,63 @@ class DocumentController extends AbstractController methods: ['POST'] )] public function deleteDocument( - string $id, - Request $request, + string $id, + Request $request, EntityManagerInterface $em, - IngestJobService $jobService, - LockService $lockService, + IngestJobService $jobService, + LockService $lockService, ): RedirectResponse { - if (!$this->isCsrfTokenValid('delete_document_' . $id, (string)$request->request->get('_token'))) { + $this->denyAccessUnlessGranted('ROLE_SUPER_ADMIN'); + + if (!$this->isCsrfTokenValid('delete_document_' . $id, (string) $request->request->get('_token'))) { throw $this->createAccessDeniedException(); } - try { - $uuid = Uuid::fromString($id); - } catch (\Exception) { - throw $this->createNotFoundException(); - } - - /** @var Document|null $document */ - $document = $em->getRepository(Document::class)->find($uuid); - if (!$document) { - throw $this->createNotFoundException(); - } + $document = $this->findDocument($id, $em); if (!$lockService->acquire()) { $this->addFlash('danger', 'Ein Ingest-Job läuft bereits. Löschen derzeit nicht möglich.'); + return $this->redirectToRoute('admin_documents'); } $lockService->release(); $job = $jobService->startJob( IngestJob::TYPE_DOCUMENT_DELETE, - $this->getUser(), + $this->requireUser(), $document->getId(), null, null, IngestJob::STATUS_QUEUED ); + $logFile = $this->prepareJobLogFile((string) $job->getId()); + $job->setLogPath($logFile); + $em->flush(); + if (!$this->canExec()) { $jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).'); $this->addFlash('danger', 'Löschen konnte nicht gestartet werden (exec deaktiviert).'); + return $this->redirectToRoute('admin_documents'); } - $this->startIngestJob((string)$job->getId()); + try { + $this->startIngestJob((string) $job->getId(), $logFile); + } catch (\Throwable $e) { + $jobService->markFailed($job, 'Delete async start failed: ' . $e->getMessage()); + $this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Löschvorgang konnte nicht gestartet werden.')); + + return $this->redirectToRoute('admin_documents'); + } $this->addFlash('success', 'Löschvorgang gestartet. Dokument wird nach Index-Rebuild entfernt.'); return $this->redirectToRoute('admin_job_show', [ - 'id' => (string)$job->getId(), + 'id' => (string) $job->getId(), ]); } - // ========================================================= - // Helpers - // ========================================================= - private function canExec(): bool { if (!function_exists('exec')) { @@ -443,6 +463,7 @@ class DocumentController extends AbstractController } $list = array_map('trim', explode(',', $disabled)); + return !in_array('exec', $list, true); } @@ -452,34 +473,209 @@ class DocumentController extends AbstractController throw new \RuntimeException('Upload directory not configured.'); } - if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) { + if (!is_dir($dir) && !mkdir($dir, 0775, true) && !is_dir($dir)) { throw new \RuntimeException('Unable to create upload directory.'); } } - private function startIngestJob(string $jobId): void + private function moveUploadedFile(UploadedFile $file, string $uploadDir, FormatText $formatText): string { - $projectDir = (string)$this->getParameter('kernel.project_dir'); + $originalName = trim((string) $file->getClientOriginalName()); + $baseName = pathinfo($originalName !== '' ? $originalName : 'document', PATHINFO_FILENAME); + $extension = strtolower((string) $file->getClientOriginalExtension()); + + $safeBaseName = $formatText->slugify($baseName !== '' ? $baseName : 'document'); + if ($safeBaseName === '') { + $safeBaseName = 'document'; + } + + $newFilename = uniqid('', true) . '_' . $safeBaseName; + if ($extension !== '') { + $newFilename .= '.' . $extension; + } + + try { + $file->move($uploadDir, $newFilename); + } catch (FileException) { + throw new \RuntimeException('File upload failed.'); + } + + return rtrim($uploadDir, '/') . '/' . $newFilename; + } + + private function resolveDocumentTitle(Request $request, UploadedFile $file, FormatText $formatText): string + { + $rawTitle = trim((string) $request->request->get('title', '')); + if ($rawTitle !== '') { + return $rawTitle; + } + + $originalName = trim((string) $file->getClientOriginalName()); + $baseName = pathinfo($originalName, PATHINFO_FILENAME); + + return trim((string) $formatText->slugify($baseName !== '' ? $baseName : $originalName)); + } + + private function startIngestJob(string $jobId, string $logFile): void + { + $projectDir = $this->resolveProjectDir(); $console = $projectDir . '/bin/console'; - $logDir = $projectDir . '/var/log/ingest'; - if (!is_dir($logDir)) { - @mkdir($logDir, 0777, true); + if (!is_file($console)) { + throw new \RuntimeException('bin/console not found: ' . $console); } - $logFile = $logDir . '/job_' . $jobId . '.log'; - // Wichtig: CLI-PHP verwenden, nicht PHP_BINARY aus FPM - $php = 'php'; + $php = $this->resolvePhpBinary(); $cmd = sprintf( - '%s %s --no-interaction %s %s >> %s 2>&1 &', - escapeshellcmd($php), + 'cd %s && nohup %s %s %s %s --no-interaction >> %s 2>&1 & echo $!', + escapeshellarg($projectDir), + escapeshellarg($php), escapeshellarg($console), escapeshellarg('mto:agent:ingest:run'), escapeshellarg($jobId), escapeshellarg($logFile), ); - exec($cmd); + $output = []; + $exitCode = 0; + @exec($cmd, $output, $exitCode); + + if ($exitCode !== 0) { + throw new \RuntimeException('Background ingest bootstrap failed with exit code ' . $exitCode . '.'); + } + } + + private function prepareJobLogFile(string $jobId): string + { + $projectDir = $this->resolveProjectDir(); + $logDir = $projectDir . '/var/log/ingest'; + $this->ensureDir($logDir); + + return $logDir . '/job_' . $jobId . '.log'; + } + + private function resolveProjectDir(): string + { + $projectDir = trim((string) $this->getParameter('kernel.project_dir')); + + if ($projectDir === '' || !is_dir($projectDir)) { + throw new \RuntimeException('Project directory is invalid.'); + } + + return rtrim($projectDir, '/'); + } + + private function resolvePhpBinary(): string + { + $envCandidates = [ + trim((string) ($_SERVER['PHP_CLI_BINARY'] ?? '')), + trim((string) ($_ENV['PHP_CLI_BINARY'] ?? '')), + trim((string) getenv('PHP_CLI_BINARY')), + ]; + + foreach ($envCandidates as $candidate) { + if ($this->isValidCliPhpBinary($candidate)) { + return $candidate; + } + } + + $phpBinary = defined('PHP_BINARY') ? trim((string) PHP_BINARY) : ''; + if ($this->isValidCliPhpBinary($phpBinary)) { + return $phpBinary; + } + + $fallbackCandidates = [ + '/usr/bin/php', + '/usr/local/bin/php', + '/bin/php', + '/opt/homebrew/bin/php', + ]; + + foreach ($fallbackCandidates as $candidate) { + if ($this->isValidCliPhpBinary($candidate)) { + return $candidate; + } + } + + $whichPhp = trim((string) @shell_exec('command -v php 2>/dev/null')); + if ($this->isValidCliPhpBinary($whichPhp)) { + return $whichPhp; + } + + throw new \RuntimeException( + 'Could not resolve a CLI PHP binary. Set PHP_CLI_BINARY explicitly, e.g. /usr/bin/php.' + ); + } + + private function isValidCliPhpBinary(string $path): bool + { + $path = trim($path); + + if ($path === '' || !is_file($path) || !is_executable($path)) { + return false; + } + + $basename = strtolower(basename($path)); + + if (str_contains($basename, 'fpm') || str_contains($basename, 'cgi')) { + return false; + } + + return true; + } + + private function findDocument(string $id, EntityManagerInterface $em): Document + { + try { + $uuid = Uuid::fromString(trim($id)); + } catch (\Throwable) { + throw new NotFoundHttpException(); + } + + /** @var Document|null $document */ + $document = $em->getRepository(Document::class)->find($uuid); + + if (!$document instanceof Document) { + throw new NotFoundHttpException(); + } + + return $document; + } + + private function findDocumentVersion(string $versionId, EntityManagerInterface $em): DocumentVersion + { + try { + $uuid = Uuid::fromString(trim($versionId)); + } catch (\Throwable) { + throw new NotFoundHttpException(); + } + + /** @var DocumentVersion|null $version */ + $version = $em->getRepository(DocumentVersion::class)->find($uuid); + + if (!$version instanceof DocumentVersion) { + throw new NotFoundHttpException(); + } + + return $version; + } + + private function requireUser(): User + { + $user = $this->getUser(); + + if (!$user instanceof User) { + throw new \RuntimeException('No authenticated user available.'); + } + + return $user; + } + + private function buildSafeErrorMessage(\Throwable $e, string $fallback): string + { + $message = trim($e->getMessage()); + + return $message !== '' ? $message : $fallback; } } \ No newline at end of file diff --git a/src/Controller/Admin/DocumentTagController.php b/src/Controller/Admin/DocumentTagController.php index c50d3bb..bc5710a 100644 --- a/src/Controller/Admin/DocumentTagController.php +++ b/src/Controller/Admin/DocumentTagController.php @@ -19,44 +19,97 @@ final class DocumentTagController extends AbstractController #[Route('/{id}/tags', name: 'admin_document_tags_edit', methods: ['GET'])] public function edit(string $id, DocumentTagAdminService $svc): Response { - $data = $svc->getEditData($id); + $id = trim($id); + + try { + $data = $svc->getEditData($id); + } catch (\Throwable $e) { + $this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Dokument-Tags konnten nicht geladen werden.')); + + return $this->redirectToRoute('admin_documents'); + } return $this->render('admin/document_tags/edit.html.twig', [ - 'document' => $data['document'], - 'allTags' => $data['allTags'], - 'latestJob' => $data['latestJob'], - - 'statusRunning' => TagRebuildJob::STATUS_RUNNING, - 'statusQueued' => TagRebuildJob::STATUS_QUEUED, - 'statusCompleted' => TagRebuildJob::STATUS_COMPLETED, - 'statusFailed' => TagRebuildJob::STATUS_FAILED, + ...$data, + ...$this->buildJobStatusViewData(), ]); } #[Route('/{id}/tags/save', name: 'admin_document_tags_save', methods: ['POST'])] public function save(string $id, Request $request, DocumentTagAdminService $svc): RedirectResponse { - $selected = $request->request->all('tag_ids') ?? []; + $id = trim($id); + + if (!$this->isCsrfTokenValid('admin_document_tags_save_' . $id, (string) $request->request->get('_token'))) { + $this->addFlash('danger', 'Ungültiges CSRF-Token.'); + + return $this->redirectToRoute('admin_document_tags_edit', ['id' => $id]); + } try { - $svc->saveTags($id, $selected); + $svc->saveTags($id, $this->normalizeStringList($request->request->all('tag_ids'))); $this->addFlash('success', 'Tags wurden aktualisiert. Rebuild läuft im Hintergrund.'); } catch (\Throwable $e) { - $this->addFlash('danger', $e->getMessage()); + $this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Tags konnten nicht aktualisiert werden.')); } return $this->redirectToRoute('admin_document_tags_edit', ['id' => $id]); } - /** - * Wichtig: Ohne extra "admin/" im Pfad, weil Prefix schon /admin/documents ist. - * Ergebnis: /admin/documents/tags/status - */ #[Route('/tags/status', name: 'admin_tags_status', methods: ['GET'])] public function status(DocumentTagAdminService $svc): JsonResponse { + $status = $svc->getLatestRebuildStatus(); + return $this->json([ - 'status' => $svc->getLatestRebuildStatus(), + 'status' => $status, + 'hasActiveJob' => $status === TagRebuildJob::STATUS_RUNNING + || $status === TagRebuildJob::STATUS_QUEUED, ]); } + + /** + * @param mixed $values + * @return list + */ + private function normalizeStringList(mixed $values): array + { + if (!is_array($values)) { + return []; + } + + $normalized = []; + + foreach ($values as $value) { + $value = trim((string) $value); + + if ($value === '') { + continue; + } + + $normalized[] = $value; + } + + return array_values(array_unique($normalized)); + } + + /** + * @return array + */ + private function buildJobStatusViewData(): array + { + return [ + 'statusRunning' => TagRebuildJob::STATUS_RUNNING, + 'statusQueued' => TagRebuildJob::STATUS_QUEUED, + 'statusCompleted' => TagRebuildJob::STATUS_COMPLETED, + 'statusFailed' => TagRebuildJob::STATUS_FAILED, + ]; + } + + private function buildSafeErrorMessage(\Throwable $e, string $fallback): string + { + $message = trim($e->getMessage()); + + return $message !== '' ? $message : $fallback; + } } \ No newline at end of file diff --git a/src/Controller/Admin/IngestJobController.php b/src/Controller/Admin/IngestJobController.php index f17b477..256268f 100644 --- a/src/Controller/Admin/IngestJobController.php +++ b/src/Controller/Admin/IngestJobController.php @@ -1,46 +1,44 @@ getRepository(IngestJob::class) - ->findBy([], ['startedAt' => 'DESC']); + ->findBy([], ['startedAt' => 'DESC', 'id' => 'DESC']); return $this->render('admin/job/index.html.twig', [ - 'jobs' => $jobs + 'jobs' => $jobs, ]); } #[Route( '/{id}', name: 'admin_job_show', - requirements: ['id' => '[0-9a-fA-F\-]{36}'] + requirements: ['id' => '[0-9a-fA-F\-]{36}'], + methods: ['GET'] )] public function show(string $id, EntityManagerInterface $em): Response { - $job = $em->getRepository(IngestJob::class)->find($id); - - if (!$job) { - throw new NotFoundHttpException(); - } - return $this->render('admin/job/show.html.twig', [ - 'job' => $job + 'job' => $this->findJob($id, $em), ]); } @@ -54,12 +52,7 @@ class IngestJobController extends AbstractController { $this->denyAccessUnlessGranted('ROLE_USER'); - /** @var IngestJob|null $job */ - $job = $em->getRepository(IngestJob::class)->find($id); - - if (!$job) { - throw new NotFoundHttpException(); - } + $job = $this->findJob($id, $em); return $this->json([ 'id' => (string) $job->getId(), @@ -68,58 +61,185 @@ class IngestJobController extends AbstractController 'startedAt' => $job->getStartedAt()->format(DATE_ATOM), 'finishedAt' => $job->getFinishedAt()?->format(DATE_ATOM), 'errorMessage' => $job->getErrorMessage(), + 'logPath' => $job->getLogPath(), ]); } #[Route('/global-reindex', name: 'admin_global_reindex', methods: ['POST'])] public function globalReindex( + Request $request, IngestJobService $jobService, + EntityManagerInterface $em, ): RedirectResponse { - $this->denyAccessUnlessGranted('ROLE_SUPER_ADMIN'); - // --------------------------------------------------------- - // 1) Job anlegen (QUEUED) - // --------------------------------------------------------- - $job = $jobService->startJob( - IngestJob::TYPE_GLOBAL_REINDEX, - $this->getUser(), - null, - null, - null, - IngestJob::STATUS_QUEUED - ); + if (!$this->isCsrfTokenValid('global_reindex', (string) $request->request->get('_token'))) { + $this->addFlash('danger', 'Ungültiges CSRF-Token.'); - // --------------------------------------------------------- - // 2) CLI im Hintergrund starten - // --------------------------------------------------------- - $projectDir = (string)$this->getParameter('kernel.project_dir'); - $console = $projectDir . '/bin/console'; - - $logDir = $projectDir . '/var/log/ingest'; - if (!is_dir($logDir)) { - @mkdir($logDir, 0777, true); + return $this->redirectToRoute('admin_jobs'); } - $logFile = $logDir . '/job_' . (string)$job->getId() . '.log'; - $php = 'php'; + try { + $projectDir = $this->resolveProjectDir(); + $console = $projectDir . '/bin/console'; - $cmd = sprintf( - '%s %s --no-interaction %s %s >> %s 2>&1 &', - escapeshellcmd($php), - escapeshellarg($console), - escapeshellarg('mto:agent:ingest:run'), - escapeshellarg((string)$job->getId()), - escapeshellarg($logFile), - ); + if (!is_file($console)) { + throw new \RuntimeException('bin/console not found: ' . $console); + } - exec($cmd); + $logDir = $projectDir . '/var/log/ingest'; + $this->ensureDirectoryExists($logDir); - // --------------------------------------------------------- - // 3) Redirect auf Job-Detailseite (Loader) - // --------------------------------------------------------- - return $this->redirectToRoute('admin_job_show', [ - 'id' => (string)$job->getId(), - ]); + $job = $jobService->startJob( + IngestJob::TYPE_GLOBAL_REINDEX, + $this->getUser(), + null, + null, + null, + IngestJob::STATUS_QUEUED + ); + + $logFile = $logDir . '/job_' . (string) $job->getId() . '.log'; + $job->setLogPath($logFile); + $em->flush(); + + $phpBinary = $this->resolvePhpBinary(); + $cmd = sprintf( + 'cd %s && nohup %s %s %s %s --no-interaction >> %s 2>&1 & echo $!', + escapeshellarg($projectDir), + escapeshellarg($phpBinary), + escapeshellarg($console), + escapeshellarg('mto:agent:ingest:run'), + escapeshellarg((string) $job->getId()), + escapeshellarg($logFile), + ); + + $output = []; + $exitCode = 0; + @exec($cmd, $output, $exitCode); + + if ($exitCode !== 0) { + $job->markFailed('Global reindex async bootstrap failed with exit code ' . $exitCode . '.'); + $em->flush(); + + $this->addFlash('danger', 'Global Reindex konnte nicht im Hintergrund gestartet werden.'); + + return $this->redirectToRoute('admin_job_show', [ + 'id' => (string) $job->getId(), + ]); + } + + $this->addFlash('success', 'Global Reindex wurde gestartet.'); + + return $this->redirectToRoute('admin_job_show', [ + 'id' => (string) $job->getId(), + ]); + } catch (\Throwable $e) { + $this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Global Reindex konnte nicht gestartet werden.')); + + return $this->redirectToRoute('admin_jobs'); + } } -} + + private function findJob(string $id, EntityManagerInterface $em): IngestJob + { + $id = trim($id); + + /** @var IngestJob|null $job */ + $job = $em->getRepository(IngestJob::class)->find($id); + + if (!$job instanceof IngestJob) { + throw new NotFoundHttpException(); + } + + return $job; + } + + private function resolveProjectDir(): string + { + $projectDir = trim((string) $this->getParameter('kernel.project_dir')); + + if ($projectDir === '' || !is_dir($projectDir)) { + throw new \RuntimeException('Project directory is invalid.'); + } + + return rtrim($projectDir, '/'); + } + + private function resolvePhpBinary(): string + { + $envCandidates = [ + trim((string) ($_SERVER['PHP_CLI_BINARY'] ?? '')), + trim((string) ($_ENV['PHP_CLI_BINARY'] ?? '')), + trim((string) getenv('PHP_CLI_BINARY')), + ]; + + foreach ($envCandidates as $candidate) { + if ($this->isValidCliPhpBinary($candidate)) { + return $candidate; + } + } + + $phpBinary = defined('PHP_BINARY') ? trim((string) PHP_BINARY) : ''; + if ($this->isValidCliPhpBinary($phpBinary)) { + return $phpBinary; + } + + $fallbackCandidates = [ + '/usr/bin/php', + '/usr/local/bin/php', + '/bin/php', + '/opt/homebrew/bin/php', + ]; + + foreach ($fallbackCandidates as $candidate) { + if ($this->isValidCliPhpBinary($candidate)) { + return $candidate; + } + } + + $whichPhp = trim((string) @shell_exec('command -v php 2>/dev/null')); + if ($this->isValidCliPhpBinary($whichPhp)) { + return $whichPhp; + } + + throw new \RuntimeException( + 'Could not resolve a CLI PHP binary. Set PHP_CLI_BINARY explicitly, e.g. /usr/bin/php.' + ); + } + + private function isValidCliPhpBinary(string $path): bool + { + $path = trim($path); + + if ($path === '' || !is_file($path) || !is_executable($path)) { + return false; + } + + $basename = strtolower(basename($path)); + + if (str_contains($basename, 'fpm') || str_contains($basename, 'cgi')) { + return false; + } + + return true; + } + + private function ensureDirectoryExists(string $dir): void + { + if (is_dir($dir)) { + return; + } + + if (!@mkdir($dir, 0775, true) && !is_dir($dir)) { + throw new \RuntimeException('Could not create ingest log directory.'); + } + } + + private function buildSafeErrorMessage(\Throwable $e, string $fallback): string + { + $message = trim($e->getMessage()); + + return $message !== '' ? $message : $fallback; + } +} \ No newline at end of file diff --git a/src/Controller/Admin/TagController.php b/src/Controller/Admin/TagController.php index 4fe9763..317793d 100644 --- a/src/Controller/Admin/TagController.php +++ b/src/Controller/Admin/TagController.php @@ -6,6 +6,7 @@ namespace App\Controller\Admin; use App\Entity\TagRebuildJob; use App\Service\Admin\TagAdminService; +use App\Tag\TagTypes; use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; use Symfony\Component\HttpFoundation\RedirectResponse; use Symfony\Component\HttpFoundation\Request; @@ -18,41 +19,32 @@ final class TagController extends AbstractController #[Route('', name: 'admin_tags_index', methods: ['GET'])] public function index(TagAdminService $svc): Response { - $data = $svc->getIndexData(); - return $this->render('admin/tag/index.html.twig', [ - ...$data, - 'statusRunning' => TagRebuildJob::STATUS_RUNNING, - 'statusQueued' => TagRebuildJob::STATUS_QUEUED, - 'statusCompleted' => TagRebuildJob::STATUS_COMPLETED, - 'statusFailed' => TagRebuildJob::STATUS_FAILED, + ...$svc->getIndexData(), + ...$this->buildJobStatusViewData(), ]); } #[Route('/create', name: 'admin_tags_create', methods: ['POST'])] public function create(Request $request, TagAdminService $svc): RedirectResponse { - if (!$this->isCsrfTokenValid( - 'admin_tag_create', - $request->request->get('_token') - )) { - $this->addFlash('danger', 'Ungültiges CSRF Token.'); + if (!$this->isCsrfTokenValid('admin_tag_create', (string) $request->request->get('_token'))) { + $this->addFlash('danger', 'Ungültiges CSRF-Token.'); + return $this->redirectToRoute('admin_tags_index'); } try { $svc->create( - (string)$request->request->get('slug', ''), - (string)$request->request->get('label', ''), - $request->request->get('description') - ? (string)$request->request->get('description') - : null, - (string)$request->request->get('type', 'generic') // NEU + (string) $request->request->get('slug', ''), + (string) $request->request->get('label', ''), + $this->normalizeNullableString($request->request->get('description')), + TagTypes::normalize((string) $request->request->get('type', TagTypes::GENERIC)) ); $this->addFlash('success', 'Tag wurde erstellt.'); } catch (\Throwable $e) { - $this->addFlash('danger', $e->getMessage()); + $this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Tag konnte nicht erstellt werden.')); } return $this->redirectToRoute('admin_tags_index'); @@ -61,58 +53,110 @@ final class TagController extends AbstractController #[Route('/{id}/delete', name: 'admin_tags_delete', methods: ['POST'])] public function delete(string $id, Request $request, TagAdminService $svc): RedirectResponse { - if (!$this->isCsrfTokenValid( - 'admin_tag_delete_' . $id, - $request->request->get('_token') - )) { - $this->addFlash('danger', 'Ungültiges CSRF Token.'); + if (!$this->isCsrfTokenValid('admin_tag_delete_' . $id, (string) $request->request->get('_token'))) { + $this->addFlash('danger', 'Ungültiges CSRF-Token.'); + return $this->redirectToRoute('admin_tags_index'); } try { - $svc->delete($id); + $svc->delete(trim($id)); $this->addFlash('success', 'Tag wurde gelöscht.'); } catch (\Throwable $e) { - $this->addFlash('danger', $e->getMessage()); + $this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Tag konnte nicht gelöscht werden.')); } return $this->redirectToRoute('admin_tags_index'); } #[Route('/{id}/assign', name: 'admin_tags_assign', methods: ['GET', 'POST'])] - public function assign( - string $id, - Request $request, - TagAdminService $svc - ): Response { + public function assign(string $id, Request $request, TagAdminService $svc): Response + { + $id = trim($id); if ($request->isMethod('POST')) { + if (!$this->isCsrfTokenValid('assign_tag_' . $id, (string) $request->request->get('_token'))) { + $this->addFlash('danger', 'Ungültiges CSRF-Token.'); - if (!$this->isCsrfTokenValid( - 'assign_tag_' . $id, - $request->request->get('_token') - )) { - throw $this->createAccessDeniedException(); + return $this->redirectToRoute('admin_tags_assign', ['id' => $id]); } - $svc->syncAssignments( - $id, - $request->request->all('documents') ?? [] - ); - - $this->addFlash('success', 'Zuweisungen aktualisiert.'); + try { + $svc->syncAssignments($id, $this->normalizeStringList($request->request->all('documents'))); + $this->addFlash('success', 'Zuweisungen aktualisiert.'); + } catch (\Throwable $e) { + $this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Zuweisungen konnten nicht aktualisiert werden.')); + } return $this->redirectToRoute('admin_tags_assign', ['id' => $id]); } - $data = $svc->getAssignData($id); + try { + $data = $svc->getAssignData($id); + } catch (\Throwable $e) { + $this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Tag konnte nicht geladen werden.')); + + return $this->redirectToRoute('admin_tags_index'); + } return $this->render('admin/tag/assign.html.twig', [ ...$data, + ...$this->buildJobStatusViewData(), + ]); + } + + /** + * @param mixed $value + */ + private function normalizeNullableString(mixed $value): ?string + { + $value = trim((string) $value); + + return $value !== '' ? $value : null; + } + + /** + * @param mixed $values + * @return list + */ + private function normalizeStringList(mixed $values): array + { + if (!is_array($values)) { + return []; + } + + $normalized = []; + + foreach ($values as $value) { + $value = trim((string) $value); + + if ($value === '') { + continue; + } + + $normalized[] = $value; + } + + return array_values(array_unique($normalized)); + } + + /** + * @return array + */ + private function buildJobStatusViewData(): array + { + return [ 'statusRunning' => TagRebuildJob::STATUS_RUNNING, 'statusQueued' => TagRebuildJob::STATUS_QUEUED, 'statusCompleted' => TagRebuildJob::STATUS_COMPLETED, 'statusFailed' => TagRebuildJob::STATUS_FAILED, - ]); + ]; + } + + private function buildSafeErrorMessage(\Throwable $e, string $fallback): string + { + $message = trim($e->getMessage()); + + return $message !== '' ? $message : $fallback; } } \ No newline at end of file diff --git a/src/Controller/Admin/TagRebuildStreamController.php b/src/Controller/Admin/TagRebuildStreamController.php index 2d686b7..37b2434 100644 --- a/src/Controller/Admin/TagRebuildStreamController.php +++ b/src/Controller/Admin/TagRebuildStreamController.php @@ -10,38 +10,79 @@ use Symfony\Component\Routing\Attribute\Route; final class TagRebuildStreamController { - #[Route('/admin/tags/rebuild/stream', name: 'admin_tags_rebuild_stream')] + private const POLL_INTERVAL_SECONDS = 2; + private const KEEPALIVE_INTERVAL_SECONDS = 10; + + #[Route('/admin/tags/rebuild/stream', name: 'admin_tags_rebuild_stream', methods: ['GET'])] public function stream(TagRebuildStatusProvider $provider): StreamedResponse { - $response = new StreamedResponse(function () use ($provider) { + $response = new StreamedResponse(function () use ($provider): void { + self::disableOutputBuffering(); - echo "event: ping\n"; - echo "data: " . json_encode(['init' => true]) . "\n\n"; + echo "retry: 3000\n"; + self::sendEvent('ping', ['init' => true]); - @ob_flush(); - @flush(); + $lastPayloadHash = null; + $lastKeepaliveAt = time(); while (!connection_aborted()) { - $data = $provider->getLatestStatus(); if ($data !== null) { - echo "event: message\n"; - echo "data: " . json_encode($data) . "\n\n"; + $payloadHash = md5( + json_encode($data, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES) ?: 'null' + ); - @ob_flush(); - @flush(); + if ($payloadHash !== $lastPayloadHash) { + self::sendEvent('message', $data); + $lastPayloadHash = $payloadHash; + $lastKeepaliveAt = time(); + } } - sleep(2); + if ((time() - $lastKeepaliveAt) >= self::KEEPALIVE_INTERVAL_SECONDS) { + self::sendEvent('ping', [ + 'ts' => (new \DateTimeImmutable())->format(DATE_ATOM), + ]); + $lastKeepaliveAt = time(); + } + + sleep(self::POLL_INTERVAL_SECONDS); } }); $response->headers->set('Content-Type', 'text/event-stream'); - $response->headers->set('Cache-Control', 'no-cache'); + $response->headers->set('Cache-Control', 'no-cache, no-store, must-revalidate'); + $response->headers->set('Pragma', 'no-cache'); + $response->headers->set('Expires', '0'); $response->headers->set('Connection', 'keep-alive'); $response->headers->set('X-Accel-Buffering', 'no'); return $response; } + + private static function disableOutputBuffering(): void + { + while (ob_get_level() > 0) { + @ob_end_flush(); + } + } + + /** + * @param array $data + */ + private static function sendEvent(string $event, array $data): void + { + $json = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES); + + if (!is_string($json)) { + $json = '{"error":"json_encode_failed"}'; + } + + echo 'event: ' . $event . "\n"; + echo 'data: ' . $json . "\n\n"; + + @ob_flush(); + @flush(); + } } \ No newline at end of file diff --git a/src/Entity/DocumentTag.php b/src/Entity/DocumentTag.php index 169b69f..7973d9a 100644 --- a/src/Entity/DocumentTag.php +++ b/src/Entity/DocumentTag.php @@ -8,6 +8,7 @@ use Doctrine\ORM\Mapping as ORM; #[ORM\Entity] #[ORM\Table(name: 'document_tag')] +#[ORM\Index(name: 'idx_document_tag_tag_id', columns: ['tag_id'])] class DocumentTag { #[ORM\Id] @@ -22,8 +23,8 @@ class DocumentTag public function __construct(Document $document, Tag $tag) { - $this->document = $document; - $this->tag = $tag; + $this->setDocument($document); + $this->setTag($tag); } public function getDocument(): Document @@ -35,4 +36,20 @@ class DocumentTag { return $this->tag; } + + public function isSameRelation(Document $document, Tag $tag): bool + { + return $this->document->getId()->equals($document->getId()) + && $this->tag->getId()->equals($tag->getId()); + } + + private function setDocument(Document $document): void + { + $this->document = $document; + } + + private function setTag(Tag $tag): void + { + $this->tag = $tag; + } } \ No newline at end of file diff --git a/src/Entity/Tag.php b/src/Entity/Tag.php index 08b967a..a9625ad 100644 --- a/src/Entity/Tag.php +++ b/src/Entity/Tag.php @@ -1,8 +1,12 @@ id = Uuid::v4(); $this->createdAt = new \DateTimeImmutable(); - $this->slug = $slug; - $this->label = $label; - $this->description = $description; + $this->setSlug($slug); + $this->setLabel($label); + $this->setDescription($description); + $this->setType($type); } public function getId(): Uuid @@ -57,7 +61,14 @@ class Tag public function setSlug(string $slug): static { + $slug = $this->normalizeSlug($slug); + + if ($slug === '') { + throw new InvalidArgumentException('Tag slug must not be empty.'); + } + $this->slug = $slug; + return $this; } @@ -68,7 +79,14 @@ class Tag public function setLabel(string $label): static { + $label = trim($label); + + if ($label === '') { + throw new InvalidArgumentException('Tag label must not be empty.'); + } + $this->label = $label; + return $this; } @@ -79,7 +97,9 @@ class Tag public function setDescription(?string $description): static { - $this->description = $description; + $description = trim((string) $description); + $this->description = $description !== '' ? $description : null; + return $this; } @@ -90,13 +110,43 @@ class Tag public function setType(string $type): static { - $type = trim($type); - $this->type = $type !== '' ? $type : 'generic'; + $normalizedType = TagTypes::normalize($type); + + if (!TagTypes::isValid($normalizedType)) { + throw new InvalidArgumentException(sprintf('Unsupported tag type "%s".', $type)); + } + + $this->type = $normalizedType; + return $this; } + public function isGeneric(): bool + { + return $this->type === TagTypes::GENERIC; + } + + public function isCatalogEntity(): bool + { + return $this->type === TagTypes::CATALOG_ENTITY; + } + + public function isSalesSignal(): bool + { + return $this->type === TagTypes::SALES_SIGNAL; + } + public function getCreatedAt(): \DateTimeImmutable { return $this->createdAt; } + + private function normalizeSlug(string $slug): string + { + $slug = mb_strtolower(trim($slug)); + $slug = preg_replace('/\s+/u', '-', $slug) ?? $slug; + $slug = preg_replace('/-+/u', '-', $slug) ?? $slug; + + return trim($slug, '-'); + } } \ No newline at end of file diff --git a/src/Entity/TagRebuildJob.php b/src/Entity/TagRebuildJob.php index 0cadabf..198c1b3 100644 --- a/src/Entity/TagRebuildJob.php +++ b/src/Entity/TagRebuildJob.php @@ -9,14 +9,16 @@ use Symfony\Component\Uid\Uuid; #[ORM\Entity] #[ORM\Table(name: 'tag_rebuild_job')] -#[ORM\Index(columns: ['status'], name: 'idx_tag_rebuild_job_status')] -#[ORM\Index(columns: ['created_at'], name: 'idx_tag_rebuild_job_created_at')] +#[ORM\Index(name: 'idx_tag_rebuild_job_status', columns: ['status'])] +#[ORM\Index(name: 'idx_tag_rebuild_job_created_at', columns: ['created_at'])] class TagRebuildJob { - public const STATUS_QUEUED = 'QUEUED'; - public const STATUS_RUNNING = 'RUNNING'; + public const STATUS_QUEUED = 'QUEUED'; + public const STATUS_RUNNING = 'RUNNING'; public const STATUS_COMPLETED = 'COMPLETED'; - public const STATUS_FAILED = 'FAILED'; + public const STATUS_FAILED = 'FAILED'; + + private const ERROR_MESSAGE_MAX_LENGTH = 4000; #[ORM\Id] #[ORM\Column(type: 'uuid', unique: true)] @@ -44,6 +46,19 @@ class TagRebuildJob $this->status = self::STATUS_QUEUED; } + /** + * @return list + */ + public static function statuses(): array + { + return [ + self::STATUS_QUEUED, + self::STATUS_RUNNING, + self::STATUS_COMPLETED, + self::STATUS_FAILED, + ]; + } + public function getId(): Uuid { return $this->id; @@ -54,24 +69,59 @@ class TagRebuildJob return $this->status; } + public function isQueued(): bool + { + return $this->status === self::STATUS_QUEUED; + } + + public function isRunning(): bool + { + return $this->status === self::STATUS_RUNNING; + } + + public function isCompleted(): bool + { + return $this->status === self::STATUS_COMPLETED; + } + + public function isFailed(): bool + { + return $this->status === self::STATUS_FAILED; + } + + public function isActive(): bool + { + return $this->isQueued() || $this->isRunning(); + } + public function markRunning(): void { $this->status = self::STATUS_RUNNING; $this->startedAt = new \DateTimeImmutable(); + $this->finishedAt = null; $this->errorMessage = null; } public function markCompleted(): void { + if ($this->startedAt === null) { + $this->startedAt = new \DateTimeImmutable(); + } + $this->status = self::STATUS_COMPLETED; $this->finishedAt = new \DateTimeImmutable(); + $this->errorMessage = null; } public function markFailed(string $message): void { + if ($this->startedAt === null) { + $this->startedAt = new \DateTimeImmutable(); + } + $this->status = self::STATUS_FAILED; $this->finishedAt = new \DateTimeImmutable(); - $this->errorMessage = $message; + $this->errorMessage = $this->normalizeErrorMessage($message); } public function getCreatedAt(): \DateTimeImmutable @@ -93,4 +143,19 @@ class TagRebuildJob { return $this->errorMessage; } + + private function normalizeErrorMessage(string $message): ?string + { + $message = trim($message); + + if ($message === '') { + return 'Unknown tag rebuild failure.'; + } + + if (mb_strlen($message) > self::ERROR_MESSAGE_MAX_LENGTH) { + $message = mb_substr($message, 0, self::ERROR_MESSAGE_MAX_LENGTH); + } + + return $message; + } } \ No newline at end of file diff --git a/src/Intent/CatalogIntentLite.php b/src/Intent/CatalogIntentLite.php index bd5e73f..d8c5760 100644 --- a/src/Intent/CatalogIntentLite.php +++ b/src/Intent/CatalogIntentLite.php @@ -6,82 +6,132 @@ namespace App\Intent; use App\Config\CatalogIntentConfig; use App\Knowledge\Retrieval\QueryCleaner; -use App\Tag\TagVectorSearchClient; use App\Tag\TagTypes; +use App\Tag\TagVectorSearchClient; /** - * CatalogIntentLite + * Lightweight catalog entity detector. * - * Reiner Entity-Detector. - * - * Verantwortlich nur für: - * - Vector-Tag-Erkennung - * - Score-Gate - * - Ambiguity-Check - * - Sicherstellen, dass TagType = catalog_entity - * - * KEIN: - * - Listen-Signal - * - SalesIntent - * - Routing + * Responsibilities: + * - clean the user query for tag lookup + * - query the tag vector index + * - keep only catalog_entity hits + * - apply confidence and ambiguity gates + * - return one canonical entity label or null */ final readonly class CatalogIntentLite { + /** + * Slightly wider than the old top-3 search so generic tags do not crowd out + * relevant catalog_entity hits too easily. + */ + private const SEARCH_LIMIT = 6; public function __construct( private TagVectorSearchClient $tagVectorClient, - private QueryCleaner $queryCleaner - ) {} + private QueryCleaner $queryCleaner, + ) { + } /** - * Gibt das canonical Label der erkannten catalog_entity zurück - * oder null, wenn kein sauberer Treffer. + * Returns the canonical normalized label of the detected catalog entity, + * or null when no safe entity match exists. */ public function detect(string $prompt): ?string { $prompt = trim($prompt); + if ($prompt === '') { return null; } - $promptTag = $this->queryCleaner->clean($prompt); + $cleanQuery = trim($this->queryCleaner->clean($prompt)); - // 1) Tag-Vector-Suche - $hits = $this->tagVectorClient->search($promptTag, 3); - - if ($hits === []) { + if ($cleanQuery === '') { return null; } - $best = $hits[0]; - $bestScore = (float)($best['score'] ?? 0.0); + $catalogHits = $this->filterCatalogEntityHits( + $this->tagVectorClient->search($cleanQuery, self::SEARCH_LIMIT) + ); + + if ($catalogHits === []) { + return null; + } + + $best = $catalogHits[0]; + $bestScore = (float) ($best['score'] ?? 0.0); - // 2) Score-Tags if ($bestScore < CatalogIntentConfig::MIN_SCORE) { return null; } - // 3) Ambiguity-Check - if (isset($hits[1])) { - $secondScore = (float)($hits[1]['score'] ?? 0.0); + if (isset($catalogHits[1])) { + $secondScore = (float) ($catalogHits[1]['score'] ?? 0.0); if (abs($bestScore - $secondScore) < CatalogIntentConfig::AMBIGUITY_DELTA) { return null; } } - // 4) Nur catalog_entity zulassen - if (($best['tag_type'] ?? null) !== TagTypes::CATALOG_ENTITY) { - return null; + $label = $this->normalizeLabel((string) ($best['label'] ?? '')); + + return $label !== '' ? $label : null; + } + + /** + * @param array $hits + * + * @return list + */ + private function filterCatalogEntityHits(array $hits): array + { + $filtered = []; + + foreach ($hits as $hit) { + $tagId = trim((string) ($hit['tag_id'] ?? '')); + $score = (float) ($hit['score'] ?? 0.0); + $tagType = TagTypes::normalize((string) ($hit['tag_type'] ?? TagTypes::GENERIC)); + + if ($tagId === '') { + continue; + } + + if ($tagType !== TagTypes::CATALOG_ENTITY) { + continue; + } + + $filtered[] = [ + 'tag_id' => $tagId, + 'score' => $score, + 'label' => isset($hit['label']) ? (string) $hit['label'] : null, + 'tag_type' => $tagType, + ]; } - // 5) Canonical Label - $label = trim((string)($best['label'] ?? '')); + usort( + $filtered, + static fn (array $left, array $right): int => ($right['score'] <=> $left['score']) + ); - if ($label === '') { - return null; - } + return $filtered; + } - return mb_strtolower($label); + private function normalizeLabel(string $label): string + { + $label = mb_strtolower(trim($label)); + $label = preg_replace('/\s+/u', ' ', $label) ?? $label; + + return trim($label); } } \ No newline at end of file diff --git a/src/Service/Admin/DocumentTagAdminService.php b/src/Service/Admin/DocumentTagAdminService.php index 32e5d26..f5313ba 100644 --- a/src/Service/Admin/DocumentTagAdminService.php +++ b/src/Service/Admin/DocumentTagAdminService.php @@ -8,65 +8,99 @@ use App\Entity\Document; use App\Entity\Tag; use App\Service\TagRebuildJobService; use App\Tag\TagService; +use App\Tag\TagTypes; use Doctrine\ORM\EntityManagerInterface; +use RuntimeException; -final class DocumentTagAdminService +final readonly class DocumentTagAdminService { public function __construct( - private readonly EntityManagerInterface $em, - private readonly TagService $tagService, - private readonly TagRebuildJobService $jobs, - ) {} + private EntityManagerInterface $em, + private TagService $tagService, + private TagRebuildJobService $jobs, + ) { + } /** * @return array{ * document: Document, * allTags: list, - * latestJob: mixed + * latestJob: mixed, + * hasActiveJob: bool * } */ public function getEditData(string $documentId): array { - $document = $this->em->getRepository(Document::class)->find($documentId); - if (!$document instanceof Document) { - throw new \RuntimeException('Document not found'); - } + $document = $this->findDocumentById($documentId); /** @var list $allTags */ - $allTags = $this->em->createQueryBuilder() - ->select('t') - ->from(Tag::class, 't') - ->orderBy('t.label', 'ASC') - ->getQuery() - ->getResult(); + $allTags = $this->em->getRepository(Tag::class)->findAll(); - $latestJob = $this->jobs->getLatestJob(); + usort( + $allTags, + static function (Tag $left, Tag $right): int { + $typeOrder = [ + TagTypes::CATALOG_ENTITY => 10, + TagTypes::GENERIC => 20, + TagTypes::SALES_SIGNAL => 30, + ]; + + $leftTypeRank = $typeOrder[$left->getType()] ?? 999; + $rightTypeRank = $typeOrder[$right->getType()] ?? 999; + + if ($leftTypeRank !== $rightTypeRank) { + return $leftTypeRank <=> $rightTypeRank; + } + + $labelComparison = strcasecmp($left->getLabel(), $right->getLabel()); + + if ($labelComparison !== 0) { + return $labelComparison; + } + + return strcmp($left->getSlug(), $right->getSlug()); + } + ); return [ 'document' => $document, 'allTags' => $allTags, - 'latestJob' => $latestJob, + 'latestJob' => $this->jobs->getLatestJob(), + 'hasActiveJob' => $this->jobs->hasActiveJob(), ]; } /** - * Speichert die Tag-Auswahl für ein Dokument (inkl. Sync-Logik). + * Persists the selected tag set for a document via the central domain service. + * + * @param array $selectedTagIds */ public function saveTags(string $documentId, array $selectedTagIds): void { - $document = $this->em->getRepository(Document::class)->find($documentId); - if (!$document instanceof Document) { - throw new \RuntimeException('Document not found'); - } + $document = $this->findDocumentById($documentId); - // Delegation an deine Domain-Logik (bleibt dort, wo sie hingehört) $this->tagService->syncDocumentTags($document, $selectedTagIds); } public function getLatestRebuildStatus(): ?string { - $job = $this->jobs->getLatestJob(); + return $this->jobs->getLatestJob()?->getStatus(); + } - return $job?->getStatus(); + private function findDocumentById(string $documentId): Document + { + $documentId = trim($documentId); + + if ($documentId === '') { + throw new RuntimeException('Document not found.'); + } + + $document = $this->em->getRepository(Document::class)->find($documentId); + + if (!$document instanceof Document) { + throw new RuntimeException('Document not found.'); + } + + return $document; } } \ No newline at end of file diff --git a/src/Service/Admin/TagAdminService.php b/src/Service/Admin/TagAdminService.php index b0a701f..0fbe27e 100644 --- a/src/Service/Admin/TagAdminService.php +++ b/src/Service/Admin/TagAdminService.php @@ -9,23 +9,29 @@ use App\Entity\DocumentTag; use App\Entity\Tag; use App\Service\TagRebuildJobService; use App\Tag\TagService; +use App\Tag\TagTypes; use Doctrine\ORM\EntityManagerInterface; +use RuntimeException; final readonly class TagAdminService { public function __construct( private EntityManagerInterface $em, - private TagService $tagService, - private TagRebuildJobService $jobs, - ) {} + private TagService $tagService, + private TagRebuildJobService $jobs, + ) { + } public function getIndexData(): array { + /** @var list $tags */ $tags = $this->em->getRepository(Tag::class) - ->findBy([], ['label' => 'ASC']); + ->findBy([], ['type' => 'ASC', 'label' => 'ASC']); return [ 'tags' => $tags, + 'tagTypeChoices' => TagTypes::choices(), + 'documentCountByTagId' => $this->buildDocumentCountByTagId(), 'latestJob' => $this->jobs->getLatestJob(), 'hasActiveJob' => $this->jobs->hasActiveJob(), ]; @@ -35,7 +41,7 @@ final readonly class TagAdminService string $slug, string $label, ?string $description, - string $type = 'generic' // NEU + string $type = TagTypes::GENERIC, ): void { $this->tagService->create($slug, $label, $description, $type); } @@ -47,35 +53,47 @@ final readonly class TagAdminService public function getAssignData(string $tagId): array { - $tag = $this->em->getRepository(Tag::class)->find($tagId); + $tag = $this->findTagById($tagId); - if (!$tag instanceof Tag) { - throw new \RuntimeException('Tag nicht gefunden.'); - } - - $documents = $this->em->getRepository(Document::class)->findAll(); + /** @var list $documents */ + $documents = $this->em->getRepository(Document::class)->findBy( + ['status' => Document::STATUS_ACTIVE], + ['title' => 'ASC'] + ); $documentsData = array_map( - fn(Document $d) => [ - 'id' => (string)$d->getId(), - 'title' => $d->getTitle(), + static fn (Document $document): array => [ + 'id' => (string) $document->getId(), + 'title' => $document->getTitle(), ], $documents ); + /** @var list $existingRelations */ $existingRelations = $this->em ->getRepository(DocumentTag::class) ->findBy(['tag' => $tag]); - $assignedDocIds = array_map( - fn(DocumentTag $dt) => (string)$dt->getDocument()->getId(), - $existingRelations + $activeDocumentIds = array_map( + static fn (Document $document): string => (string) $document->getId(), + $documents ); + $assignedDocIds = []; + + foreach ($existingRelations as $relation) { + $documentId = (string) $relation->getDocument()->getId(); + + if (in_array($documentId, $activeDocumentIds, true)) { + $assignedDocIds[] = $documentId; + } + } + return [ 'tag' => $tag, 'documents' => $documentsData, - 'assignedDocIds' => $assignedDocIds, + 'assignedDocIds' => array_values(array_unique($assignedDocIds)), + 'tagTypeChoices' => TagTypes::choices(), 'latestJob' => $this->jobs->getLatestJob(), 'hasActiveJob' => $this->jobs->hasActiveJob(), ]; @@ -83,12 +101,55 @@ final readonly class TagAdminService public function syncAssignments(string $tagId, array $selectedDocIds): void { + $tag = $this->findTagById($tagId); + $this->tagService->syncTagDocuments($tag, $selectedDocIds); + } + + private function findTagById(string $tagId): Tag + { + $tagId = trim($tagId); + + if ($tagId === '') { + throw new RuntimeException('Tag nicht gefunden.'); + } + $tag = $this->em->getRepository(Tag::class)->find($tagId); if (!$tag instanceof Tag) { - throw new \RuntimeException('Tag nicht gefunden.'); + throw new RuntimeException('Tag nicht gefunden.'); } - $this->tagService->syncTagDocuments($tag, $selectedDocIds); + return $tag; + } + + /** + * @return array + */ + private function buildDocumentCountByTagId(): array + { + $rows = $this->em->createQueryBuilder() + ->select('t AS tag', 'COUNT(d.id) AS documentCount') + ->from(Tag::class, 't') + ->leftJoin(DocumentTag::class, 'dt', 'WITH', 'dt.tag = t') + ->leftJoin('dt.document', 'd', 'WITH', 'd.status = :status') + ->groupBy('t.id') + ->setParameter('status', Document::STATUS_ACTIVE) + ->getQuery() + ->getResult(); + + $counts = []; + + foreach ($rows as $row) { + $tag = $row[0] ?? $row['tag'] ?? null; + $documentCount = (int) ($row['documentCount'] ?? 0); + + if (!$tag instanceof Tag) { + continue; + } + + $counts[$tag->getId()->toRfc4122()] = $documentCount; + } + + return $counts; } } \ No newline at end of file diff --git a/src/Service/DocumentService.php b/src/Service/DocumentService.php index 9f1a86a..69b3958 100644 --- a/src/Service/DocumentService.php +++ b/src/Service/DocumentService.php @@ -1,29 +1,33 @@ setTitle($title); + $document->setTitle(trim($title)); $document->setCreatedBy($user); $version = new DocumentVersion(); @@ -44,14 +48,13 @@ class DocumentService } /** - * Fügt neue Version hinzu (immutable) + * Adds a new immutable version to an existing document. */ public function addVersion( Document $document, string $filePath, User $user ): DocumentVersion { - $nextVersionNumber = $this->getNextVersionNumber($document); $version = new DocumentVersion(); @@ -70,7 +73,7 @@ class DocumentService } /** - * Aktiviert eine Version + * Activates a document version and marks it for re-ingest. */ public function activateVersion(DocumentVersion $version): void { @@ -82,41 +85,77 @@ class DocumentService $version->setActive(true); $document->setCurrentVersion($version); - $version->setIngestStatus(DocumentVersion::INGEST_PENDING); $this->em->flush(); } /** - * Archiviert Dokument + * Archives a document. + * + * If the document had tag assignments, the tag index is rebuilt so the + * routing layer no longer works with an outdated active document set. */ public function archive(Document $document): void { + if ($document->getStatus() === Document::STATUS_ARCHIVED) { + return; + } + + $shouldRebuildTags = $this->hasTagAssignments($document); + $document->archive(); $this->em->flush(); - } - public function delete(Document $document): void - { - $this->em->remove($document); - $this->em->flush(); + if ($shouldRebuildTags) { + $this->triggerTagRebuildIfIdle(); + } } /** - * Berechnet SHA256 Checksum + * Deletes a document. + * + * If the document had tag assignments, the tag index is rebuilt after the + * removal so stale document references disappear from tag-based routing. + */ + public function delete(Document $document): void + { + $shouldRebuildTags = $this->hasTagAssignments($document); + + $this->em->remove($document); + $this->em->flush(); + + if ($shouldRebuildTags) { + $this->triggerTagRebuildIfIdle(); + } + } + + /** + * Calculates the SHA256 checksum for a file path. */ private function calculateChecksum(string $filePath): string { - if (!file_exists($filePath)) { - throw new \RuntimeException('File not found for checksum.'); + $filePath = trim($filePath); + + if ($filePath === '') { + throw new RuntimeException('File path must not be empty.'); } - return hash_file('sha256', $filePath); + if (!is_file($filePath)) { + throw new RuntimeException('File not found for checksum.'); + } + + $checksum = hash_file('sha256', $filePath); + + if ($checksum === false) { + throw new RuntimeException('Could not calculate file checksum.'); + } + + return $checksum; } /** - * Ermittelt nächste Versionsnummer + * Determines the next version number for a document. */ private function getNextVersionNumber(Document $document): int { @@ -128,4 +167,16 @@ class DocumentService return $max + 1; } -} + + private function hasTagAssignments(Document $document): bool + { + return $document->getDocumentTags()->count() > 0; + } + + private function triggerTagRebuildIfIdle(): void + { + if (!$this->tagRebuildJobService->hasActiveJob()) { + $this->tagRebuildJobService->enqueueAndStartAsync(); + } + } +} \ No newline at end of file diff --git a/src/Service/TagRebuildJobService.php b/src/Service/TagRebuildJobService.php index e07dec5..d02e7fe 100644 --- a/src/Service/TagRebuildJobService.php +++ b/src/Service/TagRebuildJobService.php @@ -11,16 +11,24 @@ use Psr\Log\LoggerInterface; final readonly class TagRebuildJobService { /** - * Wenn ein QUEUED-Job länger nicht startet, gilt er als "stale" und wird auf FAILED gesetzt, - * damit das System nicht dauerhaft blockiert. + * If a QUEUED job does not transition into RUNNING in time, + * it is treated as stale so the system does not stay blocked forever. */ - private const STALE_QUEUED_AFTER_SECONDS = 300; // 5 Minuten + private const STALE_QUEUED_AFTER_SECONDS = 300; + + /** + * The background runner should switch the job from QUEUED to RUNNING almost + * immediately because markRunning() happens at the top of the command. + */ + private const ASYNC_START_TIMEOUT_SECONDS = 3; + private const ASYNC_START_POLL_INTERVAL_MICROSECONDS = 250000; public function __construct( private EntityManagerInterface $em, - private LoggerInterface $agentLogger, - private string $projectDir, - ) {} + private LoggerInterface $agentLogger, + private string $projectDir, + ) { + } public function enqueueAndStartAsync(): TagRebuildJob { @@ -29,14 +37,25 @@ final readonly class TagRebuildJobService $this->em->persist($job); $this->em->flush(); - $this->startAsync($job); + try { + $this->startAsync($job); + } catch (\Throwable $e) { + $job->markFailed('Async tag rebuild start failed: ' . $e->getMessage()); + $this->em->flush(); + + $this->agentLogger->error('[tags] async job start failed', [ + 'job' => (string) $job->getId(), + 'error' => $e->getMessage(), + ]); + + throw $e; + } return $job; } public function enqueueIfIdle(): ?TagRebuildJob { - // Coalescing: Wenn ein Job läuft oder queued ist -> nichts tun if ($this->hasActiveJob()) { return null; } @@ -44,23 +63,18 @@ final readonly class TagRebuildJobService return $this->enqueueAndStartAsync(); } - /** - * Letzter Job (egal welcher Status). - */ public function getLatestJob(): ?TagRebuildJob { return $this->em->createQueryBuilder() ->select('j') ->from(TagRebuildJob::class, 'j') ->orderBy('j.createdAt', 'DESC') + ->addOrderBy('j.id', 'DESC') ->setMaxResults(1) ->getQuery() ->getOneOrNullResult(); } - /** - * Letzter Job mit Status COMPLETED. - */ public function getLatestCompletedJob(): ?TagRebuildJob { return $this->em->createQueryBuilder() @@ -69,18 +83,12 @@ final readonly class TagRebuildJobService ->where('j.status = :status') ->setParameter('status', TagRebuildJob::STATUS_COMPLETED) ->orderBy('j.createdAt', 'DESC') + ->addOrderBy('j.id', 'DESC') ->setMaxResults(1) ->getQuery() ->getOneOrNullResult(); } - /** - * Ob gerade ein Job aktiv ist: - * - RUNNING ist immer aktiv - * - QUEUED ist nur aktiv, wenn er nicht stale ist - * - * Zusätzlich: stale QUEUED Jobs werden auf FAILED gesetzt (Recovery). - */ public function hasActiveJob(): bool { $this->markStaleQueuedJobsFailed(); @@ -106,31 +114,33 @@ final readonly class TagRebuildJobService return (int) $qb->getQuery()->getSingleScalarResult() > 0; } - /** - * Startet den Job async über bin/console. - * Wichtige Fixes: - * - php explizit verwenden - * - --no-interaction - * - Logfile statt /dev/null - */ private function startAsync(TagRebuildJob $job): void { - $projectDir = rtrim($this->projectDir, '/'); - $console = $projectDir . '/bin/console'; + $projectDir = rtrim(trim($this->projectDir), '/'); + $console = $projectDir . '/bin/console'; + if ($projectDir === '' || !is_dir($projectDir)) { + throw new \RuntimeException('Project directory is invalid.'); + } + + if (!is_file($console)) { + throw new \RuntimeException('bin/console not found: ' . $console); + } + + $phpBinary = $this->resolvePhpBinary(); $jobId = (string) $job->getId(); $logDir = $projectDir . '/var/log/tags'; - if (!is_dir($logDir)) { - @mkdir($logDir, 0777, true); + if (!is_dir($logDir) && !@mkdir($logDir, 0775, true) && !is_dir($logDir)) { + throw new \RuntimeException('Could not create tag job log directory.'); } + $logFile = $logDir . '/job_' . $jobId . '.log'; - // Robust: cd ins Projekt, dann nohup php bin/console ... $cmd = sprintf( - 'cd %s && nohup %s %s %s %s --no-interaction >> %s 2>&1 &', + 'cd %s && nohup %s %s %s %s --no-interaction >> %s 2>&1 & echo $!', escapeshellarg($projectDir), - escapeshellcmd('php'), + escapeshellarg($phpBinary), escapeshellarg($console), escapeshellarg('mto:agent:tags:job:run'), escapeshellarg($jobId), @@ -141,15 +151,92 @@ final readonly class TagRebuildJobService 'job' => $jobId, 'cmd' => $cmd, 'log' => $logFile, + 'php_binary' => $phpBinary, ]); - @exec($cmd); + $output = []; + $exitCode = 0; + @exec($cmd, $output, $exitCode); + + $pid = isset($output[0]) ? trim((string) $output[0]) : ''; + + if ($exitCode !== 0) { + throw new \RuntimeException('Async process bootstrap failed with exit code ' . $exitCode . '.'); + } + + if ($pid === '' || !ctype_digit($pid)) { + throw new \RuntimeException('Async process bootstrap did not return a valid PID.'); + } + + $this->agentLogger->info('[tags] async job process started', [ + 'job' => $jobId, + 'pid' => $pid, + 'log' => $logFile, + 'php_binary' => $phpBinary, + ]); + + $this->waitForAsyncJobTransition($job, $logFile); + } + + private function resolvePhpBinary(): string + { + $envCandidates = [ + trim((string) ($_SERVER['PHP_CLI_BINARY'] ?? '')), + trim((string) ($_ENV['PHP_CLI_BINARY'] ?? '')), + trim((string) getenv('PHP_CLI_BINARY')), + ]; + + foreach ($envCandidates as $candidate) { + if ($this->isValidCliPhpBinary($candidate)) { + return $candidate; + } + } + + $phpBinary = defined('PHP_BINARY') ? trim((string) PHP_BINARY) : ''; + if ($this->isValidCliPhpBinary($phpBinary)) { + return $phpBinary; + } + + $fallbackCandidates = [ + '/usr/bin/php', + '/usr/local/bin/php', + '/bin/php', + '/opt/homebrew/bin/php', + ]; + + foreach ($fallbackCandidates as $candidate) { + if ($this->isValidCliPhpBinary($candidate)) { + return $candidate; + } + } + + $whichPhp = trim((string) @shell_exec('command -v php 2>/dev/null')); + if ($this->isValidCliPhpBinary($whichPhp)) { + return $whichPhp; + } + + throw new \RuntimeException( + 'Could not resolve a CLI PHP binary. Set PHP_CLI_BINARY explicitly, e.g. /usr/bin/php.' + ); + } + + private function isValidCliPhpBinary(string $path): bool + { + $path = trim($path); + + if ($path === '' || !is_file($path) || !is_executable($path)) { + return false; + } + + $basename = strtolower(basename($path)); + + if (str_contains($basename, 'fpm') || str_contains($basename, 'cgi')) { + return false; + } + + return true; } - /** - * Recovery gegen "ewig QUEUED": - * Setzt alte QUEUED Jobs auf FAILED, damit enqueueIfIdle() nicht dauerhaft blockiert. - */ private function markStaleQueuedJobsFailed(): void { $cutoff = new \DateTimeImmutable('-' . self::STALE_QUEUED_AFTER_SECONDS . ' seconds'); @@ -161,12 +248,13 @@ final readonly class TagRebuildJobService ->andWhere('j.createdAt < :cutoff') ->setParameter('queued', TagRebuildJob::STATUS_QUEUED) ->setParameter('cutoff', $cutoff) + ->orderBy('j.createdAt', 'ASC') ->setMaxResults(25); - /** @var TagRebuildJob[] $stale */ + /** @var list $stale */ $stale = $qb->getQuery()->getResult(); - if (!$stale) { + if ($stale === []) { return; } @@ -183,4 +271,46 @@ final readonly class TagRebuildJobService $this->em->flush(); } + + private function waitForAsyncJobTransition(TagRebuildJob $job, string $logFile): void + { + $deadline = microtime(true) + self::ASYNC_START_TIMEOUT_SECONDS; + + while (microtime(true) < $deadline) { + usleep(self::ASYNC_START_POLL_INTERVAL_MICROSECONDS); + $this->em->refresh($job); + + if (!$job->isQueued()) { + return; + } + } + + $logHint = $this->readLogTail($logFile); + + throw new \RuntimeException( + 'Async tag rebuild runner did not transition from QUEUED to RUNNING within ' + . self::ASYNC_START_TIMEOUT_SECONDS + . ' seconds.' + . ($logHint !== null ? ' Log tail: ' . $logHint : '') + ); + } + + private function readLogTail(string $logFile): ?string + { + if (!is_file($logFile) || !is_readable($logFile)) { + return null; + } + + $content = @file_get_contents($logFile); + + if (!is_string($content) || trim($content) === '') { + return null; + } + + $content = trim($content); + $tail = mb_substr($content, -800); + $tail = preg_replace('/\s+/u', ' ', $tail) ?? $tail; + + return trim($tail) !== '' ? trim($tail) : null; + } } \ No newline at end of file diff --git a/src/Service/TagRebuildStatusProvider.php b/src/Service/TagRebuildStatusProvider.php index 6154191..7cab7bc 100644 --- a/src/Service/TagRebuildStatusProvider.php +++ b/src/Service/TagRebuildStatusProvider.php @@ -11,29 +11,76 @@ final readonly class TagRebuildStatusProvider { public function __construct( private EntityManagerInterface $em - ) {} + ) { + } public function getLatestStatus(): ?array { - $this->em->clear(); - - $job = $this->em->createQueryBuilder() - ->select('j') + $row = $this->em->createQueryBuilder() + ->select( + 'j.status AS status', + 'j.createdAt AS createdAt', + 'j.startedAt AS startedAt', + 'j.finishedAt AS finishedAt', + 'j.errorMessage AS errorMessage' + ) ->from(TagRebuildJob::class, 'j') ->orderBy('j.createdAt', 'DESC') + ->addOrderBy('j.id', 'DESC') ->setMaxResults(1) ->getQuery() - ->getOneOrNullResult(); + ->getOneOrNullResult(\Doctrine\ORM\Query::HYDRATE_ARRAY); - if (!$job instanceof TagRebuildJob) { + if (!is_array($row)) { + return null; + } + + $status = trim((string) ($row['status'] ?? '')); + + if ($status === '') { return null; } return [ - 'status' => $job->getStatus(), - 'startedAt' => $job->getStartedAt()?->format(DATE_ATOM), - 'finishedAt' => $job->getFinishedAt()?->format(DATE_ATOM), - 'error' => $job->getErrorMessage(), + 'status' => $status, + 'createdAt' => $this->formatDateValue($row['createdAt'] ?? null), + 'startedAt' => $this->formatDateValue($row['startedAt'] ?? null), + 'finishedAt' => $this->formatDateValue($row['finishedAt'] ?? null), + 'error' => $this->normalizeNullableString($row['errorMessage'] ?? null), + 'hasActiveJob' => in_array($status, [ + TagRebuildJob::STATUS_QUEUED, + TagRebuildJob::STATUS_RUNNING, + ], true), ]; } + + private function formatDateValue(mixed $value): ?string + { + if ($value instanceof \DateTimeInterface) { + return $value->format(DATE_ATOM); + } + + if (is_string($value)) { + $value = trim($value); + + if ($value === '') { + return null; + } + + try { + return (new \DateTimeImmutable($value))->format(DATE_ATOM); + } catch (\Throwable) { + return null; + } + } + + return null; + } + + private function normalizeNullableString(mixed $value): ?string + { + $value = trim((string) $value); + + return $value !== '' ? $value : null; + } } \ No newline at end of file diff --git a/src/Tag/TagNdjsonExporter.php b/src/Tag/TagNdjsonExporter.php index 93cfcd0..5bd1aa3 100644 --- a/src/Tag/TagNdjsonExporter.php +++ b/src/Tag/TagNdjsonExporter.php @@ -4,6 +4,7 @@ declare(strict_types=1); namespace App\Tag; +use App\Entity\Document; use App\Entity\DocumentTag; use App\Entity\Tag; use Doctrine\ORM\EntityManagerInterface; @@ -12,148 +13,199 @@ final readonly class TagNdjsonExporter { public function __construct( private EntityManagerInterface $em, - private string $tagsNdjsonPath, - ) {} + private string $tagsNdjsonPath, + ) { + } /** - * Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()). + * Export all relevant tags into NDJSON (streaming) with atomic switch (.tmp + rename()). * * Line format: * { * "tag_id":"...", * "text":"label\nslug\noptional description", - * "type":"catalog_entity|generic|...", + * "type":"catalog_entity|generic|sales_signal", * "document_ids":["...","..."] * } * + * Only ACTIVE document assignments are exported. Tags without active document + * assignments are intentionally skipped so they do not influence retrieval. + * * @return array{tags:int, lines:int, bytes:int, path:string} */ public function export(): array { - $dir = \dirname($this->tagsNdjsonPath); - if (!\is_dir($dir)) { - @\mkdir($dir, 0775, true); - } + $this->ensureTargetDirectoryExists(); $tmpPath = $this->tagsNdjsonPath . '.tmp'; + $this->cleanupTemporaryFile($tmpPath); - $fh = @\fopen($tmpPath, 'wb'); - if (!$fh) { + $fh = @fopen($tmpPath, 'wb'); + + if ($fh === false) { throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath); } - // 1) Load all tags - $tags = $this->em->createQueryBuilder() - ->select('t') - ->from(Tag::class, 't') - ->orderBy('t.label', 'ASC') - ->getQuery() - ->getResult(); + try { + /** @var list $tags */ + $tags = $this->em->createQueryBuilder() + ->select('t') + ->from(Tag::class, 't') + ->orderBy('t.type', 'ASC') + ->addOrderBy('t.label', 'ASC') + ->getQuery() + ->getResult(); - if (!\is_array($tags) || $tags === []) { - \fclose($fh); + if ($tags === []) { + fclose($fh); + $this->atomicReplace($tmpPath, $this->tagsNdjsonPath); + + return [ + 'tags' => 0, + 'lines' => 0, + 'bytes' => (int) @filesize($this->tagsNdjsonPath), + 'path' => $this->tagsNdjsonPath, + ]; + } + + $tagToActiveDocs = $this->buildActiveDocumentMap(); + $lines = 0; + + foreach ($tags as $tag) { + $tagId = $tag->getId()->toRfc4122(); + $docIds = $tagToActiveDocs[$tagId] ?? []; + + if ($docIds === []) { + continue; + } + + $line = [ + 'tag_id' => $tagId, + 'text' => $this->buildEmbeddingText($tag), + 'type' => TagTypes::normalize($tag->getType()), + 'document_ids' => $docIds, + ]; + + $json = json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES); + + if (!is_string($json)) { + continue; + } + + fwrite($fh, $json . "\n"); + $lines++; + } + + fclose($fh); $this->atomicReplace($tmpPath, $this->tagsNdjsonPath); return [ - 'tags' => 0, - 'lines' => 0, - 'bytes' => (int) @\filesize($this->tagsNdjsonPath), - 'path' => $this->tagsNdjsonPath, + 'tags' => count($tags), + 'lines' => $lines, + 'bytes' => (int) @filesize($this->tagsNdjsonPath), + 'path' => $this->tagsNdjsonPath, ]; - } + } catch (\Throwable $e) { + fclose($fh); + $this->cleanupTemporaryFile($tmpPath); - // 2) Build tagId => docIds map - $rows = $this->em->createQueryBuilder() - ->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId') + throw $e; + } + } + + /** + * @return array> + */ + private function buildActiveDocumentMap(): array + { + /** @var list $relations */ + $relations = $this->em->createQueryBuilder() + ->select('dt') + ->addSelect('t', 'd') ->from(DocumentTag::class, 'dt') + ->innerJoin('dt.tag', 't') + ->innerJoin('dt.document', 'd') + ->where('d.status = :status') + ->setParameter('status', Document::STATUS_ACTIVE) ->getQuery() - ->getArrayResult(); + ->getResult(); $tagToDocs = []; - foreach ($rows as $r) { - $tagId = (string) ($r['tagId'] ?? ''); - $docId = (string) ($r['docId'] ?? ''); - if ($tagId === '' || $docId === '') { - continue; - } - $tagToDocs[$tagId][] = $docId; + + foreach ($relations as $relation) { + $tag = $relation->getTag(); + $document = $relation->getDocument(); + + $tagId = $tag->getId()->toRfc4122(); + $docId = $document->getId()->toRfc4122(); + + $tagToDocs[$tagId][$docId] = $docId; } - // 3) Stream NDJSON - $lines = 0; - - foreach ($tags as $tag) { - if (!$tag instanceof Tag) { - continue; - } - - $tagId = (string) $tag->getId(); - $docIds = $tagToDocs[$tagId] ?? []; - - if ($docIds !== []) { - $docIds = \array_values(\array_unique($docIds)); - } - - // Embedding source - $textParts = [ - $tag->getLabel(), - $tag->getSlug(), - ]; - - $desc = $tag->getDescription(); - if (\is_string($desc) && \trim($desc) !== '') { - $textParts[] = \trim($desc); - } - - $type = method_exists($tag, 'getType') - ? (string) $tag->getType() - : 'generic'; - - if ($type === '') { - $type = 'generic'; - } - - $line = [ - 'tag_id' => $tagId, - 'text' => \implode("\n", $textParts), - 'type' => $type, // 🔥 NEW - 'document_ids' => $docIds, - ]; - - $json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES); - if (!\is_string($json)) { - continue; - } - - \fwrite($fh, $json . "\n"); - $lines++; + foreach ($tagToDocs as $tagId => $docIds) { + ksort($docIds); + $tagToDocs[$tagId] = array_values($docIds); } - \fclose($fh); - $this->atomicReplace($tmpPath, $this->tagsNdjsonPath); + return $tagToDocs; + } - return [ - 'tags' => \count($tags), - 'lines' => $lines, - 'bytes' => (int) @\filesize($this->tagsNdjsonPath), - 'path' => $this->tagsNdjsonPath, + private function buildEmbeddingText(Tag $tag): string + { + $parts = [ + trim($tag->getLabel()), + trim($tag->getSlug()), ]; + + $description = trim((string) $tag->getDescription()); + + if ($description !== '') { + $parts[] = preg_replace('/\s+/u', ' ', $description) ?? $description; + } + + $parts = array_values(array_filter( + array_unique($parts), + static fn (string $part): bool => $part !== '' + )); + + return implode("\n", $parts); + } + + private function ensureTargetDirectoryExists(): void + { + $dir = dirname($this->tagsNdjsonPath); + + if (is_dir($dir)) { + return; + } + + if (!@mkdir($dir, 0775, true) && !is_dir($dir)) { + throw new \RuntimeException('Cannot create tags NDJSON directory: ' . $dir); + } + } + + private function cleanupTemporaryFile(string $tmpPath): void + { + if (is_file($tmpPath)) { + @unlink($tmpPath); + } } private function atomicReplace(string $tmpPath, string $finalPath): void { - if (\is_file($finalPath)) { - @\chmod($finalPath, 0664); + if (is_file($finalPath)) { + @chmod($finalPath, 0664); } - if (!@\rename($tmpPath, $finalPath)) { - if (!@\copy($tmpPath, $finalPath)) { - @\unlink($tmpPath); + if (!@rename($tmpPath, $finalPath)) { + if (!@copy($tmpPath, $finalPath)) { + @unlink($tmpPath); throw new \RuntimeException('Atomic replace failed for: ' . $finalPath); } - @\unlink($tmpPath); + + @unlink($tmpPath); } - @\chmod($finalPath, 0664); + @chmod($finalPath, 0664); } } \ No newline at end of file diff --git a/src/Tag/TagRoutingService.php b/src/Tag/TagRoutingService.php index f8e3a89..5a497e3 100644 --- a/src/Tag/TagRoutingService.php +++ b/src/Tag/TagRoutingService.php @@ -4,6 +4,7 @@ declare(strict_types=1); namespace App\Tag; +use App\Entity\Document; use Doctrine\DBAL\ArrayParameterType; use Doctrine\DBAL\Exception; use Doctrine\ORM\EntityManagerInterface; @@ -11,91 +12,239 @@ use Symfony\Component\Uid\Uuid; final class TagRoutingService { + /** + * Number of raw tag hits requested from the vector service. + */ private const DEFAULT_TOPK = 8; - private const MIN_BEST_SCORE = 0.25; - private const MAX_CANDIDATE_DOCS = 200; + + /** + * Hard minimum confidence required to activate tag-based document routing. + * + * This intentionally aligns with the tag vector client gate to avoid + * misleading secondary thresholds in this class. + */ + private const MIN_BEST_SCORE = 0.72; + + /** + * Only keep tag hits that stay reasonably close to the best hit. + * This reduces semantic spillover into weakly related document spaces. + */ + private const MAX_SCORE_DROP_FROM_BEST = 0.08; + + /** + * Maximum number of tag hits that may influence routing. + */ + private const MAX_ROUTING_TAGS = 5; + + /** + * Maximum number of candidate documents passed into scoped chunk search. + */ + private const MAX_CANDIDATE_DOCS = 80; + + /** + * Small bonus for documents matched by multiple routed tags. + */ + private const MULTI_TAG_BONUS_PER_EXTRA_TAG = 0.05; + private const MAX_MULTI_TAG_BONUS = 0.15; public function __construct( private readonly TagVectorSearchClient $tagSearch, private readonly EntityManagerInterface $em, - ) {} + ) { + } /** - * @return string[]|null + * Returns ordered active document ids for tag-scoped retrieval. + * + * The method intentionally returns only document ids so the current + * retriever pipeline can stay unchanged. + * + * @return list|null * @throws Exception */ public function route(string $query): ?array { $query = trim($query); + if ($query === '') { return null; } - $hits = $this->tagSearch->search($query, self::DEFAULT_TOPK); + $hits = $this->filterRoutingHits( + $this->tagSearch->search($query, self::DEFAULT_TOPK) + ); - if (!is_array($hits) || $hits === []) { + if ($hits === []) { return null; } - $bestScore = (float)($hits[0]['score'] ?? 0.0); - if ($bestScore < self::MIN_BEST_SCORE) { - return null; - } - - // Convert tag UUID strings to binary(16) $tagBinaryIds = []; + $tagMetaById = []; foreach ($hits as $hit) { - $id = (string)($hit['tag_id'] ?? ''); - if ($id === '') { + $tagId = (string) ($hit['tag_id'] ?? ''); + + if ($tagId === '') { continue; } try { - $tagBinaryIds[] = Uuid::fromString($id)->toBinary(); + $tagBinaryIds[] = Uuid::fromString($tagId)->toBinary(); } catch (\Throwable) { continue; } + + $tagMetaById[$tagId] = [ + 'score' => (float) $hit['score'], + 'weight' => $this->resolveTypeWeight((string) $hit['tag_type']), + ]; } if ($tagBinaryIds === []) { return null; } - // Direct DBAL query (binary-safe) - $conn = $this->em->getConnection(); - - $rows = $conn->executeQuery( - 'SELECT document_id - FROM document_tag - WHERE tag_id IN (:tagIds)', - ['tagIds' => $tagBinaryIds], - ['tagIds' => ArrayParameterType::BINARY] + $rows = $this->em->getConnection()->executeQuery( + 'SELECT dt.document_id, dt.tag_id + FROM document_tag dt + INNER JOIN document d ON d.id = dt.document_id + WHERE dt.tag_id IN (:tagIds) + AND d.status = :status', + [ + 'tagIds' => $tagBinaryIds, + 'status' => Document::STATUS_ACTIVE, + ], + [ + 'tagIds' => ArrayParameterType::BINARY, + ] )->fetchAllAssociative(); if ($rows === []) { return null; } - $docIds = []; + $documentScores = []; + $documentMatchedTags = []; foreach ($rows as $row) { - if (!isset($row['document_id'])) { + if (!isset($row['document_id'], $row['tag_id'])) { continue; } try { - $uuid = Uuid::fromBinary($row['document_id']); - $docIds[(string)$uuid] = true; + $documentId = (string) Uuid::fromBinary($row['document_id']); + $tagId = (string) Uuid::fromBinary($row['tag_id']); } catch (\Throwable) { continue; } - if (count($docIds) >= self::MAX_CANDIDATE_DOCS) { + if (!isset($tagMetaById[$tagId])) { + continue; + } + + $documentScores[$documentId] = ($documentScores[$documentId] ?? 0.0) + + ($tagMetaById[$tagId]['score'] * $tagMetaById[$tagId]['weight']); + + $documentMatchedTags[$documentId][$tagId] = true; + } + + if ($documentScores === []) { + return null; + } + + foreach ($documentScores as $documentId => $score) { + $matchedTagCount = isset($documentMatchedTags[$documentId]) + ? count($documentMatchedTags[$documentId]) + : 0; + + if ($matchedTagCount > 1) { + $documentScores[$documentId] += min( + self::MAX_MULTI_TAG_BONUS, + ($matchedTagCount - 1) * self::MULTI_TAG_BONUS_PER_EXTRA_TAG + ); + } + } + + arsort($documentScores, SORT_NUMERIC); + + return array_slice( + array_keys($documentScores), + 0, + self::MAX_CANDIDATE_DOCS + ); + } + + /** + * @param array $hits + * + * @return list + */ + private function filterRoutingHits(array $hits): array + { + if ($hits === []) { + return []; + } + + $bestScore = (float) ($hits[0]['score'] ?? 0.0); + + if ($bestScore < self::MIN_BEST_SCORE) { + return []; + } + + $minimumAcceptedScore = max( + self::MIN_BEST_SCORE, + $bestScore - self::MAX_SCORE_DROP_FROM_BEST + ); + + $filtered = []; + + foreach ($hits as $hit) { + $tagId = (string) ($hit['tag_id'] ?? ''); + $score = (float) ($hit['score'] ?? 0.0); + $tagType = TagTypes::normalize( + (string) ($hit['tag_type'] ?? TagTypes::GENERIC) + ); + + if ($tagId === '' || $score < $minimumAcceptedScore) { + continue; + } + + // Sales signals may still be useful elsewhere, but they should not + // expand the document scope for semantic retrieval. + if ($tagType === TagTypes::SALES_SIGNAL) { + continue; + } + + $filtered[] = [ + 'tag_id' => $tagId, + 'score' => $score, + 'tag_type' => $tagType, + ]; + + if (count($filtered) >= self::MAX_ROUTING_TAGS) { break; } } - return array_keys($docIds); + return $filtered; + } + + private function resolveTypeWeight(string $tagType): float + { + return match (TagTypes::normalize($tagType)) { + TagTypes::CATALOG_ENTITY => 1.20, + TagTypes::GENERIC => 1.00, + TagTypes::SALES_SIGNAL => 0.00, + default => 1.00, + }; } } \ No newline at end of file diff --git a/src/Tag/TagService.php b/src/Tag/TagService.php index 65e8117..dffc27c 100644 --- a/src/Tag/TagService.php +++ b/src/Tag/TagService.php @@ -4,42 +4,45 @@ declare(strict_types=1); namespace App\Tag; -use App\Entity\Tag; use App\Entity\Document; use App\Entity\DocumentTag; +use App\Entity\Tag; use App\Service\TagRebuildJobService; use Doctrine\ORM\EntityManagerInterface; +use InvalidArgumentException; +use RuntimeException; final readonly class TagService { public function __construct( private EntityManagerInterface $em, - private TagRebuildJobService $jobs, - ) {} - - // ========================================================= - // TAG CREATE - // ========================================================= + private TagRebuildJobService $jobs, + ) { + } public function create( string $slug, string $label, ?string $description = null, - string $type = 'generic' // NEU + string $type = TagTypes::GENERIC, ): Tag { - $slug = trim($slug); + $normalizedSlug = $this->normalizeSlug($slug); $label = trim($label); - if ($label === '' || $slug === '') { - throw new \InvalidArgumentException('Label und Slug sind Pflichtfelder.'); + if ($normalizedSlug === '' || $label === '') { + throw new InvalidArgumentException('Tag label and slug are required.'); } - if ($this->slugExists($slug)) { - throw new \RuntimeException('Slug existiert bereits.'); + if ($this->slugExists($normalizedSlug)) { + throw new RuntimeException(sprintf('Tag slug "%s" already exists.', $normalizedSlug)); } - $tag = new Tag($slug, $label, $description); - $tag->setType($type); // NEU + $tag = new Tag( + $normalizedSlug, + $label, + $description, + TagTypes::normalize($type) + ); $this->em->persist($tag); $this->em->flush(); @@ -49,18 +52,9 @@ final readonly class TagService return $tag; } - // ========================================================= - // TAG DELETE - // ========================================================= - public function deleteById(string $tagId): void { - $tag = $this->em->getRepository(Tag::class)->find($tagId); - - if (!$tag instanceof Tag) { - throw new \RuntimeException('Tag nicht gefunden.'); - } - + $tag = $this->findTagById($tagId); $this->delete($tag); } @@ -72,87 +66,103 @@ final readonly class TagService $this->triggerRebuildIfIdle(); } - // ========================================================= - // DOCUMENT TAG SYNC - // ========================================================= - public function syncDocumentTags(Document $document, array $newTagIds): void { - $newTagIds = array_unique($newTagIds); + $normalizedTagIds = $this->normalizeIdList($newTagIds); + /** @var list $currentRelations */ $currentRelations = $this->em ->getRepository(DocumentTag::class) ->findBy(['document' => $document]); $currentTagIds = array_map( - fn(DocumentTag $dt) => (string) $dt->getTag()->getId(), + static fn (DocumentTag $relation): string => (string) $relation->getTag()->getId(), $currentRelations ); - $toAdd = array_diff($newTagIds, $currentTagIds); - $toRemove = array_diff($currentTagIds, $newTagIds); + $toAdd = array_values(array_diff($normalizedTagIds, $currentTagIds)); + $toRemove = array_values(array_diff($currentTagIds, $normalizedTagIds)); foreach ($toAdd as $tagId) { $tag = $this->em->getRepository(Tag::class)->find($tagId); + if ($tag instanceof Tag) { $this->em->persist(new DocumentTag($document, $tag)); } } foreach ($currentRelations as $relation) { - if (in_array((string) $relation->getTag()->getId(), $toRemove, true)) { + $relationTagId = (string) $relation->getTag()->getId(); + + if (in_array($relationTagId, $toRemove, true)) { $this->em->remove($relation); } } - if ($toAdd || $toRemove) { + if ($toAdd !== [] || $toRemove !== []) { $this->em->flush(); $this->triggerRebuildIfIdle(); } } - // ========================================================= - // TAG → DOCUMENT SYNC (Bulk Assign) - // ========================================================= - public function syncTagDocuments(Tag $tag, array $newDocumentIds): void { - $newDocumentIds = array_unique($newDocumentIds); + $normalizedDocumentIds = $this->normalizeIdList($newDocumentIds); + /** @var list $currentRelations */ $currentRelations = $this->em ->getRepository(DocumentTag::class) ->findBy(['tag' => $tag]); $currentDocumentIds = array_map( - fn(DocumentTag $dt) => (string) $dt->getDocument()->getId(), + static fn (DocumentTag $relation): string => (string) $relation->getDocument()->getId(), $currentRelations ); - $toAdd = array_diff($newDocumentIds, $currentDocumentIds); - $toRemove = array_diff($currentDocumentIds, $newDocumentIds); + $toAdd = array_values(array_diff($normalizedDocumentIds, $currentDocumentIds)); + $toRemove = array_values(array_diff($currentDocumentIds, $normalizedDocumentIds)); foreach ($toAdd as $documentId) { $document = $this->em->getRepository(Document::class)->find($documentId); - if ($document instanceof Document) { + + if ( + $document instanceof Document + && $document->getStatus() === Document::STATUS_ACTIVE + ) { $this->em->persist(new DocumentTag($document, $tag)); } } foreach ($currentRelations as $relation) { - if (in_array((string) $relation->getDocument()->getId(), $toRemove, true)) { + $relationDocumentId = (string) $relation->getDocument()->getId(); + + if (in_array($relationDocumentId, $toRemove, true)) { $this->em->remove($relation); } } - if ($toAdd || $toRemove) { + if ($toAdd !== [] || $toRemove !== []) { $this->em->flush(); $this->triggerRebuildIfIdle(); } } - // ========================================================= - // INTERNAL HELPERS - // ========================================================= + private function findTagById(string $tagId): Tag + { + $tagId = trim($tagId); + + if ($tagId === '') { + throw new InvalidArgumentException('Tag id must not be empty.'); + } + + $tag = $this->em->getRepository(Tag::class)->find($tagId); + + if (!$tag instanceof Tag) { + throw new RuntimeException('Tag not found.'); + } + + return $tag; + } private function slugExists(string $slug): bool { @@ -165,6 +175,36 @@ final readonly class TagService ->getSingleScalarResult() > 0; } + /** + * @param array $ids + * @return list + */ + private function normalizeIdList(array $ids): array + { + $normalized = []; + + foreach ($ids as $id) { + $id = trim((string) $id); + + if ($id === '') { + continue; + } + + $normalized[] = $id; + } + + return array_values(array_unique($normalized)); + } + + private function normalizeSlug(string $slug): string + { + $slug = mb_strtolower(trim($slug)); + $slug = preg_replace('/\s+/u', '-', $slug) ?? $slug; + $slug = preg_replace('/-+/u', '-', $slug) ?? $slug; + + return trim($slug, '-'); + } + private function triggerRebuildIfIdle(): void { if (!$this->jobs->hasActiveJob()) { diff --git a/src/Tag/TagTypes.php b/src/Tag/TagTypes.php index b27c6f3..89894b7 100644 --- a/src/Tag/TagTypes.php +++ b/src/Tag/TagTypes.php @@ -5,8 +5,10 @@ declare(strict_types=1); namespace App\Tag; /** - * Zentrale Definition aller erlaubten Tag-Typen. - * Verhindert Magic Strings im Code. + * Central definition of all supported tag types. + * + * This class is intentionally tiny and dependency-free because it is the + * foundation for entity validation, admin forms, routing, and catalog logic. */ final class TagTypes { @@ -14,6 +16,25 @@ final class TagTypes public const CATALOG_ENTITY = 'catalog_entity'; public const SALES_SIGNAL = 'sales_signal'; + /** + * Returns the canonical list of allowed type values. + * + * @return list + */ + public static function all(): array + { + return [ + self::GENERIC, + self::CATALOG_ENTITY, + self::SALES_SIGNAL, + ]; + } + + /** + * Returns UI choices for forms and admin screens. + * + * @return array + */ public static function choices(): array { return [ @@ -23,5 +44,53 @@ final class TagTypes ]; } - private function __construct() {} + /** + * Returns true if the given value is an allowed tag type. + */ + public static function isValid(?string $type): bool + { + if ($type === null) { + return false; + } + + return in_array(self::normalize($type), self::all(), true); + } + + /** + * Normalizes external input into a canonical internal value. + * + * Empty or unknown input falls back to the provided default. + */ + public static function normalize(?string $type, string $default = self::GENERIC): string + { + $type = mb_strtolower(trim((string) $type)); + $default = mb_strtolower(trim($default)); + + if ($type === '') { + return self::isKnownDefault($default) ? $default : self::GENERIC; + } + + if (in_array($type, self::all(), true)) { + return $type; + } + + return self::isKnownDefault($default) ? $default : self::GENERIC; + } + + /** + * Returns a human-readable label for a canonical type. + */ + public static function labelFor(string $type): string + { + return array_flip(self::choices())[self::normalize($type)] ?? 'Generic'; + } + + private static function isKnownDefault(string $type): bool + { + return in_array($type, self::all(), true); + } + + private function __construct() + { + } } \ No newline at end of file diff --git a/src/Tag/TagVectorIndexBuilder.php b/src/Tag/TagVectorIndexBuilder.php index b0f6d64..d074a64 100644 --- a/src/Tag/TagVectorIndexBuilder.php +++ b/src/Tag/TagVectorIndexBuilder.php @@ -9,18 +9,81 @@ use Psr\Log\LoggerInterface; final readonly class TagVectorIndexBuilder { + private const GRACEFUL_TERMINATION_SECONDS = 2; + public function __construct( - private string $pythonBin, - private string $scriptPath, - private string $tagsNdjsonPath, - private string $vectorTagsIndexPath, - private string $embeddingModel, - private int $timeoutSeconds, - private LoggerInterface $agentLogger, - private IndexMetaManager $metaManager, // ✅ NEU - ) {} + private string $pythonBin, + private string $scriptPath, + private string $tagsNdjsonPath, + private string $vectorTagsIndexPath, + private string $embeddingModel, + private int $timeoutSeconds, + private LoggerInterface $agentLogger, + private IndexMetaManager $metaManager, + ) { + } public function build(): void + { + $this->assertPreconditions(); + + $tmpIndex = $this->vectorTagsIndexPath . '.tmp'; + $tmpMeta = $tmpIndex . '.meta.json'; + $finalIndex = $this->vectorTagsIndexPath; + $finalMeta = $finalIndex . '.meta.json'; + + $this->ensureTargetDirectoryExists($finalIndex); + $this->cleanupTemporaryArtifacts($tmpIndex, $tmpMeta); + + if (!$this->hasEmbeddableTags()) { + $this->agentLogger->info('[tags] no embeddable tags found, removing stale tag index artifacts.'); + $this->removeFileIfExists($finalIndex); + $this->removeFileIfExists($finalMeta); + $this->commitRuntime(false); + + return; + } + + $cmd = $this->buildCommand($tmpIndex); + + $this->agentLogger->info('[tags] build tag vector index', [ + 'cmd' => $cmd, + 'timeout' => $this->timeoutSeconds, + 'embedding_model' => $this->embeddingModel, + ]); + + try { + $result = $this->runCommand($cmd); + + if ($result['exit'] !== 0) { + $this->agentLogger->error('[tags] tag vector ingest failed', [ + 'exit' => $result['exit'], + 'stdout' => $result['stdout'], + 'stderr' => $result['stderr'], + ]); + + throw new \RuntimeException('Tag vector ingest failed (exit=' . $result['exit'] . ')'); + } + + if (!$this->isUsableArtifact($tmpIndex) || !$this->isUsableArtifact($tmpMeta)) { + throw new \RuntimeException('Tag vector ingest produced incomplete artifacts.'); + } + + $this->atomicReplace($tmpIndex, $finalIndex); + $this->atomicReplace($tmpMeta, $finalMeta); + $this->commitRuntime(true); + + $this->agentLogger->info('[tags] tag vector index build completed + runtime committed', [ + 'index' => $finalIndex, + 'meta' => $finalMeta, + ]); + } catch (\Throwable $e) { + $this->cleanupTemporaryArtifacts($tmpIndex, $tmpMeta); + throw $e; + } + } + + private function assertPreconditions(): void { if (!is_file($this->tagsNdjsonPath)) { throw new \RuntimeException('tags.ndjson missing: ' . $this->tagsNdjsonPath); @@ -30,65 +93,178 @@ final readonly class TagVectorIndexBuilder throw new \RuntimeException('Tag ingest script missing: ' . $this->scriptPath); } - $tmpIndex = $this->vectorTagsIndexPath . '.tmp'; - $tmpMeta = $tmpIndex . '.meta.json'; - - $finalIndex = $this->vectorTagsIndexPath; - $finalMeta = $finalIndex . '.meta.json'; - - $dir = \dirname($finalIndex); - if (!\is_dir($dir)) { - @\mkdir($dir, 0775, true); + if (trim($this->pythonBin) === '') { + throw new \RuntimeException('Python binary must not be empty.'); } - @\unlink($tmpIndex); - @\unlink($tmpMeta); + if ($this->timeoutSeconds < 1) { + throw new \RuntimeException('Tag vector timeout must be >= 1 second.'); + } + } - $cmd = sprintf( - '%s %s %s %s %s 2>&1', + private function buildCommand(string $tmpIndex): string + { + return sprintf( + '%s %s %s %s 2>&1', escapeshellarg($this->pythonBin), escapeshellarg($this->scriptPath), escapeshellarg($this->tagsNdjsonPath), escapeshellarg($tmpIndex), - escapeshellarg($this->embeddingModel), ); + } - $this->agentLogger->info('[tags] build tag vector index', [ - 'cmd' => $cmd, - 'timeout' => $this->timeoutSeconds, - ]); + private function ensureTargetDirectoryExists(string $finalIndexPath): void + { + $dir = dirname($finalIndexPath); - $out = []; - $exit = 0; - - exec($cmd, $out, $exit); - - if ($exit !== 0) { - $this->agentLogger->error('[tags] tag vector ingest failed', [ - 'exit' => $exit, - 'out' => $out, - ]); - throw new \RuntimeException('Tag vector ingest failed (exit=' . $exit . ')'); - } - - if (!is_file($tmpIndex) || !is_file($tmpMeta)) { - @\unlink($tmpIndex); - @\unlink($tmpMeta); - $this->agentLogger->warning('[tags] no tag index produced (maybe 0 tags).'); + if (is_dir($dir)) { return; } - $this->atomicReplace($tmpIndex, $finalIndex); - $this->atomicReplace($tmpMeta, $finalMeta); + if (!@mkdir($dir, 0775, true) && !is_dir($dir)) { + throw new \RuntimeException('Unable to create tag vector directory: ' . $dir); + } + } - // ✅ ENTERPRISE COMMIT MARKER + private function hasEmbeddableTags(): bool + { + $fh = @fopen($this->tagsNdjsonPath, 'rb'); + + if ($fh === false) { + throw new \RuntimeException('Unable to read tags NDJSON: ' . $this->tagsNdjsonPath); + } + + try { + while (($line = fgets($fh)) !== false) { + $line = trim($line); + + if ($line === '') { + continue; + } + + $decoded = json_decode($line, true); + + if (!is_array($decoded)) { + continue; + } + + $tagId = trim((string) ($decoded['tag_id'] ?? '')); + $text = trim((string) ($decoded['text'] ?? '')); + + if ($tagId !== '' && $text !== '') { + return true; + } + } + } finally { + fclose($fh); + } + + return false; + } + + /** + * @return array{exit:int, stdout:string, stderr:string} + */ + private function runCommand(string $cmd): array + { + $descriptorSpec = [ + 0 => ['pipe', 'r'], + 1 => ['pipe', 'w'], + 2 => ['pipe', 'w'], + ]; + + $process = @proc_open($cmd, $descriptorSpec, $pipes); + + if (!is_resource($process)) { + throw new \RuntimeException('Could not start tag vector ingest process.'); + } + + fclose($pipes[0]); + stream_set_blocking($pipes[1], false); + stream_set_blocking($pipes[2], false); + + $stdout = ''; + $stderr = ''; + $startedAt = microtime(true); + $timedOut = false; + + try { + while (true) { + $stdout .= stream_get_contents($pipes[1]) ?: ''; + $stderr .= stream_get_contents($pipes[2]) ?: ''; + + $status = proc_get_status($process); + + if (!is_array($status) || ($status['running'] ?? false) !== true) { + break; + } + + if ((microtime(true) - $startedAt) > $this->timeoutSeconds) { + $timedOut = true; + proc_terminate($process); + usleep(self::GRACEFUL_TERMINATION_SECONDS * 1000000); + + $status = proc_get_status($process); + if (is_array($status) && ($status['running'] ?? false) === true) { + proc_terminate($process, 9); + } + + break; + } + + usleep(100000); + } + + $stdout .= stream_get_contents($pipes[1]) ?: ''; + $stderr .= stream_get_contents($pipes[2]) ?: ''; + } finally { + fclose($pipes[1]); + fclose($pipes[2]); + } + + $exitCode = proc_close($process); + + if ($timedOut) { + $this->agentLogger->error('[tags] tag vector ingest timed out', [ + 'timeout' => $this->timeoutSeconds, + 'stdout' => $stdout, + 'stderr' => $stderr, + ]); + + throw new \RuntimeException('Tag vector ingest timed out after ' . $this->timeoutSeconds . ' seconds.'); + } + + return [ + 'exit' => is_int($exitCode) ? $exitCode : 1, + 'stdout' => trim($stdout), + 'stderr' => trim($stderr), + ]; + } + + private function isUsableArtifact(string $path): bool + { + return is_file($path) && filesize($path) > 0; + } + + private function cleanupTemporaryArtifacts(string ...$paths): void + { + foreach ($paths as $path) { + $this->removeFileIfExists($path); + } + } + + private function removeFileIfExists(string $path): void + { + if (is_file($path)) { + @unlink($path); + } + } + + private function commitRuntime(bool $indexPresent): void + { $this->metaManager->touchRuntime([ 'last_tags_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM), - ]); - - $this->agentLogger->info('[tags] tag vector index build completed + runtime committed', [ - 'index' => $finalIndex, - 'meta' => $finalMeta, + 'tags_index_present' => $indexPresent, ]); } @@ -99,6 +275,7 @@ final readonly class TagVectorIndexBuilder @unlink($tmp); throw new \RuntimeException('Atomic replace failed for: ' . $final); } + @unlink($tmp); } diff --git a/src/Tag/TagVectorIndexHealthService.php b/src/Tag/TagVectorIndexHealthService.php index 79977dc..456798b 100644 --- a/src/Tag/TagVectorIndexHealthService.php +++ b/src/Tag/TagVectorIndexHealthService.php @@ -6,63 +6,210 @@ namespace App\Tag; final readonly class TagVectorIndexHealthService { + private const STATUS_OK = 'OK'; + private const STATUS_OK_EMPTY = 'OK_EMPTY'; + private const STATUS_INCONSISTENT_STALE_VECTOR = 'INCONSISTENT_STALE_VECTOR'; + private const STATUS_INCONSISTENT_MISSING_VECTOR = 'INCONSISTENT_MISSING_VECTOR'; + private const STATUS_INCONSISTENT_COUNT_MISMATCH = 'INCONSISTENT_COUNT_MISMATCH'; + private const STATUS_INCONSISTENT_INVALID_META = 'INCONSISTENT_INVALID_META'; + private const STATUS_UNKNOWN = 'UNKNOWN'; + public function __construct( private string $tagsNdjsonPath, private string $vectorTagsIndexPath, - private string $vectorTagsMetaPath - ) {} + private string $vectorTagsMetaPath, + ) { + } public function check(): array { $ndjsonExists = is_file($this->tagsNdjsonPath); $vectorExists = is_file($this->vectorTagsIndexPath); - $metaExists = is_file($this->vectorTagsMetaPath); + $metaExists = is_file($this->vectorTagsMetaPath); - $ndjsonTagCount = 0; + $ndjsonStats = $this->readNdjsonStats(); + $metaStats = $this->readMetaStats(); - if ($ndjsonExists) { - $h = @fopen($this->tagsNdjsonPath, 'r'); - if ($h !== false) { - while (($line = fgets($h)) !== false) { - $line = trim($line); - if ($line === '') continue; - - $data = json_decode($line, true); - if (is_array($data) && !empty($data['tag_id']) && !empty($data['text'])) { - $ndjsonTagCount++; - } - } - fclose($h); - } - } - - $vectorTagCount = 0; - if ($metaExists) { - $meta = json_decode((string) file_get_contents($this->vectorTagsMetaPath), true); - if (is_array($meta)) { - $vectorTagCount = count($meta); - } - } - - $status = $this->determineStatus($ndjsonTagCount, $vectorExists, $metaExists, $vectorTagCount); + $status = $this->determineStatus( + $ndjsonStats['exported_tag_count'], + $vectorExists, + $metaExists, + $metaStats['vector_tag_count'], + $metaStats['meta_valid'] + ); return [ 'tags_ndjson_exists' => $ndjsonExists, - 'tags_ndjson_count' => $ndjsonTagCount, - 'vector_exists' => $vectorExists, - 'meta_exists' => $metaExists, - 'vector_tag_count' => $vectorTagCount, - 'status' => $status, + 'tags_ndjson_count' => $ndjsonStats['exported_tag_count'], + 'vector_exists' => $vectorExists, + 'meta_exists' => $metaExists, + 'vector_tag_count' => $metaStats['vector_tag_count'], + 'status' => $status, + + // Extra diagnostics for admin/CLI. + 'tags_ndjson_lines_total' => $ndjsonStats['lines_total'], + 'tags_ndjson_invalid_lines' => $ndjsonStats['invalid_lines'], + 'tags_ndjson_empty_lines' => $ndjsonStats['empty_lines'], + 'tags_with_active_document_ids' => $ndjsonStats['tags_with_document_ids'], + 'meta_valid' => $metaStats['meta_valid'], + 'paths' => [ + 'tags_ndjson' => $this->tagsNdjsonPath, + 'vector_index' => $this->vectorTagsIndexPath, + 'vector_meta' => $this->vectorTagsMetaPath, + ], ]; } - private function determineStatus(int $ndjsonTagCount, bool $vectorExists, bool $metaExists, int $vectorTagCount): string + /** + * @return array{ + * lines_total:int, + * empty_lines:int, + * invalid_lines:int, + * exported_tag_count:int, + * tags_with_document_ids:int + * } + */ + private function readNdjsonStats(): array { - if ($ndjsonTagCount === 0 && !$vectorExists && !$metaExists) return 'OK_EMPTY'; - if ($ndjsonTagCount > 0 && $vectorExists && $metaExists && $vectorTagCount === $ndjsonTagCount) return 'OK'; - if ($ndjsonTagCount === 0 && ($vectorExists || $metaExists)) return 'INCONSISTENT_STALE_VECTOR'; - if ($ndjsonTagCount > 0 && (!$vectorExists || !$metaExists)) return 'INCONSISTENT_MISSING_VECTOR'; - if ($ndjsonTagCount !== $vectorTagCount) return 'INCONSISTENT_COUNT_MISMATCH'; - return 'UNKNOWN'; + $stats = [ + 'lines_total' => 0, + 'empty_lines' => 0, + 'invalid_lines' => 0, + 'exported_tag_count' => 0, + 'tags_with_document_ids' => 0, + ]; + + if (!is_file($this->tagsNdjsonPath)) { + return $stats; + } + + $handle = @fopen($this->tagsNdjsonPath, 'rb'); + + if ($handle === false) { + return $stats; + } + + try { + while (($line = fgets($handle)) !== false) { + $stats['lines_total']++; + $line = trim($line); + + if ($line === '') { + $stats['empty_lines']++; + continue; + } + + $data = json_decode($line, true); + + if (!is_array($data)) { + $stats['invalid_lines']++; + continue; + } + + $tagId = trim((string) ($data['tag_id'] ?? '')); + $text = trim((string) ($data['text'] ?? '')); + $documentIds = $data['document_ids'] ?? null; + $hasDocumentIds = is_array($documentIds) && $documentIds !== []; + + if ($tagId === '' || $text === '') { + $stats['invalid_lines']++; + continue; + } + + $stats['exported_tag_count']++; + + if ($hasDocumentIds) { + $stats['tags_with_document_ids']++; + } + } + } finally { + fclose($handle); + } + + return $stats; + } + + /** + * @return array{vector_tag_count:int, meta_valid:bool} + */ + private function readMetaStats(): array + { + if (!is_file($this->vectorTagsMetaPath)) { + return [ + 'vector_tag_count' => 0, + 'meta_valid' => false, + ]; + } + + $raw = file_get_contents($this->vectorTagsMetaPath); + + if (!is_string($raw) || trim($raw) === '') { + return [ + 'vector_tag_count' => 0, + 'meta_valid' => false, + ]; + } + + $decoded = json_decode($raw, true); + + if (is_array($decoded)) { + if (array_is_list($decoded)) { + return [ + 'vector_tag_count' => count($decoded), + 'meta_valid' => true, + ]; + } + + $numericKeys = array_filter( + array_keys($decoded), + static fn (string|int $key): bool => is_string($key) && ctype_digit($key) + ); + + if ($numericKeys !== [] && count($numericKeys) === count($decoded)) { + return [ + 'vector_tag_count' => count($decoded), + 'meta_valid' => true, + ]; + } + } + + return [ + 'vector_tag_count' => 0, + 'meta_valid' => false, + ]; + } + + private function determineStatus( + int $ndjsonTagCount, + bool $vectorExists, + bool $metaExists, + int $vectorTagCount, + bool $metaValid + ): string { + if ($ndjsonTagCount === 0 && !$vectorExists && !$metaExists) { + return self::STATUS_OK_EMPTY; + } + + if ($ndjsonTagCount === 0 && ($vectorExists || $metaExists)) { + return self::STATUS_INCONSISTENT_STALE_VECTOR; + } + + if ($ndjsonTagCount > 0 && (!$vectorExists || !$metaExists)) { + return self::STATUS_INCONSISTENT_MISSING_VECTOR; + } + + if ($metaExists && !$metaValid) { + return self::STATUS_INCONSISTENT_INVALID_META; + } + + if ($ndjsonTagCount > 0 && $vectorExists && $metaExists && $metaValid && $vectorTagCount === $ndjsonTagCount) { + return self::STATUS_OK; + } + + if ($ndjsonTagCount !== $vectorTagCount) { + return self::STATUS_INCONSISTENT_COUNT_MISMATCH; + } + + return self::STATUS_UNKNOWN; } } \ No newline at end of file diff --git a/src/Tag/TagVectorSearchClient.php b/src/Tag/TagVectorSearchClient.php index 50b34c9..cec3537 100644 --- a/src/Tag/TagVectorSearchClient.php +++ b/src/Tag/TagVectorSearchClient.php @@ -12,18 +12,29 @@ final readonly class TagVectorSearchClient /** * Minimum similarity score required for a tag to be considered. */ - private const MIN_SCORE = 0.72; + public const MIN_SCORE = 0.72; + + /** + * Default result size when callers do not specify a limit. + */ + private const DEFAULT_LIMIT = 8; /** * Hard limit to prevent excessive requests. */ private const MAX_LIMIT = 50; + /** + * HTTP timeout for the Python vector service. + */ + private const TIMEOUT_SECONDS = 10; + public function __construct( private HttpClientInterface $http, - private string $serviceUrl, - private LoggerInterface $agentLogger, - ) {} + private string $serviceUrl, + private LoggerInterface $agentLogger, + ) { + } /** * Executes a vector search against the Python tag index. @@ -33,43 +44,51 @@ final readonly class TagVectorSearchClient * { * "tag_id": "...", * "score": 0.73, - * "label": "Geräte", // optional (new) - * "tag_type": "catalog_entity" // optional (new) + * "label": "Geräte", + * "tag_type": "catalog_entity" * } * ] * - * @return array */ - public function search(string $query, int $limit = 8): array + public function search(string $query, int $limit = self::DEFAULT_LIMIT): array { $query = trim($query); + if ($query === '') { return []; } $limit = max(1, min($limit, self::MAX_LIMIT)); + $serviceUrl = rtrim(trim($this->serviceUrl), '/'); + + if ($serviceUrl === '') { + $this->agentLogger->warning('Tag vector service URL is empty.'); + + return []; + } try { $response = $this->http->request( 'POST', - rtrim($this->serviceUrl, '/') . '/search-tags', + $serviceUrl . '/search-tags', [ 'json' => [ 'query' => $query, 'limit' => $limit, ], - 'timeout' => 10, + 'timeout' => self::TIMEOUT_SECONDS, ] ); if ($response->getStatusCode() !== 200) { $this->agentLogger->warning( - 'Tag vector service returned non-200', + 'Tag vector service returned non-200.', ['status' => $response->getStatusCode()] ); @@ -77,10 +96,9 @@ final readonly class TagVectorSearchClient } $data = $response->toArray(false); - } catch (\Throwable $e) { $this->agentLogger->warning( - 'Tag vector service unreachable', + 'Tag vector service unreachable.', ['error' => $e->getMessage()] ); @@ -88,18 +106,33 @@ final readonly class TagVectorSearchClient } if (!is_array($data)) { - $this->agentLogger->warning('Tag vector service returned invalid payload'); + $this->agentLogger->warning('Tag vector service returned invalid payload.'); + return []; } - $hits = []; + return $this->normalizeHits($data, $limit); + } - foreach ($data as $row) { + /** + * @param array $rows + * @return list + */ + private function normalizeHits(array $rows, int $limit): array + { + $hitsByTagId = []; + + foreach ($rows as $row) { if (!is_array($row)) { continue; } - $tagId = (string)($row['tag_id'] ?? ''); + $tagId = trim((string) ($row['tag_id'] ?? '')); $score = $row['score'] ?? null; if ($tagId === '' || !is_numeric($score)) { @@ -112,24 +145,45 @@ final readonly class TagVectorSearchClient continue; } - $hit = [ + $normalizedHit = [ 'tag_id' => $tagId, - 'score' => $score, + 'score' => $score, + 'label' => trim((string) ($row['label'] ?? '')), + 'tag_type' => TagTypes::normalize((string) ($row['tag_type'] ?? TagTypes::GENERIC)), ]; - // Optional: label - if (isset($row['label']) && is_string($row['label'])) { - $hit['label'] = $row['label']; - } + $existingHit = $hitsByTagId[$tagId] ?? null; - // Optional: tag_type - if (isset($row['tag_type']) && is_string($row['tag_type'])) { - $hit['tag_type'] = $row['tag_type']; + if ($existingHit === null || $normalizedHit['score'] > $existingHit['score']) { + $hitsByTagId[$tagId] = $normalizedHit; } - - $hits[] = $hit; } - return $hits; + if ($hitsByTagId === []) { + return []; + } + + $hits = array_values($hitsByTagId); + + usort( + $hits, + static function (array $left, array $right): int { + $scoreComparison = $right['score'] <=> $left['score']; + + if ($scoreComparison !== 0) { + return $scoreComparison; + } + + $typeComparison = strcmp($left['tag_type'], $right['tag_type']); + + if ($typeComparison !== 0) { + return $typeComparison; + } + + return strcmp($left['tag_id'], $right['tag_id']); + } + ); + + return array_slice($hits, 0, $limit); } } \ No newline at end of file diff --git a/templates/admin/dashboard/index.html.twig b/templates/admin/dashboard/index.html.twig index cf63378..823d226 100644 --- a/templates/admin/dashboard/index.html.twig +++ b/templates/admin/dashboard/index.html.twig @@ -5,107 +5,108 @@ {% block body %}
- - - + {% set chunkStatus = vectorHealth.status|default('UNKNOWN') %} + {% set chunkBadgeClass = + chunkStatus starts with 'OK' + ? 'bg-success' + : (chunkStatus == 'INCONSISTENT_MISSING_VECTOR' + ? 'bg-warning text-dark' + : 'bg-danger') + %} -
-

Systemübersicht

- RAG Enterprise + {% set tagStatus = tagVectorHealth.status|default('UNKNOWN') %} + {% set tagBadgeClass = + tagStatus starts with 'OK' + ? 'bg-success' + : (tagStatus == 'INCONSISTENT_MISSING_VECTOR' + ? 'bg-warning text-dark' + : 'bg-danger') + %} + + {% set percent = chunkLimit > 0 ? (chunkCount / chunkLimit * 100)|round(3) : 0 %} + {% set percentClass = + percent >= 95 + ? 'bg-danger' + : (percent >= 85 ? 'bg-warning text-dark' : 'bg-success') + %} + + {% set chunkHealthy = chunkStatus in ['OK', 'OK_EMPTY'] %} + {% set tagHealthy = tagStatus in ['OK', 'OK_EMPTY'] %} + {% set anyHealthIssue = not chunkHealthy or not tagHealthy %} + +
+

+ Systemübersicht +

+ RetrieX Admin
- - - + {% if anyHealthIssue %} +
+ Achtung: + Mindestens ein Index-Zustand ist nicht konsistent. + Prüfe die Detailkarten unten und führe bei Bedarf einen Global Reindex aus. +
+ {% endif %}
- {# ================= CHUNK VECTOR STATUS ================= #} - {% if vectorHealth is defined %} - {% set status = vectorHealth.status %} - {% set badgeClass = - status starts with 'OK' - ? 'bg-success' - : (status == 'INCONSISTENT_MISSING_VECTOR' - ? 'bg-warning text-dark' - : 'bg-danger') %} - {% endif %} -
-
+
-
Chunk-Vektor
+
+ Chunk-Vektor +
- {% if vectorHealth is defined %} -

- - {{ vectorHealth.status }} +

+ + {{ chunkStatus }} -

- {% else %} -
- Keine Daten verfügbar. -
- {% endif %} +

+ +
+ Keyword-/Chunk-Retrieval-Grundlage des Systems +
- {# ================= TAG VECTOR STATUS ================= #} - {% if tagVectorHealth is defined %} - {% set tagStatus = tagVectorHealth.status %} - {% set tagBadgeClass = - tagStatus starts with 'OK' - ? 'bg-success' - : (tagStatus == 'INCONSISTENT_MISSING_VECTOR' - ? 'bg-warning text-dark' - : 'bg-danger') %} - {% endif %} -
-
+
-
Tag-Vektor
+
+ Tag-Vektor +
- {% if tagVectorHealth is defined %} -

+

- {{ tagVectorHealth.status }} + {{ tagStatus }} -

- {% else %} -
- Keine Daten verfügbar. -
- {% endif %} + + +
+ Semantisches Tag-Routing für Dokumenträume und Entity-Erkennung +
- {# ================= KNOWLEDGE CAPACITY ================= #} - {% set percent = chunkLimit > 0 ? (chunkCount / chunkLimit * 100)|round(3) : 0 %} -
-
+
-
Wissenskapazität
+
+ Wissenskapazität +

{{ chunkCount|number_format(0, ',', '.') }} - / {{ chunkLimit|number_format(0, ',', '.') }} Chunks - + / {{ chunkLimit|number_format(0, ',', '.') }} Chunks +

-
@@ -117,20 +118,21 @@
- {# ================= GOVERNANCE ================= #}
-
+
-
System-Governance
+
+ System-Governance +
Benutzer
- {{ app.user.userIdentifier }} + {{ app.user ? app.user.userIdentifier : '-' }}
Rollen
- {{ app.user.roles|join(', ') }} + {{ app.user ? app.user.roles|join(', ') : '-' }}
@@ -138,65 +140,94 @@
- - - -
- {% if vectorHealth is defined %} -
-
-
-
Chunk-Vektor-Details
- -
NDJSON-Chunks
-
- {{ vectorHealth.ndjson_chunk_count|number_format(0, ',', '.') }} -
- -
Vektor-Index-Chunks
-
- {{ vectorHealth.vector_chunk_count|number_format(0, ',', '.') }} -
-
-
-
- {% endif %} - - {% if tagVectorHealth is defined %} -
-
-
-
Tag-Vektor-Details
- -
NDJSON-Tags
-
- {{ tagVectorHealth.tags_ndjson_count|number_format(0, ',', '.') }} -
- -
Vektor-Index-Tags
-
- {{ tagVectorHealth.vector_tag_count|number_format(0, ',', '.') }} -
-
-
-
- {% endif %} - -
-
+
-
Indexierung (Ingest Jobs)
+
+ Chunk-Vektor-Details +
+ +
NDJSON-Chunks
+
+ {{ vectorHealth.ndjson_chunk_count|default(0)|number_format(0, ',', '.') }} +
+ +
Vektor-Index-Chunks
+
+ {{ vectorHealth.vector_chunk_count|default(0)|number_format(0, ',', '.') }} +
+ +
+ + NDJSON {{ vectorHealth.ndjson_exists|default(false) ? 'vorhanden' : 'fehlt' }} + + + Index {{ vectorHealth.vector_exists|default(false) ? 'vorhanden' : 'fehlt' }} + + + Meta {{ vectorHealth.meta_exists|default(false) ? 'vorhanden' : 'fehlt' }} + +
+
+
+
+ +
+
+
+
+ Tag-Vektor-Details +
+ +
Exportierte Tags (NDJSON)
+
+ {{ tagVectorHealth.tags_ndjson_count|default(0)|number_format(0, ',', '.') }} +
+ +
Vektor-Index-Tags
+
+ {{ tagVectorHealth.vector_tag_count|default(0)|number_format(0, ',', '.') }} +
+ +
Tags mit aktiven Dokumenten
+
+ {{ tagVectorHealth.tags_with_active_document_ids|default(0)|number_format(0, ',', '.') }} +
+ +
+ + NDJSON {{ tagVectorHealth.tags_ndjson_exists|default(false) ? 'vorhanden' : 'fehlt' }} + + + Index {{ tagVectorHealth.vector_exists|default(false) ? 'vorhanden' : 'fehlt' }} + + + Meta {{ tagVectorHealth.meta_exists|default(false) ? 'vorhanden' : 'fehlt' }} + + + Meta {{ tagVectorHealth.meta_valid|default(false) ? 'gültig' : 'ungültig' }} + +
+
+
+
+ +
+
+
+
+ Indexierung (Ingest Jobs) +
- Erstellt den kompletten Wissensindex neu. - Kann je nach Datenmenge mehrere Minuten dauern. + Erstellt den kompletten Wissensindex neu und zieht dabei auch die + physischen Retrieval-Artefakte wieder gerade.
+ + {% if anyHealthIssue %} +
+ Empfohlen bei inkonsistentem Chunk- oder Tag-Zustand. +
+ {% endif %}
{% if is_granted('ROLE_SUPER_ADMIN') %}
-
+
-
Kritische Systemoperationen
+
+ Kritische Systemoperationen +
Entfernt alle Dokumente, Versionen, Indizes und Jobs. diff --git a/templates/admin/document/index.html.twig b/templates/admin/document/index.html.twig index d9999f0..bf326c9 100644 --- a/templates/admin/document/index.html.twig +++ b/templates/admin/document/index.html.twig @@ -4,8 +4,15 @@ {% block body %} -
-

Dokumente

+
+
+

+ Dokumente +

+
+ Übersicht über Dokumente, aktive Versionen, Ingest-Zustände und Tag-Zuordnungen. +
+
@@ -13,154 +20,222 @@
+ {% for message in app.flashes('success') %} +
+ {{ message }} +
+ {% endfor %} + + {% for message in app.flashes('danger') %} +
+ {{ message }} +
+ {% endfor %} + + {% for message in app.flashes('info') %} +
+ {{ message }} +
+ {% endfor %} + +
+
+
+
Worauf achten?
+
    +
  • INDEXED bedeutet: aktive Version ist sauber im Wissensindex.
  • +
  • PENDING oder FAILED bedeuten: Dokument prüfen und ggf. Ingest erneut anstoßen.
  • +
  • Tags sollten fachlich präzise sein und nicht nur generische Oberbegriffe abbilden.
  • +
  • Die aktive Version ist die fachlich führende Version des Dokuments.
  • +
+
+ +
+
Schnellzugriff
+
+ Über Tags gelangst du direkt in die Tag-Pflege des Dokuments. + Über Details steuerst du Versionen, Aktivierung, Re-Ingest und Löschung. +
+
+
+
+ {% if documents is empty %} -
+
Keine Dokumente vorhanden.
{% else %} -
+
- - - - - - - - - - - - - - - +
+
+ Vorhandene Dokumente + {{ documents|length }} Einträge +
- {% for document in documents %} +
+ Neueste Dokumente stehen oben. +
+
+ +
+
TitelIDTypStatusIndexierungVersionenAktive VersionErstelltAktionen
+ + + + + + + + + + + + + + - {# Titel #} - + {% for document in documents %} + + + {% if document.currentVersion and document.currentVersion.filePath %} +
+ Aktive Datei vorhanden +
+ {% endif %} + - {# Typ #} - + - {# Dokument Status #} - - - {# Ingest Status #} - + - {# Version Count #} - + - {# Aktive Version #} - + - {# Created At #} - + - {# Aktionen #} - - - Tags - + - - Details - + - {% if is_granted('ROLE_SUPER_ADMIN') %} - + + + + {% endif %} + + + + {% endfor %} - - {% endfor %} - - -
TitelIDTypStatusIndexierungVersionenAktive VersionTagsErstelltAktionen
- - {{ document.title }} - -
+ - {# ID #} - - {{ document.id }} - - {% if document.currentVersion %} - - {{ document.currentVersion.fileTypeLabel }} - - {% else %} - - - - - {% endif %} - + {{ document.id }} + - {% if document.status == 'ACTIVE' %} - Aktiv - {% else %} - Archiviert - {% endif %} - - {% if document.currentVersion %} - {% if document.currentVersion.ingestStatus == 'INDEXED' %} - INDEXED - {% elseif document.currentVersion.ingestStatus == 'PENDING' %} - PENDING - {% elseif document.currentVersion.ingestStatus == 'FAILED' %} - FAILED + + {% if document.currentVersion %} + + {{ document.currentVersion.fileTypeLabel }} + {% else %} - {{ document.currentVersion.ingestStatus }} - + - + {% endif %} - {% else %} - - - {% endif %} - - {{ document.versions|length }} - + {% if document.status == 'ACTIVE' %} + Aktiv + {% else %} + Archiviert + {% endif %} + - {% if document.currentVersion %} - v{{ document.currentVersion.versionNumber }} - {% else %} - - - {% endif %} - + {% if document.currentVersion %} + {% if document.currentVersion.ingestStatus == 'INDEXED' %} + INDEXED + {% elseif document.currentVersion.ingestStatus == 'PENDING' %} + PENDING + {% elseif document.currentVersion.ingestStatus == 'RUNNING' %} + RUNNING + {% elseif document.currentVersion.ingestStatus == 'FAILED' %} + FAILED + {% else %} + + {{ document.currentVersion.ingestStatus ?: '-' }} + + {% endif %} + {% else %} + - + {% endif %} + - {{ document.createdAt|date('d.m.Y H:i') }} - + + {{ document.versions|length }} + + + + {% if document.currentVersion %} + + v{{ document.currentVersion.versionNumber }} + + {% else %} + - + {% endif %} + + + {{ document.tags|length }} + + + {{ document.createdAt|date('d.m.Y H:i') }} +
+
+ + Tags + - + + Details + - - - {% endif %} + {% if is_granted('ROLE_SUPER_ADMIN') %} +
+ -
+ + +
{% endif %} -
- Hinweis: Das Löschen eines Dokuments entfernt alle Versionen und - erfordert eine Aktualisierung des NDJSON-Indexes. +
+
+
Hinweis zum Dokument-Lifecycle
+
+ Änderungen an aktiven Versionen und Löschvorgänge wirken sich direkt auf den Wissensindex aus. + Zugewiesene Tags beeinflussen zusätzlich die semantische Routing-Ebene des Systems. + Dokumente mit schwachen oder fehlenden Tags sind oft ein guter Kandidat für fachliche Nachpflege. +
+
{% endblock %} \ No newline at end of file diff --git a/templates/admin/document/new.html.twig b/templates/admin/document/new.html.twig index 0ef3751..2d0a712 100644 --- a/templates/admin/document/new.html.twig +++ b/templates/admin/document/new.html.twig @@ -4,8 +4,13 @@ {% block body %} -
-

Neues Dokument

+
+
+

Neues Dokument

+
+ Neuer Upload mit initialer Version und anschließendem asynchronen Ingest. +
+
@@ -13,7 +18,49 @@
-
+ {% for message in app.flashes('success') %} +
+ {{ message }} +
+ {% endfor %} + + {% for message in app.flashes('danger') %} +
+ {{ message }} +
+ {% endfor %} + + {% for message in app.flashes('info') %} +
+ {{ message }} +
+ {% endfor %} + +
+
+
+
Warum ist der Titel wichtig?
+ +
    +
  • Der Titel wird später Teil des fachlichen Kontexts des Dokuments.
  • +
  • Ein präziser Titel verbessert Retrieval, Chunk-Einordnung und spätere Tag-Pflege.
  • +
  • Generische Titel wie Dokument 1 oder nur Dateinamen sind deutlich schwächer.
  • +
+
+ +
+
Gute Beispiele
+ +
    +
  • Testomat 808 – Technisches Datenblatt
  • +
  • Resthärte-Messung – Produktübersicht
  • +
  • Indikator 300 – Anwendung und Dosierung
  • +
+
+
+
+ +
@@ -22,31 +69,24 @@ name="_token" value="{{ csrf_token('create_document') }}"> - {# ============================= #} - {# Titel #} - {# ============================= #} -
Hinweis zur Qualität:
- Der Titel ist entscheidend für die semantische Einordnung - der erzeugten Chunks. Jeder Chunk erhält den Titel als Kontext, - wodurch Retrieval und Antwortqualität signifikant verbessert werden.

- - Wird kein Titel angegeben, wird automatisch der Dateiname - verwendet (nicht empfohlen). + Verwende einen fachlich präzisen Titel, der Produkt, Thema oder Dokumenttyp klar beschreibt. + Wenn kein Titel angegeben wird, wird automatisch der Dateiname verwendet.
-
+ value="{{ app.request.get('title') }}" + placeholder="z. B. Testomat 808 – Technisches Datenblatt"> - {# ============================= #} - {# Datei Upload #} - {# ============================= #} +
+ Der Titel muss nicht lang sein, aber fachlich eindeutig. +
+
@@ -58,14 +98,22 @@
Unterstützte Formate: PDF, DOCX, TXT, MD. - Das Dokument wird versioniert gespeichert und anschließend - indexiert. + Nach dem Upload wird automatisch Version 1 erstellt und ein Ingest-Job gestartet.
- {# ============================= #} - {# Submit #} - {# ============================= #} +
+
+
Was passiert nach dem Speichern?
+ +
    +
  • Das Dokument wird versioniert gespeichert.
  • +
  • Die erste Version wird als aktuelle Version gesetzt.
  • +
  • Ein asynchroner Ingest-Job verarbeitet das Dokument für den Wissensindex.
  • +
  • Später können dem Dokument gezielt Tags zugewiesen werden.
  • +
+
+
- Hinweis: Nach dem Upload wird automatisch eine neue Dokumentversion erstellt. - Die Indexierung erfolgt asynchron über einen Ingest-Job. + Hinweis: Ein sauber benanntes Dokument ist die beste Grundlage für gutes Retrieval und späteres präzises Tagging.
-{% endblock %} +{% endblock %} \ No newline at end of file diff --git a/templates/admin/document/new_version.html.twig b/templates/admin/document/new_version.html.twig index 2f5a883..660f42f 100644 --- a/templates/admin/document/new_version.html.twig +++ b/templates/admin/document/new_version.html.twig @@ -4,10 +4,13 @@ {% block body %} -
-

- Neue Version -

+
+
+

Neue Version

+
+ Neue unveränderliche Version für ein bestehendes Dokument hochladen. +
+
@@ -15,36 +18,99 @@
-
-
+ {% for message in app.flashes('success') %} +
+ {{ message }} +
+ {% endfor %} -
- Dokument: - {{ document.title }} + {% for message in app.flashes('danger') %} +
+ {{ message }} +
+ {% endfor %} + + {% for message in app.flashes('info') %} +
+ {{ message }} +
+ {% endfor %} + +
+
+
+
Dokumentkontext
+ +
+ Dokument: + {{ document.title }} +
+ +
+ Eine neue Version erzeugt eine zusätzliche, unveränderliche Dokumentversion. + Die bestehende aktive Version bleibt zunächst unverändert. +
-
- Das Hochladen einer neuen Version erzeugt eine zusätzliche - unveränderliche Dokumentversion. Die Aktivierung erfolgt separat - und löst einen deterministischen Re-Ingest aus. -
+
+
Aktueller Stand
+
+ Aktive Version: + {% if document.currentVersion %} + + v{{ document.currentVersion.versionNumber }} + + {% else %} + - + {% endif %} +
+ +
+ Vorhandene Versionen: + {{ document.versions|length }} +
+ +
+ Zugewiesene Tags: + {{ document.tags|length }} +
+
-
+
+
+
+
Wichtig für den Lifecycle
+ +
    +
  • Der Upload erzeugt nur eine neue Version, aber aktiviert sie nicht automatisch.
  • +
  • Erst die spätere Aktivierung löst den deterministischen Re-Ingest aus.
  • +
  • Tags bleiben auf Dokumentebene bestehen und gelten weiterhin für das Dokument als Ganzes.
  • +
+
+ +
+
Gute Praxis
+ +
    +
  • Nur fachlich wirklich passende Nachfolgeversionen hochladen.
  • +
  • Kein anderes Thema oder anderes Produkt in dieselbe Dokumentlinie mischen.
  • +
  • Bei stark verändertem Fachinhalt später Tagging mitprüfen.
  • +
+
+
+
+ +
- - {# ============================= #} - {# Datei Upload #} - {# ============================= #} -
@@ -54,15 +120,23 @@ required>
- Unterstützte Formate: PDF, DOCX, TXT, MD.
- Die Datei wird versioniert gespeichert und mit einer - eindeutigen Checksum versehen. + Unterstützte Formate: PDF, DOCX, TXT, MD. + Die Datei wird versioniert gespeichert und mit einer eindeutigen Checksum versehen.
- {# ============================= #} - {# Submit #} - {# ============================= #} +
+
+
Was passiert nach dem Upload?
+ +
    +
  • Es wird eine neue, unveränderliche Dokumentversion angelegt.
  • +
  • Die aktive Version bleibt zunächst unverändert.
  • +
  • Ein Re-Ingest erfolgt erst nach späterer Aktivierung dieser Version.
  • +
  • Danach wird der Wissensindex deterministisch neu aufgebaut.
  • +
+
+
{% if is_granted('ROLE_SUPER_ADMIN') %}
@@ -71,16 +145,14 @@
{% endif %} -
- Hinweis: Eine neue Version ersetzt nicht automatisch die aktive Version. - Erst nach Aktivierung wird ein Re-Ingest durchgeführt und der Index - neu aufgebaut. + Hinweis: Eine neue Version verbessert den Dokument-Lifecycle nur dann sauber, wenn sie fachlich wirklich zu diesem Dokument gehört. + Bei stark verändertem Inhalt sollten nach der späteren Aktivierung auch die Tags geprüft werden.
-{% endblock %} +{% endblock %} \ No newline at end of file diff --git a/templates/admin/document/show.html.twig b/templates/admin/document/show.html.twig index 086b8c2..e093ea8 100644 --- a/templates/admin/document/show.html.twig +++ b/templates/admin/document/show.html.twig @@ -4,116 +4,225 @@ {% block body %} -
-

{{ document.title ?? 'Ein Fehler trat auf' }}

+
+
+

{{ document.title }}

+
+ Detailansicht für Dokument, Versionen und Tag-Zuordnung. +
+
- - Zurück zur Übersicht - +
- {% if document %} - - {# ============================= #} - {# Dokument-Meta #} - {# ============================= #} - -
-
- -
- Status: - {% if document.status == 'ACTIVE' %} - Aktiv - {% else %} - Archiviert - {% endif %} -
- -
- Erstellt von: - {{ document.createdBy ? document.createdBy.email : '-' }} -
- -
- Erstellt am: - {{ document.createdAt|date('d.m.Y H:i') }} -
- -
- Aktive Version: - {% if document.currentVersion %} - - v{{ document.currentVersion.versionNumber }} - - {% else %} - - - {% endif %} -
- -
+ {% for message in app.flashes('success') %} +
+ {{ message }}
+ {% endfor %} - {# ============================= #} - {# Versionen #} - {# ============================= #} - -
-

Versionen

- - {% if is_granted('ROLE_SUPER_ADMIN') %} - - Neue Version - - {% endif %} + {% for message in app.flashes('danger') %} +
+ {{ message }}
+ {% endfor %} - {% if document.versions is empty %} + {% for message in app.flashes('info') %} +
+ {{ message }} +
+ {% endfor %} -
- Keine Versionen vorhanden. -
+
- {% else %} - -
+
+
+
Dokument-Metadaten
+
+
+
Status
+
+ {% if document.status == 'ACTIVE' %} + Aktiv + {% else %} + Archiviert + {% endif %} +
+
+ +
+
Aktive Version
+
+ {% if document.currentVersion %} + + v{{ document.currentVersion.versionNumber }} + + {% else %} + - + {% endif %} +
+
+ +
+
Erstellt von
+
{{ document.createdBy ? document.createdBy.email : '-' }}
+
+ +
+
Erstellt am
+
{{ document.createdAt|date('d.m.Y H:i:s') }}
+
+ +
+
Anzahl Versionen
+
{{ document.versions|length }}
+
+ +
+
Zugewiesene Tags
+
{{ document.tags|length }}
+
+
+ + {% if is_granted('ROLE_SUPER_ADMIN') %} +
+ +
+ + Neue Version + + +
+ + +
+
+ {% endif %} +
+
+
+ +
+
+
+
+
Tags
+ + + Bearbeiten + +
+ + {% if document.tags is empty %} +
+ Diesem Dokument sind noch keine Tags zugewiesen. +
+ {% else %} +
+ {% for tag in document.tags %} + + {{ tag.label }} + + {% endfor %} +
+ +
+ Tags steuern die semantische Routing-Ebene. Weise nur fachlich wirklich passende Tags zu. +
+ {% endif %} +
+
+
+ +
+ +
+
+

Versionen

+
+ Beim Aktivieren einer Version wird automatisch ein Re-Ingest ausgelöst. +
+
+ + {% if is_granted('ROLE_SUPER_ADMIN') %} + + Neue Version + + {% endif %} +
+ + {% if document.versions is empty %} + +
+ Keine Versionen vorhanden. +
+ + {% else %} + +
+
+ +
- - - - - - - + + + + + + + {% for version in document.versions %} - - {# Aktivstatus #} - {# Ingest Status #} - {# Checksum #} - {# Created by #} - {# Date #} - {# Aktionen #} - - {% endfor %}
VersionStatusIngestChecksumErstellt vonDatumAktionenVersionAktivIngestChecksumErstellt vonDatumAktionen
v{{ version.versionNumber }} + {% if document.currentVersion and version.id == document.currentVersion.id %} +
Current
+ {% endif %}
{% if version.isActive %} Aktiv {% else %} - - Inaktiv - + Inaktiv {% endif %} {% if version.ingestStatus == 'INDEXED' %} INDEXED @@ -125,99 +234,85 @@ PENDING {% else %} - {{ version.ingestStatus }} - + {{ version.ingestStatus ?: '-' }} + {% endif %} - {{ version.checksum ? version.checksum[:10] ~ '…' : '-' }} + {% if version.checksum %} + {{ version.checksum[:12] ~ '…' }} + {% else %} + - + {% endif %} {{ version.createdBy ? version.createdBy.email : '-' }} - {{ version.createdAt|date('d.m.Y H:i') }} + {{ version.createdAt|date('d.m.Y H:i:s') }} - - {% if version.isActive %} - - {% if version.ingestStatus in ['PENDING', 'FAILED'] and is_granted('ROLE_SUPER_ADMIN') %} - -
- - - - -
- +
+ {% if version.isActive %} + {% if version.ingestStatus in ['PENDING', 'FAILED'] and is_granted('ROLE_SUPER_ADMIN') %} +
+ + +
+ {% else %} + + Keine Aktion nötig + + {% endif %} {% else %} - - Bereits indexiert - + {% if is_granted('ROLE_SUPER_ADMIN') %} +
+ + +
+ {% endif %} {% endif %} - - {% else %} - - {% if is_granted('ROLE_SUPER_ADMIN') %} -
- - - - -
- {% endif %} - - {% endif %} - +
-
+
- - {% endif %} - -
- Hinweis: Beim Aktivieren einer Version wird automatisch ein Re-Ingest - durchgeführt. Der NDJSON-Index und der FAISS-Index werden deterministisch - neu aufgebaut. -
- - {% else %} - -
- Dokument nicht gefunden.
{% endif %} -{% endblock %} +
+
+
Hinweis zum Lifecycle
+
+ Beim Aktivieren einer Version wird automatisch ein Re-Ingest durchgeführt. + Der NDJSON-Bestand und der Vektorindex werden deterministisch neu aufgebaut. + Wenn Tags zugewiesen sind, beeinflusst dieses Dokument zusätzlich die semantische Routing-Ebene. +
+
+
+ +{% endblock %} \ No newline at end of file diff --git a/templates/admin/document_tags/edit.html.twig b/templates/admin/document_tags/edit.html.twig index b42b849..8ed17d8 100644 --- a/templates/admin/document_tags/edit.html.twig +++ b/templates/admin/document_tags/edit.html.twig @@ -4,81 +4,87 @@ {% block body %} - {# ============================================= #} - {# Tag-Rebuild Status (Echte Live-Anzeige) #} - {# ============================================= #} - -
+
+ {% if latestJob %} +
+ Status wird geladen… +
+ {% endif %} +
+ {% for message in app.flashes('success') %} +
+ {{ message }} +
+ {% endfor %} + + {% for message in app.flashes('danger') %} +
+ {{ message }} +
+ {% endfor %} +
-

- Tags für Dokument - {{ document.title }} -

+
+

+ Tags für Dokument + {{ document.title }} +

+
+ Weise nur Tags zu, die den fachlichen Kern des Dokuments wirklich beschreiben. +
+
@@ -86,14 +92,40 @@
- {# ============================================= #} - {# Bereits zugewiesene Tags #} - {# ============================================= #} +
+
+
+
Hinweis für gutes Tagging
-
+
    +
  • Präzise statt breit: lieber produkt- oder themenscharfe Tags als allgemeine Oberbegriffe.
  • +
  • Catalog Entity nur bei echten Produktfamilien, Katalogbegriffen oder klaren Entitäten.
  • +
  • Generic nur als unterstützende Zusatzsemantik.
  • +
  • Sales Signal sparsam und bewusst einsetzen, nicht als Ersatz für Fach-Tags.
  • +
+
+ +
+
Aktueller Stand
+ +
+ + Zugewiesen: {{ document.tags|length }} + + + Verfügbar: {{ allTags|length }} + + + Nicht zugewiesen: {{ allTags|length - document.tags|length }} + +
+
+
+
+ +
- -
Zugewiesene Tags für: {{ document.title }}
+
Bereits zugewiesene Tags
{% if document.tags is empty %}
@@ -101,22 +133,26 @@
{% else %}
- {% for tag in document.tags %} - - {{ tag.label }} - + {% for tag in allTags %} + {% if tag in document.tags %} + + {{ tag.label }} + + {% endif %} {% endfor %}
{% endif %} -
- {# ============================================= #} - {# Tag-Zuweisung Formular #} - {# ============================================= #} - -
+
Tags zuweisen
@@ -128,38 +164,125 @@ name="_token" value="{{ csrf_token('admin_document_tags_save_' ~ document.id) }}"> -
- {% for tag in allTags %} -
- -
- - - - - +
+
+
+
+ Zugewiesene Tags
+
+
+ {% set hasAssigned = false %} + {% for tag in allTags %} + {% if tag in document.tags %} + {% set hasAssigned = true %} +
+
+ + +
+
+ {% endif %} + {% endfor %} + + {% if not hasAssigned %} +
+
+ Noch keine Tags zugewiesen. +
+
+ {% endif %} +
+
- {% endfor %} +
+ +
+
+
+ Verfügbare Tags +
+ +
+
+ {% set hasAvailable = false %} + {% for tag in allTags %} + {% if tag not in document.tags %} + {% set hasAvailable = true %} +
+
+ + +
+
+ {% endif %} + {% endfor %} + + {% if not hasAvailable %} +
+
+ Keine weiteren Tags verfügbar. +
+
+ {% endif %} +
+
+
+

- +
+ +
diff --git a/templates/admin/job/index.html.twig b/templates/admin/job/index.html.twig index 70055ee..5a333a4 100644 --- a/templates/admin/job/index.html.twig +++ b/templates/admin/job/index.html.twig @@ -4,8 +4,17 @@ {% block body %} -
-

Indexierung (Ingest Jobs)

+ {% set latestJob = jobs is not empty ? jobs|first : null %} + +
+
+

+ Indexierung (Ingest Jobs) +

+
+ Übersicht über Reindex-, Dokument- und Aktivierungsjobs des Systems. +
+
{% if is_granted('ROLE_SUPER_ADMIN') %}
+
+
+
+
Was sieht man hier?
+
    +
  • DOCUMENT verarbeitet ein einzelnes Dokument neu.
  • +
  • DOCUMENT_VERSION_ACTIVATE zieht eine aktivierte Version deterministisch neu in den Index.
  • +
  • DOCUMENT_DELETE entfernt Dokumentinhalt wieder sauber aus den Index-Artefakten.
  • +
  • GLOBAL_REINDEX baut den Wissensindex vollständig neu auf und ist der stärkste Reparaturpfad.
  • +
+
+ +
+
Worauf achten?
+
    +
  • RUNNING und QUEUED bedeuten: keine unnötigen parallelen Rebuilds starten.
  • +
  • FAILED oder ABORTED direkt prüfen.
  • +
  • Bei inkonsistentem Indexzustand ist meist ein Global Reindex der richtige Reparaturschritt.
  • +
+
+
+
+ + {% if latestJob %} +
+
+
+
+
Letzter Job
+ +
+ +
+
Typ
+ {{ latestJob.type }} +
+ +
+
Status
+ {% if latestJob.status == 'COMPLETED' %} + COMPLETED + {% elseif latestJob.status == 'QUEUED' %} + QUEUED + {% elseif latestJob.status == 'RUNNING' %} + RUNNING + {% elseif latestJob.status == 'FAILED' %} + FAILED + {% elseif latestJob.status == 'ABORTED' %} + ABORTED + {% else %} + {{ latestJob.status }} + {% endif %} +
+ +
+
Gestartet
+
+ {{ latestJob.startedAt ? latestJob.startedAt|date('d.m.Y H:i:s') : '-' }} +
+
+ +
+
Beendet
+
+ {{ latestJob.finishedAt ? latestJob.finishedAt|date('d.m.Y H:i:s') : 'läuft noch / offen' }} +
+
+
+ + {% if latestJob.errorMessage %} +
+ Fehler: + {{ latestJob.errorMessage|slice(0, 250) }}{% if latestJob.errorMessage|length > 250 %}…{% endif %} +
+ {% endif %} +
+
+ {% endif %} + {% if jobs is empty %} -
+
Keine Ingest Jobs vorhanden.
{% else %} -
+
- - - - - - - - - - - - - - +
+
+ Vorhandene Jobs + {{ jobs|length }} Einträge +
- {% for job in jobs %} +
+ Neueste Jobs stehen oben. +
+
+ +
+
Job-IDTypStatusDokumentVersionGestartetBeendetBenutzer
+ - - - - - - - - - - - - - - - - - + + + + + + + - {% else %} - - - - {% endfor %} + + - -
- - {{ job.id }} - - - - {{ job.type }} - - - {% if job.status == 'COMPLETED' %} - COMPLETED - {% elseif job.status == 'QUEUED' %} - QUEUED - {% elseif job.status == 'RUNNING' %} - RUNNING - {% elseif job.status == 'FAILED' %} - FAILED - {% else %} - - {{ job.status }} - - {% endif %} - - {% if job.documentId %} - - {{ job.documentId }} - - {% else %} - - - {% endif %} - - {{ job.documentVersionId ?? '-' }} - - {{ job.startedAt ? job.startedAt|date('d.m.Y H:i:s') : '-' }} - - {{ job.finishedAt ? job.finishedAt|date('d.m.Y H:i:s') : '-' }} - - {{ job.startedBy ? job.startedBy.email : '-' }} - JobTypStatusBezugGestartetBeendetBenutzer
- Keine Jobs gefunden. -
+ {% for job in jobs %} + + + + + {% if job.errorMessage %} +
+ {{ job.errorMessage|slice(0, 120) }}{% if job.errorMessage|length > 120 %}…{% endif %} +
+ {% endif %} + + + + + {{ job.type }} + + + + + {% if job.status == 'COMPLETED' %} + COMPLETED + {% elseif job.status == 'QUEUED' %} + QUEUED + {% elseif job.status == 'RUNNING' %} + RUNNING + {% elseif job.status == 'FAILED' %} + FAILED + {% elseif job.status == 'ABORTED' %} + ABORTED + {% else %} + + {{ job.status }} + + {% endif %} + + + + {% if job.documentId %} + + {% endif %} + + {% if job.documentVersionId %} +
+ Version: + {{ job.documentVersionId }} +
+ {% endif %} + + {% if not job.documentId and not job.documentVersionId %} + - + {% endif %} + + + + {{ job.startedAt ? job.startedAt|date('d.m.Y H:i:s') : '-' }} + + + + {{ job.finishedAt ? job.finishedAt|date('d.m.Y H:i:s') : 'offen' }} + + + + {{ job.startedBy ? job.startedBy.email : '-' }} + + + {% else %} + + + Keine Jobs gefunden. + + + {% endfor %} + + + +
@@ -128,8 +250,8 @@ {% endif %}
- Hinweis: Während laufender Jobs (Status RUNNING) sollten keine - parallelen Reindex-Prozesse gestartet werden. + Hinweis: Während laufender Jobs (Status RUNNING) oder wartender Jobs (QUEUED) + sollten keine unnötigen parallelen Reindex-Prozesse gestartet werden.
-{% endblock %} +{% endblock %} \ No newline at end of file diff --git a/templates/admin/job/show.html.twig b/templates/admin/job/show.html.twig index 8659bab..dd6c0ad 100644 --- a/templates/admin/job/show.html.twig +++ b/templates/admin/job/show.html.twig @@ -4,8 +4,18 @@ {% block body %} -
-

Ingest Job

+ {% set jobStatus = job.status|upper %} + {% set isActiveJob = jobStatus in ['QUEUED', 'RUNNING'] %} + +
+
+

+ Ingest Job +

+
+ Detailansicht für einen einzelnen Indexierungs- oder Rebuild-Job. +
+
@@ -13,61 +23,134 @@
-
+
+
+
+
Einordnung
+
    +
  • DOCUMENT verarbeitet ein einzelnes Dokument neu.
  • +
  • DOCUMENT_VERSION_ACTIVATE aktiviert eine Version und zieht sie deterministisch neu in den Index.
  • +
  • DOCUMENT_DELETE entfernt Dokumentinhalt wieder sauber aus dem Wissensbestand.
  • +
  • GLOBAL_REINDEX baut den Gesamtindex vollständig neu auf.
  • +
+
+ +
+
Aktueller Zustand
+
+ {% if jobStatus == 'COMPLETED' %} + COMPLETED + {% elseif jobStatus == 'QUEUED' %} + QUEUED + {% elseif jobStatus == 'RUNNING' %} + RUNNING + {% elseif jobStatus == 'FAILED' %} + FAILED + {% elseif jobStatus == 'ABORTED' %} + ABORTED + {% else %} + {{ jobStatus }} + {% endif %} + + {% if isActiveJob %} + Polling aktiv + {% endif %} +
+
+
+
+ +
-
- ID: - {{ job.id }} +
+
+
+
Job-ID
+
{{ job.id }}
+
+ +
+
Typ
+
+ {{ job.type }} +
+
+ +
+
Status
+
+ {% if jobStatus == 'COMPLETED' %} + COMPLETED + {% elseif jobStatus == 'QUEUED' %} + QUEUED + {% elseif jobStatus == 'RUNNING' %} + RUNNING + {% elseif jobStatus == 'FAILED' %} + FAILED + {% elseif jobStatus == 'ABORTED' %} + ABORTED + {% else %} + {{ jobStatus }} + {% endif %} +
+
+ +
+
Dokument
+
+ {% if job.documentId %} + + {{ job.documentId }} + + {% else %} + - + {% endif %} +
+
+ +
+
Dokumentversion
+
{{ job.documentVersionId ?? '-' }}
+
+
+ +
+
+
Gestartet
+
+ {{ job.startedAt ? job.startedAt|date('d.m.Y H:i:s') : '-' }} +
+
+ +
+
Beendet
+
+ {{ job.finishedAt ? job.finishedAt|date('d.m.Y H:i:s') : '-' }} +
+
+ +
+
Gestartet von
+
{{ job.startedBy ? job.startedBy.email : '-' }}
+
+ +
+
Polling
+
+ {% if isActiveJob %} + Status wird automatisch aktualisiert. + {% else %} + Kein Live-Polling nötig. + {% endif %} +
+
+
-
- Typ: - {{ job.type }} -
- -
- Status: - -
- -
- Dokument: - {% if job.documentId %} - - {{ job.documentId }} - - {% else %} - - - {% endif %} -
- -
- Version: - {{ job.documentVersionId ?? '-' }} -
- -
- Gestartet: - {{ job.startedAt|date('d.m.Y H:i:s') }} -
- -
- Beendet: - - {{ job.finishedAt ? job.finishedAt|date('d.m.Y H:i:s') : '-' }} - -
- -
- Gestartet von: - {{ job.startedBy ? job.startedBy.email : '-' }} -
- - {# Loader #}
+ class="mt-4 {% if not isActiveJob %}d-none{% endif %}">
@@ -79,10 +162,10 @@
- {# Fehlerbereich #}
+ class="alert alert-danger mt-4 {% if not job.errorMessage %}d-none{% endif %}"> {% if job.errorMessage %} + Fehler:
{{ job.errorMessage }} {% endif %}
@@ -91,13 +174,13 @@
- Hinweis: Bei DOCUMENT_VERSION_ACTIVATE-Jobs wird ein vollständiger - NDJSON-Rebuild und FAISS-Reindex durchgeführt. + Hinweis: Bei DOCUMENT_VERSION_ACTIVATE-Jobs wird ein vollständiger + NDJSON-Rebuild und FAISS-Reindex durchgeführt. Bei GLOBAL_REINDEX + wird der gesamte Wissensindex neu aufgebaut.
-{% endblock %} +{% endblock %} \ No newline at end of file diff --git a/templates/admin/tag/assign.html.twig b/templates/admin/tag/assign.html.twig index 481e941..03efc17 100644 --- a/templates/admin/tag/assign.html.twig +++ b/templates/admin/tag/assign.html.twig @@ -4,19 +4,24 @@ {% block body %} - {# ========================================================= #} - {# LIVE REBUILD STATUS (SSE) #} - {# ========================================================= #} - -
-
- Status wird geladen… -
+
+ {% if latestJob %} +
+ Status wird geladen… +
+ {% endif %}
+
-

- Tag: {{ tag.label }} -

+
+

+ Tag: {{ tag.label }} +

+ +
+ Slug: {{ tag.slug }} +
+
@@ -24,7 +29,6 @@
- - {# ============================= #} - {# Flash Messages #} - {# ============================= #} - {% for message in app.flashes('success') %} -
+
{{ message }}
{% endfor %} {% for message in app.flashes('danger') %} -
+
{{ message }}
{% endfor %} - {# ============================= #} - {# Tag → Dokumente #} - {# ============================= #} +
+
+
+
Einordnung des Tags
+ +
+ {% if tag.type == 'catalog_entity' %} + Catalog Entity + {% elseif tag.type == 'sales_signal' %} + Sales Signal + {% else %} + Generic + {% endif %} +
+ +

+ {{ tag.description ?: 'Keine Beschreibung hinterlegt.' }} +

+ +

+ Weise diesen Tag nur Dokumenten zu, die fachlich wirklich denselben Gegenstand, + dieselbe Produktfamilie oder denselben Anwendungsfall abbilden. + Zu breite Zuweisungen machen das Routing weicher. +

+
+ +
+
Aktueller Stand
+ +
+ + Zugewiesen: {{ assignedDocIds|length }} + + + Verfügbar: {{ documents|length }} + + + Nicht zugewiesen: {{ documents|length - assignedDocIds|length }} + +
+
+
+
- -
-
-
- - - - - - - +
+
+
+
+ Zugewiesene Dokumente +
-
- {% for doc in documents %} - {% if doc.id in assignedDocIds %} +
+
+
Zugewiesene Dokumente
+ - - + + - {% endif %} - {% endfor %} + + + {% set hasAssigned = false %} + {% for doc in documents %} + {% if doc.id in assignedDocIds %} + {% set hasAssigned = true %} + + + + + {% endif %} + {% endfor %} - -
- - - {{ doc.title }} - + + Dokument
+ + + {{ doc.title }} +
+ {% if not hasAssigned %} + + + Noch keine Dokumente zugewiesen. + + + {% endif %} + + +
+
-
- - - - - - - + - - {% for doc in documents %} - {% if doc.id not in assignedDocIds %} +
+
+
+ Verfügbare Dokumente +
+ +
+
+
Nicht zugewiesene Dokumente
+ - - + + - {% endif %} - {% endfor %} - -
- - - {{ doc.title }} - + + Dokument
+ + + {% set hasUnassigned = false %} + {% for doc in documents %} + {% if doc.id not in assignedDocIds %} + {% set hasUnassigned = true %} + + + + + + {{ doc.title }} + + + {% endif %} + {% endfor %} + + {% if not hasUnassigned %} + + + Keine weiteren aktiven Dokumente verfügbar. + + + {% endif %} + + +
+
- - +
+ +
{% endblock %} \ No newline at end of file diff --git a/templates/admin/tag/index.html.twig b/templates/admin/tag/index.html.twig index d887cf6..50baff3 100644 --- a/templates/admin/tag/index.html.twig +++ b/templates/admin/tag/index.html.twig @@ -4,77 +4,52 @@ {% block body %} - {# ========================================================= #} - {# LIVE REBUILD STATUS (SSE) #} - {# ========================================================= #} - -
+
{% if latestJob %} -
+
Status wird geladen…
{% endif %}
-

Tag-Management

+

+ Tag-Management +

- {# ========================================================= #} - {# TAG SYSTEM DESCRIPTION #} - {# ========================================================= #}
-
+
Was machen Tags im System?

- Tags dienen als semantische Routing-Ebene innerhalb des RAG-Systems. - Sie strukturieren Dokumente thematisch und beeinflussen, - welche Inhalte bei einer Nutzeranfrage priorisiert werden. + Tags sind die semantische Routing-Ebene innerhalb des Systems. + Sie helfen dabei, thematisch passende Dokumenträume schneller zu erkennen + und gute Retrieval-Kandidaten zu priorisieren.

-
    -
  • - Tags werden Dokumenten manuell zugewiesen. -
  • -
  • - Beim Rebuild wird aus allen Tags eine eigene - tags.ndjson erzeugt. -
  • -
  • - Zusätzlich wird ein separater Vektorindex - (vector_tags.index) aufgebaut. -
  • -
  • - Bei einer Anfrage erfolgt zunächst ein Tag-Matching, - danach wird das Chunk-Retrieval entsprechend gewichtet. -
  • +
      +
    • Tags werden Dokumenten manuell zugewiesen.
    • +
    • Beim Rebuild wird aus den aktiven Tag-Zuordnungen eine tags.ndjson erzeugt.
    • +
    • Zusätzlich wird ein eigener Tag-Vektorindex (vector_tags.index) gebaut.
    • +
    • Bei Anfragen erfolgt zunächst ein semantisches Tag-Matching, danach das eigentliche Chunk-Retrieval.
+
-
Wie werden Tags bewertet?
+
Was ist gutes Tagging?
-

- Die Bewertung erfolgt über einen eigenen Vektor-Similarity-Score - im Tag-Index. Das System berechnet: -

- -
    -
  • - Ähnlichkeit zwischen Nutzeranfrage und Tag-Embedding -
  • -
  • - Top-K Treffer im Tag-Index -
  • -
  • - Gewichtete Übergabe an das Chunk-Retrieval -
  • +
      +
    • Präzise statt generisch: lieber Produktnamen als Gerät.
    • +
    • Fachlich sauber: Tags sollen echte Produktfamilien, Anwendungsfälle oder Entitäten abbilden.
    • +
    • Wenig Überschneidung: keine unnötig breiten oder doppeldeutigen Tags.
    • +
    • Bewusst typisieren: catalog_entity für echte Katalog-/Entity-Tags, generic nur für allgemeine Zusatzsemantik.
    -

    - Tags wirken somit als semantischer Verstärker. - Sie ersetzen kein Chunk-Retrieval, sondern steuern dessen Priorisierung. +

    + Zu breite Tags wie „Produkt“, „System“ oder „Gerät“ machen das Routing weicher + und bringen meist weniger Nutzen als präzise fachliche Tags.

@@ -90,9 +65,9 @@ if (data.status === '{{ statusRunning }}') { html = ` -
+
- Tag-Rebuild läuft
+ Tag-Rebuild läuft
${data.startedAt ? 'Gestartet: ' + new Date(data.startedAt).toLocaleString() : ''}
@@ -100,20 +75,20 @@ `; } else if (data.status === '{{ statusQueued }}') { html = ` -
- Tag-Rebuild in Warteschlange +
+ Tag-Rebuild in Warteschlange
`; } else if (data.status === '{{ statusCompleted }}') { html = ` -
+
Tag-Rebuild erfolgreich abgeschlossen
`; } else if (data.status === '{{ statusFailed }}') { html = ` -
- Tag-Rebuild fehlgeschlagen
+
+ Tag-Rebuild fehlgeschlagen
${data.error ? '' + data.error + '' : ''}
`; @@ -125,11 +100,12 @@ source.onerror = function () { console.warn('SSE Verbindung verloren'); }; + + window.addEventListener('beforeunload', function () { + source.close(); + }); - {# ========================================================= #} - {# Create Tag Card #} - {# ========================================================= #}
Neuen Tag hinzufügen
@@ -153,24 +129,26 @@ required/>
-
- - -
- -
- - + {% for choiceLabel, choiceValue in tagTypeChoices %} + + {% endfor %}
-
+
+ + +
+ +
@@ -179,66 +157,85 @@
- {# ========================================================= #} - {# Tag Table #} - {# ========================================================= #}
-
- Vorhandene Tags: - - {{ tags|length }} Einträge - +
+
+ Vorhandene Tags: + + {{ tags|length }} Einträge + +
+ +
+ Dokumentanzahl bezieht sich auf aktive Dokumente. +
- - - - - - - - - - - {% for tag in tags %} +
+
LabelSlugBeschreibungAktion
+ - - - - + + + + + + - {% else %} - - - - {% endfor %} - -
{{ tag.label }}{{ tag.slug }}{{ tag.description ?: '-' }} - - - Zuweisen - - -
- - - - -
- -
LabelSlugTypAktive DokumenteBeschreibungAktion
- Noch keine Tags vorhanden. -
+ + + {% for tag in tags %} + {% set activeDocumentCount = documentCountByTagId[tag.id.toRfc4122] ?? 0 %} + + {{ tag.label }} + {{ tag.slug }} + + {% if tag.type == 'catalog_entity' %} + Catalog Entity + {% elseif tag.type == 'sales_signal' %} + Sales Signal + {% else %} + Generic + {% endif %} + + + + {{ activeDocumentCount }} + + + {{ tag.description ?: '-' }} + + + Zuweisen + + +
+ + + +
+ + + {% else %} + + + Noch keine Tags vorhanden. + + + {% endfor %} + + +