optimize catalog semantic match sby tags
This commit is contained in:
32
migrations/Version20260228000100.php
Normal file
32
migrations/Version20260228000100.php
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace DoctrineMigrations;
|
||||||
|
|
||||||
|
use Doctrine\DBAL\Schema\Schema;
|
||||||
|
use Doctrine\Migrations\AbstractMigration;
|
||||||
|
|
||||||
|
final class Version20260228000100 extends AbstractMigration
|
||||||
|
{
|
||||||
|
public function getDescription(): string
|
||||||
|
{
|
||||||
|
return 'Add type column to knowledge_tag table for catalog entity support';
|
||||||
|
}
|
||||||
|
|
||||||
|
public function up(Schema $schema): void
|
||||||
|
{
|
||||||
|
$this->addSql("
|
||||||
|
ALTER TABLE knowledge_tag
|
||||||
|
ADD type VARCHAR(50) NOT NULL DEFAULT 'generic'
|
||||||
|
");
|
||||||
|
}
|
||||||
|
|
||||||
|
public function down(Schema $schema): void
|
||||||
|
{
|
||||||
|
$this->addSql("
|
||||||
|
ALTER TABLE knowledge_tag
|
||||||
|
DROP type
|
||||||
|
");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -42,6 +42,9 @@ INDEX_META_PATH = KNOWLEDGE_DIR / "index_meta.json"
|
|||||||
INDEX_RUNTIME_PATH = KNOWLEDGE_DIR / "index_runtime.json"
|
INDEX_RUNTIME_PATH = KNOWLEDGE_DIR / "index_runtime.json"
|
||||||
INDEX_NDJSON_PATH = KNOWLEDGE_DIR / "index.ndjson"
|
INDEX_NDJSON_PATH = KNOWLEDGE_DIR / "index.ndjson"
|
||||||
|
|
||||||
|
# NEW: Tags NDJSON (exported by PHP) used to enrich /search-tags responses
|
||||||
|
TAGS_NDJSON_PATH = KNOWLEDGE_DIR / "tags.ndjson"
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# Logging
|
# Logging
|
||||||
@@ -111,6 +114,9 @@ chunk_pos_map: Dict[str, int] = {}
|
|||||||
tag_index = None
|
tag_index = None
|
||||||
tag_ids: Optional[List[Any]] = None
|
tag_ids: Optional[List[Any]] = None
|
||||||
|
|
||||||
|
# NEW: tag_id -> {"label": "...", "tag_type": "..."}
|
||||||
|
tag_meta_map: Dict[str, Dict[str, str]] = {}
|
||||||
|
|
||||||
loaded_embedding_model_name: Optional[str] = None
|
loaded_embedding_model_name: Optional[str] = None
|
||||||
current_index_version: Optional[int] = None
|
current_index_version: Optional[int] = None
|
||||||
current_runtime_stamp: Optional[str] = None
|
current_runtime_stamp: Optional[str] = None
|
||||||
@@ -210,6 +216,61 @@ def load_chunk_maps_from_ndjson() -> None:
|
|||||||
logger.warning("Failed to load chunk maps from ndjson: %s", str(e))
|
logger.warning("Failed to load chunk maps from ndjson: %s", str(e))
|
||||||
|
|
||||||
|
|
||||||
|
def load_tag_meta_from_tags_ndjson() -> None:
|
||||||
|
"""
|
||||||
|
Loads minimal tag metadata from tags.ndjson to enrich /search-tags results.
|
||||||
|
Expected line format (from PHP exporter / ingester pipeline):
|
||||||
|
{"tag_id":"...","text":"LABEL\\nSLUG\\noptional description", ...}
|
||||||
|
We extract:
|
||||||
|
label = first line of "text" (fallback: "")
|
||||||
|
tag_type = "type" if present (preferred), else "generic"
|
||||||
|
"""
|
||||||
|
global tag_meta_map
|
||||||
|
|
||||||
|
tag_meta_map = {}
|
||||||
|
|
||||||
|
if not TAGS_NDJSON_PATH.exists():
|
||||||
|
logger.info("[Reload] tags.ndjson missing -> tag_meta_map empty (%s)", str(TAGS_NDJSON_PATH))
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
with TAGS_NDJSON_PATH.open("r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
row = json.loads(line)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tag_id = _as_key(row.get("tag_id"))
|
||||||
|
if not tag_id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Prefer explicit fields if present
|
||||||
|
ttype = row.get("type")
|
||||||
|
if isinstance(ttype, str) and ttype.strip():
|
||||||
|
tag_type = ttype.strip()
|
||||||
|
else:
|
||||||
|
tag_type = "generic"
|
||||||
|
|
||||||
|
label = ""
|
||||||
|
txt = row.get("text")
|
||||||
|
if isinstance(txt, str) and txt.strip():
|
||||||
|
first = txt.splitlines()[0].strip() if txt.splitlines() else ""
|
||||||
|
label = first
|
||||||
|
|
||||||
|
if label:
|
||||||
|
tag_meta_map[tag_id] = {"label": label, "tag_type": tag_type}
|
||||||
|
else:
|
||||||
|
tag_meta_map[tag_id] = {"label": "", "tag_type": tag_type}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to load tag meta from tags.ndjson: %s", str(e))
|
||||||
|
tag_meta_map = {}
|
||||||
|
|
||||||
|
|
||||||
def _normalize_meta_list(value: Any) -> Optional[List[Any]]:
|
def _normalize_meta_list(value: Any) -> Optional[List[Any]]:
|
||||||
"""
|
"""
|
||||||
Accepts:
|
Accepts:
|
||||||
@@ -282,6 +343,10 @@ def load_all() -> None:
|
|||||||
tag_index = None
|
tag_index = None
|
||||||
tag_ids = None
|
tag_ids = None
|
||||||
|
|
||||||
|
# NEW: load tag meta for enrichment
|
||||||
|
logger.info("[Reload] Loading tag meta from tags.ndjson")
|
||||||
|
load_tag_meta_from_tags_ndjson()
|
||||||
|
|
||||||
runtime = _safe_read_json(INDEX_RUNTIME_PATH)
|
runtime = _safe_read_json(INDEX_RUNTIME_PATH)
|
||||||
if isinstance(runtime, dict):
|
if isinstance(runtime, dict):
|
||||||
v = runtime.get("last_rebuild_at")
|
v = runtime.get("last_rebuild_at")
|
||||||
@@ -292,10 +357,11 @@ def load_all() -> None:
|
|||||||
current_index_version = index_version if isinstance(index_version, int) else None
|
current_index_version = index_version if isinstance(index_version, int) else None
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s stamp=%s file=%s)",
|
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s tag_meta=%s stamp=%s file=%s)",
|
||||||
str(current_index_version),
|
str(current_index_version),
|
||||||
str(current_runtime_stamp),
|
str(current_runtime_stamp),
|
||||||
str(loaded_embedding_model_name),
|
str(loaded_embedding_model_name),
|
||||||
|
str(len(tag_meta_map)),
|
||||||
SERVICE_STAMP,
|
SERVICE_STAMP,
|
||||||
str(Path(__file__).resolve()),
|
str(Path(__file__).resolve()),
|
||||||
)
|
)
|
||||||
@@ -390,6 +456,8 @@ def health():
|
|||||||
"tag_meta_len": len(tag_ids) if isinstance(tag_ids, list) else None,
|
"tag_meta_len": len(tag_ids) if isinstance(tag_ids, list) else None,
|
||||||
"chunk_meta_type": type(chunk_ids).__name__ if chunk_ids is not None else None,
|
"chunk_meta_type": type(chunk_ids).__name__ if chunk_ids is not None else None,
|
||||||
"chunk_meta_len": len(chunk_ids) if isinstance(chunk_ids, list) else None,
|
"chunk_meta_len": len(chunk_ids) if isinstance(chunk_ids, list) else None,
|
||||||
|
"tag_meta_map_len": len(tag_meta_map),
|
||||||
|
"tags_ndjson_path": str(TAGS_NDJSON_PATH),
|
||||||
"log_file": str(LOG_FILE),
|
"log_file": str(LOG_FILE),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -502,7 +570,26 @@ def search_tags(req: SearchRequest):
|
|||||||
continue
|
continue
|
||||||
if idx < 0 or idx >= len(tag_ids):
|
if idx < 0 or idx >= len(tag_ids):
|
||||||
continue
|
continue
|
||||||
results.append({"tag_id": tag_ids[idx], "score": float(score)})
|
|
||||||
|
tag_id = tag_ids[idx]
|
||||||
|
tag_id_key = _as_key(tag_id) or ""
|
||||||
|
|
||||||
|
payload: Dict[str, Any] = {
|
||||||
|
"tag_id": tag_id,
|
||||||
|
"score": float(score),
|
||||||
|
}
|
||||||
|
|
||||||
|
meta = tag_meta_map.get(tag_id_key)
|
||||||
|
if isinstance(meta, dict):
|
||||||
|
label = meta.get("label")
|
||||||
|
ttype = meta.get("tag_type")
|
||||||
|
|
||||||
|
if isinstance(label, str) and label.strip():
|
||||||
|
payload["label"] = label
|
||||||
|
if isinstance(ttype, str) and ttype.strip():
|
||||||
|
payload["tag_type"] = ttype
|
||||||
|
|
||||||
|
results.append(payload)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,10 @@ use Symfony\Component\Uid\Uuid;
|
|||||||
* - TagVectorSearch (Score-Gate + Ambiguity-Check)
|
* - TagVectorSearch (Score-Gate + Ambiguity-Check)
|
||||||
* - DB Query auf document_tag + document (ACTIVE)
|
* - DB Query auf document_tag + document (ACTIVE)
|
||||||
* - Rückgabe als EIN Textblock (string) oder null (Fallback auf normalen Retrieval)
|
* - Rückgabe als EIN Textblock (string) oder null (Fallback auf normalen Retrieval)
|
||||||
|
*
|
||||||
|
* Schritt-3 Änderung:
|
||||||
|
* - Headline ist NICHT mehr hardcoded
|
||||||
|
* - Headline basiert dynamisch auf dem gefundenen Tag
|
||||||
*/
|
*/
|
||||||
final class EntityCatalogService
|
final class EntityCatalogService
|
||||||
{
|
{
|
||||||
@@ -63,6 +67,10 @@ final class EntityCatalogService
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// OPTIONAL: Falls TagVectorSearchClient künftig tag_label zurückliefert,
|
||||||
|
// kann das hier direkt verwendet werden.
|
||||||
|
$tagLabel = isset($best['tag_label']) ? (string)$best['tag_label'] : null;
|
||||||
|
|
||||||
// 3) DB Query: alle ACTIVE Dokumente zu diesem Tag
|
// 3) DB Query: alle ACTIVE Dokumente zu diesem Tag
|
||||||
$rows = $this->connection->fetchAllAssociative(
|
$rows = $this->connection->fetchAllAssociative(
|
||||||
'
|
'
|
||||||
@@ -95,18 +103,24 @@ final class EntityCatalogService
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return $this->buildTextBlock($entityTerm, $titles);
|
return $this->buildTextBlock($tagLabel, $titles);
|
||||||
}
|
}
|
||||||
|
|
||||||
private function buildTextBlock(string $entityTerm, array $titles): string
|
/**
|
||||||
|
* Dynamische Headline:
|
||||||
|
* - Wenn Tag-Label vorhanden → verwenden
|
||||||
|
* - Sonst generischer Fallback
|
||||||
|
*/
|
||||||
|
private function buildTextBlock(?string $tagLabel, array $titles): string
|
||||||
{
|
{
|
||||||
$headline = match ($entityTerm) {
|
$headline = 'Folgende Einträge sind verfügbar:';
|
||||||
'geräte' => 'Folgende Geräte sind verfügbar:',
|
|
||||||
'indikatoren' => 'Folgende Indikatoren sind verfügbar:',
|
if (\is_string($tagLabel) && \trim($tagLabel) !== '') {
|
||||||
'funktionen' => 'Folgende Funktionen sind verfügbar:',
|
$headline = sprintf(
|
||||||
'zubehör' => 'Folgendes Zubehör ist verfügbar:',
|
'Folgende %s sind verfügbar:',
|
||||||
default => 'Folgende Einträge sind verfügbar:',
|
$tagLabel
|
||||||
};
|
);
|
||||||
|
}
|
||||||
|
|
||||||
$lines = [];
|
$lines = [];
|
||||||
foreach ($titles as $title) {
|
foreach ($titles as $title) {
|
||||||
|
|||||||
@@ -46,7 +46,8 @@ final class TagController extends AbstractController
|
|||||||
(string)$request->request->get('label', ''),
|
(string)$request->request->get('label', ''),
|
||||||
$request->request->get('description')
|
$request->request->get('description')
|
||||||
? (string)$request->request->get('description')
|
? (string)$request->request->get('description')
|
||||||
: null
|
: null,
|
||||||
|
(string)$request->request->get('type', 'generic') // NEU
|
||||||
);
|
);
|
||||||
|
|
||||||
$this->addFlash('success', 'Tag wurde erstellt.');
|
$this->addFlash('success', 'Tag wurde erstellt.');
|
||||||
|
|||||||
@@ -24,6 +24,14 @@ class Tag
|
|||||||
#[ORM\Column(type: 'text', nullable: true)]
|
#[ORM\Column(type: 'text', nullable: true)]
|
||||||
private ?string $description = null;
|
private ?string $description = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NEU: Governance-Typ des Tags
|
||||||
|
* - generic
|
||||||
|
* - catalog_entity
|
||||||
|
*/
|
||||||
|
#[ORM\Column(length: 50)]
|
||||||
|
private string $type = 'generic';
|
||||||
|
|
||||||
#[ORM\Column]
|
#[ORM\Column]
|
||||||
private \DateTimeImmutable $createdAt;
|
private \DateTimeImmutable $createdAt;
|
||||||
|
|
||||||
@@ -75,6 +83,18 @@ class Tag
|
|||||||
return $this;
|
return $this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getType(): string
|
||||||
|
{
|
||||||
|
return $this->type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function setType(string $type): static
|
||||||
|
{
|
||||||
|
$type = trim($type);
|
||||||
|
$this->type = $type !== '' ? $type : 'generic';
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
public function getCreatedAt(): \DateTimeImmutable
|
public function getCreatedAt(): \DateTimeImmutable
|
||||||
{
|
{
|
||||||
return $this->createdAt;
|
return $this->createdAt;
|
||||||
|
|||||||
@@ -4,27 +4,10 @@ declare(strict_types=1);
|
|||||||
|
|
||||||
namespace App\Intent;
|
namespace App\Intent;
|
||||||
|
|
||||||
/**
|
use App\Tag\TagVectorSearchClient;
|
||||||
* CatalogIntentLite
|
|
||||||
*
|
|
||||||
* Minimal, deterministische Erkennung von Katalog-/Entity-Listenanfragen.
|
|
||||||
*
|
|
||||||
* Ziel:
|
|
||||||
* - "Liste aller Geräte" / "Welche Indikatoren gibt es?" / "Zeig mir alle Funktionen"
|
|
||||||
*
|
|
||||||
* Guardrails:
|
|
||||||
* - Kein Catalog-Mode bei Sales-/Pricing-/Comparison-/ROI-/Implementation-/Objection-Intents.
|
|
||||||
* - Kein Catalog-Mode ohne expliziten Entity-Term.
|
|
||||||
*
|
|
||||||
* WICHTIG:
|
|
||||||
* - Immer mit ORIGINAL-Prompt aufrufen.
|
|
||||||
* - Kein LLM, kein ML.
|
|
||||||
*/
|
|
||||||
final class CatalogIntentLite
|
final class CatalogIntentLite
|
||||||
{
|
{
|
||||||
/**
|
|
||||||
* Listensignale (leichtgewichtig) – IntentLite bleibt weiterhin für "allgemeine" List Detection zuständig.
|
|
||||||
*/
|
|
||||||
private const LIST_SIGNALS = [
|
private const LIST_SIGNALS = [
|
||||||
'liste',
|
'liste',
|
||||||
'auflisten',
|
'auflisten',
|
||||||
@@ -39,100 +22,76 @@ final class CatalogIntentLite
|
|||||||
'alle',
|
'alle',
|
||||||
];
|
];
|
||||||
|
|
||||||
/**
|
private const MIN_SCORE = 0.60;
|
||||||
* Entity-Terms, die wir als Katalogtypen unterstützen.
|
private const AMBIGUITY_DELTA = 0.05;
|
||||||
*
|
|
||||||
* Left side: canonical term (für Tag-Suche)
|
|
||||||
* Right side: Such-Synonyme, die im Prompt vorkommen dürfen.
|
|
||||||
*/
|
|
||||||
private const ENTITY_TERMS = [
|
|
||||||
'geräte' => ['gerät', 'geräte', 'geraet', 'geraete', 'device', 'devices'],
|
|
||||||
'indikatoren' => ['indikator', 'indikatoren', 'indicator', 'indicators'],
|
|
||||||
'funktionen' => ['funktion', 'funktionen', 'feature', 'features', 'funktionalität', 'funktionalitaet'],
|
|
||||||
'zubehör' => ['zubehör', 'zubehoer', 'accessory', 'accessories', 'zubehor'],
|
|
||||||
];
|
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly SalesIntentLite $salesIntentLite,
|
private readonly SalesIntentLite $salesIntentLite,
|
||||||
|
private readonly TagVectorSearchClient $tagVectorClient,
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
/**
|
public function detect(string $prompt): ?string
|
||||||
* @return string|null canonical entity term (z. B. "geräte") oder null wenn kein Catalog-Intent.
|
|
||||||
*/
|
|
||||||
public function detect(string $originalPrompt): ?string
|
|
||||||
{
|
{
|
||||||
$p = $this->normalize($originalPrompt);
|
$normalizedPrompt = mb_strtolower($prompt);
|
||||||
|
|
||||||
// 1) Muss ein Listen-Signal enthalten
|
// 1) Muss Listen-Signal enthalten
|
||||||
if (!$this->containsAny($p, self::LIST_SIGNALS)) {
|
if (!$this->containsAny($normalizedPrompt, self::LIST_SIGNALS)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2) Guardrail: Kein Catalog-Mode bei Sales-Intents
|
// 2) Guardrail: Nur DISCOVERY
|
||||||
$sales = $this->salesIntentLite->detect($originalPrompt);
|
$sales = $this->salesIntentLite->detect($prompt);
|
||||||
$intent = (string)($sales['intent'] ?? SalesIntentLite::DISCOVERY);
|
$intent = (string)($sales['intent'] ?? SalesIntentLite::DISCOVERY);
|
||||||
|
|
||||||
if ($intent !== SalesIntentLite::DISCOVERY) {
|
if ($intent !== SalesIntentLite::DISCOVERY) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3) Expliziten Entity-Term extrahieren (sonst kein Catalog)
|
// 3) Vector-basierte Tag-Suche (Top 3 für Ambiguity-Check)
|
||||||
foreach (self::ENTITY_TERMS as $canonical => $synonyms) {
|
$hits = $this->tagVectorClient->search($prompt, 3);
|
||||||
foreach ($synonyms as $syn) {
|
|
||||||
if ($this->containsWord($p, $syn)) {
|
|
||||||
return $canonical;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if ($hits === []) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ------------------------------------------------------------
|
$best = $hits[0];
|
||||||
// Helpers
|
$bestScore = (float)($best['score'] ?? 0.0);
|
||||||
// ------------------------------------------------------------
|
|
||||||
|
if ($bestScore < self::MIN_SCORE) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ambiguity-Check
|
||||||
|
if (isset($hits[1])) {
|
||||||
|
$secondScore = (float)($hits[1]['score'] ?? 0.0);
|
||||||
|
if (abs($bestScore - $secondScore) < self::AMBIGUITY_DELTA) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4) Nur catalog_entity zulassen
|
||||||
|
if (($best['tag_type'] ?? null) !== 'catalog_entity') {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5) Canonical Label zurückgeben
|
||||||
|
$label = (string)($best['label'] ?? '');
|
||||||
|
|
||||||
|
if ($label === '') {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mb_strtolower($label);
|
||||||
|
}
|
||||||
|
|
||||||
private function containsAny(string $haystack, array $needles): bool
|
private function containsAny(string $haystack, array $needles): bool
|
||||||
{
|
{
|
||||||
foreach ($needles as $needle) {
|
foreach ($needles as $needle) {
|
||||||
if ($needle === '') {
|
if ($needle !== '' && str_contains($haystack, $needle)) {
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (str_contains($haystack, $needle)) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private function containsWord(string $haystack, string $word): bool
|
|
||||||
{
|
|
||||||
$word = trim($word);
|
|
||||||
if ($word === '') {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return preg_match('/\b' . preg_quote($word, '/') . '\b/u', $haystack) === 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
private function normalize(string $s): string
|
|
||||||
{
|
|
||||||
$s = mb_strtolower($s);
|
|
||||||
|
|
||||||
// Umlaute absichern (analog IntentLite/SalesIntentLite)
|
|
||||||
$replacements = [
|
|
||||||
'ä' => 'ae',
|
|
||||||
'ö' => 'oe',
|
|
||||||
'ü' => 'ue',
|
|
||||||
'ß' => 'ss',
|
|
||||||
];
|
|
||||||
|
|
||||||
foreach ($replacements as $umlaut => $alt) {
|
|
||||||
if (str_contains($s, $umlaut)) {
|
|
||||||
$s .= ' ' . str_replace($umlaut, $alt, $s);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return $s;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
@@ -4,7 +4,6 @@ declare(strict_types=1);
|
|||||||
|
|
||||||
namespace App\Knowledge\Retrieval;
|
namespace App\Knowledge\Retrieval;
|
||||||
|
|
||||||
use App\Catalog\EntityCatalogService;
|
|
||||||
use App\Entity\ModelGenerationConfig;
|
use App\Entity\ModelGenerationConfig;
|
||||||
use App\Intent\CatalogIntentLite;
|
use App\Intent\CatalogIntentLite;
|
||||||
use App\Intent\IntentLite;
|
use App\Intent\IntentLite;
|
||||||
@@ -13,6 +12,9 @@ use App\Knowledge\QueryCleaner;
|
|||||||
use App\Repository\ModelGenerationConfigRepository;
|
use App\Repository\ModelGenerationConfigRepository;
|
||||||
use App\Tag\TagRoutingService;
|
use App\Tag\TagRoutingService;
|
||||||
use App\Vector\VectorSearchClient;
|
use App\Vector\VectorSearchClient;
|
||||||
|
use App\Catalog\EntityCatalogService;
|
||||||
|
use App\Knowledge\Retrieval\NdjsonChunkLookup;
|
||||||
|
use App\Knowledge\Retrieval\RetrieverInterface;
|
||||||
|
|
||||||
final class NdjsonHybridRetriever implements RetrieverInterface
|
final class NdjsonHybridRetriever implements RetrieverInterface
|
||||||
{
|
{
|
||||||
@@ -307,7 +309,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
float $threshold,
|
float $threshold,
|
||||||
bool $boost = false,
|
bool $boost = false,
|
||||||
bool $captureRaw = false
|
bool $captureRaw = false
|
||||||
): void {
|
): void
|
||||||
|
{
|
||||||
$rank = 0;
|
$rank = 0;
|
||||||
|
|
||||||
foreach ($hits as $hit) {
|
foreach ($hits as $hit) {
|
||||||
|
|||||||
@@ -11,12 +11,12 @@ use App\Service\TagRebuildJobService;
|
|||||||
use App\Tag\TagService;
|
use App\Tag\TagService;
|
||||||
use Doctrine\ORM\EntityManagerInterface;
|
use Doctrine\ORM\EntityManagerInterface;
|
||||||
|
|
||||||
final class TagAdminService
|
final readonly class TagAdminService
|
||||||
{
|
{
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly EntityManagerInterface $em,
|
private EntityManagerInterface $em,
|
||||||
private readonly TagService $tagService,
|
private TagService $tagService,
|
||||||
private readonly TagRebuildJobService $jobs,
|
private TagRebuildJobService $jobs,
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
public function getIndexData(): array
|
public function getIndexData(): array
|
||||||
@@ -31,9 +31,13 @@ final class TagAdminService
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
public function create(string $slug, string $label, ?string $description): void
|
public function create(
|
||||||
{
|
string $slug,
|
||||||
$this->tagService->create($slug, $label, $description);
|
string $label,
|
||||||
|
?string $description,
|
||||||
|
string $type = 'generic' // NEU
|
||||||
|
): void {
|
||||||
|
$this->tagService->create($slug, $label, $description, $type);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function delete(string $id): void
|
public function delete(string $id): void
|
||||||
|
|||||||
@@ -19,7 +19,12 @@ final readonly class TagNdjsonExporter
|
|||||||
* Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
|
* Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
|
||||||
*
|
*
|
||||||
* Line format:
|
* Line format:
|
||||||
* {"tag_id":"...","text":"label\nslug\noptional description","document_ids":["...","..."]}
|
* {
|
||||||
|
* "tag_id":"...",
|
||||||
|
* "text":"label\nslug\noptional description",
|
||||||
|
* "type":"catalog_entity|generic|...",
|
||||||
|
* "document_ids":["...","..."]
|
||||||
|
* }
|
||||||
*
|
*
|
||||||
* @return array{tags:int, lines:int, bytes:int, path:string}
|
* @return array{tags:int, lines:int, bytes:int, path:string}
|
||||||
*/
|
*/
|
||||||
@@ -37,12 +42,7 @@ final readonly class TagNdjsonExporter
|
|||||||
throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
|
throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ------------------------------------------------------------------
|
// 1) Load all tags
|
||||||
// Fetch tags (small) + join document ids (can be bigger) efficiently.
|
|
||||||
// We avoid repositories and keep it DB-agnostic via DQL/QB.
|
|
||||||
// ------------------------------------------------------------------
|
|
||||||
|
|
||||||
// 1) Load all tags (id, slug, label, description)
|
|
||||||
$tags = $this->em->createQueryBuilder()
|
$tags = $this->em->createQueryBuilder()
|
||||||
->select('t')
|
->select('t')
|
||||||
->from(Tag::class, 't')
|
->from(Tag::class, 't')
|
||||||
@@ -52,8 +52,6 @@ final readonly class TagNdjsonExporter
|
|||||||
|
|
||||||
if (!\is_array($tags) || $tags === []) {
|
if (!\is_array($tags) || $tags === []) {
|
||||||
\fclose($fh);
|
\fclose($fh);
|
||||||
|
|
||||||
// Write empty file atomically
|
|
||||||
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
||||||
|
|
||||||
return [
|
return [
|
||||||
@@ -64,8 +62,7 @@ final readonly class TagNdjsonExporter
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2) Build tagId => docIds map from document_tag
|
// 2) Build tagId => docIds map
|
||||||
// We query pairs (tag_id, document_id) in one go.
|
|
||||||
$rows = $this->em->createQueryBuilder()
|
$rows = $this->em->createQueryBuilder()
|
||||||
->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
|
->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
|
||||||
->from(DocumentTag::class, 'dt')
|
->from(DocumentTag::class, 'dt')
|
||||||
@@ -82,7 +79,7 @@ final readonly class TagNdjsonExporter
|
|||||||
$tagToDocs[$tagId][] = $docId;
|
$tagToDocs[$tagId][] = $docId;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3) Stream NDJSON lines
|
// 3) Stream NDJSON
|
||||||
$lines = 0;
|
$lines = 0;
|
||||||
|
|
||||||
foreach ($tags as $tag) {
|
foreach ($tags as $tag) {
|
||||||
@@ -93,13 +90,11 @@ final readonly class TagNdjsonExporter
|
|||||||
$tagId = (string) $tag->getId();
|
$tagId = (string) $tag->getId();
|
||||||
$docIds = $tagToDocs[$tagId] ?? [];
|
$docIds = $tagToDocs[$tagId] ?? [];
|
||||||
|
|
||||||
// de-dupe docIds for safety
|
|
||||||
if ($docIds !== []) {
|
if ($docIds !== []) {
|
||||||
$docIds = \array_values(\array_unique($docIds));
|
$docIds = \array_values(\array_unique($docIds));
|
||||||
}
|
}
|
||||||
|
|
||||||
// "text" is the embedding source for tag vectors later:
|
// Embedding source
|
||||||
// Keep it short but semantically useful.
|
|
||||||
$textParts = [
|
$textParts = [
|
||||||
$tag->getLabel(),
|
$tag->getLabel(),
|
||||||
$tag->getSlug(),
|
$tag->getSlug(),
|
||||||
@@ -110,15 +105,23 @@ final readonly class TagNdjsonExporter
|
|||||||
$textParts[] = \trim($desc);
|
$textParts[] = \trim($desc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$type = method_exists($tag, 'getType')
|
||||||
|
? (string) $tag->getType()
|
||||||
|
: 'generic';
|
||||||
|
|
||||||
|
if ($type === '') {
|
||||||
|
$type = 'generic';
|
||||||
|
}
|
||||||
|
|
||||||
$line = [
|
$line = [
|
||||||
'tag_id' => $tagId,
|
'tag_id' => $tagId,
|
||||||
'text' => \implode("\n", $textParts),
|
'text' => \implode("\n", $textParts),
|
||||||
|
'type' => $type, // 🔥 NEW
|
||||||
'document_ids' => $docIds,
|
'document_ids' => $docIds,
|
||||||
];
|
];
|
||||||
|
|
||||||
$json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
$json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||||
if (!\is_string($json)) {
|
if (!\is_string($json)) {
|
||||||
// skip invalid line but keep export running
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -127,7 +130,6 @@ final readonly class TagNdjsonExporter
|
|||||||
}
|
}
|
||||||
|
|
||||||
\fclose($fh);
|
\fclose($fh);
|
||||||
|
|
||||||
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
||||||
|
|
||||||
return [
|
return [
|
||||||
@@ -140,13 +142,11 @@ final readonly class TagNdjsonExporter
|
|||||||
|
|
||||||
private function atomicReplace(string $tmpPath, string $finalPath): void
|
private function atomicReplace(string $tmpPath, string $finalPath): void
|
||||||
{
|
{
|
||||||
// Ensure old file can be replaced on Windows-like FS too (best effort)
|
|
||||||
if (\is_file($finalPath)) {
|
if (\is_file($finalPath)) {
|
||||||
@\chmod($finalPath, 0664);
|
@\chmod($finalPath, 0664);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!@\rename($tmpPath, $finalPath)) {
|
if (!@\rename($tmpPath, $finalPath)) {
|
||||||
// if rename fails, try copy+unlink fallback
|
|
||||||
if (!@\copy($tmpPath, $finalPath)) {
|
if (!@\copy($tmpPath, $finalPath)) {
|
||||||
@\unlink($tmpPath);
|
@\unlink($tmpPath);
|
||||||
throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);
|
throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);
|
||||||
|
|||||||
@@ -21,8 +21,12 @@ final readonly class TagService
|
|||||||
// TAG CREATE
|
// TAG CREATE
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
public function create(string $slug, string $label, ?string $description = null): Tag
|
public function create(
|
||||||
{
|
string $slug,
|
||||||
|
string $label,
|
||||||
|
?string $description = null,
|
||||||
|
string $type = 'generic' // NEU
|
||||||
|
): Tag {
|
||||||
$slug = trim($slug);
|
$slug = trim($slug);
|
||||||
$label = trim($label);
|
$label = trim($label);
|
||||||
|
|
||||||
@@ -35,6 +39,7 @@ final readonly class TagService
|
|||||||
}
|
}
|
||||||
|
|
||||||
$tag = new Tag($slug, $label, $description);
|
$tag = new Tag($slug, $label, $description);
|
||||||
|
$tag->setType($type); // NEU
|
||||||
|
|
||||||
$this->em->persist($tag);
|
$this->em->persist($tag);
|
||||||
$this->em->flush();
|
$this->em->flush();
|
||||||
@@ -71,10 +76,6 @@ final readonly class TagService
|
|||||||
// DOCUMENT TAG SYNC
|
// DOCUMENT TAG SYNC
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
/**
|
|
||||||
* Synchronisiert alle Tags eines Dokuments.
|
|
||||||
* Löst einen Rebuild aus, da document_ids Teil des NDJSON sind.
|
|
||||||
*/
|
|
||||||
public function syncDocumentTags(Document $document, array $newTagIds): void
|
public function syncDocumentTags(Document $document, array $newTagIds): void
|
||||||
{
|
{
|
||||||
$newTagIds = array_unique($newTagIds);
|
$newTagIds = array_unique($newTagIds);
|
||||||
@@ -114,10 +115,6 @@ final readonly class TagService
|
|||||||
// TAG → DOCUMENT SYNC (Bulk Assign)
|
// TAG → DOCUMENT SYNC (Bulk Assign)
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
/**
|
|
||||||
* Synchronisiert alle Dokumente eines Tags.
|
|
||||||
* Löst einen Rebuild aus, da document_ids Teil des NDJSON sind.
|
|
||||||
*/
|
|
||||||
public function syncTagDocuments(Tag $tag, array $newDocumentIds): void
|
public function syncTagDocuments(Tag $tag, array $newDocumentIds): void
|
||||||
{
|
{
|
||||||
$newDocumentIds = array_unique($newDocumentIds);
|
$newDocumentIds = array_unique($newDocumentIds);
|
||||||
|
|||||||
27
src/Tag/TagTypes.php
Normal file
27
src/Tag/TagTypes.php
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Tag;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Zentrale Definition aller erlaubten Tag-Typen.
|
||||||
|
* Verhindert Magic Strings im Code.
|
||||||
|
*/
|
||||||
|
final class TagTypes
|
||||||
|
{
|
||||||
|
public const GENERIC = 'generic';
|
||||||
|
public const CATALOG_ENTITY = 'catalog_entity';
|
||||||
|
public const SALES_SIGNAL = 'sales_signal';
|
||||||
|
|
||||||
|
public static function choices(): array
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
'Generic' => self::GENERIC,
|
||||||
|
'Catalog Entity' => self::CATALOG_ENTITY,
|
||||||
|
'Sales Signal' => self::SALES_SIGNAL,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
private function __construct() {}
|
||||||
|
}
|
||||||
@@ -11,7 +11,6 @@ final readonly class TagVectorSearchClient
|
|||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* Minimum similarity score required for a tag to be considered.
|
* Minimum similarity score required for a tag to be considered.
|
||||||
* Acts as a confidence gate to avoid noisy routing.
|
|
||||||
*/
|
*/
|
||||||
private const MIN_SCORE = 0.4;
|
private const MIN_SCORE = 0.4;
|
||||||
|
|
||||||
@@ -29,7 +28,22 @@ final readonly class TagVectorSearchClient
|
|||||||
/**
|
/**
|
||||||
* Executes a vector search against the Python tag index.
|
* Executes a vector search against the Python tag index.
|
||||||
*
|
*
|
||||||
* @return array<int, array{tag_id:string, score:float}>
|
* Expected response rows:
|
||||||
|
* [
|
||||||
|
* {
|
||||||
|
* "tag_id": "...",
|
||||||
|
* "score": 0.73,
|
||||||
|
* "label": "Geräte", // optional (new)
|
||||||
|
* "tag_type": "catalog_entity" // optional (new)
|
||||||
|
* }
|
||||||
|
* ]
|
||||||
|
*
|
||||||
|
* @return array<int, array{
|
||||||
|
* tag_id:string,
|
||||||
|
* score:float,
|
||||||
|
* label?:string,
|
||||||
|
* tag_type?:string
|
||||||
|
* }>
|
||||||
*/
|
*/
|
||||||
public function search(string $query, int $limit = 8): array
|
public function search(string $query, int $limit = 8): array
|
||||||
{
|
{
|
||||||
@@ -94,15 +108,26 @@ final readonly class TagVectorSearchClient
|
|||||||
|
|
||||||
$score = (float) $score;
|
$score = (float) $score;
|
||||||
|
|
||||||
// 🔥 Confidence Gate
|
|
||||||
if ($score < self::MIN_SCORE) {
|
if ($score < self::MIN_SCORE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$hits[] = [
|
$hit = [
|
||||||
'tag_id' => $tagId,
|
'tag_id' => $tagId,
|
||||||
'score' => $score,
|
'score' => $score,
|
||||||
];
|
];
|
||||||
|
|
||||||
|
// Optional: label
|
||||||
|
if (isset($row['label']) && is_string($row['label'])) {
|
||||||
|
$hit['label'] = $row['label'];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Optional: tag_type
|
||||||
|
if (isset($row['tag_type']) && is_string($row['tag_type'])) {
|
||||||
|
$hit['tag_type'] = $row['tag_type'];
|
||||||
|
}
|
||||||
|
|
||||||
|
$hits[] = $hit;
|
||||||
}
|
}
|
||||||
|
|
||||||
return $hits;
|
return $hits;
|
||||||
|
|||||||
@@ -157,6 +157,15 @@
|
|||||||
placeholder="Optional"/>
|
placeholder="Optional"/>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div class="mb-3">
|
||||||
|
<label class="form-label">Type</label>
|
||||||
|
<select name="type" class="form-select">
|
||||||
|
<option value="generic">Generic</option>
|
||||||
|
<option value="catalog_entity">Catalog Entity</option>
|
||||||
|
<option value="sales_signal">Sales Signal</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="col-md-2 d-grid align-items-end">
|
<div class="col-md-2 d-grid align-items-end">
|
||||||
<button class="btn btn-sm btn-outline-info">
|
<button class="btn btn-sm btn-outline-info">
|
||||||
Anlegen
|
Anlegen
|
||||||
|
|||||||
Reference in New Issue
Block a user