optimize catalog semantic match sby tags

This commit is contained in:
team2
2026-02-28 16:10:47 +01:00
parent d3294464ea
commit 0d3f6e21d6
13 changed files with 329 additions and 151 deletions

View File

@@ -0,0 +1,32 @@
<?php
declare(strict_types=1);
namespace DoctrineMigrations;
use Doctrine\DBAL\Schema\Schema;
use Doctrine\Migrations\AbstractMigration;
final class Version20260228000100 extends AbstractMigration
{
public function getDescription(): string
{
return 'Add type column to knowledge_tag table for catalog entity support';
}
public function up(Schema $schema): void
{
$this->addSql("
ALTER TABLE knowledge_tag
ADD type VARCHAR(50) NOT NULL DEFAULT 'generic'
");
}
public function down(Schema $schema): void
{
$this->addSql("
ALTER TABLE knowledge_tag
DROP type
");
}
}

View File

@@ -42,6 +42,9 @@ INDEX_META_PATH = KNOWLEDGE_DIR / "index_meta.json"
INDEX_RUNTIME_PATH = KNOWLEDGE_DIR / "index_runtime.json"
INDEX_NDJSON_PATH = KNOWLEDGE_DIR / "index.ndjson"
# NEW: Tags NDJSON (exported by PHP) used to enrich /search-tags responses
TAGS_NDJSON_PATH = KNOWLEDGE_DIR / "tags.ndjson"
# ============================================================
# Logging
@@ -111,6 +114,9 @@ chunk_pos_map: Dict[str, int] = {}
tag_index = None
tag_ids: Optional[List[Any]] = None
# NEW: tag_id -> {"label": "...", "tag_type": "..."}
tag_meta_map: Dict[str, Dict[str, str]] = {}
loaded_embedding_model_name: Optional[str] = None
current_index_version: Optional[int] = None
current_runtime_stamp: Optional[str] = None
@@ -210,6 +216,61 @@ def load_chunk_maps_from_ndjson() -> None:
logger.warning("Failed to load chunk maps from ndjson: %s", str(e))
def load_tag_meta_from_tags_ndjson() -> None:
"""
Loads minimal tag metadata from tags.ndjson to enrich /search-tags results.
Expected line format (from PHP exporter / ingester pipeline):
{"tag_id":"...","text":"LABEL\\nSLUG\\noptional description", ...}
We extract:
label = first line of "text" (fallback: "")
tag_type = "type" if present (preferred), else "generic"
"""
global tag_meta_map
tag_meta_map = {}
if not TAGS_NDJSON_PATH.exists():
logger.info("[Reload] tags.ndjson missing -> tag_meta_map empty (%s)", str(TAGS_NDJSON_PATH))
return
try:
with TAGS_NDJSON_PATH.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
row = json.loads(line)
except Exception:
continue
tag_id = _as_key(row.get("tag_id"))
if not tag_id:
continue
# Prefer explicit fields if present
ttype = row.get("type")
if isinstance(ttype, str) and ttype.strip():
tag_type = ttype.strip()
else:
tag_type = "generic"
label = ""
txt = row.get("text")
if isinstance(txt, str) and txt.strip():
first = txt.splitlines()[0].strip() if txt.splitlines() else ""
label = first
if label:
tag_meta_map[tag_id] = {"label": label, "tag_type": tag_type}
else:
tag_meta_map[tag_id] = {"label": "", "tag_type": tag_type}
except Exception as e:
logger.warning("Failed to load tag meta from tags.ndjson: %s", str(e))
tag_meta_map = {}
def _normalize_meta_list(value: Any) -> Optional[List[Any]]:
"""
Accepts:
@@ -282,6 +343,10 @@ def load_all() -> None:
tag_index = None
tag_ids = None
# NEW: load tag meta for enrichment
logger.info("[Reload] Loading tag meta from tags.ndjson")
load_tag_meta_from_tags_ndjson()
runtime = _safe_read_json(INDEX_RUNTIME_PATH)
if isinstance(runtime, dict):
v = runtime.get("last_rebuild_at")
@@ -292,10 +357,11 @@ def load_all() -> None:
current_index_version = index_version if isinstance(index_version, int) else None
logger.info(
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s stamp=%s file=%s)",
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s tag_meta=%s stamp=%s file=%s)",
str(current_index_version),
str(current_runtime_stamp),
str(loaded_embedding_model_name),
str(len(tag_meta_map)),
SERVICE_STAMP,
str(Path(__file__).resolve()),
)
@@ -390,6 +456,8 @@ def health():
"tag_meta_len": len(tag_ids) if isinstance(tag_ids, list) else None,
"chunk_meta_type": type(chunk_ids).__name__ if chunk_ids is not None else None,
"chunk_meta_len": len(chunk_ids) if isinstance(chunk_ids, list) else None,
"tag_meta_map_len": len(tag_meta_map),
"tags_ndjson_path": str(TAGS_NDJSON_PATH),
"log_file": str(LOG_FILE),
}
@@ -502,7 +570,26 @@ def search_tags(req: SearchRequest):
continue
if idx < 0 or idx >= len(tag_ids):
continue
results.append({"tag_id": tag_ids[idx], "score": float(score)})
tag_id = tag_ids[idx]
tag_id_key = _as_key(tag_id) or ""
payload: Dict[str, Any] = {
"tag_id": tag_id,
"score": float(score),
}
meta = tag_meta_map.get(tag_id_key)
if isinstance(meta, dict):
label = meta.get("label")
ttype = meta.get("tag_type")
if isinstance(label, str) and label.strip():
payload["label"] = label
if isinstance(ttype, str) and ttype.strip():
payload["tag_type"] = ttype
results.append(payload)
return results

View File

@@ -15,6 +15,10 @@ use Symfony\Component\Uid\Uuid;
* - TagVectorSearch (Score-Gate + Ambiguity-Check)
* - DB Query auf document_tag + document (ACTIVE)
* - Rückgabe als EIN Textblock (string) oder null (Fallback auf normalen Retrieval)
*
* Schritt-3 Änderung:
* - Headline ist NICHT mehr hardcoded
* - Headline basiert dynamisch auf dem gefundenen Tag
*/
final class EntityCatalogService
{
@@ -63,6 +67,10 @@ final class EntityCatalogService
return null;
}
// OPTIONAL: Falls TagVectorSearchClient künftig tag_label zurückliefert,
// kann das hier direkt verwendet werden.
$tagLabel = isset($best['tag_label']) ? (string)$best['tag_label'] : null;
// 3) DB Query: alle ACTIVE Dokumente zu diesem Tag
$rows = $this->connection->fetchAllAssociative(
'
@@ -95,18 +103,24 @@ final class EntityCatalogService
return null;
}
return $this->buildTextBlock($entityTerm, $titles);
return $this->buildTextBlock($tagLabel, $titles);
}
private function buildTextBlock(string $entityTerm, array $titles): string
/**
* Dynamische Headline:
* - Wenn Tag-Label vorhanden → verwenden
* - Sonst generischer Fallback
*/
private function buildTextBlock(?string $tagLabel, array $titles): string
{
$headline = match ($entityTerm) {
'geräte' => 'Folgende Geräte sind verfügbar:',
'indikatoren' => 'Folgende Indikatoren sind verfügbar:',
'funktionen' => 'Folgende Funktionen sind verfügbar:',
'zubehör' => 'Folgendes Zubehör ist verfügbar:',
default => 'Folgende Einträge sind verfügbar:',
};
$headline = 'Folgende Einträge sind verfügbar:';
if (\is_string($tagLabel) && \trim($tagLabel) !== '') {
$headline = sprintf(
'Folgende %s sind verfügbar:',
$tagLabel
);
}
$lines = [];
foreach ($titles as $title) {

View File

@@ -46,7 +46,8 @@ final class TagController extends AbstractController
(string)$request->request->get('label', ''),
$request->request->get('description')
? (string)$request->request->get('description')
: null
: null,
(string)$request->request->get('type', 'generic') // NEU
);
$this->addFlash('success', 'Tag wurde erstellt.');

View File

@@ -24,6 +24,14 @@ class Tag
#[ORM\Column(type: 'text', nullable: true)]
private ?string $description = null;
/**
* NEU: Governance-Typ des Tags
* - generic
* - catalog_entity
*/
#[ORM\Column(length: 50)]
private string $type = 'generic';
#[ORM\Column]
private \DateTimeImmutable $createdAt;
@@ -75,6 +83,18 @@ class Tag
return $this;
}
public function getType(): string
{
return $this->type;
}
public function setType(string $type): static
{
$type = trim($type);
$this->type = $type !== '' ? $type : 'generic';
return $this;
}
public function getCreatedAt(): \DateTimeImmutable
{
return $this->createdAt;

View File

@@ -4,27 +4,10 @@ declare(strict_types=1);
namespace App\Intent;
/**
* CatalogIntentLite
*
* Minimal, deterministische Erkennung von Katalog-/Entity-Listenanfragen.
*
* Ziel:
* - "Liste aller Geräte" / "Welche Indikatoren gibt es?" / "Zeig mir alle Funktionen"
*
* Guardrails:
* - Kein Catalog-Mode bei Sales-/Pricing-/Comparison-/ROI-/Implementation-/Objection-Intents.
* - Kein Catalog-Mode ohne expliziten Entity-Term.
*
* WICHTIG:
* - Immer mit ORIGINAL-Prompt aufrufen.
* - Kein LLM, kein ML.
*/
use App\Tag\TagVectorSearchClient;
final class CatalogIntentLite
{
/**
* Listensignale (leichtgewichtig) IntentLite bleibt weiterhin für "allgemeine" List Detection zuständig.
*/
private const LIST_SIGNALS = [
'liste',
'auflisten',
@@ -39,100 +22,76 @@ final class CatalogIntentLite
'alle',
];
/**
* Entity-Terms, die wir als Katalogtypen unterstützen.
*
* Left side: canonical term (für Tag-Suche)
* Right side: Such-Synonyme, die im Prompt vorkommen dürfen.
*/
private const ENTITY_TERMS = [
'geräte' => ['gerät', 'geräte', 'geraet', 'geraete', 'device', 'devices'],
'indikatoren' => ['indikator', 'indikatoren', 'indicator', 'indicators'],
'funktionen' => ['funktion', 'funktionen', 'feature', 'features', 'funktionalität', 'funktionalitaet'],
'zubehör' => ['zubehör', 'zubehoer', 'accessory', 'accessories', 'zubehor'],
];
private const MIN_SCORE = 0.60;
private const AMBIGUITY_DELTA = 0.05;
public function __construct(
private readonly SalesIntentLite $salesIntentLite,
private readonly TagVectorSearchClient $tagVectorClient,
) {}
/**
* @return string|null canonical entity term (z. B. "geräte") oder null wenn kein Catalog-Intent.
*/
public function detect(string $originalPrompt): ?string
public function detect(string $prompt): ?string
{
$p = $this->normalize($originalPrompt);
$normalizedPrompt = mb_strtolower($prompt);
// 1) Muss ein Listen-Signal enthalten
if (!$this->containsAny($p, self::LIST_SIGNALS)) {
// 1) Muss Listen-Signal enthalten
if (!$this->containsAny($normalizedPrompt, self::LIST_SIGNALS)) {
return null;
}
// 2) Guardrail: Kein Catalog-Mode bei Sales-Intents
$sales = $this->salesIntentLite->detect($originalPrompt);
// 2) Guardrail: Nur DISCOVERY
$sales = $this->salesIntentLite->detect($prompt);
$intent = (string)($sales['intent'] ?? SalesIntentLite::DISCOVERY);
if ($intent !== SalesIntentLite::DISCOVERY) {
return null;
}
// 3) Expliziten Entity-Term extrahieren (sonst kein Catalog)
foreach (self::ENTITY_TERMS as $canonical => $synonyms) {
foreach ($synonyms as $syn) {
if ($this->containsWord($p, $syn)) {
return $canonical;
}
}
}
// 3) Vector-basierte Tag-Suche (Top 3 für Ambiguity-Check)
$hits = $this->tagVectorClient->search($prompt, 3);
if ($hits === []) {
return null;
}
// ------------------------------------------------------------
// Helpers
// ------------------------------------------------------------
$best = $hits[0];
$bestScore = (float)($best['score'] ?? 0.0);
if ($bestScore < self::MIN_SCORE) {
return null;
}
// Ambiguity-Check
if (isset($hits[1])) {
$secondScore = (float)($hits[1]['score'] ?? 0.0);
if (abs($bestScore - $secondScore) < self::AMBIGUITY_DELTA) {
return null;
}
}
// 4) Nur catalog_entity zulassen
if (($best['tag_type'] ?? null) !== 'catalog_entity') {
return null;
}
// 5) Canonical Label zurückgeben
$label = (string)($best['label'] ?? '');
if ($label === '') {
return null;
}
return mb_strtolower($label);
}
private function containsAny(string $haystack, array $needles): bool
{
foreach ($needles as $needle) {
if ($needle === '') {
continue;
}
if (str_contains($haystack, $needle)) {
if ($needle !== '' && str_contains($haystack, $needle)) {
return true;
}
}
return false;
}
private function containsWord(string $haystack, string $word): bool
{
$word = trim($word);
if ($word === '') {
return false;
}
return preg_match('/\b' . preg_quote($word, '/') . '\b/u', $haystack) === 1;
}
private function normalize(string $s): string
{
$s = mb_strtolower($s);
// Umlaute absichern (analog IntentLite/SalesIntentLite)
$replacements = [
'ä' => 'ae',
'ö' => 'oe',
'ü' => 'ue',
'ß' => 'ss',
];
foreach ($replacements as $umlaut => $alt) {
if (str_contains($s, $umlaut)) {
$s .= ' ' . str_replace($umlaut, $alt, $s);
break;
}
}
return $s;
}
}

View File

@@ -4,7 +4,6 @@ declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Catalog\EntityCatalogService;
use App\Entity\ModelGenerationConfig;
use App\Intent\CatalogIntentLite;
use App\Intent\IntentLite;
@@ -13,6 +12,9 @@ use App\Knowledge\QueryCleaner;
use App\Repository\ModelGenerationConfigRepository;
use App\Tag\TagRoutingService;
use App\Vector\VectorSearchClient;
use App\Catalog\EntityCatalogService;
use App\Knowledge\Retrieval\NdjsonChunkLookup;
use App\Knowledge\Retrieval\RetrieverInterface;
final class NdjsonHybridRetriever implements RetrieverInterface
{
@@ -153,7 +155,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
'rrf_score' => isset($core['rrf_scores'][$chunkId]) ? (float)$core['rrf_scores'][$chunkId] : null,
'threshold' => (float)$core['threshold'],
'intent' => (string)$core['sales_intent'],
'is_list_query'=> (bool)$core['is_list_query'],
'is_list_query' => (bool)$core['is_list_query'],
'text' => $text,
];
}
@@ -307,7 +309,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface
float $threshold,
bool $boost = false,
bool $captureRaw = false
): void {
): void
{
$rank = 0;
foreach ($hits as $hit) {

View File

@@ -11,12 +11,12 @@ use App\Service\TagRebuildJobService;
use App\Tag\TagService;
use Doctrine\ORM\EntityManagerInterface;
final class TagAdminService
final readonly class TagAdminService
{
public function __construct(
private readonly EntityManagerInterface $em,
private readonly TagService $tagService,
private readonly TagRebuildJobService $jobs,
private EntityManagerInterface $em,
private TagService $tagService,
private TagRebuildJobService $jobs,
) {}
public function getIndexData(): array
@@ -31,9 +31,13 @@ final class TagAdminService
];
}
public function create(string $slug, string $label, ?string $description): void
{
$this->tagService->create($slug, $label, $description);
public function create(
string $slug,
string $label,
?string $description,
string $type = 'generic' // NEU
): void {
$this->tagService->create($slug, $label, $description, $type);
}
public function delete(string $id): void

View File

@@ -19,7 +19,12 @@ final readonly class TagNdjsonExporter
* Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
*
* Line format:
* {"tag_id":"...","text":"label\nslug\noptional description","document_ids":["...","..."]}
* {
* "tag_id":"...",
* "text":"label\nslug\noptional description",
* "type":"catalog_entity|generic|...",
* "document_ids":["...","..."]
* }
*
* @return array{tags:int, lines:int, bytes:int, path:string}
*/
@@ -37,12 +42,7 @@ final readonly class TagNdjsonExporter
throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
}
// ------------------------------------------------------------------
// Fetch tags (small) + join document ids (can be bigger) efficiently.
// We avoid repositories and keep it DB-agnostic via DQL/QB.
// ------------------------------------------------------------------
// 1) Load all tags (id, slug, label, description)
// 1) Load all tags
$tags = $this->em->createQueryBuilder()
->select('t')
->from(Tag::class, 't')
@@ -52,8 +52,6 @@ final readonly class TagNdjsonExporter
if (!\is_array($tags) || $tags === []) {
\fclose($fh);
// Write empty file atomically
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return [
@@ -64,8 +62,7 @@ final readonly class TagNdjsonExporter
];
}
// 2) Build tagId => docIds map from document_tag
// We query pairs (tag_id, document_id) in one go.
// 2) Build tagId => docIds map
$rows = $this->em->createQueryBuilder()
->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
->from(DocumentTag::class, 'dt')
@@ -82,7 +79,7 @@ final readonly class TagNdjsonExporter
$tagToDocs[$tagId][] = $docId;
}
// 3) Stream NDJSON lines
// 3) Stream NDJSON
$lines = 0;
foreach ($tags as $tag) {
@@ -93,13 +90,11 @@ final readonly class TagNdjsonExporter
$tagId = (string) $tag->getId();
$docIds = $tagToDocs[$tagId] ?? [];
// de-dupe docIds for safety
if ($docIds !== []) {
$docIds = \array_values(\array_unique($docIds));
}
// "text" is the embedding source for tag vectors later:
// Keep it short but semantically useful.
// Embedding source
$textParts = [
$tag->getLabel(),
$tag->getSlug(),
@@ -110,15 +105,23 @@ final readonly class TagNdjsonExporter
$textParts[] = \trim($desc);
}
$type = method_exists($tag, 'getType')
? (string) $tag->getType()
: 'generic';
if ($type === '') {
$type = 'generic';
}
$line = [
'tag_id' => $tagId,
'text' => \implode("\n", $textParts),
'type' => $type, // 🔥 NEW
'document_ids' => $docIds,
];
$json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
if (!\is_string($json)) {
// skip invalid line but keep export running
continue;
}
@@ -127,7 +130,6 @@ final readonly class TagNdjsonExporter
}
\fclose($fh);
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return [
@@ -140,13 +142,11 @@ final readonly class TagNdjsonExporter
private function atomicReplace(string $tmpPath, string $finalPath): void
{
// Ensure old file can be replaced on Windows-like FS too (best effort)
if (\is_file($finalPath)) {
@\chmod($finalPath, 0664);
}
if (!@\rename($tmpPath, $finalPath)) {
// if rename fails, try copy+unlink fallback
if (!@\copy($tmpPath, $finalPath)) {
@\unlink($tmpPath);
throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);

View File

@@ -21,8 +21,12 @@ final readonly class TagService
// TAG CREATE
// =========================================================
public function create(string $slug, string $label, ?string $description = null): Tag
{
public function create(
string $slug,
string $label,
?string $description = null,
string $type = 'generic' // NEU
): Tag {
$slug = trim($slug);
$label = trim($label);
@@ -35,6 +39,7 @@ final readonly class TagService
}
$tag = new Tag($slug, $label, $description);
$tag->setType($type); // NEU
$this->em->persist($tag);
$this->em->flush();
@@ -71,10 +76,6 @@ final readonly class TagService
// DOCUMENT TAG SYNC
// =========================================================
/**
* Synchronisiert alle Tags eines Dokuments.
* Löst einen Rebuild aus, da document_ids Teil des NDJSON sind.
*/
public function syncDocumentTags(Document $document, array $newTagIds): void
{
$newTagIds = array_unique($newTagIds);
@@ -114,10 +115,6 @@ final readonly class TagService
// TAG → DOCUMENT SYNC (Bulk Assign)
// =========================================================
/**
* Synchronisiert alle Dokumente eines Tags.
* Löst einen Rebuild aus, da document_ids Teil des NDJSON sind.
*/
public function syncTagDocuments(Tag $tag, array $newDocumentIds): void
{
$newDocumentIds = array_unique($newDocumentIds);

27
src/Tag/TagTypes.php Normal file
View File

@@ -0,0 +1,27 @@
<?php
declare(strict_types=1);
namespace App\Tag;
/**
* Zentrale Definition aller erlaubten Tag-Typen.
* Verhindert Magic Strings im Code.
*/
final class TagTypes
{
public const GENERIC = 'generic';
public const CATALOG_ENTITY = 'catalog_entity';
public const SALES_SIGNAL = 'sales_signal';
public static function choices(): array
{
return [
'Generic' => self::GENERIC,
'Catalog Entity' => self::CATALOG_ENTITY,
'Sales Signal' => self::SALES_SIGNAL,
];
}
private function __construct() {}
}

View File

@@ -11,7 +11,6 @@ final readonly class TagVectorSearchClient
{
/**
* Minimum similarity score required for a tag to be considered.
* Acts as a confidence gate to avoid noisy routing.
*/
private const MIN_SCORE = 0.4;
@@ -29,7 +28,22 @@ final readonly class TagVectorSearchClient
/**
* Executes a vector search against the Python tag index.
*
* @return array<int, array{tag_id:string, score:float}>
* Expected response rows:
* [
* {
* "tag_id": "...",
* "score": 0.73,
* "label": "Geräte", // optional (new)
* "tag_type": "catalog_entity" // optional (new)
* }
* ]
*
* @return array<int, array{
* tag_id:string,
* score:float,
* label?:string,
* tag_type?:string
* }>
*/
public function search(string $query, int $limit = 8): array
{
@@ -94,15 +108,26 @@ final readonly class TagVectorSearchClient
$score = (float) $score;
// 🔥 Confidence Gate
if ($score < self::MIN_SCORE) {
continue;
}
$hits[] = [
$hit = [
'tag_id' => $tagId,
'score' => $score,
];
// Optional: label
if (isset($row['label']) && is_string($row['label'])) {
$hit['label'] = $row['label'];
}
// Optional: tag_type
if (isset($row['tag_type']) && is_string($row['tag_type'])) {
$hit['tag_type'] = $row['tag_type'];
}
$hits[] = $hit;
}
return $hits;

View File

@@ -157,6 +157,15 @@
placeholder="Optional"/>
</div>
<div class="mb-3">
<label class="form-label">Type</label>
<select name="type" class="form-select">
<option value="generic">Generic</option>
<option value="catalog_entity">Catalog Entity</option>
<option value="sales_signal">Sales Signal</option>
</select>
</div>
<div class="col-md-2 d-grid align-items-end">
<button class="btn btn-sm btn-outline-info">
Anlegen