optimize catalog semantic match sby tags
This commit is contained in:
32
migrations/Version20260228000100.php
Normal file
32
migrations/Version20260228000100.php
Normal file
@@ -0,0 +1,32 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace DoctrineMigrations;
|
||||
|
||||
use Doctrine\DBAL\Schema\Schema;
|
||||
use Doctrine\Migrations\AbstractMigration;
|
||||
|
||||
final class Version20260228000100 extends AbstractMigration
|
||||
{
|
||||
public function getDescription(): string
|
||||
{
|
||||
return 'Add type column to knowledge_tag table for catalog entity support';
|
||||
}
|
||||
|
||||
public function up(Schema $schema): void
|
||||
{
|
||||
$this->addSql("
|
||||
ALTER TABLE knowledge_tag
|
||||
ADD type VARCHAR(50) NOT NULL DEFAULT 'generic'
|
||||
");
|
||||
}
|
||||
|
||||
public function down(Schema $schema): void
|
||||
{
|
||||
$this->addSql("
|
||||
ALTER TABLE knowledge_tag
|
||||
DROP type
|
||||
");
|
||||
}
|
||||
}
|
||||
@@ -42,6 +42,9 @@ INDEX_META_PATH = KNOWLEDGE_DIR / "index_meta.json"
|
||||
INDEX_RUNTIME_PATH = KNOWLEDGE_DIR / "index_runtime.json"
|
||||
INDEX_NDJSON_PATH = KNOWLEDGE_DIR / "index.ndjson"
|
||||
|
||||
# NEW: Tags NDJSON (exported by PHP) used to enrich /search-tags responses
|
||||
TAGS_NDJSON_PATH = KNOWLEDGE_DIR / "tags.ndjson"
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Logging
|
||||
@@ -111,6 +114,9 @@ chunk_pos_map: Dict[str, int] = {}
|
||||
tag_index = None
|
||||
tag_ids: Optional[List[Any]] = None
|
||||
|
||||
# NEW: tag_id -> {"label": "...", "tag_type": "..."}
|
||||
tag_meta_map: Dict[str, Dict[str, str]] = {}
|
||||
|
||||
loaded_embedding_model_name: Optional[str] = None
|
||||
current_index_version: Optional[int] = None
|
||||
current_runtime_stamp: Optional[str] = None
|
||||
@@ -210,6 +216,61 @@ def load_chunk_maps_from_ndjson() -> None:
|
||||
logger.warning("Failed to load chunk maps from ndjson: %s", str(e))
|
||||
|
||||
|
||||
def load_tag_meta_from_tags_ndjson() -> None:
|
||||
"""
|
||||
Loads minimal tag metadata from tags.ndjson to enrich /search-tags results.
|
||||
Expected line format (from PHP exporter / ingester pipeline):
|
||||
{"tag_id":"...","text":"LABEL\\nSLUG\\noptional description", ...}
|
||||
We extract:
|
||||
label = first line of "text" (fallback: "")
|
||||
tag_type = "type" if present (preferred), else "generic"
|
||||
"""
|
||||
global tag_meta_map
|
||||
|
||||
tag_meta_map = {}
|
||||
|
||||
if not TAGS_NDJSON_PATH.exists():
|
||||
logger.info("[Reload] tags.ndjson missing -> tag_meta_map empty (%s)", str(TAGS_NDJSON_PATH))
|
||||
return
|
||||
|
||||
try:
|
||||
with TAGS_NDJSON_PATH.open("r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
row = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
tag_id = _as_key(row.get("tag_id"))
|
||||
if not tag_id:
|
||||
continue
|
||||
|
||||
# Prefer explicit fields if present
|
||||
ttype = row.get("type")
|
||||
if isinstance(ttype, str) and ttype.strip():
|
||||
tag_type = ttype.strip()
|
||||
else:
|
||||
tag_type = "generic"
|
||||
|
||||
label = ""
|
||||
txt = row.get("text")
|
||||
if isinstance(txt, str) and txt.strip():
|
||||
first = txt.splitlines()[0].strip() if txt.splitlines() else ""
|
||||
label = first
|
||||
|
||||
if label:
|
||||
tag_meta_map[tag_id] = {"label": label, "tag_type": tag_type}
|
||||
else:
|
||||
tag_meta_map[tag_id] = {"label": "", "tag_type": tag_type}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load tag meta from tags.ndjson: %s", str(e))
|
||||
tag_meta_map = {}
|
||||
|
||||
|
||||
def _normalize_meta_list(value: Any) -> Optional[List[Any]]:
|
||||
"""
|
||||
Accepts:
|
||||
@@ -282,6 +343,10 @@ def load_all() -> None:
|
||||
tag_index = None
|
||||
tag_ids = None
|
||||
|
||||
# NEW: load tag meta for enrichment
|
||||
logger.info("[Reload] Loading tag meta from tags.ndjson")
|
||||
load_tag_meta_from_tags_ndjson()
|
||||
|
||||
runtime = _safe_read_json(INDEX_RUNTIME_PATH)
|
||||
if isinstance(runtime, dict):
|
||||
v = runtime.get("last_rebuild_at")
|
||||
@@ -292,10 +357,11 @@ def load_all() -> None:
|
||||
current_index_version = index_version if isinstance(index_version, int) else None
|
||||
|
||||
logger.info(
|
||||
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s stamp=%s file=%s)",
|
||||
"[Reload] Completed (index_version=%s runtime=%s embedding_model=%s tag_meta=%s stamp=%s file=%s)",
|
||||
str(current_index_version),
|
||||
str(current_runtime_stamp),
|
||||
str(loaded_embedding_model_name),
|
||||
str(len(tag_meta_map)),
|
||||
SERVICE_STAMP,
|
||||
str(Path(__file__).resolve()),
|
||||
)
|
||||
@@ -390,6 +456,8 @@ def health():
|
||||
"tag_meta_len": len(tag_ids) if isinstance(tag_ids, list) else None,
|
||||
"chunk_meta_type": type(chunk_ids).__name__ if chunk_ids is not None else None,
|
||||
"chunk_meta_len": len(chunk_ids) if isinstance(chunk_ids, list) else None,
|
||||
"tag_meta_map_len": len(tag_meta_map),
|
||||
"tags_ndjson_path": str(TAGS_NDJSON_PATH),
|
||||
"log_file": str(LOG_FILE),
|
||||
}
|
||||
|
||||
@@ -502,7 +570,26 @@ def search_tags(req: SearchRequest):
|
||||
continue
|
||||
if idx < 0 or idx >= len(tag_ids):
|
||||
continue
|
||||
results.append({"tag_id": tag_ids[idx], "score": float(score)})
|
||||
|
||||
tag_id = tag_ids[idx]
|
||||
tag_id_key = _as_key(tag_id) or ""
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"tag_id": tag_id,
|
||||
"score": float(score),
|
||||
}
|
||||
|
||||
meta = tag_meta_map.get(tag_id_key)
|
||||
if isinstance(meta, dict):
|
||||
label = meta.get("label")
|
||||
ttype = meta.get("tag_type")
|
||||
|
||||
if isinstance(label, str) and label.strip():
|
||||
payload["label"] = label
|
||||
if isinstance(ttype, str) and ttype.strip():
|
||||
payload["tag_type"] = ttype
|
||||
|
||||
results.append(payload)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@@ -15,6 +15,10 @@ use Symfony\Component\Uid\Uuid;
|
||||
* - TagVectorSearch (Score-Gate + Ambiguity-Check)
|
||||
* - DB Query auf document_tag + document (ACTIVE)
|
||||
* - Rückgabe als EIN Textblock (string) oder null (Fallback auf normalen Retrieval)
|
||||
*
|
||||
* Schritt-3 Änderung:
|
||||
* - Headline ist NICHT mehr hardcoded
|
||||
* - Headline basiert dynamisch auf dem gefundenen Tag
|
||||
*/
|
||||
final class EntityCatalogService
|
||||
{
|
||||
@@ -63,6 +67,10 @@ final class EntityCatalogService
|
||||
return null;
|
||||
}
|
||||
|
||||
// OPTIONAL: Falls TagVectorSearchClient künftig tag_label zurückliefert,
|
||||
// kann das hier direkt verwendet werden.
|
||||
$tagLabel = isset($best['tag_label']) ? (string)$best['tag_label'] : null;
|
||||
|
||||
// 3) DB Query: alle ACTIVE Dokumente zu diesem Tag
|
||||
$rows = $this->connection->fetchAllAssociative(
|
||||
'
|
||||
@@ -95,18 +103,24 @@ final class EntityCatalogService
|
||||
return null;
|
||||
}
|
||||
|
||||
return $this->buildTextBlock($entityTerm, $titles);
|
||||
return $this->buildTextBlock($tagLabel, $titles);
|
||||
}
|
||||
|
||||
private function buildTextBlock(string $entityTerm, array $titles): string
|
||||
/**
|
||||
* Dynamische Headline:
|
||||
* - Wenn Tag-Label vorhanden → verwenden
|
||||
* - Sonst generischer Fallback
|
||||
*/
|
||||
private function buildTextBlock(?string $tagLabel, array $titles): string
|
||||
{
|
||||
$headline = match ($entityTerm) {
|
||||
'geräte' => 'Folgende Geräte sind verfügbar:',
|
||||
'indikatoren' => 'Folgende Indikatoren sind verfügbar:',
|
||||
'funktionen' => 'Folgende Funktionen sind verfügbar:',
|
||||
'zubehör' => 'Folgendes Zubehör ist verfügbar:',
|
||||
default => 'Folgende Einträge sind verfügbar:',
|
||||
};
|
||||
$headline = 'Folgende Einträge sind verfügbar:';
|
||||
|
||||
if (\is_string($tagLabel) && \trim($tagLabel) !== '') {
|
||||
$headline = sprintf(
|
||||
'Folgende %s sind verfügbar:',
|
||||
$tagLabel
|
||||
);
|
||||
}
|
||||
|
||||
$lines = [];
|
||||
foreach ($titles as $title) {
|
||||
|
||||
@@ -46,7 +46,8 @@ final class TagController extends AbstractController
|
||||
(string)$request->request->get('label', ''),
|
||||
$request->request->get('description')
|
||||
? (string)$request->request->get('description')
|
||||
: null
|
||||
: null,
|
||||
(string)$request->request->get('type', 'generic') // NEU
|
||||
);
|
||||
|
||||
$this->addFlash('success', 'Tag wurde erstellt.');
|
||||
|
||||
@@ -24,6 +24,14 @@ class Tag
|
||||
#[ORM\Column(type: 'text', nullable: true)]
|
||||
private ?string $description = null;
|
||||
|
||||
/**
|
||||
* NEU: Governance-Typ des Tags
|
||||
* - generic
|
||||
* - catalog_entity
|
||||
*/
|
||||
#[ORM\Column(length: 50)]
|
||||
private string $type = 'generic';
|
||||
|
||||
#[ORM\Column]
|
||||
private \DateTimeImmutable $createdAt;
|
||||
|
||||
@@ -75,6 +83,18 @@ class Tag
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function getType(): string
|
||||
{
|
||||
return $this->type;
|
||||
}
|
||||
|
||||
public function setType(string $type): static
|
||||
{
|
||||
$type = trim($type);
|
||||
$this->type = $type !== '' ? $type : 'generic';
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function getCreatedAt(): \DateTimeImmutable
|
||||
{
|
||||
return $this->createdAt;
|
||||
|
||||
@@ -4,27 +4,10 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Intent;
|
||||
|
||||
/**
|
||||
* CatalogIntentLite
|
||||
*
|
||||
* Minimal, deterministische Erkennung von Katalog-/Entity-Listenanfragen.
|
||||
*
|
||||
* Ziel:
|
||||
* - "Liste aller Geräte" / "Welche Indikatoren gibt es?" / "Zeig mir alle Funktionen"
|
||||
*
|
||||
* Guardrails:
|
||||
* - Kein Catalog-Mode bei Sales-/Pricing-/Comparison-/ROI-/Implementation-/Objection-Intents.
|
||||
* - Kein Catalog-Mode ohne expliziten Entity-Term.
|
||||
*
|
||||
* WICHTIG:
|
||||
* - Immer mit ORIGINAL-Prompt aufrufen.
|
||||
* - Kein LLM, kein ML.
|
||||
*/
|
||||
use App\Tag\TagVectorSearchClient;
|
||||
|
||||
final class CatalogIntentLite
|
||||
{
|
||||
/**
|
||||
* Listensignale (leichtgewichtig) – IntentLite bleibt weiterhin für "allgemeine" List Detection zuständig.
|
||||
*/
|
||||
private const LIST_SIGNALS = [
|
||||
'liste',
|
||||
'auflisten',
|
||||
@@ -39,100 +22,76 @@ final class CatalogIntentLite
|
||||
'alle',
|
||||
];
|
||||
|
||||
/**
|
||||
* Entity-Terms, die wir als Katalogtypen unterstützen.
|
||||
*
|
||||
* Left side: canonical term (für Tag-Suche)
|
||||
* Right side: Such-Synonyme, die im Prompt vorkommen dürfen.
|
||||
*/
|
||||
private const ENTITY_TERMS = [
|
||||
'geräte' => ['gerät', 'geräte', 'geraet', 'geraete', 'device', 'devices'],
|
||||
'indikatoren' => ['indikator', 'indikatoren', 'indicator', 'indicators'],
|
||||
'funktionen' => ['funktion', 'funktionen', 'feature', 'features', 'funktionalität', 'funktionalitaet'],
|
||||
'zubehör' => ['zubehör', 'zubehoer', 'accessory', 'accessories', 'zubehor'],
|
||||
];
|
||||
private const MIN_SCORE = 0.60;
|
||||
private const AMBIGUITY_DELTA = 0.05;
|
||||
|
||||
public function __construct(
|
||||
private readonly SalesIntentLite $salesIntentLite,
|
||||
private readonly TagVectorSearchClient $tagVectorClient,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* @return string|null canonical entity term (z. B. "geräte") oder null wenn kein Catalog-Intent.
|
||||
*/
|
||||
public function detect(string $originalPrompt): ?string
|
||||
public function detect(string $prompt): ?string
|
||||
{
|
||||
$p = $this->normalize($originalPrompt);
|
||||
$normalizedPrompt = mb_strtolower($prompt);
|
||||
|
||||
// 1) Muss ein Listen-Signal enthalten
|
||||
if (!$this->containsAny($p, self::LIST_SIGNALS)) {
|
||||
// 1) Muss Listen-Signal enthalten
|
||||
if (!$this->containsAny($normalizedPrompt, self::LIST_SIGNALS)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 2) Guardrail: Kein Catalog-Mode bei Sales-Intents
|
||||
$sales = $this->salesIntentLite->detect($originalPrompt);
|
||||
// 2) Guardrail: Nur DISCOVERY
|
||||
$sales = $this->salesIntentLite->detect($prompt);
|
||||
$intent = (string)($sales['intent'] ?? SalesIntentLite::DISCOVERY);
|
||||
|
||||
if ($intent !== SalesIntentLite::DISCOVERY) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 3) Expliziten Entity-Term extrahieren (sonst kein Catalog)
|
||||
foreach (self::ENTITY_TERMS as $canonical => $synonyms) {
|
||||
foreach ($synonyms as $syn) {
|
||||
if ($this->containsWord($p, $syn)) {
|
||||
return $canonical;
|
||||
}
|
||||
}
|
||||
}
|
||||
// 3) Vector-basierte Tag-Suche (Top 3 für Ambiguity-Check)
|
||||
$hits = $this->tagVectorClient->search($prompt, 3);
|
||||
|
||||
if ($hits === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// Helpers
|
||||
// ------------------------------------------------------------
|
||||
$best = $hits[0];
|
||||
$bestScore = (float)($best['score'] ?? 0.0);
|
||||
|
||||
if ($bestScore < self::MIN_SCORE) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Ambiguity-Check
|
||||
if (isset($hits[1])) {
|
||||
$secondScore = (float)($hits[1]['score'] ?? 0.0);
|
||||
if (abs($bestScore - $secondScore) < self::AMBIGUITY_DELTA) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// 4) Nur catalog_entity zulassen
|
||||
if (($best['tag_type'] ?? null) !== 'catalog_entity') {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 5) Canonical Label zurückgeben
|
||||
$label = (string)($best['label'] ?? '');
|
||||
|
||||
if ($label === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
return mb_strtolower($label);
|
||||
}
|
||||
|
||||
private function containsAny(string $haystack, array $needles): bool
|
||||
{
|
||||
foreach ($needles as $needle) {
|
||||
if ($needle === '') {
|
||||
continue;
|
||||
}
|
||||
if (str_contains($haystack, $needle)) {
|
||||
if ($needle !== '' && str_contains($haystack, $needle)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private function containsWord(string $haystack, string $word): bool
|
||||
{
|
||||
$word = trim($word);
|
||||
if ($word === '') {
|
||||
return false;
|
||||
}
|
||||
return preg_match('/\b' . preg_quote($word, '/') . '\b/u', $haystack) === 1;
|
||||
}
|
||||
|
||||
private function normalize(string $s): string
|
||||
{
|
||||
$s = mb_strtolower($s);
|
||||
|
||||
// Umlaute absichern (analog IntentLite/SalesIntentLite)
|
||||
$replacements = [
|
||||
'ä' => 'ae',
|
||||
'ö' => 'oe',
|
||||
'ü' => 'ue',
|
||||
'ß' => 'ss',
|
||||
];
|
||||
|
||||
foreach ($replacements as $umlaut => $alt) {
|
||||
if (str_contains($s, $umlaut)) {
|
||||
$s .= ' ' . str_replace($umlaut, $alt, $s);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $s;
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,6 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Catalog\EntityCatalogService;
|
||||
use App\Entity\ModelGenerationConfig;
|
||||
use App\Intent\CatalogIntentLite;
|
||||
use App\Intent\IntentLite;
|
||||
@@ -13,6 +12,9 @@ use App\Knowledge\QueryCleaner;
|
||||
use App\Repository\ModelGenerationConfigRepository;
|
||||
use App\Tag\TagRoutingService;
|
||||
use App\Vector\VectorSearchClient;
|
||||
use App\Catalog\EntityCatalogService;
|
||||
use App\Knowledge\Retrieval\NdjsonChunkLookup;
|
||||
use App\Knowledge\Retrieval\RetrieverInterface;
|
||||
|
||||
final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
@@ -307,7 +309,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
float $threshold,
|
||||
bool $boost = false,
|
||||
bool $captureRaw = false
|
||||
): void {
|
||||
): void
|
||||
{
|
||||
$rank = 0;
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
|
||||
@@ -11,12 +11,12 @@ use App\Service\TagRebuildJobService;
|
||||
use App\Tag\TagService;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
|
||||
final class TagAdminService
|
||||
final readonly class TagAdminService
|
||||
{
|
||||
public function __construct(
|
||||
private readonly EntityManagerInterface $em,
|
||||
private readonly TagService $tagService,
|
||||
private readonly TagRebuildJobService $jobs,
|
||||
private EntityManagerInterface $em,
|
||||
private TagService $tagService,
|
||||
private TagRebuildJobService $jobs,
|
||||
) {}
|
||||
|
||||
public function getIndexData(): array
|
||||
@@ -31,9 +31,13 @@ final class TagAdminService
|
||||
];
|
||||
}
|
||||
|
||||
public function create(string $slug, string $label, ?string $description): void
|
||||
{
|
||||
$this->tagService->create($slug, $label, $description);
|
||||
public function create(
|
||||
string $slug,
|
||||
string $label,
|
||||
?string $description,
|
||||
string $type = 'generic' // NEU
|
||||
): void {
|
||||
$this->tagService->create($slug, $label, $description, $type);
|
||||
}
|
||||
|
||||
public function delete(string $id): void
|
||||
|
||||
@@ -19,7 +19,12 @@ final readonly class TagNdjsonExporter
|
||||
* Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
|
||||
*
|
||||
* Line format:
|
||||
* {"tag_id":"...","text":"label\nslug\noptional description","document_ids":["...","..."]}
|
||||
* {
|
||||
* "tag_id":"...",
|
||||
* "text":"label\nslug\noptional description",
|
||||
* "type":"catalog_entity|generic|...",
|
||||
* "document_ids":["...","..."]
|
||||
* }
|
||||
*
|
||||
* @return array{tags:int, lines:int, bytes:int, path:string}
|
||||
*/
|
||||
@@ -37,12 +42,7 @@ final readonly class TagNdjsonExporter
|
||||
throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Fetch tags (small) + join document ids (can be bigger) efficiently.
|
||||
// We avoid repositories and keep it DB-agnostic via DQL/QB.
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
// 1) Load all tags (id, slug, label, description)
|
||||
// 1) Load all tags
|
||||
$tags = $this->em->createQueryBuilder()
|
||||
->select('t')
|
||||
->from(Tag::class, 't')
|
||||
@@ -52,8 +52,6 @@ final readonly class TagNdjsonExporter
|
||||
|
||||
if (!\is_array($tags) || $tags === []) {
|
||||
\fclose($fh);
|
||||
|
||||
// Write empty file atomically
|
||||
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
||||
|
||||
return [
|
||||
@@ -64,8 +62,7 @@ final readonly class TagNdjsonExporter
|
||||
];
|
||||
}
|
||||
|
||||
// 2) Build tagId => docIds map from document_tag
|
||||
// We query pairs (tag_id, document_id) in one go.
|
||||
// 2) Build tagId => docIds map
|
||||
$rows = $this->em->createQueryBuilder()
|
||||
->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
|
||||
->from(DocumentTag::class, 'dt')
|
||||
@@ -82,7 +79,7 @@ final readonly class TagNdjsonExporter
|
||||
$tagToDocs[$tagId][] = $docId;
|
||||
}
|
||||
|
||||
// 3) Stream NDJSON lines
|
||||
// 3) Stream NDJSON
|
||||
$lines = 0;
|
||||
|
||||
foreach ($tags as $tag) {
|
||||
@@ -93,13 +90,11 @@ final readonly class TagNdjsonExporter
|
||||
$tagId = (string) $tag->getId();
|
||||
$docIds = $tagToDocs[$tagId] ?? [];
|
||||
|
||||
// de-dupe docIds for safety
|
||||
if ($docIds !== []) {
|
||||
$docIds = \array_values(\array_unique($docIds));
|
||||
}
|
||||
|
||||
// "text" is the embedding source for tag vectors later:
|
||||
// Keep it short but semantically useful.
|
||||
// Embedding source
|
||||
$textParts = [
|
||||
$tag->getLabel(),
|
||||
$tag->getSlug(),
|
||||
@@ -110,15 +105,23 @@ final readonly class TagNdjsonExporter
|
||||
$textParts[] = \trim($desc);
|
||||
}
|
||||
|
||||
$type = method_exists($tag, 'getType')
|
||||
? (string) $tag->getType()
|
||||
: 'generic';
|
||||
|
||||
if ($type === '') {
|
||||
$type = 'generic';
|
||||
}
|
||||
|
||||
$line = [
|
||||
'tag_id' => $tagId,
|
||||
'text' => \implode("\n", $textParts),
|
||||
'type' => $type, // 🔥 NEW
|
||||
'document_ids' => $docIds,
|
||||
];
|
||||
|
||||
$json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||
if (!\is_string($json)) {
|
||||
// skip invalid line but keep export running
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -127,7 +130,6 @@ final readonly class TagNdjsonExporter
|
||||
}
|
||||
|
||||
\fclose($fh);
|
||||
|
||||
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
||||
|
||||
return [
|
||||
@@ -140,13 +142,11 @@ final readonly class TagNdjsonExporter
|
||||
|
||||
private function atomicReplace(string $tmpPath, string $finalPath): void
|
||||
{
|
||||
// Ensure old file can be replaced on Windows-like FS too (best effort)
|
||||
if (\is_file($finalPath)) {
|
||||
@\chmod($finalPath, 0664);
|
||||
}
|
||||
|
||||
if (!@\rename($tmpPath, $finalPath)) {
|
||||
// if rename fails, try copy+unlink fallback
|
||||
if (!@\copy($tmpPath, $finalPath)) {
|
||||
@\unlink($tmpPath);
|
||||
throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);
|
||||
|
||||
@@ -21,8 +21,12 @@ final readonly class TagService
|
||||
// TAG CREATE
|
||||
// =========================================================
|
||||
|
||||
public function create(string $slug, string $label, ?string $description = null): Tag
|
||||
{
|
||||
public function create(
|
||||
string $slug,
|
||||
string $label,
|
||||
?string $description = null,
|
||||
string $type = 'generic' // NEU
|
||||
): Tag {
|
||||
$slug = trim($slug);
|
||||
$label = trim($label);
|
||||
|
||||
@@ -35,6 +39,7 @@ final readonly class TagService
|
||||
}
|
||||
|
||||
$tag = new Tag($slug, $label, $description);
|
||||
$tag->setType($type); // NEU
|
||||
|
||||
$this->em->persist($tag);
|
||||
$this->em->flush();
|
||||
@@ -71,10 +76,6 @@ final readonly class TagService
|
||||
// DOCUMENT TAG SYNC
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* Synchronisiert alle Tags eines Dokuments.
|
||||
* Löst einen Rebuild aus, da document_ids Teil des NDJSON sind.
|
||||
*/
|
||||
public function syncDocumentTags(Document $document, array $newTagIds): void
|
||||
{
|
||||
$newTagIds = array_unique($newTagIds);
|
||||
@@ -114,10 +115,6 @@ final readonly class TagService
|
||||
// TAG → DOCUMENT SYNC (Bulk Assign)
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* Synchronisiert alle Dokumente eines Tags.
|
||||
* Löst einen Rebuild aus, da document_ids Teil des NDJSON sind.
|
||||
*/
|
||||
public function syncTagDocuments(Tag $tag, array $newDocumentIds): void
|
||||
{
|
||||
$newDocumentIds = array_unique($newDocumentIds);
|
||||
|
||||
27
src/Tag/TagTypes.php
Normal file
27
src/Tag/TagTypes.php
Normal file
@@ -0,0 +1,27 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Tag;
|
||||
|
||||
/**
|
||||
* Zentrale Definition aller erlaubten Tag-Typen.
|
||||
* Verhindert Magic Strings im Code.
|
||||
*/
|
||||
final class TagTypes
|
||||
{
|
||||
public const GENERIC = 'generic';
|
||||
public const CATALOG_ENTITY = 'catalog_entity';
|
||||
public const SALES_SIGNAL = 'sales_signal';
|
||||
|
||||
public static function choices(): array
|
||||
{
|
||||
return [
|
||||
'Generic' => self::GENERIC,
|
||||
'Catalog Entity' => self::CATALOG_ENTITY,
|
||||
'Sales Signal' => self::SALES_SIGNAL,
|
||||
];
|
||||
}
|
||||
|
||||
private function __construct() {}
|
||||
}
|
||||
@@ -11,7 +11,6 @@ final readonly class TagVectorSearchClient
|
||||
{
|
||||
/**
|
||||
* Minimum similarity score required for a tag to be considered.
|
||||
* Acts as a confidence gate to avoid noisy routing.
|
||||
*/
|
||||
private const MIN_SCORE = 0.4;
|
||||
|
||||
@@ -29,7 +28,22 @@ final readonly class TagVectorSearchClient
|
||||
/**
|
||||
* Executes a vector search against the Python tag index.
|
||||
*
|
||||
* @return array<int, array{tag_id:string, score:float}>
|
||||
* Expected response rows:
|
||||
* [
|
||||
* {
|
||||
* "tag_id": "...",
|
||||
* "score": 0.73,
|
||||
* "label": "Geräte", // optional (new)
|
||||
* "tag_type": "catalog_entity" // optional (new)
|
||||
* }
|
||||
* ]
|
||||
*
|
||||
* @return array<int, array{
|
||||
* tag_id:string,
|
||||
* score:float,
|
||||
* label?:string,
|
||||
* tag_type?:string
|
||||
* }>
|
||||
*/
|
||||
public function search(string $query, int $limit = 8): array
|
||||
{
|
||||
@@ -94,15 +108,26 @@ final readonly class TagVectorSearchClient
|
||||
|
||||
$score = (float) $score;
|
||||
|
||||
// 🔥 Confidence Gate
|
||||
if ($score < self::MIN_SCORE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$hits[] = [
|
||||
$hit = [
|
||||
'tag_id' => $tagId,
|
||||
'score' => $score,
|
||||
];
|
||||
|
||||
// Optional: label
|
||||
if (isset($row['label']) && is_string($row['label'])) {
|
||||
$hit['label'] = $row['label'];
|
||||
}
|
||||
|
||||
// Optional: tag_type
|
||||
if (isset($row['tag_type']) && is_string($row['tag_type'])) {
|
||||
$hit['tag_type'] = $row['tag_type'];
|
||||
}
|
||||
|
||||
$hits[] = $hit;
|
||||
}
|
||||
|
||||
return $hits;
|
||||
|
||||
@@ -157,6 +157,15 @@
|
||||
placeholder="Optional"/>
|
||||
</div>
|
||||
|
||||
<div class="mb-3">
|
||||
<label class="form-label">Type</label>
|
||||
<select name="type" class="form-select">
|
||||
<option value="generic">Generic</option>
|
||||
<option value="catalog_entity">Catalog Entity</option>
|
||||
<option value="sales_signal">Sales Signal</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div class="col-md-2 d-grid align-items-end">
|
||||
<button class="btn btn-sm btn-outline-info">
|
||||
Anlegen
|
||||
|
||||
Reference in New Issue
Block a user