optimize catalog semantic match sby tags

This commit is contained in:
team2
2026-02-28 16:10:47 +01:00
parent d3294464ea
commit 0d3f6e21d6
13 changed files with 329 additions and 151 deletions

View File

@@ -19,7 +19,12 @@ final readonly class TagNdjsonExporter
* Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
*
* Line format:
* {"tag_id":"...","text":"label\nslug\noptional description","document_ids":["...","..."]}
* {
* "tag_id":"...",
* "text":"label\nslug\noptional description",
* "type":"catalog_entity|generic|...",
* "document_ids":["...","..."]
* }
*
* @return array{tags:int, lines:int, bytes:int, path:string}
*/
@@ -37,12 +42,7 @@ final readonly class TagNdjsonExporter
throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
}
// ------------------------------------------------------------------
// Fetch tags (small) + join document ids (can be bigger) efficiently.
// We avoid repositories and keep it DB-agnostic via DQL/QB.
// ------------------------------------------------------------------
// 1) Load all tags (id, slug, label, description)
// 1) Load all tags
$tags = $this->em->createQueryBuilder()
->select('t')
->from(Tag::class, 't')
@@ -52,8 +52,6 @@ final readonly class TagNdjsonExporter
if (!\is_array($tags) || $tags === []) {
\fclose($fh);
// Write empty file atomically
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return [
@@ -64,8 +62,7 @@ final readonly class TagNdjsonExporter
];
}
// 2) Build tagId => docIds map from document_tag
// We query pairs (tag_id, document_id) in one go.
// 2) Build tagId => docIds map
$rows = $this->em->createQueryBuilder()
->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
->from(DocumentTag::class, 'dt')
@@ -82,7 +79,7 @@ final readonly class TagNdjsonExporter
$tagToDocs[$tagId][] = $docId;
}
// 3) Stream NDJSON lines
// 3) Stream NDJSON
$lines = 0;
foreach ($tags as $tag) {
@@ -93,13 +90,11 @@ final readonly class TagNdjsonExporter
$tagId = (string) $tag->getId();
$docIds = $tagToDocs[$tagId] ?? [];
// de-dupe docIds for safety
if ($docIds !== []) {
$docIds = \array_values(\array_unique($docIds));
}
// "text" is the embedding source for tag vectors later:
// Keep it short but semantically useful.
// Embedding source
$textParts = [
$tag->getLabel(),
$tag->getSlug(),
@@ -110,15 +105,23 @@ final readonly class TagNdjsonExporter
$textParts[] = \trim($desc);
}
$type = method_exists($tag, 'getType')
? (string) $tag->getType()
: 'generic';
if ($type === '') {
$type = 'generic';
}
$line = [
'tag_id' => $tagId,
'text' => \implode("\n", $textParts),
'type' => $type, // 🔥 NEW
'document_ids' => $docIds,
];
$json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
if (!\is_string($json)) {
// skip invalid line but keep export running
continue;
}
@@ -127,7 +130,6 @@ final readonly class TagNdjsonExporter
}
\fclose($fh);
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return [
@@ -140,13 +142,11 @@ final readonly class TagNdjsonExporter
private function atomicReplace(string $tmpPath, string $finalPath): void
{
// Ensure old file can be replaced on Windows-like FS too (best effort)
if (\is_file($finalPath)) {
@\chmod($finalPath, 0664);
}
if (!@\rename($tmpPath, $finalPath)) {
// if rename fails, try copy+unlink fallback
if (!@\copy($tmpPath, $finalPath)) {
@\unlink($tmpPath);
throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);

View File

@@ -21,8 +21,12 @@ final readonly class TagService
// TAG CREATE
// =========================================================
public function create(string $slug, string $label, ?string $description = null): Tag
{
public function create(
string $slug,
string $label,
?string $description = null,
string $type = 'generic' // NEU
): Tag {
$slug = trim($slug);
$label = trim($label);
@@ -35,6 +39,7 @@ final readonly class TagService
}
$tag = new Tag($slug, $label, $description);
$tag->setType($type); // NEU
$this->em->persist($tag);
$this->em->flush();
@@ -71,10 +76,6 @@ final readonly class TagService
// DOCUMENT TAG SYNC
// =========================================================
/**
* Synchronisiert alle Tags eines Dokuments.
* Löst einen Rebuild aus, da document_ids Teil des NDJSON sind.
*/
public function syncDocumentTags(Document $document, array $newTagIds): void
{
$newTagIds = array_unique($newTagIds);
@@ -114,10 +115,6 @@ final readonly class TagService
// TAG → DOCUMENT SYNC (Bulk Assign)
// =========================================================
/**
* Synchronisiert alle Dokumente eines Tags.
* Löst einen Rebuild aus, da document_ids Teil des NDJSON sind.
*/
public function syncTagDocuments(Tag $tag, array $newDocumentIds): void
{
$newDocumentIds = array_unique($newDocumentIds);

27
src/Tag/TagTypes.php Normal file
View File

@@ -0,0 +1,27 @@
<?php
declare(strict_types=1);
namespace App\Tag;
/**
* Zentrale Definition aller erlaubten Tag-Typen.
* Verhindert Magic Strings im Code.
*/
final class TagTypes
{
public const GENERIC = 'generic';
public const CATALOG_ENTITY = 'catalog_entity';
public const SALES_SIGNAL = 'sales_signal';
public static function choices(): array
{
return [
'Generic' => self::GENERIC,
'Catalog Entity' => self::CATALOG_ENTITY,
'Sales Signal' => self::SALES_SIGNAL,
];
}
private function __construct() {}
}

View File

@@ -11,7 +11,6 @@ final readonly class TagVectorSearchClient
{
/**
* Minimum similarity score required for a tag to be considered.
* Acts as a confidence gate to avoid noisy routing.
*/
private const MIN_SCORE = 0.4;
@@ -29,7 +28,22 @@ final readonly class TagVectorSearchClient
/**
* Executes a vector search against the Python tag index.
*
* @return array<int, array{tag_id:string, score:float}>
* Expected response rows:
* [
* {
* "tag_id": "...",
* "score": 0.73,
* "label": "Geräte", // optional (new)
* "tag_type": "catalog_entity" // optional (new)
* }
* ]
*
* @return array<int, array{
* tag_id:string,
* score:float,
* label?:string,
* tag_type?:string
* }>
*/
public function search(string $query, int $limit = 8): array
{
@@ -94,15 +108,26 @@ final readonly class TagVectorSearchClient
$score = (float) $score;
// 🔥 Confidence Gate
if ($score < self::MIN_SCORE) {
continue;
}
$hits[] = [
$hit = [
'tag_id' => $tagId,
'score' => $score,
];
// Optional: label
if (isset($row['label']) && is_string($row['label'])) {
$hit['label'] = $row['label'];
}
// Optional: tag_type
if (isset($row['tag_type']) && is_string($row['tag_type'])) {
$hit['tag_type'] = $row['tag_type'];
}
$hits[] = $hit;
}
return $hits;