optimize catalog semantic match sby tags
This commit is contained in:
@@ -19,7 +19,12 @@ final readonly class TagNdjsonExporter
|
||||
* Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
|
||||
*
|
||||
* Line format:
|
||||
* {"tag_id":"...","text":"label\nslug\noptional description","document_ids":["...","..."]}
|
||||
* {
|
||||
* "tag_id":"...",
|
||||
* "text":"label\nslug\noptional description",
|
||||
* "type":"catalog_entity|generic|...",
|
||||
* "document_ids":["...","..."]
|
||||
* }
|
||||
*
|
||||
* @return array{tags:int, lines:int, bytes:int, path:string}
|
||||
*/
|
||||
@@ -37,12 +42,7 @@ final readonly class TagNdjsonExporter
|
||||
throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Fetch tags (small) + join document ids (can be bigger) efficiently.
|
||||
// We avoid repositories and keep it DB-agnostic via DQL/QB.
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
// 1) Load all tags (id, slug, label, description)
|
||||
// 1) Load all tags
|
||||
$tags = $this->em->createQueryBuilder()
|
||||
->select('t')
|
||||
->from(Tag::class, 't')
|
||||
@@ -52,8 +52,6 @@ final readonly class TagNdjsonExporter
|
||||
|
||||
if (!\is_array($tags) || $tags === []) {
|
||||
\fclose($fh);
|
||||
|
||||
// Write empty file atomically
|
||||
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
||||
|
||||
return [
|
||||
@@ -64,8 +62,7 @@ final readonly class TagNdjsonExporter
|
||||
];
|
||||
}
|
||||
|
||||
// 2) Build tagId => docIds map from document_tag
|
||||
// We query pairs (tag_id, document_id) in one go.
|
||||
// 2) Build tagId => docIds map
|
||||
$rows = $this->em->createQueryBuilder()
|
||||
->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
|
||||
->from(DocumentTag::class, 'dt')
|
||||
@@ -82,7 +79,7 @@ final readonly class TagNdjsonExporter
|
||||
$tagToDocs[$tagId][] = $docId;
|
||||
}
|
||||
|
||||
// 3) Stream NDJSON lines
|
||||
// 3) Stream NDJSON
|
||||
$lines = 0;
|
||||
|
||||
foreach ($tags as $tag) {
|
||||
@@ -93,13 +90,11 @@ final readonly class TagNdjsonExporter
|
||||
$tagId = (string) $tag->getId();
|
||||
$docIds = $tagToDocs[$tagId] ?? [];
|
||||
|
||||
// de-dupe docIds for safety
|
||||
if ($docIds !== []) {
|
||||
$docIds = \array_values(\array_unique($docIds));
|
||||
}
|
||||
|
||||
// "text" is the embedding source for tag vectors later:
|
||||
// Keep it short but semantically useful.
|
||||
// Embedding source
|
||||
$textParts = [
|
||||
$tag->getLabel(),
|
||||
$tag->getSlug(),
|
||||
@@ -110,15 +105,23 @@ final readonly class TagNdjsonExporter
|
||||
$textParts[] = \trim($desc);
|
||||
}
|
||||
|
||||
$type = method_exists($tag, 'getType')
|
||||
? (string) $tag->getType()
|
||||
: 'generic';
|
||||
|
||||
if ($type === '') {
|
||||
$type = 'generic';
|
||||
}
|
||||
|
||||
$line = [
|
||||
'tag_id' => $tagId,
|
||||
'text' => \implode("\n", $textParts),
|
||||
'type' => $type, // 🔥 NEW
|
||||
'document_ids' => $docIds,
|
||||
];
|
||||
|
||||
$json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||
if (!\is_string($json)) {
|
||||
// skip invalid line but keep export running
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -127,7 +130,6 @@ final readonly class TagNdjsonExporter
|
||||
}
|
||||
|
||||
\fclose($fh);
|
||||
|
||||
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
||||
|
||||
return [
|
||||
@@ -140,13 +142,11 @@ final readonly class TagNdjsonExporter
|
||||
|
||||
private function atomicReplace(string $tmpPath, string $finalPath): void
|
||||
{
|
||||
// Ensure old file can be replaced on Windows-like FS too (best effort)
|
||||
if (\is_file($finalPath)) {
|
||||
@\chmod($finalPath, 0664);
|
||||
}
|
||||
|
||||
if (!@\rename($tmpPath, $finalPath)) {
|
||||
// if rename fails, try copy+unlink fallback
|
||||
if (!@\copy($tmpPath, $finalPath)) {
|
||||
@\unlink($tmpPath);
|
||||
throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);
|
||||
|
||||
@@ -21,8 +21,12 @@ final readonly class TagService
|
||||
// TAG CREATE
|
||||
// =========================================================
|
||||
|
||||
public function create(string $slug, string $label, ?string $description = null): Tag
|
||||
{
|
||||
public function create(
|
||||
string $slug,
|
||||
string $label,
|
||||
?string $description = null,
|
||||
string $type = 'generic' // NEU
|
||||
): Tag {
|
||||
$slug = trim($slug);
|
||||
$label = trim($label);
|
||||
|
||||
@@ -35,6 +39,7 @@ final readonly class TagService
|
||||
}
|
||||
|
||||
$tag = new Tag($slug, $label, $description);
|
||||
$tag->setType($type); // NEU
|
||||
|
||||
$this->em->persist($tag);
|
||||
$this->em->flush();
|
||||
@@ -71,10 +76,6 @@ final readonly class TagService
|
||||
// DOCUMENT TAG SYNC
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* Synchronisiert alle Tags eines Dokuments.
|
||||
* Löst einen Rebuild aus, da document_ids Teil des NDJSON sind.
|
||||
*/
|
||||
public function syncDocumentTags(Document $document, array $newTagIds): void
|
||||
{
|
||||
$newTagIds = array_unique($newTagIds);
|
||||
@@ -114,10 +115,6 @@ final readonly class TagService
|
||||
// TAG → DOCUMENT SYNC (Bulk Assign)
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* Synchronisiert alle Dokumente eines Tags.
|
||||
* Löst einen Rebuild aus, da document_ids Teil des NDJSON sind.
|
||||
*/
|
||||
public function syncTagDocuments(Tag $tag, array $newDocumentIds): void
|
||||
{
|
||||
$newDocumentIds = array_unique($newDocumentIds);
|
||||
|
||||
27
src/Tag/TagTypes.php
Normal file
27
src/Tag/TagTypes.php
Normal file
@@ -0,0 +1,27 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Tag;
|
||||
|
||||
/**
|
||||
* Zentrale Definition aller erlaubten Tag-Typen.
|
||||
* Verhindert Magic Strings im Code.
|
||||
*/
|
||||
final class TagTypes
|
||||
{
|
||||
public const GENERIC = 'generic';
|
||||
public const CATALOG_ENTITY = 'catalog_entity';
|
||||
public const SALES_SIGNAL = 'sales_signal';
|
||||
|
||||
public static function choices(): array
|
||||
{
|
||||
return [
|
||||
'Generic' => self::GENERIC,
|
||||
'Catalog Entity' => self::CATALOG_ENTITY,
|
||||
'Sales Signal' => self::SALES_SIGNAL,
|
||||
];
|
||||
}
|
||||
|
||||
private function __construct() {}
|
||||
}
|
||||
@@ -11,7 +11,6 @@ final readonly class TagVectorSearchClient
|
||||
{
|
||||
/**
|
||||
* Minimum similarity score required for a tag to be considered.
|
||||
* Acts as a confidence gate to avoid noisy routing.
|
||||
*/
|
||||
private const MIN_SCORE = 0.4;
|
||||
|
||||
@@ -29,7 +28,22 @@ final readonly class TagVectorSearchClient
|
||||
/**
|
||||
* Executes a vector search against the Python tag index.
|
||||
*
|
||||
* @return array<int, array{tag_id:string, score:float}>
|
||||
* Expected response rows:
|
||||
* [
|
||||
* {
|
||||
* "tag_id": "...",
|
||||
* "score": 0.73,
|
||||
* "label": "Geräte", // optional (new)
|
||||
* "tag_type": "catalog_entity" // optional (new)
|
||||
* }
|
||||
* ]
|
||||
*
|
||||
* @return array<int, array{
|
||||
* tag_id:string,
|
||||
* score:float,
|
||||
* label?:string,
|
||||
* tag_type?:string
|
||||
* }>
|
||||
*/
|
||||
public function search(string $query, int $limit = 8): array
|
||||
{
|
||||
@@ -94,15 +108,26 @@ final readonly class TagVectorSearchClient
|
||||
|
||||
$score = (float) $score;
|
||||
|
||||
// 🔥 Confidence Gate
|
||||
if ($score < self::MIN_SCORE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$hits[] = [
|
||||
$hit = [
|
||||
'tag_id' => $tagId,
|
||||
'score' => $score,
|
||||
];
|
||||
|
||||
// Optional: label
|
||||
if (isset($row['label']) && is_string($row['label'])) {
|
||||
$hit['label'] = $row['label'];
|
||||
}
|
||||
|
||||
// Optional: tag_type
|
||||
if (isset($row['tag_type']) && is_string($row['tag_type'])) {
|
||||
$hit['tag_type'] = $row['tag_type'];
|
||||
}
|
||||
|
||||
$hits[] = $hit;
|
||||
}
|
||||
|
||||
return $hits;
|
||||
|
||||
Reference in New Issue
Block a user