first commit
This commit is contained in:
@@ -4,6 +4,7 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Tag;
|
||||
|
||||
use App\Entity\Document;
|
||||
use App\Entity\DocumentTag;
|
||||
use App\Entity\Tag;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
@@ -12,148 +13,199 @@ final readonly class TagNdjsonExporter
|
||||
{
|
||||
public function __construct(
|
||||
private EntityManagerInterface $em,
|
||||
private string $tagsNdjsonPath,
|
||||
) {}
|
||||
private string $tagsNdjsonPath,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
|
||||
* Export all relevant tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
|
||||
*
|
||||
* Line format:
|
||||
* {
|
||||
* "tag_id":"...",
|
||||
* "text":"label\nslug\noptional description",
|
||||
* "type":"catalog_entity|generic|...",
|
||||
* "type":"catalog_entity|generic|sales_signal",
|
||||
* "document_ids":["...","..."]
|
||||
* }
|
||||
*
|
||||
* Only ACTIVE document assignments are exported. Tags without active document
|
||||
* assignments are intentionally skipped so they do not influence retrieval.
|
||||
*
|
||||
* @return array{tags:int, lines:int, bytes:int, path:string}
|
||||
*/
|
||||
public function export(): array
|
||||
{
|
||||
$dir = \dirname($this->tagsNdjsonPath);
|
||||
if (!\is_dir($dir)) {
|
||||
@\mkdir($dir, 0775, true);
|
||||
}
|
||||
$this->ensureTargetDirectoryExists();
|
||||
|
||||
$tmpPath = $this->tagsNdjsonPath . '.tmp';
|
||||
$this->cleanupTemporaryFile($tmpPath);
|
||||
|
||||
$fh = @\fopen($tmpPath, 'wb');
|
||||
if (!$fh) {
|
||||
$fh = @fopen($tmpPath, 'wb');
|
||||
|
||||
if ($fh === false) {
|
||||
throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
|
||||
}
|
||||
|
||||
// 1) Load all tags
|
||||
$tags = $this->em->createQueryBuilder()
|
||||
->select('t')
|
||||
->from(Tag::class, 't')
|
||||
->orderBy('t.label', 'ASC')
|
||||
->getQuery()
|
||||
->getResult();
|
||||
try {
|
||||
/** @var list<Tag> $tags */
|
||||
$tags = $this->em->createQueryBuilder()
|
||||
->select('t')
|
||||
->from(Tag::class, 't')
|
||||
->orderBy('t.type', 'ASC')
|
||||
->addOrderBy('t.label', 'ASC')
|
||||
->getQuery()
|
||||
->getResult();
|
||||
|
||||
if (!\is_array($tags) || $tags === []) {
|
||||
\fclose($fh);
|
||||
if ($tags === []) {
|
||||
fclose($fh);
|
||||
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
||||
|
||||
return [
|
||||
'tags' => 0,
|
||||
'lines' => 0,
|
||||
'bytes' => (int) @filesize($this->tagsNdjsonPath),
|
||||
'path' => $this->tagsNdjsonPath,
|
||||
];
|
||||
}
|
||||
|
||||
$tagToActiveDocs = $this->buildActiveDocumentMap();
|
||||
$lines = 0;
|
||||
|
||||
foreach ($tags as $tag) {
|
||||
$tagId = $tag->getId()->toRfc4122();
|
||||
$docIds = $tagToActiveDocs[$tagId] ?? [];
|
||||
|
||||
if ($docIds === []) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$line = [
|
||||
'tag_id' => $tagId,
|
||||
'text' => $this->buildEmbeddingText($tag),
|
||||
'type' => TagTypes::normalize($tag->getType()),
|
||||
'document_ids' => $docIds,
|
||||
];
|
||||
|
||||
$json = json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||
|
||||
if (!is_string($json)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
fwrite($fh, $json . "\n");
|
||||
$lines++;
|
||||
}
|
||||
|
||||
fclose($fh);
|
||||
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
||||
|
||||
return [
|
||||
'tags' => 0,
|
||||
'lines' => 0,
|
||||
'bytes' => (int) @\filesize($this->tagsNdjsonPath),
|
||||
'path' => $this->tagsNdjsonPath,
|
||||
'tags' => count($tags),
|
||||
'lines' => $lines,
|
||||
'bytes' => (int) @filesize($this->tagsNdjsonPath),
|
||||
'path' => $this->tagsNdjsonPath,
|
||||
];
|
||||
}
|
||||
} catch (\Throwable $e) {
|
||||
fclose($fh);
|
||||
$this->cleanupTemporaryFile($tmpPath);
|
||||
|
||||
// 2) Build tagId => docIds map
|
||||
$rows = $this->em->createQueryBuilder()
|
||||
->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, list<string>>
|
||||
*/
|
||||
private function buildActiveDocumentMap(): array
|
||||
{
|
||||
/** @var list<DocumentTag> $relations */
|
||||
$relations = $this->em->createQueryBuilder()
|
||||
->select('dt')
|
||||
->addSelect('t', 'd')
|
||||
->from(DocumentTag::class, 'dt')
|
||||
->innerJoin('dt.tag', 't')
|
||||
->innerJoin('dt.document', 'd')
|
||||
->where('d.status = :status')
|
||||
->setParameter('status', Document::STATUS_ACTIVE)
|
||||
->getQuery()
|
||||
->getArrayResult();
|
||||
->getResult();
|
||||
|
||||
$tagToDocs = [];
|
||||
foreach ($rows as $r) {
|
||||
$tagId = (string) ($r['tagId'] ?? '');
|
||||
$docId = (string) ($r['docId'] ?? '');
|
||||
if ($tagId === '' || $docId === '') {
|
||||
continue;
|
||||
}
|
||||
$tagToDocs[$tagId][] = $docId;
|
||||
|
||||
foreach ($relations as $relation) {
|
||||
$tag = $relation->getTag();
|
||||
$document = $relation->getDocument();
|
||||
|
||||
$tagId = $tag->getId()->toRfc4122();
|
||||
$docId = $document->getId()->toRfc4122();
|
||||
|
||||
$tagToDocs[$tagId][$docId] = $docId;
|
||||
}
|
||||
|
||||
// 3) Stream NDJSON
|
||||
$lines = 0;
|
||||
|
||||
foreach ($tags as $tag) {
|
||||
if (!$tag instanceof Tag) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$tagId = (string) $tag->getId();
|
||||
$docIds = $tagToDocs[$tagId] ?? [];
|
||||
|
||||
if ($docIds !== []) {
|
||||
$docIds = \array_values(\array_unique($docIds));
|
||||
}
|
||||
|
||||
// Embedding source
|
||||
$textParts = [
|
||||
$tag->getLabel(),
|
||||
$tag->getSlug(),
|
||||
];
|
||||
|
||||
$desc = $tag->getDescription();
|
||||
if (\is_string($desc) && \trim($desc) !== '') {
|
||||
$textParts[] = \trim($desc);
|
||||
}
|
||||
|
||||
$type = method_exists($tag, 'getType')
|
||||
? (string) $tag->getType()
|
||||
: 'generic';
|
||||
|
||||
if ($type === '') {
|
||||
$type = 'generic';
|
||||
}
|
||||
|
||||
$line = [
|
||||
'tag_id' => $tagId,
|
||||
'text' => \implode("\n", $textParts),
|
||||
'type' => $type, // 🔥 NEW
|
||||
'document_ids' => $docIds,
|
||||
];
|
||||
|
||||
$json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||
if (!\is_string($json)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
\fwrite($fh, $json . "\n");
|
||||
$lines++;
|
||||
foreach ($tagToDocs as $tagId => $docIds) {
|
||||
ksort($docIds);
|
||||
$tagToDocs[$tagId] = array_values($docIds);
|
||||
}
|
||||
|
||||
\fclose($fh);
|
||||
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
||||
return $tagToDocs;
|
||||
}
|
||||
|
||||
return [
|
||||
'tags' => \count($tags),
|
||||
'lines' => $lines,
|
||||
'bytes' => (int) @\filesize($this->tagsNdjsonPath),
|
||||
'path' => $this->tagsNdjsonPath,
|
||||
private function buildEmbeddingText(Tag $tag): string
|
||||
{
|
||||
$parts = [
|
||||
trim($tag->getLabel()),
|
||||
trim($tag->getSlug()),
|
||||
];
|
||||
|
||||
$description = trim((string) $tag->getDescription());
|
||||
|
||||
if ($description !== '') {
|
||||
$parts[] = preg_replace('/\s+/u', ' ', $description) ?? $description;
|
||||
}
|
||||
|
||||
$parts = array_values(array_filter(
|
||||
array_unique($parts),
|
||||
static fn (string $part): bool => $part !== ''
|
||||
));
|
||||
|
||||
return implode("\n", $parts);
|
||||
}
|
||||
|
||||
private function ensureTargetDirectoryExists(): void
|
||||
{
|
||||
$dir = dirname($this->tagsNdjsonPath);
|
||||
|
||||
if (is_dir($dir)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
|
||||
throw new \RuntimeException('Cannot create tags NDJSON directory: ' . $dir);
|
||||
}
|
||||
}
|
||||
|
||||
private function cleanupTemporaryFile(string $tmpPath): void
|
||||
{
|
||||
if (is_file($tmpPath)) {
|
||||
@unlink($tmpPath);
|
||||
}
|
||||
}
|
||||
|
||||
private function atomicReplace(string $tmpPath, string $finalPath): void
|
||||
{
|
||||
if (\is_file($finalPath)) {
|
||||
@\chmod($finalPath, 0664);
|
||||
if (is_file($finalPath)) {
|
||||
@chmod($finalPath, 0664);
|
||||
}
|
||||
|
||||
if (!@\rename($tmpPath, $finalPath)) {
|
||||
if (!@\copy($tmpPath, $finalPath)) {
|
||||
@\unlink($tmpPath);
|
||||
if (!@rename($tmpPath, $finalPath)) {
|
||||
if (!@copy($tmpPath, $finalPath)) {
|
||||
@unlink($tmpPath);
|
||||
throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);
|
||||
}
|
||||
@\unlink($tmpPath);
|
||||
|
||||
@unlink($tmpPath);
|
||||
}
|
||||
|
||||
@\chmod($finalPath, 0664);
|
||||
@chmod($finalPath, 0664);
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,7 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Tag;
|
||||
|
||||
use App\Entity\Document;
|
||||
use Doctrine\DBAL\ArrayParameterType;
|
||||
use Doctrine\DBAL\Exception;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
@@ -11,91 +12,239 @@ use Symfony\Component\Uid\Uuid;
|
||||
|
||||
final class TagRoutingService
|
||||
{
|
||||
/**
|
||||
* Number of raw tag hits requested from the vector service.
|
||||
*/
|
||||
private const DEFAULT_TOPK = 8;
|
||||
private const MIN_BEST_SCORE = 0.25;
|
||||
private const MAX_CANDIDATE_DOCS = 200;
|
||||
|
||||
/**
|
||||
* Hard minimum confidence required to activate tag-based document routing.
|
||||
*
|
||||
* This intentionally aligns with the tag vector client gate to avoid
|
||||
* misleading secondary thresholds in this class.
|
||||
*/
|
||||
private const MIN_BEST_SCORE = 0.72;
|
||||
|
||||
/**
|
||||
* Only keep tag hits that stay reasonably close to the best hit.
|
||||
* This reduces semantic spillover into weakly related document spaces.
|
||||
*/
|
||||
private const MAX_SCORE_DROP_FROM_BEST = 0.08;
|
||||
|
||||
/**
|
||||
* Maximum number of tag hits that may influence routing.
|
||||
*/
|
||||
private const MAX_ROUTING_TAGS = 5;
|
||||
|
||||
/**
|
||||
* Maximum number of candidate documents passed into scoped chunk search.
|
||||
*/
|
||||
private const MAX_CANDIDATE_DOCS = 80;
|
||||
|
||||
/**
|
||||
* Small bonus for documents matched by multiple routed tags.
|
||||
*/
|
||||
private const MULTI_TAG_BONUS_PER_EXTRA_TAG = 0.05;
|
||||
private const MAX_MULTI_TAG_BONUS = 0.15;
|
||||
|
||||
public function __construct(
|
||||
private readonly TagVectorSearchClient $tagSearch,
|
||||
private readonly EntityManagerInterface $em,
|
||||
) {}
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]|null
|
||||
* Returns ordered active document ids for tag-scoped retrieval.
|
||||
*
|
||||
* The method intentionally returns only document ids so the current
|
||||
* retriever pipeline can stay unchanged.
|
||||
*
|
||||
* @return list<string>|null
|
||||
* @throws Exception
|
||||
*/
|
||||
public function route(string $query): ?array
|
||||
{
|
||||
$query = trim($query);
|
||||
|
||||
if ($query === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
$hits = $this->tagSearch->search($query, self::DEFAULT_TOPK);
|
||||
$hits = $this->filterRoutingHits(
|
||||
$this->tagSearch->search($query, self::DEFAULT_TOPK)
|
||||
);
|
||||
|
||||
if (!is_array($hits) || $hits === []) {
|
||||
if ($hits === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$bestScore = (float)($hits[0]['score'] ?? 0.0);
|
||||
if ($bestScore < self::MIN_BEST_SCORE) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Convert tag UUID strings to binary(16)
|
||||
$tagBinaryIds = [];
|
||||
$tagMetaById = [];
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
$id = (string)($hit['tag_id'] ?? '');
|
||||
if ($id === '') {
|
||||
$tagId = (string) ($hit['tag_id'] ?? '');
|
||||
|
||||
if ($tagId === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
$tagBinaryIds[] = Uuid::fromString($id)->toBinary();
|
||||
$tagBinaryIds[] = Uuid::fromString($tagId)->toBinary();
|
||||
} catch (\Throwable) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$tagMetaById[$tagId] = [
|
||||
'score' => (float) $hit['score'],
|
||||
'weight' => $this->resolveTypeWeight((string) $hit['tag_type']),
|
||||
];
|
||||
}
|
||||
|
||||
if ($tagBinaryIds === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Direct DBAL query (binary-safe)
|
||||
$conn = $this->em->getConnection();
|
||||
|
||||
$rows = $conn->executeQuery(
|
||||
'SELECT document_id
|
||||
FROM document_tag
|
||||
WHERE tag_id IN (:tagIds)',
|
||||
['tagIds' => $tagBinaryIds],
|
||||
['tagIds' => ArrayParameterType::BINARY]
|
||||
$rows = $this->em->getConnection()->executeQuery(
|
||||
'SELECT dt.document_id, dt.tag_id
|
||||
FROM document_tag dt
|
||||
INNER JOIN document d ON d.id = dt.document_id
|
||||
WHERE dt.tag_id IN (:tagIds)
|
||||
AND d.status = :status',
|
||||
[
|
||||
'tagIds' => $tagBinaryIds,
|
||||
'status' => Document::STATUS_ACTIVE,
|
||||
],
|
||||
[
|
||||
'tagIds' => ArrayParameterType::BINARY,
|
||||
]
|
||||
)->fetchAllAssociative();
|
||||
|
||||
if ($rows === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$docIds = [];
|
||||
$documentScores = [];
|
||||
$documentMatchedTags = [];
|
||||
|
||||
foreach ($rows as $row) {
|
||||
if (!isset($row['document_id'])) {
|
||||
if (!isset($row['document_id'], $row['tag_id'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
$uuid = Uuid::fromBinary($row['document_id']);
|
||||
$docIds[(string)$uuid] = true;
|
||||
$documentId = (string) Uuid::fromBinary($row['document_id']);
|
||||
$tagId = (string) Uuid::fromBinary($row['tag_id']);
|
||||
} catch (\Throwable) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (count($docIds) >= self::MAX_CANDIDATE_DOCS) {
|
||||
if (!isset($tagMetaById[$tagId])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$documentScores[$documentId] = ($documentScores[$documentId] ?? 0.0)
|
||||
+ ($tagMetaById[$tagId]['score'] * $tagMetaById[$tagId]['weight']);
|
||||
|
||||
$documentMatchedTags[$documentId][$tagId] = true;
|
||||
}
|
||||
|
||||
if ($documentScores === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
foreach ($documentScores as $documentId => $score) {
|
||||
$matchedTagCount = isset($documentMatchedTags[$documentId])
|
||||
? count($documentMatchedTags[$documentId])
|
||||
: 0;
|
||||
|
||||
if ($matchedTagCount > 1) {
|
||||
$documentScores[$documentId] += min(
|
||||
self::MAX_MULTI_TAG_BONUS,
|
||||
($matchedTagCount - 1) * self::MULTI_TAG_BONUS_PER_EXTRA_TAG
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
arsort($documentScores, SORT_NUMERIC);
|
||||
|
||||
return array_slice(
|
||||
array_keys($documentScores),
|
||||
0,
|
||||
self::MAX_CANDIDATE_DOCS
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, array{
|
||||
* tag_id:string,
|
||||
* score:float,
|
||||
* label?:string,
|
||||
* tag_type?:string
|
||||
* }> $hits
|
||||
*
|
||||
* @return list<array{
|
||||
* tag_id:string,
|
||||
* score:float,
|
||||
* tag_type:string
|
||||
* }>
|
||||
*/
|
||||
private function filterRoutingHits(array $hits): array
|
||||
{
|
||||
if ($hits === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$bestScore = (float) ($hits[0]['score'] ?? 0.0);
|
||||
|
||||
if ($bestScore < self::MIN_BEST_SCORE) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$minimumAcceptedScore = max(
|
||||
self::MIN_BEST_SCORE,
|
||||
$bestScore - self::MAX_SCORE_DROP_FROM_BEST
|
||||
);
|
||||
|
||||
$filtered = [];
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
$tagId = (string) ($hit['tag_id'] ?? '');
|
||||
$score = (float) ($hit['score'] ?? 0.0);
|
||||
$tagType = TagTypes::normalize(
|
||||
(string) ($hit['tag_type'] ?? TagTypes::GENERIC)
|
||||
);
|
||||
|
||||
if ($tagId === '' || $score < $minimumAcceptedScore) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Sales signals may still be useful elsewhere, but they should not
|
||||
// expand the document scope for semantic retrieval.
|
||||
if ($tagType === TagTypes::SALES_SIGNAL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$filtered[] = [
|
||||
'tag_id' => $tagId,
|
||||
'score' => $score,
|
||||
'tag_type' => $tagType,
|
||||
];
|
||||
|
||||
if (count($filtered) >= self::MAX_ROUTING_TAGS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return array_keys($docIds);
|
||||
return $filtered;
|
||||
}
|
||||
|
||||
private function resolveTypeWeight(string $tagType): float
|
||||
{
|
||||
return match (TagTypes::normalize($tagType)) {
|
||||
TagTypes::CATALOG_ENTITY => 1.20,
|
||||
TagTypes::GENERIC => 1.00,
|
||||
TagTypes::SALES_SIGNAL => 0.00,
|
||||
default => 1.00,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -4,42 +4,45 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Tag;
|
||||
|
||||
use App\Entity\Tag;
|
||||
use App\Entity\Document;
|
||||
use App\Entity\DocumentTag;
|
||||
use App\Entity\Tag;
|
||||
use App\Service\TagRebuildJobService;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
use InvalidArgumentException;
|
||||
use RuntimeException;
|
||||
|
||||
final readonly class TagService
|
||||
{
|
||||
public function __construct(
|
||||
private EntityManagerInterface $em,
|
||||
private TagRebuildJobService $jobs,
|
||||
) {}
|
||||
|
||||
// =========================================================
|
||||
// TAG CREATE
|
||||
// =========================================================
|
||||
private TagRebuildJobService $jobs,
|
||||
) {
|
||||
}
|
||||
|
||||
public function create(
|
||||
string $slug,
|
||||
string $label,
|
||||
?string $description = null,
|
||||
string $type = 'generic' // NEU
|
||||
string $type = TagTypes::GENERIC,
|
||||
): Tag {
|
||||
$slug = trim($slug);
|
||||
$normalizedSlug = $this->normalizeSlug($slug);
|
||||
$label = trim($label);
|
||||
|
||||
if ($label === '' || $slug === '') {
|
||||
throw new \InvalidArgumentException('Label und Slug sind Pflichtfelder.');
|
||||
if ($normalizedSlug === '' || $label === '') {
|
||||
throw new InvalidArgumentException('Tag label and slug are required.');
|
||||
}
|
||||
|
||||
if ($this->slugExists($slug)) {
|
||||
throw new \RuntimeException('Slug existiert bereits.');
|
||||
if ($this->slugExists($normalizedSlug)) {
|
||||
throw new RuntimeException(sprintf('Tag slug "%s" already exists.', $normalizedSlug));
|
||||
}
|
||||
|
||||
$tag = new Tag($slug, $label, $description);
|
||||
$tag->setType($type); // NEU
|
||||
$tag = new Tag(
|
||||
$normalizedSlug,
|
||||
$label,
|
||||
$description,
|
||||
TagTypes::normalize($type)
|
||||
);
|
||||
|
||||
$this->em->persist($tag);
|
||||
$this->em->flush();
|
||||
@@ -49,18 +52,9 @@ final readonly class TagService
|
||||
return $tag;
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// TAG DELETE
|
||||
// =========================================================
|
||||
|
||||
public function deleteById(string $tagId): void
|
||||
{
|
||||
$tag = $this->em->getRepository(Tag::class)->find($tagId);
|
||||
|
||||
if (!$tag instanceof Tag) {
|
||||
throw new \RuntimeException('Tag nicht gefunden.');
|
||||
}
|
||||
|
||||
$tag = $this->findTagById($tagId);
|
||||
$this->delete($tag);
|
||||
}
|
||||
|
||||
@@ -72,87 +66,103 @@ final readonly class TagService
|
||||
$this->triggerRebuildIfIdle();
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// DOCUMENT TAG SYNC
|
||||
// =========================================================
|
||||
|
||||
public function syncDocumentTags(Document $document, array $newTagIds): void
|
||||
{
|
||||
$newTagIds = array_unique($newTagIds);
|
||||
$normalizedTagIds = $this->normalizeIdList($newTagIds);
|
||||
|
||||
/** @var list<DocumentTag> $currentRelations */
|
||||
$currentRelations = $this->em
|
||||
->getRepository(DocumentTag::class)
|
||||
->findBy(['document' => $document]);
|
||||
|
||||
$currentTagIds = array_map(
|
||||
fn(DocumentTag $dt) => (string) $dt->getTag()->getId(),
|
||||
static fn (DocumentTag $relation): string => (string) $relation->getTag()->getId(),
|
||||
$currentRelations
|
||||
);
|
||||
|
||||
$toAdd = array_diff($newTagIds, $currentTagIds);
|
||||
$toRemove = array_diff($currentTagIds, $newTagIds);
|
||||
$toAdd = array_values(array_diff($normalizedTagIds, $currentTagIds));
|
||||
$toRemove = array_values(array_diff($currentTagIds, $normalizedTagIds));
|
||||
|
||||
foreach ($toAdd as $tagId) {
|
||||
$tag = $this->em->getRepository(Tag::class)->find($tagId);
|
||||
|
||||
if ($tag instanceof Tag) {
|
||||
$this->em->persist(new DocumentTag($document, $tag));
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($currentRelations as $relation) {
|
||||
if (in_array((string) $relation->getTag()->getId(), $toRemove, true)) {
|
||||
$relationTagId = (string) $relation->getTag()->getId();
|
||||
|
||||
if (in_array($relationTagId, $toRemove, true)) {
|
||||
$this->em->remove($relation);
|
||||
}
|
||||
}
|
||||
|
||||
if ($toAdd || $toRemove) {
|
||||
if ($toAdd !== [] || $toRemove !== []) {
|
||||
$this->em->flush();
|
||||
$this->triggerRebuildIfIdle();
|
||||
}
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// TAG → DOCUMENT SYNC (Bulk Assign)
|
||||
// =========================================================
|
||||
|
||||
public function syncTagDocuments(Tag $tag, array $newDocumentIds): void
|
||||
{
|
||||
$newDocumentIds = array_unique($newDocumentIds);
|
||||
$normalizedDocumentIds = $this->normalizeIdList($newDocumentIds);
|
||||
|
||||
/** @var list<DocumentTag> $currentRelations */
|
||||
$currentRelations = $this->em
|
||||
->getRepository(DocumentTag::class)
|
||||
->findBy(['tag' => $tag]);
|
||||
|
||||
$currentDocumentIds = array_map(
|
||||
fn(DocumentTag $dt) => (string) $dt->getDocument()->getId(),
|
||||
static fn (DocumentTag $relation): string => (string) $relation->getDocument()->getId(),
|
||||
$currentRelations
|
||||
);
|
||||
|
||||
$toAdd = array_diff($newDocumentIds, $currentDocumentIds);
|
||||
$toRemove = array_diff($currentDocumentIds, $newDocumentIds);
|
||||
$toAdd = array_values(array_diff($normalizedDocumentIds, $currentDocumentIds));
|
||||
$toRemove = array_values(array_diff($currentDocumentIds, $normalizedDocumentIds));
|
||||
|
||||
foreach ($toAdd as $documentId) {
|
||||
$document = $this->em->getRepository(Document::class)->find($documentId);
|
||||
if ($document instanceof Document) {
|
||||
|
||||
if (
|
||||
$document instanceof Document
|
||||
&& $document->getStatus() === Document::STATUS_ACTIVE
|
||||
) {
|
||||
$this->em->persist(new DocumentTag($document, $tag));
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($currentRelations as $relation) {
|
||||
if (in_array((string) $relation->getDocument()->getId(), $toRemove, true)) {
|
||||
$relationDocumentId = (string) $relation->getDocument()->getId();
|
||||
|
||||
if (in_array($relationDocumentId, $toRemove, true)) {
|
||||
$this->em->remove($relation);
|
||||
}
|
||||
}
|
||||
|
||||
if ($toAdd || $toRemove) {
|
||||
if ($toAdd !== [] || $toRemove !== []) {
|
||||
$this->em->flush();
|
||||
$this->triggerRebuildIfIdle();
|
||||
}
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// INTERNAL HELPERS
|
||||
// =========================================================
|
||||
private function findTagById(string $tagId): Tag
|
||||
{
|
||||
$tagId = trim($tagId);
|
||||
|
||||
if ($tagId === '') {
|
||||
throw new InvalidArgumentException('Tag id must not be empty.');
|
||||
}
|
||||
|
||||
$tag = $this->em->getRepository(Tag::class)->find($tagId);
|
||||
|
||||
if (!$tag instanceof Tag) {
|
||||
throw new RuntimeException('Tag not found.');
|
||||
}
|
||||
|
||||
return $tag;
|
||||
}
|
||||
|
||||
private function slugExists(string $slug): bool
|
||||
{
|
||||
@@ -165,6 +175,36 @@ final readonly class TagService
|
||||
->getSingleScalarResult() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<mixed> $ids
|
||||
* @return list<string>
|
||||
*/
|
||||
private function normalizeIdList(array $ids): array
|
||||
{
|
||||
$normalized = [];
|
||||
|
||||
foreach ($ids as $id) {
|
||||
$id = trim((string) $id);
|
||||
|
||||
if ($id === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$normalized[] = $id;
|
||||
}
|
||||
|
||||
return array_values(array_unique($normalized));
|
||||
}
|
||||
|
||||
private function normalizeSlug(string $slug): string
|
||||
{
|
||||
$slug = mb_strtolower(trim($slug));
|
||||
$slug = preg_replace('/\s+/u', '-', $slug) ?? $slug;
|
||||
$slug = preg_replace('/-+/u', '-', $slug) ?? $slug;
|
||||
|
||||
return trim($slug, '-');
|
||||
}
|
||||
|
||||
private function triggerRebuildIfIdle(): void
|
||||
{
|
||||
if (!$this->jobs->hasActiveJob()) {
|
||||
|
||||
@@ -5,8 +5,10 @@ declare(strict_types=1);
|
||||
namespace App\Tag;
|
||||
|
||||
/**
|
||||
* Zentrale Definition aller erlaubten Tag-Typen.
|
||||
* Verhindert Magic Strings im Code.
|
||||
* Central definition of all supported tag types.
|
||||
*
|
||||
* This class is intentionally tiny and dependency-free because it is the
|
||||
* foundation for entity validation, admin forms, routing, and catalog logic.
|
||||
*/
|
||||
final class TagTypes
|
||||
{
|
||||
@@ -14,6 +16,25 @@ final class TagTypes
|
||||
public const CATALOG_ENTITY = 'catalog_entity';
|
||||
public const SALES_SIGNAL = 'sales_signal';
|
||||
|
||||
/**
|
||||
* Returns the canonical list of allowed type values.
|
||||
*
|
||||
* @return list<string>
|
||||
*/
|
||||
public static function all(): array
|
||||
{
|
||||
return [
|
||||
self::GENERIC,
|
||||
self::CATALOG_ENTITY,
|
||||
self::SALES_SIGNAL,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns UI choices for forms and admin screens.
|
||||
*
|
||||
* @return array<string, string>
|
||||
*/
|
||||
public static function choices(): array
|
||||
{
|
||||
return [
|
||||
@@ -23,5 +44,53 @@ final class TagTypes
|
||||
];
|
||||
}
|
||||
|
||||
private function __construct() {}
|
||||
/**
|
||||
* Returns true if the given value is an allowed tag type.
|
||||
*/
|
||||
public static function isValid(?string $type): bool
|
||||
{
|
||||
if ($type === null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return in_array(self::normalize($type), self::all(), true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes external input into a canonical internal value.
|
||||
*
|
||||
* Empty or unknown input falls back to the provided default.
|
||||
*/
|
||||
public static function normalize(?string $type, string $default = self::GENERIC): string
|
||||
{
|
||||
$type = mb_strtolower(trim((string) $type));
|
||||
$default = mb_strtolower(trim($default));
|
||||
|
||||
if ($type === '') {
|
||||
return self::isKnownDefault($default) ? $default : self::GENERIC;
|
||||
}
|
||||
|
||||
if (in_array($type, self::all(), true)) {
|
||||
return $type;
|
||||
}
|
||||
|
||||
return self::isKnownDefault($default) ? $default : self::GENERIC;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a human-readable label for a canonical type.
|
||||
*/
|
||||
public static function labelFor(string $type): string
|
||||
{
|
||||
return array_flip(self::choices())[self::normalize($type)] ?? 'Generic';
|
||||
}
|
||||
|
||||
private static function isKnownDefault(string $type): bool
|
||||
{
|
||||
return in_array($type, self::all(), true);
|
||||
}
|
||||
|
||||
private function __construct()
|
||||
{
|
||||
}
|
||||
}
|
||||
@@ -9,18 +9,81 @@ use Psr\Log\LoggerInterface;
|
||||
|
||||
final readonly class TagVectorIndexBuilder
|
||||
{
|
||||
private const GRACEFUL_TERMINATION_SECONDS = 2;
|
||||
|
||||
public function __construct(
|
||||
private string $pythonBin,
|
||||
private string $scriptPath,
|
||||
private string $tagsNdjsonPath,
|
||||
private string $vectorTagsIndexPath,
|
||||
private string $embeddingModel,
|
||||
private int $timeoutSeconds,
|
||||
private LoggerInterface $agentLogger,
|
||||
private IndexMetaManager $metaManager, // ✅ NEU
|
||||
) {}
|
||||
private string $pythonBin,
|
||||
private string $scriptPath,
|
||||
private string $tagsNdjsonPath,
|
||||
private string $vectorTagsIndexPath,
|
||||
private string $embeddingModel,
|
||||
private int $timeoutSeconds,
|
||||
private LoggerInterface $agentLogger,
|
||||
private IndexMetaManager $metaManager,
|
||||
) {
|
||||
}
|
||||
|
||||
public function build(): void
|
||||
{
|
||||
$this->assertPreconditions();
|
||||
|
||||
$tmpIndex = $this->vectorTagsIndexPath . '.tmp';
|
||||
$tmpMeta = $tmpIndex . '.meta.json';
|
||||
$finalIndex = $this->vectorTagsIndexPath;
|
||||
$finalMeta = $finalIndex . '.meta.json';
|
||||
|
||||
$this->ensureTargetDirectoryExists($finalIndex);
|
||||
$this->cleanupTemporaryArtifacts($tmpIndex, $tmpMeta);
|
||||
|
||||
if (!$this->hasEmbeddableTags()) {
|
||||
$this->agentLogger->info('[tags] no embeddable tags found, removing stale tag index artifacts.');
|
||||
$this->removeFileIfExists($finalIndex);
|
||||
$this->removeFileIfExists($finalMeta);
|
||||
$this->commitRuntime(false);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
$cmd = $this->buildCommand($tmpIndex);
|
||||
|
||||
$this->agentLogger->info('[tags] build tag vector index', [
|
||||
'cmd' => $cmd,
|
||||
'timeout' => $this->timeoutSeconds,
|
||||
'embedding_model' => $this->embeddingModel,
|
||||
]);
|
||||
|
||||
try {
|
||||
$result = $this->runCommand($cmd);
|
||||
|
||||
if ($result['exit'] !== 0) {
|
||||
$this->agentLogger->error('[tags] tag vector ingest failed', [
|
||||
'exit' => $result['exit'],
|
||||
'stdout' => $result['stdout'],
|
||||
'stderr' => $result['stderr'],
|
||||
]);
|
||||
|
||||
throw new \RuntimeException('Tag vector ingest failed (exit=' . $result['exit'] . ')');
|
||||
}
|
||||
|
||||
if (!$this->isUsableArtifact($tmpIndex) || !$this->isUsableArtifact($tmpMeta)) {
|
||||
throw new \RuntimeException('Tag vector ingest produced incomplete artifacts.');
|
||||
}
|
||||
|
||||
$this->atomicReplace($tmpIndex, $finalIndex);
|
||||
$this->atomicReplace($tmpMeta, $finalMeta);
|
||||
$this->commitRuntime(true);
|
||||
|
||||
$this->agentLogger->info('[tags] tag vector index build completed + runtime committed', [
|
||||
'index' => $finalIndex,
|
||||
'meta' => $finalMeta,
|
||||
]);
|
||||
} catch (\Throwable $e) {
|
||||
$this->cleanupTemporaryArtifacts($tmpIndex, $tmpMeta);
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
private function assertPreconditions(): void
|
||||
{
|
||||
if (!is_file($this->tagsNdjsonPath)) {
|
||||
throw new \RuntimeException('tags.ndjson missing: ' . $this->tagsNdjsonPath);
|
||||
@@ -30,65 +93,178 @@ final readonly class TagVectorIndexBuilder
|
||||
throw new \RuntimeException('Tag ingest script missing: ' . $this->scriptPath);
|
||||
}
|
||||
|
||||
$tmpIndex = $this->vectorTagsIndexPath . '.tmp';
|
||||
$tmpMeta = $tmpIndex . '.meta.json';
|
||||
|
||||
$finalIndex = $this->vectorTagsIndexPath;
|
||||
$finalMeta = $finalIndex . '.meta.json';
|
||||
|
||||
$dir = \dirname($finalIndex);
|
||||
if (!\is_dir($dir)) {
|
||||
@\mkdir($dir, 0775, true);
|
||||
if (trim($this->pythonBin) === '') {
|
||||
throw new \RuntimeException('Python binary must not be empty.');
|
||||
}
|
||||
|
||||
@\unlink($tmpIndex);
|
||||
@\unlink($tmpMeta);
|
||||
if ($this->timeoutSeconds < 1) {
|
||||
throw new \RuntimeException('Tag vector timeout must be >= 1 second.');
|
||||
}
|
||||
}
|
||||
|
||||
$cmd = sprintf(
|
||||
'%s %s %s %s %s 2>&1',
|
||||
private function buildCommand(string $tmpIndex): string
|
||||
{
|
||||
return sprintf(
|
||||
'%s %s %s %s 2>&1',
|
||||
escapeshellarg($this->pythonBin),
|
||||
escapeshellarg($this->scriptPath),
|
||||
escapeshellarg($this->tagsNdjsonPath),
|
||||
escapeshellarg($tmpIndex),
|
||||
escapeshellarg($this->embeddingModel),
|
||||
);
|
||||
}
|
||||
|
||||
$this->agentLogger->info('[tags] build tag vector index', [
|
||||
'cmd' => $cmd,
|
||||
'timeout' => $this->timeoutSeconds,
|
||||
]);
|
||||
private function ensureTargetDirectoryExists(string $finalIndexPath): void
|
||||
{
|
||||
$dir = dirname($finalIndexPath);
|
||||
|
||||
$out = [];
|
||||
$exit = 0;
|
||||
|
||||
exec($cmd, $out, $exit);
|
||||
|
||||
if ($exit !== 0) {
|
||||
$this->agentLogger->error('[tags] tag vector ingest failed', [
|
||||
'exit' => $exit,
|
||||
'out' => $out,
|
||||
]);
|
||||
throw new \RuntimeException('Tag vector ingest failed (exit=' . $exit . ')');
|
||||
}
|
||||
|
||||
if (!is_file($tmpIndex) || !is_file($tmpMeta)) {
|
||||
@\unlink($tmpIndex);
|
||||
@\unlink($tmpMeta);
|
||||
$this->agentLogger->warning('[tags] no tag index produced (maybe 0 tags).');
|
||||
if (is_dir($dir)) {
|
||||
return;
|
||||
}
|
||||
|
||||
$this->atomicReplace($tmpIndex, $finalIndex);
|
||||
$this->atomicReplace($tmpMeta, $finalMeta);
|
||||
if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
|
||||
throw new \RuntimeException('Unable to create tag vector directory: ' . $dir);
|
||||
}
|
||||
}
|
||||
|
||||
// ✅ ENTERPRISE COMMIT MARKER
|
||||
private function hasEmbeddableTags(): bool
|
||||
{
|
||||
$fh = @fopen($this->tagsNdjsonPath, 'rb');
|
||||
|
||||
if ($fh === false) {
|
||||
throw new \RuntimeException('Unable to read tags NDJSON: ' . $this->tagsNdjsonPath);
|
||||
}
|
||||
|
||||
try {
|
||||
while (($line = fgets($fh)) !== false) {
|
||||
$line = trim($line);
|
||||
|
||||
if ($line === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$decoded = json_decode($line, true);
|
||||
|
||||
if (!is_array($decoded)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$tagId = trim((string) ($decoded['tag_id'] ?? ''));
|
||||
$text = trim((string) ($decoded['text'] ?? ''));
|
||||
|
||||
if ($tagId !== '' && $text !== '') {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
fclose($fh);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{exit:int, stdout:string, stderr:string}
|
||||
*/
|
||||
private function runCommand(string $cmd): array
|
||||
{
|
||||
$descriptorSpec = [
|
||||
0 => ['pipe', 'r'],
|
||||
1 => ['pipe', 'w'],
|
||||
2 => ['pipe', 'w'],
|
||||
];
|
||||
|
||||
$process = @proc_open($cmd, $descriptorSpec, $pipes);
|
||||
|
||||
if (!is_resource($process)) {
|
||||
throw new \RuntimeException('Could not start tag vector ingest process.');
|
||||
}
|
||||
|
||||
fclose($pipes[0]);
|
||||
stream_set_blocking($pipes[1], false);
|
||||
stream_set_blocking($pipes[2], false);
|
||||
|
||||
$stdout = '';
|
||||
$stderr = '';
|
||||
$startedAt = microtime(true);
|
||||
$timedOut = false;
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
$stdout .= stream_get_contents($pipes[1]) ?: '';
|
||||
$stderr .= stream_get_contents($pipes[2]) ?: '';
|
||||
|
||||
$status = proc_get_status($process);
|
||||
|
||||
if (!is_array($status) || ($status['running'] ?? false) !== true) {
|
||||
break;
|
||||
}
|
||||
|
||||
if ((microtime(true) - $startedAt) > $this->timeoutSeconds) {
|
||||
$timedOut = true;
|
||||
proc_terminate($process);
|
||||
usleep(self::GRACEFUL_TERMINATION_SECONDS * 1000000);
|
||||
|
||||
$status = proc_get_status($process);
|
||||
if (is_array($status) && ($status['running'] ?? false) === true) {
|
||||
proc_terminate($process, 9);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
usleep(100000);
|
||||
}
|
||||
|
||||
$stdout .= stream_get_contents($pipes[1]) ?: '';
|
||||
$stderr .= stream_get_contents($pipes[2]) ?: '';
|
||||
} finally {
|
||||
fclose($pipes[1]);
|
||||
fclose($pipes[2]);
|
||||
}
|
||||
|
||||
$exitCode = proc_close($process);
|
||||
|
||||
if ($timedOut) {
|
||||
$this->agentLogger->error('[tags] tag vector ingest timed out', [
|
||||
'timeout' => $this->timeoutSeconds,
|
||||
'stdout' => $stdout,
|
||||
'stderr' => $stderr,
|
||||
]);
|
||||
|
||||
throw new \RuntimeException('Tag vector ingest timed out after ' . $this->timeoutSeconds . ' seconds.');
|
||||
}
|
||||
|
||||
return [
|
||||
'exit' => is_int($exitCode) ? $exitCode : 1,
|
||||
'stdout' => trim($stdout),
|
||||
'stderr' => trim($stderr),
|
||||
];
|
||||
}
|
||||
|
||||
private function isUsableArtifact(string $path): bool
|
||||
{
|
||||
return is_file($path) && filesize($path) > 0;
|
||||
}
|
||||
|
||||
private function cleanupTemporaryArtifacts(string ...$paths): void
|
||||
{
|
||||
foreach ($paths as $path) {
|
||||
$this->removeFileIfExists($path);
|
||||
}
|
||||
}
|
||||
|
||||
private function removeFileIfExists(string $path): void
|
||||
{
|
||||
if (is_file($path)) {
|
||||
@unlink($path);
|
||||
}
|
||||
}
|
||||
|
||||
private function commitRuntime(bool $indexPresent): void
|
||||
{
|
||||
$this->metaManager->touchRuntime([
|
||||
'last_tags_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
||||
]);
|
||||
|
||||
$this->agentLogger->info('[tags] tag vector index build completed + runtime committed', [
|
||||
'index' => $finalIndex,
|
||||
'meta' => $finalMeta,
|
||||
'tags_index_present' => $indexPresent,
|
||||
]);
|
||||
}
|
||||
|
||||
@@ -99,6 +275,7 @@ final readonly class TagVectorIndexBuilder
|
||||
@unlink($tmp);
|
||||
throw new \RuntimeException('Atomic replace failed for: ' . $final);
|
||||
}
|
||||
|
||||
@unlink($tmp);
|
||||
}
|
||||
|
||||
|
||||
@@ -6,63 +6,210 @@ namespace App\Tag;
|
||||
|
||||
final readonly class TagVectorIndexHealthService
|
||||
{
|
||||
private const STATUS_OK = 'OK';
|
||||
private const STATUS_OK_EMPTY = 'OK_EMPTY';
|
||||
private const STATUS_INCONSISTENT_STALE_VECTOR = 'INCONSISTENT_STALE_VECTOR';
|
||||
private const STATUS_INCONSISTENT_MISSING_VECTOR = 'INCONSISTENT_MISSING_VECTOR';
|
||||
private const STATUS_INCONSISTENT_COUNT_MISMATCH = 'INCONSISTENT_COUNT_MISMATCH';
|
||||
private const STATUS_INCONSISTENT_INVALID_META = 'INCONSISTENT_INVALID_META';
|
||||
private const STATUS_UNKNOWN = 'UNKNOWN';
|
||||
|
||||
public function __construct(
|
||||
private string $tagsNdjsonPath,
|
||||
private string $vectorTagsIndexPath,
|
||||
private string $vectorTagsMetaPath
|
||||
) {}
|
||||
private string $vectorTagsMetaPath,
|
||||
) {
|
||||
}
|
||||
|
||||
public function check(): array
|
||||
{
|
||||
$ndjsonExists = is_file($this->tagsNdjsonPath);
|
||||
$vectorExists = is_file($this->vectorTagsIndexPath);
|
||||
$metaExists = is_file($this->vectorTagsMetaPath);
|
||||
$metaExists = is_file($this->vectorTagsMetaPath);
|
||||
|
||||
$ndjsonTagCount = 0;
|
||||
$ndjsonStats = $this->readNdjsonStats();
|
||||
$metaStats = $this->readMetaStats();
|
||||
|
||||
if ($ndjsonExists) {
|
||||
$h = @fopen($this->tagsNdjsonPath, 'r');
|
||||
if ($h !== false) {
|
||||
while (($line = fgets($h)) !== false) {
|
||||
$line = trim($line);
|
||||
if ($line === '') continue;
|
||||
|
||||
$data = json_decode($line, true);
|
||||
if (is_array($data) && !empty($data['tag_id']) && !empty($data['text'])) {
|
||||
$ndjsonTagCount++;
|
||||
}
|
||||
}
|
||||
fclose($h);
|
||||
}
|
||||
}
|
||||
|
||||
$vectorTagCount = 0;
|
||||
if ($metaExists) {
|
||||
$meta = json_decode((string) file_get_contents($this->vectorTagsMetaPath), true);
|
||||
if (is_array($meta)) {
|
||||
$vectorTagCount = count($meta);
|
||||
}
|
||||
}
|
||||
|
||||
$status = $this->determineStatus($ndjsonTagCount, $vectorExists, $metaExists, $vectorTagCount);
|
||||
$status = $this->determineStatus(
|
||||
$ndjsonStats['exported_tag_count'],
|
||||
$vectorExists,
|
||||
$metaExists,
|
||||
$metaStats['vector_tag_count'],
|
||||
$metaStats['meta_valid']
|
||||
);
|
||||
|
||||
return [
|
||||
'tags_ndjson_exists' => $ndjsonExists,
|
||||
'tags_ndjson_count' => $ndjsonTagCount,
|
||||
'vector_exists' => $vectorExists,
|
||||
'meta_exists' => $metaExists,
|
||||
'vector_tag_count' => $vectorTagCount,
|
||||
'status' => $status,
|
||||
'tags_ndjson_count' => $ndjsonStats['exported_tag_count'],
|
||||
'vector_exists' => $vectorExists,
|
||||
'meta_exists' => $metaExists,
|
||||
'vector_tag_count' => $metaStats['vector_tag_count'],
|
||||
'status' => $status,
|
||||
|
||||
// Extra diagnostics for admin/CLI.
|
||||
'tags_ndjson_lines_total' => $ndjsonStats['lines_total'],
|
||||
'tags_ndjson_invalid_lines' => $ndjsonStats['invalid_lines'],
|
||||
'tags_ndjson_empty_lines' => $ndjsonStats['empty_lines'],
|
||||
'tags_with_active_document_ids' => $ndjsonStats['tags_with_document_ids'],
|
||||
'meta_valid' => $metaStats['meta_valid'],
|
||||
'paths' => [
|
||||
'tags_ndjson' => $this->tagsNdjsonPath,
|
||||
'vector_index' => $this->vectorTagsIndexPath,
|
||||
'vector_meta' => $this->vectorTagsMetaPath,
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
private function determineStatus(int $ndjsonTagCount, bool $vectorExists, bool $metaExists, int $vectorTagCount): string
|
||||
/**
|
||||
* @return array{
|
||||
* lines_total:int,
|
||||
* empty_lines:int,
|
||||
* invalid_lines:int,
|
||||
* exported_tag_count:int,
|
||||
* tags_with_document_ids:int
|
||||
* }
|
||||
*/
|
||||
private function readNdjsonStats(): array
|
||||
{
|
||||
if ($ndjsonTagCount === 0 && !$vectorExists && !$metaExists) return 'OK_EMPTY';
|
||||
if ($ndjsonTagCount > 0 && $vectorExists && $metaExists && $vectorTagCount === $ndjsonTagCount) return 'OK';
|
||||
if ($ndjsonTagCount === 0 && ($vectorExists || $metaExists)) return 'INCONSISTENT_STALE_VECTOR';
|
||||
if ($ndjsonTagCount > 0 && (!$vectorExists || !$metaExists)) return 'INCONSISTENT_MISSING_VECTOR';
|
||||
if ($ndjsonTagCount !== $vectorTagCount) return 'INCONSISTENT_COUNT_MISMATCH';
|
||||
return 'UNKNOWN';
|
||||
$stats = [
|
||||
'lines_total' => 0,
|
||||
'empty_lines' => 0,
|
||||
'invalid_lines' => 0,
|
||||
'exported_tag_count' => 0,
|
||||
'tags_with_document_ids' => 0,
|
||||
];
|
||||
|
||||
if (!is_file($this->tagsNdjsonPath)) {
|
||||
return $stats;
|
||||
}
|
||||
|
||||
$handle = @fopen($this->tagsNdjsonPath, 'rb');
|
||||
|
||||
if ($handle === false) {
|
||||
return $stats;
|
||||
}
|
||||
|
||||
try {
|
||||
while (($line = fgets($handle)) !== false) {
|
||||
$stats['lines_total']++;
|
||||
$line = trim($line);
|
||||
|
||||
if ($line === '') {
|
||||
$stats['empty_lines']++;
|
||||
continue;
|
||||
}
|
||||
|
||||
$data = json_decode($line, true);
|
||||
|
||||
if (!is_array($data)) {
|
||||
$stats['invalid_lines']++;
|
||||
continue;
|
||||
}
|
||||
|
||||
$tagId = trim((string) ($data['tag_id'] ?? ''));
|
||||
$text = trim((string) ($data['text'] ?? ''));
|
||||
$documentIds = $data['document_ids'] ?? null;
|
||||
$hasDocumentIds = is_array($documentIds) && $documentIds !== [];
|
||||
|
||||
if ($tagId === '' || $text === '') {
|
||||
$stats['invalid_lines']++;
|
||||
continue;
|
||||
}
|
||||
|
||||
$stats['exported_tag_count']++;
|
||||
|
||||
if ($hasDocumentIds) {
|
||||
$stats['tags_with_document_ids']++;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
fclose($handle);
|
||||
}
|
||||
|
||||
return $stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{vector_tag_count:int, meta_valid:bool}
|
||||
*/
|
||||
private function readMetaStats(): array
|
||||
{
|
||||
if (!is_file($this->vectorTagsMetaPath)) {
|
||||
return [
|
||||
'vector_tag_count' => 0,
|
||||
'meta_valid' => false,
|
||||
];
|
||||
}
|
||||
|
||||
$raw = file_get_contents($this->vectorTagsMetaPath);
|
||||
|
||||
if (!is_string($raw) || trim($raw) === '') {
|
||||
return [
|
||||
'vector_tag_count' => 0,
|
||||
'meta_valid' => false,
|
||||
];
|
||||
}
|
||||
|
||||
$decoded = json_decode($raw, true);
|
||||
|
||||
if (is_array($decoded)) {
|
||||
if (array_is_list($decoded)) {
|
||||
return [
|
||||
'vector_tag_count' => count($decoded),
|
||||
'meta_valid' => true,
|
||||
];
|
||||
}
|
||||
|
||||
$numericKeys = array_filter(
|
||||
array_keys($decoded),
|
||||
static fn (string|int $key): bool => is_string($key) && ctype_digit($key)
|
||||
);
|
||||
|
||||
if ($numericKeys !== [] && count($numericKeys) === count($decoded)) {
|
||||
return [
|
||||
'vector_tag_count' => count($decoded),
|
||||
'meta_valid' => true,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
return [
|
||||
'vector_tag_count' => 0,
|
||||
'meta_valid' => false,
|
||||
];
|
||||
}
|
||||
|
||||
private function determineStatus(
|
||||
int $ndjsonTagCount,
|
||||
bool $vectorExists,
|
||||
bool $metaExists,
|
||||
int $vectorTagCount,
|
||||
bool $metaValid
|
||||
): string {
|
||||
if ($ndjsonTagCount === 0 && !$vectorExists && !$metaExists) {
|
||||
return self::STATUS_OK_EMPTY;
|
||||
}
|
||||
|
||||
if ($ndjsonTagCount === 0 && ($vectorExists || $metaExists)) {
|
||||
return self::STATUS_INCONSISTENT_STALE_VECTOR;
|
||||
}
|
||||
|
||||
if ($ndjsonTagCount > 0 && (!$vectorExists || !$metaExists)) {
|
||||
return self::STATUS_INCONSISTENT_MISSING_VECTOR;
|
||||
}
|
||||
|
||||
if ($metaExists && !$metaValid) {
|
||||
return self::STATUS_INCONSISTENT_INVALID_META;
|
||||
}
|
||||
|
||||
if ($ndjsonTagCount > 0 && $vectorExists && $metaExists && $metaValid && $vectorTagCount === $ndjsonTagCount) {
|
||||
return self::STATUS_OK;
|
||||
}
|
||||
|
||||
if ($ndjsonTagCount !== $vectorTagCount) {
|
||||
return self::STATUS_INCONSISTENT_COUNT_MISMATCH;
|
||||
}
|
||||
|
||||
return self::STATUS_UNKNOWN;
|
||||
}
|
||||
}
|
||||
@@ -12,18 +12,29 @@ final readonly class TagVectorSearchClient
|
||||
/**
|
||||
* Minimum similarity score required for a tag to be considered.
|
||||
*/
|
||||
private const MIN_SCORE = 0.72;
|
||||
public const MIN_SCORE = 0.72;
|
||||
|
||||
/**
|
||||
* Default result size when callers do not specify a limit.
|
||||
*/
|
||||
private const DEFAULT_LIMIT = 8;
|
||||
|
||||
/**
|
||||
* Hard limit to prevent excessive requests.
|
||||
*/
|
||||
private const MAX_LIMIT = 50;
|
||||
|
||||
/**
|
||||
* HTTP timeout for the Python vector service.
|
||||
*/
|
||||
private const TIMEOUT_SECONDS = 10;
|
||||
|
||||
public function __construct(
|
||||
private HttpClientInterface $http,
|
||||
private string $serviceUrl,
|
||||
private LoggerInterface $agentLogger,
|
||||
) {}
|
||||
private string $serviceUrl,
|
||||
private LoggerInterface $agentLogger,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes a vector search against the Python tag index.
|
||||
@@ -33,43 +44,51 @@ final readonly class TagVectorSearchClient
|
||||
* {
|
||||
* "tag_id": "...",
|
||||
* "score": 0.73,
|
||||
* "label": "Geräte", // optional (new)
|
||||
* "tag_type": "catalog_entity" // optional (new)
|
||||
* "label": "Geräte",
|
||||
* "tag_type": "catalog_entity"
|
||||
* }
|
||||
* ]
|
||||
*
|
||||
* @return array<int, array{
|
||||
* @return list<array{
|
||||
* tag_id:string,
|
||||
* score:float,
|
||||
* label?:string,
|
||||
* tag_type?:string
|
||||
* label:string,
|
||||
* tag_type:string
|
||||
* }>
|
||||
*/
|
||||
public function search(string $query, int $limit = 8): array
|
||||
public function search(string $query, int $limit = self::DEFAULT_LIMIT): array
|
||||
{
|
||||
$query = trim($query);
|
||||
|
||||
if ($query === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
$limit = max(1, min($limit, self::MAX_LIMIT));
|
||||
$serviceUrl = rtrim(trim($this->serviceUrl), '/');
|
||||
|
||||
if ($serviceUrl === '') {
|
||||
$this->agentLogger->warning('Tag vector service URL is empty.');
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
$response = $this->http->request(
|
||||
'POST',
|
||||
rtrim($this->serviceUrl, '/') . '/search-tags',
|
||||
$serviceUrl . '/search-tags',
|
||||
[
|
||||
'json' => [
|
||||
'query' => $query,
|
||||
'limit' => $limit,
|
||||
],
|
||||
'timeout' => 10,
|
||||
'timeout' => self::TIMEOUT_SECONDS,
|
||||
]
|
||||
);
|
||||
|
||||
if ($response->getStatusCode() !== 200) {
|
||||
$this->agentLogger->warning(
|
||||
'Tag vector service returned non-200',
|
||||
'Tag vector service returned non-200.',
|
||||
['status' => $response->getStatusCode()]
|
||||
);
|
||||
|
||||
@@ -77,10 +96,9 @@ final readonly class TagVectorSearchClient
|
||||
}
|
||||
|
||||
$data = $response->toArray(false);
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
$this->agentLogger->warning(
|
||||
'Tag vector service unreachable',
|
||||
'Tag vector service unreachable.',
|
||||
['error' => $e->getMessage()]
|
||||
);
|
||||
|
||||
@@ -88,18 +106,33 @@ final readonly class TagVectorSearchClient
|
||||
}
|
||||
|
||||
if (!is_array($data)) {
|
||||
$this->agentLogger->warning('Tag vector service returned invalid payload');
|
||||
$this->agentLogger->warning('Tag vector service returned invalid payload.');
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
$hits = [];
|
||||
return $this->normalizeHits($data, $limit);
|
||||
}
|
||||
|
||||
foreach ($data as $row) {
|
||||
/**
|
||||
* @param array<mixed> $rows
|
||||
* @return list<array{
|
||||
* tag_id:string,
|
||||
* score:float,
|
||||
* label:string,
|
||||
* tag_type:string
|
||||
* }>
|
||||
*/
|
||||
private function normalizeHits(array $rows, int $limit): array
|
||||
{
|
||||
$hitsByTagId = [];
|
||||
|
||||
foreach ($rows as $row) {
|
||||
if (!is_array($row)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$tagId = (string)($row['tag_id'] ?? '');
|
||||
$tagId = trim((string) ($row['tag_id'] ?? ''));
|
||||
$score = $row['score'] ?? null;
|
||||
|
||||
if ($tagId === '' || !is_numeric($score)) {
|
||||
@@ -112,24 +145,45 @@ final readonly class TagVectorSearchClient
|
||||
continue;
|
||||
}
|
||||
|
||||
$hit = [
|
||||
$normalizedHit = [
|
||||
'tag_id' => $tagId,
|
||||
'score' => $score,
|
||||
'score' => $score,
|
||||
'label' => trim((string) ($row['label'] ?? '')),
|
||||
'tag_type' => TagTypes::normalize((string) ($row['tag_type'] ?? TagTypes::GENERIC)),
|
||||
];
|
||||
|
||||
// Optional: label
|
||||
if (isset($row['label']) && is_string($row['label'])) {
|
||||
$hit['label'] = $row['label'];
|
||||
}
|
||||
$existingHit = $hitsByTagId[$tagId] ?? null;
|
||||
|
||||
// Optional: tag_type
|
||||
if (isset($row['tag_type']) && is_string($row['tag_type'])) {
|
||||
$hit['tag_type'] = $row['tag_type'];
|
||||
if ($existingHit === null || $normalizedHit['score'] > $existingHit['score']) {
|
||||
$hitsByTagId[$tagId] = $normalizedHit;
|
||||
}
|
||||
|
||||
$hits[] = $hit;
|
||||
}
|
||||
|
||||
return $hits;
|
||||
if ($hitsByTagId === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$hits = array_values($hitsByTagId);
|
||||
|
||||
usort(
|
||||
$hits,
|
||||
static function (array $left, array $right): int {
|
||||
$scoreComparison = $right['score'] <=> $left['score'];
|
||||
|
||||
if ($scoreComparison !== 0) {
|
||||
return $scoreComparison;
|
||||
}
|
||||
|
||||
$typeComparison = strcmp($left['tag_type'], $right['tag_type']);
|
||||
|
||||
if ($typeComparison !== 0) {
|
||||
return $typeComparison;
|
||||
}
|
||||
|
||||
return strcmp($left['tag_id'], $right['tag_id']);
|
||||
}
|
||||
);
|
||||
|
||||
return array_slice($hits, 0, $limit);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user