first commit

This commit is contained in:
team 1
2026-04-20 16:36:28 +02:00
parent a0ec07a99c
commit 2587ac8b4b
41 changed files with 5126 additions and 2280 deletions

View File

@@ -4,6 +4,7 @@ declare(strict_types=1);
namespace App\Tag;
use App\Entity\Document;
use App\Entity\DocumentTag;
use App\Entity\Tag;
use Doctrine\ORM\EntityManagerInterface;
@@ -12,148 +13,199 @@ final readonly class TagNdjsonExporter
{
public function __construct(
private EntityManagerInterface $em,
private string $tagsNdjsonPath,
) {}
private string $tagsNdjsonPath,
) {
}
/**
* Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
* Export all relevant tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
*
* Line format:
* {
* "tag_id":"...",
* "text":"label\nslug\noptional description",
* "type":"catalog_entity|generic|...",
* "type":"catalog_entity|generic|sales_signal",
* "document_ids":["...","..."]
* }
*
* Only ACTIVE document assignments are exported. Tags without active document
* assignments are intentionally skipped so they do not influence retrieval.
*
* @return array{tags:int, lines:int, bytes:int, path:string}
*/
public function export(): array
{
$dir = \dirname($this->tagsNdjsonPath);
if (!\is_dir($dir)) {
@\mkdir($dir, 0775, true);
}
$this->ensureTargetDirectoryExists();
$tmpPath = $this->tagsNdjsonPath . '.tmp';
$this->cleanupTemporaryFile($tmpPath);
$fh = @\fopen($tmpPath, 'wb');
if (!$fh) {
$fh = @fopen($tmpPath, 'wb');
if ($fh === false) {
throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
}
// 1) Load all tags
$tags = $this->em->createQueryBuilder()
->select('t')
->from(Tag::class, 't')
->orderBy('t.label', 'ASC')
->getQuery()
->getResult();
try {
/** @var list<Tag> $tags */
$tags = $this->em->createQueryBuilder()
->select('t')
->from(Tag::class, 't')
->orderBy('t.type', 'ASC')
->addOrderBy('t.label', 'ASC')
->getQuery()
->getResult();
if (!\is_array($tags) || $tags === []) {
\fclose($fh);
if ($tags === []) {
fclose($fh);
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return [
'tags' => 0,
'lines' => 0,
'bytes' => (int) @filesize($this->tagsNdjsonPath),
'path' => $this->tagsNdjsonPath,
];
}
$tagToActiveDocs = $this->buildActiveDocumentMap();
$lines = 0;
foreach ($tags as $tag) {
$tagId = $tag->getId()->toRfc4122();
$docIds = $tagToActiveDocs[$tagId] ?? [];
if ($docIds === []) {
continue;
}
$line = [
'tag_id' => $tagId,
'text' => $this->buildEmbeddingText($tag),
'type' => TagTypes::normalize($tag->getType()),
'document_ids' => $docIds,
];
$json = json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
if (!is_string($json)) {
continue;
}
fwrite($fh, $json . "\n");
$lines++;
}
fclose($fh);
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return [
'tags' => 0,
'lines' => 0,
'bytes' => (int) @\filesize($this->tagsNdjsonPath),
'path' => $this->tagsNdjsonPath,
'tags' => count($tags),
'lines' => $lines,
'bytes' => (int) @filesize($this->tagsNdjsonPath),
'path' => $this->tagsNdjsonPath,
];
}
} catch (\Throwable $e) {
fclose($fh);
$this->cleanupTemporaryFile($tmpPath);
// 2) Build tagId => docIds map
$rows = $this->em->createQueryBuilder()
->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
throw $e;
}
}
/**
* @return array<string, list<string>>
*/
private function buildActiveDocumentMap(): array
{
/** @var list<DocumentTag> $relations */
$relations = $this->em->createQueryBuilder()
->select('dt')
->addSelect('t', 'd')
->from(DocumentTag::class, 'dt')
->innerJoin('dt.tag', 't')
->innerJoin('dt.document', 'd')
->where('d.status = :status')
->setParameter('status', Document::STATUS_ACTIVE)
->getQuery()
->getArrayResult();
->getResult();
$tagToDocs = [];
foreach ($rows as $r) {
$tagId = (string) ($r['tagId'] ?? '');
$docId = (string) ($r['docId'] ?? '');
if ($tagId === '' || $docId === '') {
continue;
}
$tagToDocs[$tagId][] = $docId;
foreach ($relations as $relation) {
$tag = $relation->getTag();
$document = $relation->getDocument();
$tagId = $tag->getId()->toRfc4122();
$docId = $document->getId()->toRfc4122();
$tagToDocs[$tagId][$docId] = $docId;
}
// 3) Stream NDJSON
$lines = 0;
foreach ($tags as $tag) {
if (!$tag instanceof Tag) {
continue;
}
$tagId = (string) $tag->getId();
$docIds = $tagToDocs[$tagId] ?? [];
if ($docIds !== []) {
$docIds = \array_values(\array_unique($docIds));
}
// Embedding source
$textParts = [
$tag->getLabel(),
$tag->getSlug(),
];
$desc = $tag->getDescription();
if (\is_string($desc) && \trim($desc) !== '') {
$textParts[] = \trim($desc);
}
$type = method_exists($tag, 'getType')
? (string) $tag->getType()
: 'generic';
if ($type === '') {
$type = 'generic';
}
$line = [
'tag_id' => $tagId,
'text' => \implode("\n", $textParts),
'type' => $type, // 🔥 NEW
'document_ids' => $docIds,
];
$json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
if (!\is_string($json)) {
continue;
}
\fwrite($fh, $json . "\n");
$lines++;
foreach ($tagToDocs as $tagId => $docIds) {
ksort($docIds);
$tagToDocs[$tagId] = array_values($docIds);
}
\fclose($fh);
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return $tagToDocs;
}
return [
'tags' => \count($tags),
'lines' => $lines,
'bytes' => (int) @\filesize($this->tagsNdjsonPath),
'path' => $this->tagsNdjsonPath,
private function buildEmbeddingText(Tag $tag): string
{
$parts = [
trim($tag->getLabel()),
trim($tag->getSlug()),
];
$description = trim((string) $tag->getDescription());
if ($description !== '') {
$parts[] = preg_replace('/\s+/u', ' ', $description) ?? $description;
}
$parts = array_values(array_filter(
array_unique($parts),
static fn (string $part): bool => $part !== ''
));
return implode("\n", $parts);
}
private function ensureTargetDirectoryExists(): void
{
$dir = dirname($this->tagsNdjsonPath);
if (is_dir($dir)) {
return;
}
if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
throw new \RuntimeException('Cannot create tags NDJSON directory: ' . $dir);
}
}
private function cleanupTemporaryFile(string $tmpPath): void
{
if (is_file($tmpPath)) {
@unlink($tmpPath);
}
}
private function atomicReplace(string $tmpPath, string $finalPath): void
{
if (\is_file($finalPath)) {
@\chmod($finalPath, 0664);
if (is_file($finalPath)) {
@chmod($finalPath, 0664);
}
if (!@\rename($tmpPath, $finalPath)) {
if (!@\copy($tmpPath, $finalPath)) {
@\unlink($tmpPath);
if (!@rename($tmpPath, $finalPath)) {
if (!@copy($tmpPath, $finalPath)) {
@unlink($tmpPath);
throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);
}
@\unlink($tmpPath);
@unlink($tmpPath);
}
@\chmod($finalPath, 0664);
@chmod($finalPath, 0664);
}
}

View File

@@ -4,6 +4,7 @@ declare(strict_types=1);
namespace App\Tag;
use App\Entity\Document;
use Doctrine\DBAL\ArrayParameterType;
use Doctrine\DBAL\Exception;
use Doctrine\ORM\EntityManagerInterface;
@@ -11,91 +12,239 @@ use Symfony\Component\Uid\Uuid;
final class TagRoutingService
{
/**
* Number of raw tag hits requested from the vector service.
*/
private const DEFAULT_TOPK = 8;
private const MIN_BEST_SCORE = 0.25;
private const MAX_CANDIDATE_DOCS = 200;
/**
* Hard minimum confidence required to activate tag-based document routing.
*
* This intentionally aligns with the tag vector client gate to avoid
* misleading secondary thresholds in this class.
*/
private const MIN_BEST_SCORE = 0.72;
/**
* Only keep tag hits that stay reasonably close to the best hit.
* This reduces semantic spillover into weakly related document spaces.
*/
private const MAX_SCORE_DROP_FROM_BEST = 0.08;
/**
* Maximum number of tag hits that may influence routing.
*/
private const MAX_ROUTING_TAGS = 5;
/**
* Maximum number of candidate documents passed into scoped chunk search.
*/
private const MAX_CANDIDATE_DOCS = 80;
/**
* Small bonus for documents matched by multiple routed tags.
*/
private const MULTI_TAG_BONUS_PER_EXTRA_TAG = 0.05;
private const MAX_MULTI_TAG_BONUS = 0.15;
public function __construct(
private readonly TagVectorSearchClient $tagSearch,
private readonly EntityManagerInterface $em,
) {}
) {
}
/**
* @return string[]|null
* Returns ordered active document ids for tag-scoped retrieval.
*
* The method intentionally returns only document ids so the current
* retriever pipeline can stay unchanged.
*
* @return list<string>|null
* @throws Exception
*/
public function route(string $query): ?array
{
$query = trim($query);
if ($query === '') {
return null;
}
$hits = $this->tagSearch->search($query, self::DEFAULT_TOPK);
$hits = $this->filterRoutingHits(
$this->tagSearch->search($query, self::DEFAULT_TOPK)
);
if (!is_array($hits) || $hits === []) {
if ($hits === []) {
return null;
}
$bestScore = (float)($hits[0]['score'] ?? 0.0);
if ($bestScore < self::MIN_BEST_SCORE) {
return null;
}
// Convert tag UUID strings to binary(16)
$tagBinaryIds = [];
$tagMetaById = [];
foreach ($hits as $hit) {
$id = (string)($hit['tag_id'] ?? '');
if ($id === '') {
$tagId = (string) ($hit['tag_id'] ?? '');
if ($tagId === '') {
continue;
}
try {
$tagBinaryIds[] = Uuid::fromString($id)->toBinary();
$tagBinaryIds[] = Uuid::fromString($tagId)->toBinary();
} catch (\Throwable) {
continue;
}
$tagMetaById[$tagId] = [
'score' => (float) $hit['score'],
'weight' => $this->resolveTypeWeight((string) $hit['tag_type']),
];
}
if ($tagBinaryIds === []) {
return null;
}
// Direct DBAL query (binary-safe)
$conn = $this->em->getConnection();
$rows = $conn->executeQuery(
'SELECT document_id
FROM document_tag
WHERE tag_id IN (:tagIds)',
['tagIds' => $tagBinaryIds],
['tagIds' => ArrayParameterType::BINARY]
$rows = $this->em->getConnection()->executeQuery(
'SELECT dt.document_id, dt.tag_id
FROM document_tag dt
INNER JOIN document d ON d.id = dt.document_id
WHERE dt.tag_id IN (:tagIds)
AND d.status = :status',
[
'tagIds' => $tagBinaryIds,
'status' => Document::STATUS_ACTIVE,
],
[
'tagIds' => ArrayParameterType::BINARY,
]
)->fetchAllAssociative();
if ($rows === []) {
return null;
}
$docIds = [];
$documentScores = [];
$documentMatchedTags = [];
foreach ($rows as $row) {
if (!isset($row['document_id'])) {
if (!isset($row['document_id'], $row['tag_id'])) {
continue;
}
try {
$uuid = Uuid::fromBinary($row['document_id']);
$docIds[(string)$uuid] = true;
$documentId = (string) Uuid::fromBinary($row['document_id']);
$tagId = (string) Uuid::fromBinary($row['tag_id']);
} catch (\Throwable) {
continue;
}
if (count($docIds) >= self::MAX_CANDIDATE_DOCS) {
if (!isset($tagMetaById[$tagId])) {
continue;
}
$documentScores[$documentId] = ($documentScores[$documentId] ?? 0.0)
+ ($tagMetaById[$tagId]['score'] * $tagMetaById[$tagId]['weight']);
$documentMatchedTags[$documentId][$tagId] = true;
}
if ($documentScores === []) {
return null;
}
foreach ($documentScores as $documentId => $score) {
$matchedTagCount = isset($documentMatchedTags[$documentId])
? count($documentMatchedTags[$documentId])
: 0;
if ($matchedTagCount > 1) {
$documentScores[$documentId] += min(
self::MAX_MULTI_TAG_BONUS,
($matchedTagCount - 1) * self::MULTI_TAG_BONUS_PER_EXTRA_TAG
);
}
}
arsort($documentScores, SORT_NUMERIC);
return array_slice(
array_keys($documentScores),
0,
self::MAX_CANDIDATE_DOCS
);
}
/**
* @param array<int, array{
* tag_id:string,
* score:float,
* label?:string,
* tag_type?:string
* }> $hits
*
* @return list<array{
* tag_id:string,
* score:float,
* tag_type:string
* }>
*/
private function filterRoutingHits(array $hits): array
{
if ($hits === []) {
return [];
}
$bestScore = (float) ($hits[0]['score'] ?? 0.0);
if ($bestScore < self::MIN_BEST_SCORE) {
return [];
}
$minimumAcceptedScore = max(
self::MIN_BEST_SCORE,
$bestScore - self::MAX_SCORE_DROP_FROM_BEST
);
$filtered = [];
foreach ($hits as $hit) {
$tagId = (string) ($hit['tag_id'] ?? '');
$score = (float) ($hit['score'] ?? 0.0);
$tagType = TagTypes::normalize(
(string) ($hit['tag_type'] ?? TagTypes::GENERIC)
);
if ($tagId === '' || $score < $minimumAcceptedScore) {
continue;
}
// Sales signals may still be useful elsewhere, but they should not
// expand the document scope for semantic retrieval.
if ($tagType === TagTypes::SALES_SIGNAL) {
continue;
}
$filtered[] = [
'tag_id' => $tagId,
'score' => $score,
'tag_type' => $tagType,
];
if (count($filtered) >= self::MAX_ROUTING_TAGS) {
break;
}
}
return array_keys($docIds);
return $filtered;
}
private function resolveTypeWeight(string $tagType): float
{
return match (TagTypes::normalize($tagType)) {
TagTypes::CATALOG_ENTITY => 1.20,
TagTypes::GENERIC => 1.00,
TagTypes::SALES_SIGNAL => 0.00,
default => 1.00,
};
}
}

View File

@@ -4,42 +4,45 @@ declare(strict_types=1);
namespace App\Tag;
use App\Entity\Tag;
use App\Entity\Document;
use App\Entity\DocumentTag;
use App\Entity\Tag;
use App\Service\TagRebuildJobService;
use Doctrine\ORM\EntityManagerInterface;
use InvalidArgumentException;
use RuntimeException;
final readonly class TagService
{
public function __construct(
private EntityManagerInterface $em,
private TagRebuildJobService $jobs,
) {}
// =========================================================
// TAG CREATE
// =========================================================
private TagRebuildJobService $jobs,
) {
}
public function create(
string $slug,
string $label,
?string $description = null,
string $type = 'generic' // NEU
string $type = TagTypes::GENERIC,
): Tag {
$slug = trim($slug);
$normalizedSlug = $this->normalizeSlug($slug);
$label = trim($label);
if ($label === '' || $slug === '') {
throw new \InvalidArgumentException('Label und Slug sind Pflichtfelder.');
if ($normalizedSlug === '' || $label === '') {
throw new InvalidArgumentException('Tag label and slug are required.');
}
if ($this->slugExists($slug)) {
throw new \RuntimeException('Slug existiert bereits.');
if ($this->slugExists($normalizedSlug)) {
throw new RuntimeException(sprintf('Tag slug "%s" already exists.', $normalizedSlug));
}
$tag = new Tag($slug, $label, $description);
$tag->setType($type); // NEU
$tag = new Tag(
$normalizedSlug,
$label,
$description,
TagTypes::normalize($type)
);
$this->em->persist($tag);
$this->em->flush();
@@ -49,18 +52,9 @@ final readonly class TagService
return $tag;
}
// =========================================================
// TAG DELETE
// =========================================================
public function deleteById(string $tagId): void
{
$tag = $this->em->getRepository(Tag::class)->find($tagId);
if (!$tag instanceof Tag) {
throw new \RuntimeException('Tag nicht gefunden.');
}
$tag = $this->findTagById($tagId);
$this->delete($tag);
}
@@ -72,87 +66,103 @@ final readonly class TagService
$this->triggerRebuildIfIdle();
}
// =========================================================
// DOCUMENT TAG SYNC
// =========================================================
public function syncDocumentTags(Document $document, array $newTagIds): void
{
$newTagIds = array_unique($newTagIds);
$normalizedTagIds = $this->normalizeIdList($newTagIds);
/** @var list<DocumentTag> $currentRelations */
$currentRelations = $this->em
->getRepository(DocumentTag::class)
->findBy(['document' => $document]);
$currentTagIds = array_map(
fn(DocumentTag $dt) => (string) $dt->getTag()->getId(),
static fn (DocumentTag $relation): string => (string) $relation->getTag()->getId(),
$currentRelations
);
$toAdd = array_diff($newTagIds, $currentTagIds);
$toRemove = array_diff($currentTagIds, $newTagIds);
$toAdd = array_values(array_diff($normalizedTagIds, $currentTagIds));
$toRemove = array_values(array_diff($currentTagIds, $normalizedTagIds));
foreach ($toAdd as $tagId) {
$tag = $this->em->getRepository(Tag::class)->find($tagId);
if ($tag instanceof Tag) {
$this->em->persist(new DocumentTag($document, $tag));
}
}
foreach ($currentRelations as $relation) {
if (in_array((string) $relation->getTag()->getId(), $toRemove, true)) {
$relationTagId = (string) $relation->getTag()->getId();
if (in_array($relationTagId, $toRemove, true)) {
$this->em->remove($relation);
}
}
if ($toAdd || $toRemove) {
if ($toAdd !== [] || $toRemove !== []) {
$this->em->flush();
$this->triggerRebuildIfIdle();
}
}
// =========================================================
// TAG → DOCUMENT SYNC (Bulk Assign)
// =========================================================
public function syncTagDocuments(Tag $tag, array $newDocumentIds): void
{
$newDocumentIds = array_unique($newDocumentIds);
$normalizedDocumentIds = $this->normalizeIdList($newDocumentIds);
/** @var list<DocumentTag> $currentRelations */
$currentRelations = $this->em
->getRepository(DocumentTag::class)
->findBy(['tag' => $tag]);
$currentDocumentIds = array_map(
fn(DocumentTag $dt) => (string) $dt->getDocument()->getId(),
static fn (DocumentTag $relation): string => (string) $relation->getDocument()->getId(),
$currentRelations
);
$toAdd = array_diff($newDocumentIds, $currentDocumentIds);
$toRemove = array_diff($currentDocumentIds, $newDocumentIds);
$toAdd = array_values(array_diff($normalizedDocumentIds, $currentDocumentIds));
$toRemove = array_values(array_diff($currentDocumentIds, $normalizedDocumentIds));
foreach ($toAdd as $documentId) {
$document = $this->em->getRepository(Document::class)->find($documentId);
if ($document instanceof Document) {
if (
$document instanceof Document
&& $document->getStatus() === Document::STATUS_ACTIVE
) {
$this->em->persist(new DocumentTag($document, $tag));
}
}
foreach ($currentRelations as $relation) {
if (in_array((string) $relation->getDocument()->getId(), $toRemove, true)) {
$relationDocumentId = (string) $relation->getDocument()->getId();
if (in_array($relationDocumentId, $toRemove, true)) {
$this->em->remove($relation);
}
}
if ($toAdd || $toRemove) {
if ($toAdd !== [] || $toRemove !== []) {
$this->em->flush();
$this->triggerRebuildIfIdle();
}
}
// =========================================================
// INTERNAL HELPERS
// =========================================================
private function findTagById(string $tagId): Tag
{
$tagId = trim($tagId);
if ($tagId === '') {
throw new InvalidArgumentException('Tag id must not be empty.');
}
$tag = $this->em->getRepository(Tag::class)->find($tagId);
if (!$tag instanceof Tag) {
throw new RuntimeException('Tag not found.');
}
return $tag;
}
private function slugExists(string $slug): bool
{
@@ -165,6 +175,36 @@ final readonly class TagService
->getSingleScalarResult() > 0;
}
/**
* @param array<mixed> $ids
* @return list<string>
*/
private function normalizeIdList(array $ids): array
{
$normalized = [];
foreach ($ids as $id) {
$id = trim((string) $id);
if ($id === '') {
continue;
}
$normalized[] = $id;
}
return array_values(array_unique($normalized));
}
private function normalizeSlug(string $slug): string
{
$slug = mb_strtolower(trim($slug));
$slug = preg_replace('/\s+/u', '-', $slug) ?? $slug;
$slug = preg_replace('/-+/u', '-', $slug) ?? $slug;
return trim($slug, '-');
}
private function triggerRebuildIfIdle(): void
{
if (!$this->jobs->hasActiveJob()) {

View File

@@ -5,8 +5,10 @@ declare(strict_types=1);
namespace App\Tag;
/**
* Zentrale Definition aller erlaubten Tag-Typen.
* Verhindert Magic Strings im Code.
* Central definition of all supported tag types.
*
* This class is intentionally tiny and dependency-free because it is the
* foundation for entity validation, admin forms, routing, and catalog logic.
*/
final class TagTypes
{
@@ -14,6 +16,25 @@ final class TagTypes
public const CATALOG_ENTITY = 'catalog_entity';
public const SALES_SIGNAL = 'sales_signal';
/**
* Returns the canonical list of allowed type values.
*
* @return list<string>
*/
public static function all(): array
{
return [
self::GENERIC,
self::CATALOG_ENTITY,
self::SALES_SIGNAL,
];
}
/**
* Returns UI choices for forms and admin screens.
*
* @return array<string, string>
*/
public static function choices(): array
{
return [
@@ -23,5 +44,53 @@ final class TagTypes
];
}
private function __construct() {}
/**
* Returns true if the given value is an allowed tag type.
*/
public static function isValid(?string $type): bool
{
if ($type === null) {
return false;
}
return in_array(self::normalize($type), self::all(), true);
}
/**
* Normalizes external input into a canonical internal value.
*
* Empty or unknown input falls back to the provided default.
*/
public static function normalize(?string $type, string $default = self::GENERIC): string
{
$type = mb_strtolower(trim((string) $type));
$default = mb_strtolower(trim($default));
if ($type === '') {
return self::isKnownDefault($default) ? $default : self::GENERIC;
}
if (in_array($type, self::all(), true)) {
return $type;
}
return self::isKnownDefault($default) ? $default : self::GENERIC;
}
/**
* Returns a human-readable label for a canonical type.
*/
public static function labelFor(string $type): string
{
return array_flip(self::choices())[self::normalize($type)] ?? 'Generic';
}
private static function isKnownDefault(string $type): bool
{
return in_array($type, self::all(), true);
}
private function __construct()
{
}
}

View File

@@ -9,18 +9,81 @@ use Psr\Log\LoggerInterface;
final readonly class TagVectorIndexBuilder
{
private const GRACEFUL_TERMINATION_SECONDS = 2;
public function __construct(
private string $pythonBin,
private string $scriptPath,
private string $tagsNdjsonPath,
private string $vectorTagsIndexPath,
private string $embeddingModel,
private int $timeoutSeconds,
private LoggerInterface $agentLogger,
private IndexMetaManager $metaManager, // ✅ NEU
) {}
private string $pythonBin,
private string $scriptPath,
private string $tagsNdjsonPath,
private string $vectorTagsIndexPath,
private string $embeddingModel,
private int $timeoutSeconds,
private LoggerInterface $agentLogger,
private IndexMetaManager $metaManager,
) {
}
public function build(): void
{
$this->assertPreconditions();
$tmpIndex = $this->vectorTagsIndexPath . '.tmp';
$tmpMeta = $tmpIndex . '.meta.json';
$finalIndex = $this->vectorTagsIndexPath;
$finalMeta = $finalIndex . '.meta.json';
$this->ensureTargetDirectoryExists($finalIndex);
$this->cleanupTemporaryArtifacts($tmpIndex, $tmpMeta);
if (!$this->hasEmbeddableTags()) {
$this->agentLogger->info('[tags] no embeddable tags found, removing stale tag index artifacts.');
$this->removeFileIfExists($finalIndex);
$this->removeFileIfExists($finalMeta);
$this->commitRuntime(false);
return;
}
$cmd = $this->buildCommand($tmpIndex);
$this->agentLogger->info('[tags] build tag vector index', [
'cmd' => $cmd,
'timeout' => $this->timeoutSeconds,
'embedding_model' => $this->embeddingModel,
]);
try {
$result = $this->runCommand($cmd);
if ($result['exit'] !== 0) {
$this->agentLogger->error('[tags] tag vector ingest failed', [
'exit' => $result['exit'],
'stdout' => $result['stdout'],
'stderr' => $result['stderr'],
]);
throw new \RuntimeException('Tag vector ingest failed (exit=' . $result['exit'] . ')');
}
if (!$this->isUsableArtifact($tmpIndex) || !$this->isUsableArtifact($tmpMeta)) {
throw new \RuntimeException('Tag vector ingest produced incomplete artifacts.');
}
$this->atomicReplace($tmpIndex, $finalIndex);
$this->atomicReplace($tmpMeta, $finalMeta);
$this->commitRuntime(true);
$this->agentLogger->info('[tags] tag vector index build completed + runtime committed', [
'index' => $finalIndex,
'meta' => $finalMeta,
]);
} catch (\Throwable $e) {
$this->cleanupTemporaryArtifacts($tmpIndex, $tmpMeta);
throw $e;
}
}
private function assertPreconditions(): void
{
if (!is_file($this->tagsNdjsonPath)) {
throw new \RuntimeException('tags.ndjson missing: ' . $this->tagsNdjsonPath);
@@ -30,65 +93,178 @@ final readonly class TagVectorIndexBuilder
throw new \RuntimeException('Tag ingest script missing: ' . $this->scriptPath);
}
$tmpIndex = $this->vectorTagsIndexPath . '.tmp';
$tmpMeta = $tmpIndex . '.meta.json';
$finalIndex = $this->vectorTagsIndexPath;
$finalMeta = $finalIndex . '.meta.json';
$dir = \dirname($finalIndex);
if (!\is_dir($dir)) {
@\mkdir($dir, 0775, true);
if (trim($this->pythonBin) === '') {
throw new \RuntimeException('Python binary must not be empty.');
}
@\unlink($tmpIndex);
@\unlink($tmpMeta);
if ($this->timeoutSeconds < 1) {
throw new \RuntimeException('Tag vector timeout must be >= 1 second.');
}
}
$cmd = sprintf(
'%s %s %s %s %s 2>&1',
private function buildCommand(string $tmpIndex): string
{
return sprintf(
'%s %s %s %s 2>&1',
escapeshellarg($this->pythonBin),
escapeshellarg($this->scriptPath),
escapeshellarg($this->tagsNdjsonPath),
escapeshellarg($tmpIndex),
escapeshellarg($this->embeddingModel),
);
}
$this->agentLogger->info('[tags] build tag vector index', [
'cmd' => $cmd,
'timeout' => $this->timeoutSeconds,
]);
private function ensureTargetDirectoryExists(string $finalIndexPath): void
{
$dir = dirname($finalIndexPath);
$out = [];
$exit = 0;
exec($cmd, $out, $exit);
if ($exit !== 0) {
$this->agentLogger->error('[tags] tag vector ingest failed', [
'exit' => $exit,
'out' => $out,
]);
throw new \RuntimeException('Tag vector ingest failed (exit=' . $exit . ')');
}
if (!is_file($tmpIndex) || !is_file($tmpMeta)) {
@\unlink($tmpIndex);
@\unlink($tmpMeta);
$this->agentLogger->warning('[tags] no tag index produced (maybe 0 tags).');
if (is_dir($dir)) {
return;
}
$this->atomicReplace($tmpIndex, $finalIndex);
$this->atomicReplace($tmpMeta, $finalMeta);
if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create tag vector directory: ' . $dir);
}
}
// ✅ ENTERPRISE COMMIT MARKER
private function hasEmbeddableTags(): bool
{
$fh = @fopen($this->tagsNdjsonPath, 'rb');
if ($fh === false) {
throw new \RuntimeException('Unable to read tags NDJSON: ' . $this->tagsNdjsonPath);
}
try {
while (($line = fgets($fh)) !== false) {
$line = trim($line);
if ($line === '') {
continue;
}
$decoded = json_decode($line, true);
if (!is_array($decoded)) {
continue;
}
$tagId = trim((string) ($decoded['tag_id'] ?? ''));
$text = trim((string) ($decoded['text'] ?? ''));
if ($tagId !== '' && $text !== '') {
return true;
}
}
} finally {
fclose($fh);
}
return false;
}
/**
* @return array{exit:int, stdout:string, stderr:string}
*/
private function runCommand(string $cmd): array
{
$descriptorSpec = [
0 => ['pipe', 'r'],
1 => ['pipe', 'w'],
2 => ['pipe', 'w'],
];
$process = @proc_open($cmd, $descriptorSpec, $pipes);
if (!is_resource($process)) {
throw new \RuntimeException('Could not start tag vector ingest process.');
}
fclose($pipes[0]);
stream_set_blocking($pipes[1], false);
stream_set_blocking($pipes[2], false);
$stdout = '';
$stderr = '';
$startedAt = microtime(true);
$timedOut = false;
try {
while (true) {
$stdout .= stream_get_contents($pipes[1]) ?: '';
$stderr .= stream_get_contents($pipes[2]) ?: '';
$status = proc_get_status($process);
if (!is_array($status) || ($status['running'] ?? false) !== true) {
break;
}
if ((microtime(true) - $startedAt) > $this->timeoutSeconds) {
$timedOut = true;
proc_terminate($process);
usleep(self::GRACEFUL_TERMINATION_SECONDS * 1000000);
$status = proc_get_status($process);
if (is_array($status) && ($status['running'] ?? false) === true) {
proc_terminate($process, 9);
}
break;
}
usleep(100000);
}
$stdout .= stream_get_contents($pipes[1]) ?: '';
$stderr .= stream_get_contents($pipes[2]) ?: '';
} finally {
fclose($pipes[1]);
fclose($pipes[2]);
}
$exitCode = proc_close($process);
if ($timedOut) {
$this->agentLogger->error('[tags] tag vector ingest timed out', [
'timeout' => $this->timeoutSeconds,
'stdout' => $stdout,
'stderr' => $stderr,
]);
throw new \RuntimeException('Tag vector ingest timed out after ' . $this->timeoutSeconds . ' seconds.');
}
return [
'exit' => is_int($exitCode) ? $exitCode : 1,
'stdout' => trim($stdout),
'stderr' => trim($stderr),
];
}
private function isUsableArtifact(string $path): bool
{
return is_file($path) && filesize($path) > 0;
}
private function cleanupTemporaryArtifacts(string ...$paths): void
{
foreach ($paths as $path) {
$this->removeFileIfExists($path);
}
}
private function removeFileIfExists(string $path): void
{
if (is_file($path)) {
@unlink($path);
}
}
private function commitRuntime(bool $indexPresent): void
{
$this->metaManager->touchRuntime([
'last_tags_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
]);
$this->agentLogger->info('[tags] tag vector index build completed + runtime committed', [
'index' => $finalIndex,
'meta' => $finalMeta,
'tags_index_present' => $indexPresent,
]);
}
@@ -99,6 +275,7 @@ final readonly class TagVectorIndexBuilder
@unlink($tmp);
throw new \RuntimeException('Atomic replace failed for: ' . $final);
}
@unlink($tmp);
}

View File

@@ -6,63 +6,210 @@ namespace App\Tag;
final readonly class TagVectorIndexHealthService
{
private const STATUS_OK = 'OK';
private const STATUS_OK_EMPTY = 'OK_EMPTY';
private const STATUS_INCONSISTENT_STALE_VECTOR = 'INCONSISTENT_STALE_VECTOR';
private const STATUS_INCONSISTENT_MISSING_VECTOR = 'INCONSISTENT_MISSING_VECTOR';
private const STATUS_INCONSISTENT_COUNT_MISMATCH = 'INCONSISTENT_COUNT_MISMATCH';
private const STATUS_INCONSISTENT_INVALID_META = 'INCONSISTENT_INVALID_META';
private const STATUS_UNKNOWN = 'UNKNOWN';
public function __construct(
private string $tagsNdjsonPath,
private string $vectorTagsIndexPath,
private string $vectorTagsMetaPath
) {}
private string $vectorTagsMetaPath,
) {
}
public function check(): array
{
$ndjsonExists = is_file($this->tagsNdjsonPath);
$vectorExists = is_file($this->vectorTagsIndexPath);
$metaExists = is_file($this->vectorTagsMetaPath);
$metaExists = is_file($this->vectorTagsMetaPath);
$ndjsonTagCount = 0;
$ndjsonStats = $this->readNdjsonStats();
$metaStats = $this->readMetaStats();
if ($ndjsonExists) {
$h = @fopen($this->tagsNdjsonPath, 'r');
if ($h !== false) {
while (($line = fgets($h)) !== false) {
$line = trim($line);
if ($line === '') continue;
$data = json_decode($line, true);
if (is_array($data) && !empty($data['tag_id']) && !empty($data['text'])) {
$ndjsonTagCount++;
}
}
fclose($h);
}
}
$vectorTagCount = 0;
if ($metaExists) {
$meta = json_decode((string) file_get_contents($this->vectorTagsMetaPath), true);
if (is_array($meta)) {
$vectorTagCount = count($meta);
}
}
$status = $this->determineStatus($ndjsonTagCount, $vectorExists, $metaExists, $vectorTagCount);
$status = $this->determineStatus(
$ndjsonStats['exported_tag_count'],
$vectorExists,
$metaExists,
$metaStats['vector_tag_count'],
$metaStats['meta_valid']
);
return [
'tags_ndjson_exists' => $ndjsonExists,
'tags_ndjson_count' => $ndjsonTagCount,
'vector_exists' => $vectorExists,
'meta_exists' => $metaExists,
'vector_tag_count' => $vectorTagCount,
'status' => $status,
'tags_ndjson_count' => $ndjsonStats['exported_tag_count'],
'vector_exists' => $vectorExists,
'meta_exists' => $metaExists,
'vector_tag_count' => $metaStats['vector_tag_count'],
'status' => $status,
// Extra diagnostics for admin/CLI.
'tags_ndjson_lines_total' => $ndjsonStats['lines_total'],
'tags_ndjson_invalid_lines' => $ndjsonStats['invalid_lines'],
'tags_ndjson_empty_lines' => $ndjsonStats['empty_lines'],
'tags_with_active_document_ids' => $ndjsonStats['tags_with_document_ids'],
'meta_valid' => $metaStats['meta_valid'],
'paths' => [
'tags_ndjson' => $this->tagsNdjsonPath,
'vector_index' => $this->vectorTagsIndexPath,
'vector_meta' => $this->vectorTagsMetaPath,
],
];
}
private function determineStatus(int $ndjsonTagCount, bool $vectorExists, bool $metaExists, int $vectorTagCount): string
/**
* @return array{
* lines_total:int,
* empty_lines:int,
* invalid_lines:int,
* exported_tag_count:int,
* tags_with_document_ids:int
* }
*/
private function readNdjsonStats(): array
{
if ($ndjsonTagCount === 0 && !$vectorExists && !$metaExists) return 'OK_EMPTY';
if ($ndjsonTagCount > 0 && $vectorExists && $metaExists && $vectorTagCount === $ndjsonTagCount) return 'OK';
if ($ndjsonTagCount === 0 && ($vectorExists || $metaExists)) return 'INCONSISTENT_STALE_VECTOR';
if ($ndjsonTagCount > 0 && (!$vectorExists || !$metaExists)) return 'INCONSISTENT_MISSING_VECTOR';
if ($ndjsonTagCount !== $vectorTagCount) return 'INCONSISTENT_COUNT_MISMATCH';
return 'UNKNOWN';
$stats = [
'lines_total' => 0,
'empty_lines' => 0,
'invalid_lines' => 0,
'exported_tag_count' => 0,
'tags_with_document_ids' => 0,
];
if (!is_file($this->tagsNdjsonPath)) {
return $stats;
}
$handle = @fopen($this->tagsNdjsonPath, 'rb');
if ($handle === false) {
return $stats;
}
try {
while (($line = fgets($handle)) !== false) {
$stats['lines_total']++;
$line = trim($line);
if ($line === '') {
$stats['empty_lines']++;
continue;
}
$data = json_decode($line, true);
if (!is_array($data)) {
$stats['invalid_lines']++;
continue;
}
$tagId = trim((string) ($data['tag_id'] ?? ''));
$text = trim((string) ($data['text'] ?? ''));
$documentIds = $data['document_ids'] ?? null;
$hasDocumentIds = is_array($documentIds) && $documentIds !== [];
if ($tagId === '' || $text === '') {
$stats['invalid_lines']++;
continue;
}
$stats['exported_tag_count']++;
if ($hasDocumentIds) {
$stats['tags_with_document_ids']++;
}
}
} finally {
fclose($handle);
}
return $stats;
}
/**
* @return array{vector_tag_count:int, meta_valid:bool}
*/
private function readMetaStats(): array
{
if (!is_file($this->vectorTagsMetaPath)) {
return [
'vector_tag_count' => 0,
'meta_valid' => false,
];
}
$raw = file_get_contents($this->vectorTagsMetaPath);
if (!is_string($raw) || trim($raw) === '') {
return [
'vector_tag_count' => 0,
'meta_valid' => false,
];
}
$decoded = json_decode($raw, true);
if (is_array($decoded)) {
if (array_is_list($decoded)) {
return [
'vector_tag_count' => count($decoded),
'meta_valid' => true,
];
}
$numericKeys = array_filter(
array_keys($decoded),
static fn (string|int $key): bool => is_string($key) && ctype_digit($key)
);
if ($numericKeys !== [] && count($numericKeys) === count($decoded)) {
return [
'vector_tag_count' => count($decoded),
'meta_valid' => true,
];
}
}
return [
'vector_tag_count' => 0,
'meta_valid' => false,
];
}
private function determineStatus(
int $ndjsonTagCount,
bool $vectorExists,
bool $metaExists,
int $vectorTagCount,
bool $metaValid
): string {
if ($ndjsonTagCount === 0 && !$vectorExists && !$metaExists) {
return self::STATUS_OK_EMPTY;
}
if ($ndjsonTagCount === 0 && ($vectorExists || $metaExists)) {
return self::STATUS_INCONSISTENT_STALE_VECTOR;
}
if ($ndjsonTagCount > 0 && (!$vectorExists || !$metaExists)) {
return self::STATUS_INCONSISTENT_MISSING_VECTOR;
}
if ($metaExists && !$metaValid) {
return self::STATUS_INCONSISTENT_INVALID_META;
}
if ($ndjsonTagCount > 0 && $vectorExists && $metaExists && $metaValid && $vectorTagCount === $ndjsonTagCount) {
return self::STATUS_OK;
}
if ($ndjsonTagCount !== $vectorTagCount) {
return self::STATUS_INCONSISTENT_COUNT_MISMATCH;
}
return self::STATUS_UNKNOWN;
}
}

View File

@@ -12,18 +12,29 @@ final readonly class TagVectorSearchClient
/**
* Minimum similarity score required for a tag to be considered.
*/
private const MIN_SCORE = 0.72;
public const MIN_SCORE = 0.72;
/**
* Default result size when callers do not specify a limit.
*/
private const DEFAULT_LIMIT = 8;
/**
* Hard limit to prevent excessive requests.
*/
private const MAX_LIMIT = 50;
/**
* HTTP timeout for the Python vector service.
*/
private const TIMEOUT_SECONDS = 10;
public function __construct(
private HttpClientInterface $http,
private string $serviceUrl,
private LoggerInterface $agentLogger,
) {}
private string $serviceUrl,
private LoggerInterface $agentLogger,
) {
}
/**
* Executes a vector search against the Python tag index.
@@ -33,43 +44,51 @@ final readonly class TagVectorSearchClient
* {
* "tag_id": "...",
* "score": 0.73,
* "label": "Geräte", // optional (new)
* "tag_type": "catalog_entity" // optional (new)
* "label": "Geräte",
* "tag_type": "catalog_entity"
* }
* ]
*
* @return array<int, array{
* @return list<array{
* tag_id:string,
* score:float,
* label?:string,
* tag_type?:string
* label:string,
* tag_type:string
* }>
*/
public function search(string $query, int $limit = 8): array
public function search(string $query, int $limit = self::DEFAULT_LIMIT): array
{
$query = trim($query);
if ($query === '') {
return [];
}
$limit = max(1, min($limit, self::MAX_LIMIT));
$serviceUrl = rtrim(trim($this->serviceUrl), '/');
if ($serviceUrl === '') {
$this->agentLogger->warning('Tag vector service URL is empty.');
return [];
}
try {
$response = $this->http->request(
'POST',
rtrim($this->serviceUrl, '/') . '/search-tags',
$serviceUrl . '/search-tags',
[
'json' => [
'query' => $query,
'limit' => $limit,
],
'timeout' => 10,
'timeout' => self::TIMEOUT_SECONDS,
]
);
if ($response->getStatusCode() !== 200) {
$this->agentLogger->warning(
'Tag vector service returned non-200',
'Tag vector service returned non-200.',
['status' => $response->getStatusCode()]
);
@@ -77,10 +96,9 @@ final readonly class TagVectorSearchClient
}
$data = $response->toArray(false);
} catch (\Throwable $e) {
$this->agentLogger->warning(
'Tag vector service unreachable',
'Tag vector service unreachable.',
['error' => $e->getMessage()]
);
@@ -88,18 +106,33 @@ final readonly class TagVectorSearchClient
}
if (!is_array($data)) {
$this->agentLogger->warning('Tag vector service returned invalid payload');
$this->agentLogger->warning('Tag vector service returned invalid payload.');
return [];
}
$hits = [];
return $this->normalizeHits($data, $limit);
}
foreach ($data as $row) {
/**
* @param array<mixed> $rows
* @return list<array{
* tag_id:string,
* score:float,
* label:string,
* tag_type:string
* }>
*/
private function normalizeHits(array $rows, int $limit): array
{
$hitsByTagId = [];
foreach ($rows as $row) {
if (!is_array($row)) {
continue;
}
$tagId = (string)($row['tag_id'] ?? '');
$tagId = trim((string) ($row['tag_id'] ?? ''));
$score = $row['score'] ?? null;
if ($tagId === '' || !is_numeric($score)) {
@@ -112,24 +145,45 @@ final readonly class TagVectorSearchClient
continue;
}
$hit = [
$normalizedHit = [
'tag_id' => $tagId,
'score' => $score,
'score' => $score,
'label' => trim((string) ($row['label'] ?? '')),
'tag_type' => TagTypes::normalize((string) ($row['tag_type'] ?? TagTypes::GENERIC)),
];
// Optional: label
if (isset($row['label']) && is_string($row['label'])) {
$hit['label'] = $row['label'];
}
$existingHit = $hitsByTagId[$tagId] ?? null;
// Optional: tag_type
if (isset($row['tag_type']) && is_string($row['tag_type'])) {
$hit['tag_type'] = $row['tag_type'];
if ($existingHit === null || $normalizedHit['score'] > $existingHit['score']) {
$hitsByTagId[$tagId] = $normalizedHit;
}
$hits[] = $hit;
}
return $hits;
if ($hitsByTagId === []) {
return [];
}
$hits = array_values($hitsByTagId);
usort(
$hits,
static function (array $left, array $right): int {
$scoreComparison = $right['score'] <=> $left['score'];
if ($scoreComparison !== 0) {
return $scoreComparison;
}
$typeComparison = strcmp($left['tag_type'], $right['tag_type']);
if ($typeComparison !== 0) {
return $typeComparison;
}
return strcmp($left['tag_id'], $right['tag_id']);
}
);
return array_slice($hits, 0, $limit);
}
}