add tagging

This commit is contained in:
team 1
2026-02-21 16:23:34 +01:00
parent 5a3852db12
commit cf5b473034
23 changed files with 1984 additions and 85 deletions

View File

@@ -0,0 +1,159 @@
<?php
declare(strict_types=1);
namespace App\Tag;
use App\Entity\DocumentTag;
use App\Entity\Tag;
use Doctrine\ORM\EntityManagerInterface;
final class TagNdjsonExporter
{
public function __construct(
private EntityManagerInterface $em,
private string $tagsNdjsonPath,
) {}
/**
* Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
*
* Line format:
* {"tag_id":"...","text":"label\nslug\noptional description","document_ids":["...","..."]}
*
* @return array{tags:int, lines:int, bytes:int, path:string}
*/
public function export(): array
{
$dir = \dirname($this->tagsNdjsonPath);
if (!\is_dir($dir)) {
@\mkdir($dir, 0775, true);
}
$tmpPath = $this->tagsNdjsonPath . '.tmp';
$fh = @\fopen($tmpPath, 'wb');
if (!$fh) {
throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
}
// ------------------------------------------------------------------
// Fetch tags (small) + join document ids (can be bigger) efficiently.
// We avoid repositories and keep it DB-agnostic via DQL/QB.
// ------------------------------------------------------------------
// 1) Load all tags (id, slug, label, description)
$tags = $this->em->createQueryBuilder()
->select('t')
->from(Tag::class, 't')
->orderBy('t.label', 'ASC')
->getQuery()
->getResult();
if (!\is_array($tags) || $tags === []) {
\fclose($fh);
// Write empty file atomically
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return [
'tags' => 0,
'lines' => 0,
'bytes' => (int) @\filesize($this->tagsNdjsonPath),
'path' => $this->tagsNdjsonPath,
];
}
// 2) Build tagId => docIds map from document_tag
// We query pairs (tag_id, document_id) in one go.
$rows = $this->em->createQueryBuilder()
->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
->from(DocumentTag::class, 'dt')
->getQuery()
->getArrayResult();
$tagToDocs = [];
foreach ($rows as $r) {
$tagId = (string) ($r['tagId'] ?? '');
$docId = (string) ($r['docId'] ?? '');
if ($tagId === '' || $docId === '') {
continue;
}
$tagToDocs[$tagId][] = $docId;
}
// 3) Stream NDJSON lines
$lines = 0;
foreach ($tags as $tag) {
if (!$tag instanceof Tag) {
continue;
}
$tagId = (string) $tag->getId();
$docIds = $tagToDocs[$tagId] ?? [];
// de-dupe docIds for safety
if ($docIds !== []) {
$docIds = \array_values(\array_unique($docIds));
}
// "text" is the embedding source for tag vectors later:
// Keep it short but semantically useful.
$textParts = [
$tag->getLabel(),
$tag->getSlug(),
];
$desc = $tag->getDescription();
if (\is_string($desc) && \trim($desc) !== '') {
$textParts[] = \trim($desc);
}
$line = [
'tag_id' => $tagId,
'text' => \implode("\n", $textParts),
'document_ids' => $docIds,
];
$json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
if (!\is_string($json)) {
// skip invalid line but keep export running
continue;
}
\fwrite($fh, $json . "\n");
$lines++;
}
\fclose($fh);
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return [
'tags' => \count($tags),
'lines' => $lines,
'bytes' => (int) @\filesize($this->tagsNdjsonPath),
'path' => $this->tagsNdjsonPath,
];
}
private function atomicReplace(string $tmpPath, string $finalPath): void
{
// Ensure old file can be replaced on Windows-like FS too (best effort)
if (\is_file($finalPath)) {
@\chmod($finalPath, 0664);
}
if (!@\rename($tmpPath, $finalPath)) {
// if rename fails, try copy+unlink fallback
if (!@\copy($tmpPath, $finalPath)) {
@\unlink($tmpPath);
throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);
}
@\unlink($tmpPath);
}
@\chmod($finalPath, 0664);
}
}

View File

@@ -0,0 +1,98 @@
<?php
declare(strict_types=1);
namespace App\Tag;
use Doctrine\DBAL\ArrayParameterType;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Component\Uid\Uuid;
final class TagRoutingService
{
private const DEFAULT_TOPK = 8;
private const MIN_BEST_SCORE = 0.10;
private const MAX_CANDIDATE_DOCS = 200;
public function __construct(
private readonly TagVectorSearchClient $tagSearch,
private readonly EntityManagerInterface $em,
) {}
/**
* @return string[]|null
*/
public function route(string $query): ?array
{
$query = trim($query);
if ($query === '') {
return null;
}
$hits = $this->tagSearch->search($query, self::DEFAULT_TOPK);
if (!is_array($hits) || $hits === []) {
return null;
}
$bestScore = (float)($hits[0]['score'] ?? 0.0);
if ($bestScore < self::MIN_BEST_SCORE) {
return null;
}
// Convert tag UUID strings to binary(16)
$tagBinaryIds = [];
foreach ($hits as $hit) {
$id = (string)($hit['tag_id'] ?? '');
if ($id === '') {
continue;
}
try {
$tagBinaryIds[] = Uuid::fromString($id)->toBinary();
} catch (\Throwable) {
continue;
}
}
if ($tagBinaryIds === []) {
return null;
}
// Direct DBAL query (binary-safe)
$conn = $this->em->getConnection();
$rows = $conn->executeQuery(
'SELECT document_id
FROM document_tag
WHERE tag_id IN (:tagIds)',
['tagIds' => $tagBinaryIds],
['tagIds' => ArrayParameterType::BINARY]
)->fetchAllAssociative();
if ($rows === []) {
return null;
}
$docIds = [];
foreach ($rows as $row) {
if (!isset($row['document_id'])) {
continue;
}
try {
$uuid = Uuid::fromBinary($row['document_id']);
$docIds[(string)$uuid] = true;
} catch (\Throwable) {
continue;
}
if (count($docIds) >= self::MAX_CANDIDATE_DOCS) {
break;
}
}
return array_keys($docIds);
}
}

View File

@@ -0,0 +1,107 @@
<?php
declare(strict_types=1);
namespace App\Tag;
use Psr\Log\LoggerInterface;
final class TagVectorIndexBuilder
{
public function __construct(
private readonly string $pythonBin,
private readonly string $scriptPath,
private readonly string $tagsNdjsonPath,
private readonly string $vectorTagsIndexPath,
private readonly string $embeddingModel,
private readonly int $timeoutSeconds,
private readonly LoggerInterface $agentLogger,
) {}
public function build(): void
{
if (!is_file($this->tagsNdjsonPath)) {
throw new \RuntimeException('tags.ndjson missing: ' . $this->tagsNdjsonPath);
}
if (!is_file($this->scriptPath)) {
throw new \RuntimeException('Tag ingest script missing: ' . $this->scriptPath);
}
$tmpIndex = $this->vectorTagsIndexPath . '.tmp';
$tmpMeta = $tmpIndex . '.meta.json';
$finalIndex = $this->vectorTagsIndexPath;
$finalMeta = $finalIndex . '.meta.json';
// Ensure output dir exists
$dir = \dirname($finalIndex);
if (!\is_dir($dir)) {
@\mkdir($dir, 0775, true);
}
// Clean tmp leftovers
@\unlink($tmpIndex);
@\unlink($tmpMeta);
// Positional args:
// python vector_ingest_tags.py <tags.ndjson> <out.tmp> <model>
$cmd = sprintf(
'%s %s %s %s %s 2>&1',
escapeshellarg($this->pythonBin),
escapeshellarg($this->scriptPath),
escapeshellarg($this->tagsNdjsonPath),
escapeshellarg($tmpIndex),
escapeshellarg($this->embeddingModel),
);
$this->agentLogger->info('[tags] build tag vector index', [
'cmd' => $cmd,
'timeout' => $this->timeoutSeconds,
]);
$out = [];
$exit = 0;
exec($cmd, $out, $exit);
if ($exit !== 0) {
$this->agentLogger->error('[tags] tag vector ingest failed', [
'exit' => $exit,
'out' => $out,
]);
throw new \RuntimeException('Tag vector ingest failed (exit=' . $exit . ')');
}
// If no tags -> python may remove outputs and exit 0
if (!is_file($tmpIndex) || !is_file($tmpMeta)) {
// treat as "no index" rather than hard error
@\unlink($tmpIndex);
@\unlink($tmpMeta);
$this->agentLogger->warning('[tags] no tag index produced (maybe 0 tags).');
return;
}
// Atomic switch
$this->atomicReplace($tmpIndex, $finalIndex);
$this->atomicReplace($tmpMeta, $finalMeta);
$this->agentLogger->info('[tags] tag vector index build completed', [
'index' => $finalIndex,
'meta' => $finalMeta,
]);
}
private function atomicReplace(string $tmp, string $final): void
{
if (!@rename($tmp, $final)) {
if (!@copy($tmp, $final)) {
@unlink($tmp);
throw new \RuntimeException('Atomic replace failed for: ' . $final);
}
@unlink($tmp);
}
@chmod($final, 0664);
}
}

View File

@@ -0,0 +1,88 @@
<?php
declare(strict_types=1);
namespace App\Tag;
use Psr\Log\LoggerInterface;
final readonly class TagVectorSearchClient
{
public function __construct(
private string $pythonBin,
private string $scriptPath,
private string $vectorTagsIndexPath,
private string $vectorTagsMetaPath,
private string $embeddingModel,
private LoggerInterface $agentLogger,
) {}
/**
* @return array<int, array{tag_id:string, score:float}>
*/
public function search(string $query, int $limit = 8): array
{
if (!is_file($this->scriptPath)) {
$this->agentLogger->warning('Tag vector search script missing: ' . $this->scriptPath);
return [];
}
if (!is_file($this->vectorTagsIndexPath) || !is_file($this->vectorTagsMetaPath)) {
// no tag index available yet => no routing
return [];
}
$limit = max(1, min($limit, 50));
// Positional args, aligned with existing VectorSearchClient approach:
// python vector_search_tags.py <query> <limit> <index> <meta> <model>
$cmd = sprintf(
'%s %s %s %d %s %s %s 2>&1',
escapeshellarg($this->pythonBin),
escapeshellarg($this->scriptPath),
escapeshellarg($query),
$limit,
escapeshellarg($this->vectorTagsIndexPath),
escapeshellarg($this->vectorTagsMetaPath),
escapeshellarg($this->embeddingModel),
);
exec($cmd, $out, $exitCode);
if ($exitCode !== 0 || empty($out)) {
return [];
}
$json = implode("\n", $out);
try {
$data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
} catch (\Throwable) {
return [];
}
if (!is_array($data)) {
return [];
}
$hits = [];
foreach ($data as $row) {
if (!is_array($row)) {
continue;
}
$tagId = (string)($row['tag_id'] ?? '');
$score = $row['score'] ?? null;
if ($tagId === '' || !is_numeric($score)) {
continue;
}
$hits[] = [
'tag_id' => $tagId,
'score' => (float)$score,
];
}
return $hits;
}
}