add tagging
This commit is contained in:
159
src/Tag/TagNdjsonExporter.php
Normal file
159
src/Tag/TagNdjsonExporter.php
Normal file
@@ -0,0 +1,159 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Tag;
|
||||
|
||||
use App\Entity\DocumentTag;
|
||||
use App\Entity\Tag;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
|
||||
final class TagNdjsonExporter
|
||||
{
|
||||
public function __construct(
|
||||
private EntityManagerInterface $em,
|
||||
private string $tagsNdjsonPath,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
|
||||
*
|
||||
* Line format:
|
||||
* {"tag_id":"...","text":"label\nslug\noptional description","document_ids":["...","..."]}
|
||||
*
|
||||
* @return array{tags:int, lines:int, bytes:int, path:string}
|
||||
*/
|
||||
public function export(): array
|
||||
{
|
||||
$dir = \dirname($this->tagsNdjsonPath);
|
||||
if (!\is_dir($dir)) {
|
||||
@\mkdir($dir, 0775, true);
|
||||
}
|
||||
|
||||
$tmpPath = $this->tagsNdjsonPath . '.tmp';
|
||||
|
||||
$fh = @\fopen($tmpPath, 'wb');
|
||||
if (!$fh) {
|
||||
throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Fetch tags (small) + join document ids (can be bigger) efficiently.
|
||||
// We avoid repositories and keep it DB-agnostic via DQL/QB.
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
// 1) Load all tags (id, slug, label, description)
|
||||
$tags = $this->em->createQueryBuilder()
|
||||
->select('t')
|
||||
->from(Tag::class, 't')
|
||||
->orderBy('t.label', 'ASC')
|
||||
->getQuery()
|
||||
->getResult();
|
||||
|
||||
if (!\is_array($tags) || $tags === []) {
|
||||
\fclose($fh);
|
||||
|
||||
// Write empty file atomically
|
||||
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
||||
|
||||
return [
|
||||
'tags' => 0,
|
||||
'lines' => 0,
|
||||
'bytes' => (int) @\filesize($this->tagsNdjsonPath),
|
||||
'path' => $this->tagsNdjsonPath,
|
||||
];
|
||||
}
|
||||
|
||||
// 2) Build tagId => docIds map from document_tag
|
||||
// We query pairs (tag_id, document_id) in one go.
|
||||
$rows = $this->em->createQueryBuilder()
|
||||
->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
|
||||
->from(DocumentTag::class, 'dt')
|
||||
->getQuery()
|
||||
->getArrayResult();
|
||||
|
||||
$tagToDocs = [];
|
||||
foreach ($rows as $r) {
|
||||
$tagId = (string) ($r['tagId'] ?? '');
|
||||
$docId = (string) ($r['docId'] ?? '');
|
||||
if ($tagId === '' || $docId === '') {
|
||||
continue;
|
||||
}
|
||||
$tagToDocs[$tagId][] = $docId;
|
||||
}
|
||||
|
||||
// 3) Stream NDJSON lines
|
||||
$lines = 0;
|
||||
|
||||
foreach ($tags as $tag) {
|
||||
if (!$tag instanceof Tag) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$tagId = (string) $tag->getId();
|
||||
$docIds = $tagToDocs[$tagId] ?? [];
|
||||
|
||||
// de-dupe docIds for safety
|
||||
if ($docIds !== []) {
|
||||
$docIds = \array_values(\array_unique($docIds));
|
||||
}
|
||||
|
||||
// "text" is the embedding source for tag vectors later:
|
||||
// Keep it short but semantically useful.
|
||||
$textParts = [
|
||||
$tag->getLabel(),
|
||||
$tag->getSlug(),
|
||||
];
|
||||
|
||||
$desc = $tag->getDescription();
|
||||
if (\is_string($desc) && \trim($desc) !== '') {
|
||||
$textParts[] = \trim($desc);
|
||||
}
|
||||
|
||||
$line = [
|
||||
'tag_id' => $tagId,
|
||||
'text' => \implode("\n", $textParts),
|
||||
'document_ids' => $docIds,
|
||||
];
|
||||
|
||||
$json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||
if (!\is_string($json)) {
|
||||
// skip invalid line but keep export running
|
||||
continue;
|
||||
}
|
||||
|
||||
\fwrite($fh, $json . "\n");
|
||||
$lines++;
|
||||
}
|
||||
|
||||
\fclose($fh);
|
||||
|
||||
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
|
||||
|
||||
return [
|
||||
'tags' => \count($tags),
|
||||
'lines' => $lines,
|
||||
'bytes' => (int) @\filesize($this->tagsNdjsonPath),
|
||||
'path' => $this->tagsNdjsonPath,
|
||||
];
|
||||
}
|
||||
|
||||
private function atomicReplace(string $tmpPath, string $finalPath): void
|
||||
{
|
||||
// Ensure old file can be replaced on Windows-like FS too (best effort)
|
||||
if (\is_file($finalPath)) {
|
||||
@\chmod($finalPath, 0664);
|
||||
}
|
||||
|
||||
if (!@\rename($tmpPath, $finalPath)) {
|
||||
// if rename fails, try copy+unlink fallback
|
||||
if (!@\copy($tmpPath, $finalPath)) {
|
||||
@\unlink($tmpPath);
|
||||
throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);
|
||||
}
|
||||
@\unlink($tmpPath);
|
||||
}
|
||||
|
||||
@\chmod($finalPath, 0664);
|
||||
}
|
||||
}
|
||||
98
src/Tag/TagRoutingService.php
Normal file
98
src/Tag/TagRoutingService.php
Normal file
@@ -0,0 +1,98 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Tag;
|
||||
|
||||
use Doctrine\DBAL\ArrayParameterType;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
final class TagRoutingService
|
||||
{
|
||||
private const DEFAULT_TOPK = 8;
|
||||
private const MIN_BEST_SCORE = 0.10;
|
||||
private const MAX_CANDIDATE_DOCS = 200;
|
||||
|
||||
public function __construct(
|
||||
private readonly TagVectorSearchClient $tagSearch,
|
||||
private readonly EntityManagerInterface $em,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* @return string[]|null
|
||||
*/
|
||||
public function route(string $query): ?array
|
||||
{
|
||||
$query = trim($query);
|
||||
if ($query === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
$hits = $this->tagSearch->search($query, self::DEFAULT_TOPK);
|
||||
if (!is_array($hits) || $hits === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$bestScore = (float)($hits[0]['score'] ?? 0.0);
|
||||
if ($bestScore < self::MIN_BEST_SCORE) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Convert tag UUID strings to binary(16)
|
||||
$tagBinaryIds = [];
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
$id = (string)($hit['tag_id'] ?? '');
|
||||
if ($id === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
$tagBinaryIds[] = Uuid::fromString($id)->toBinary();
|
||||
} catch (\Throwable) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if ($tagBinaryIds === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Direct DBAL query (binary-safe)
|
||||
$conn = $this->em->getConnection();
|
||||
|
||||
$rows = $conn->executeQuery(
|
||||
'SELECT document_id
|
||||
FROM document_tag
|
||||
WHERE tag_id IN (:tagIds)',
|
||||
['tagIds' => $tagBinaryIds],
|
||||
['tagIds' => ArrayParameterType::BINARY]
|
||||
)->fetchAllAssociative();
|
||||
|
||||
if ($rows === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$docIds = [];
|
||||
|
||||
foreach ($rows as $row) {
|
||||
if (!isset($row['document_id'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
$uuid = Uuid::fromBinary($row['document_id']);
|
||||
$docIds[(string)$uuid] = true;
|
||||
} catch (\Throwable) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (count($docIds) >= self::MAX_CANDIDATE_DOCS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return array_keys($docIds);
|
||||
}
|
||||
}
|
||||
107
src/Tag/TagVectorIndexBuilder.php
Normal file
107
src/Tag/TagVectorIndexBuilder.php
Normal file
@@ -0,0 +1,107 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Tag;
|
||||
|
||||
use Psr\Log\LoggerInterface;
|
||||
|
||||
final class TagVectorIndexBuilder
|
||||
{
|
||||
public function __construct(
|
||||
private readonly string $pythonBin,
|
||||
private readonly string $scriptPath,
|
||||
private readonly string $tagsNdjsonPath,
|
||||
private readonly string $vectorTagsIndexPath,
|
||||
private readonly string $embeddingModel,
|
||||
private readonly int $timeoutSeconds,
|
||||
private readonly LoggerInterface $agentLogger,
|
||||
) {}
|
||||
|
||||
public function build(): void
|
||||
{
|
||||
if (!is_file($this->tagsNdjsonPath)) {
|
||||
throw new \RuntimeException('tags.ndjson missing: ' . $this->tagsNdjsonPath);
|
||||
}
|
||||
|
||||
if (!is_file($this->scriptPath)) {
|
||||
throw new \RuntimeException('Tag ingest script missing: ' . $this->scriptPath);
|
||||
}
|
||||
|
||||
$tmpIndex = $this->vectorTagsIndexPath . '.tmp';
|
||||
$tmpMeta = $tmpIndex . '.meta.json';
|
||||
|
||||
$finalIndex = $this->vectorTagsIndexPath;
|
||||
$finalMeta = $finalIndex . '.meta.json';
|
||||
|
||||
// Ensure output dir exists
|
||||
$dir = \dirname($finalIndex);
|
||||
if (!\is_dir($dir)) {
|
||||
@\mkdir($dir, 0775, true);
|
||||
}
|
||||
|
||||
// Clean tmp leftovers
|
||||
@\unlink($tmpIndex);
|
||||
@\unlink($tmpMeta);
|
||||
|
||||
// Positional args:
|
||||
// python vector_ingest_tags.py <tags.ndjson> <out.tmp> <model>
|
||||
$cmd = sprintf(
|
||||
'%s %s %s %s %s 2>&1',
|
||||
escapeshellarg($this->pythonBin),
|
||||
escapeshellarg($this->scriptPath),
|
||||
escapeshellarg($this->tagsNdjsonPath),
|
||||
escapeshellarg($tmpIndex),
|
||||
escapeshellarg($this->embeddingModel),
|
||||
);
|
||||
|
||||
$this->agentLogger->info('[tags] build tag vector index', [
|
||||
'cmd' => $cmd,
|
||||
'timeout' => $this->timeoutSeconds,
|
||||
]);
|
||||
|
||||
$out = [];
|
||||
$exit = 0;
|
||||
|
||||
exec($cmd, $out, $exit);
|
||||
|
||||
if ($exit !== 0) {
|
||||
$this->agentLogger->error('[tags] tag vector ingest failed', [
|
||||
'exit' => $exit,
|
||||
'out' => $out,
|
||||
]);
|
||||
throw new \RuntimeException('Tag vector ingest failed (exit=' . $exit . ')');
|
||||
}
|
||||
|
||||
// If no tags -> python may remove outputs and exit 0
|
||||
if (!is_file($tmpIndex) || !is_file($tmpMeta)) {
|
||||
// treat as "no index" rather than hard error
|
||||
@\unlink($tmpIndex);
|
||||
@\unlink($tmpMeta);
|
||||
$this->agentLogger->warning('[tags] no tag index produced (maybe 0 tags).');
|
||||
return;
|
||||
}
|
||||
|
||||
// Atomic switch
|
||||
$this->atomicReplace($tmpIndex, $finalIndex);
|
||||
$this->atomicReplace($tmpMeta, $finalMeta);
|
||||
|
||||
$this->agentLogger->info('[tags] tag vector index build completed', [
|
||||
'index' => $finalIndex,
|
||||
'meta' => $finalMeta,
|
||||
]);
|
||||
}
|
||||
|
||||
private function atomicReplace(string $tmp, string $final): void
|
||||
{
|
||||
if (!@rename($tmp, $final)) {
|
||||
if (!@copy($tmp, $final)) {
|
||||
@unlink($tmp);
|
||||
throw new \RuntimeException('Atomic replace failed for: ' . $final);
|
||||
}
|
||||
@unlink($tmp);
|
||||
}
|
||||
|
||||
@chmod($final, 0664);
|
||||
}
|
||||
}
|
||||
88
src/Tag/TagVectorSearchClient.php
Normal file
88
src/Tag/TagVectorSearchClient.php
Normal file
@@ -0,0 +1,88 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Tag;
|
||||
|
||||
use Psr\Log\LoggerInterface;
|
||||
|
||||
final readonly class TagVectorSearchClient
|
||||
{
|
||||
public function __construct(
|
||||
private string $pythonBin,
|
||||
private string $scriptPath,
|
||||
private string $vectorTagsIndexPath,
|
||||
private string $vectorTagsMetaPath,
|
||||
private string $embeddingModel,
|
||||
private LoggerInterface $agentLogger,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* @return array<int, array{tag_id:string, score:float}>
|
||||
*/
|
||||
public function search(string $query, int $limit = 8): array
|
||||
{
|
||||
if (!is_file($this->scriptPath)) {
|
||||
$this->agentLogger->warning('Tag vector search script missing: ' . $this->scriptPath);
|
||||
return [];
|
||||
}
|
||||
|
||||
if (!is_file($this->vectorTagsIndexPath) || !is_file($this->vectorTagsMetaPath)) {
|
||||
// no tag index available yet => no routing
|
||||
return [];
|
||||
}
|
||||
|
||||
$limit = max(1, min($limit, 50));
|
||||
|
||||
// Positional args, aligned with existing VectorSearchClient approach:
|
||||
// python vector_search_tags.py <query> <limit> <index> <meta> <model>
|
||||
$cmd = sprintf(
|
||||
'%s %s %s %d %s %s %s 2>&1',
|
||||
escapeshellarg($this->pythonBin),
|
||||
escapeshellarg($this->scriptPath),
|
||||
escapeshellarg($query),
|
||||
$limit,
|
||||
escapeshellarg($this->vectorTagsIndexPath),
|
||||
escapeshellarg($this->vectorTagsMetaPath),
|
||||
escapeshellarg($this->embeddingModel),
|
||||
);
|
||||
|
||||
exec($cmd, $out, $exitCode);
|
||||
|
||||
if ($exitCode !== 0 || empty($out)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$json = implode("\n", $out);
|
||||
|
||||
try {
|
||||
$data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
|
||||
} catch (\Throwable) {
|
||||
return [];
|
||||
}
|
||||
|
||||
if (!is_array($data)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$hits = [];
|
||||
foreach ($data as $row) {
|
||||
if (!is_array($row)) {
|
||||
continue;
|
||||
}
|
||||
$tagId = (string)($row['tag_id'] ?? '');
|
||||
$score = $row['score'] ?? null;
|
||||
|
||||
if ($tagId === '' || !is_numeric($score)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$hits[] = [
|
||||
'tag_id' => $tagId,
|
||||
'score' => (float)$score,
|
||||
];
|
||||
}
|
||||
|
||||
return $hits;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user