add tagging

This commit is contained in:
team 1
2026-02-21 16:23:34 +01:00
parent 5a3852db12
commit cf5b473034
23 changed files with 1984 additions and 85 deletions

View File

@@ -6,14 +6,12 @@ parameters:
# ------------------------------------------------------------ # ------------------------------------------------------------
# Root # Root
# ------------------------------------------------------------ # ------------------------------------------------------------
mto.root: '%kernel.project_dir%' mto.root: '%kernel.project_dir%'
mto.kernel.dir: '%mto.root%' mto.kernel.dir: '%mto.root%'
# ------------------------------------------------------------ # ------------------------------------------------------------
# Knowledge Root (ZENTRAL) # Knowledge Root (ZENTRAL)
# ------------------------------------------------------------ # ------------------------------------------------------------
mto.knowledge.root: '%mto.root%/var/knowledge' mto.knowledge.root: '%mto.root%/var/knowledge'
mto.knowledge.ndjson: '%mto.knowledge.root%/index.ndjson' mto.knowledge.ndjson: '%mto.knowledge.root%/index.ndjson'
@@ -23,13 +21,28 @@ parameters:
mto.runtime.meta: '%mto.knowledge.root%/index_runtime.json' mto.runtime.meta: '%mto.knowledge.root%/index_runtime.json'
mto.knowledge.upload: '%mto.knowledge.root%/uploads' mto.knowledge.upload: '%mto.knowledge.root%/uploads'
# ------------------------------------------------------------
# Tags (Document Routing)
# ------------------------------------------------------------
mto.knowledge.tags_ndjson: '%mto.knowledge.root%/tags.ndjson'
# Tag vector index outputs
mto.knowledge.vector_tags_index: '%mto.knowledge.root%/vector_tags.index'
mto.knowledge.vector_tags_index_meta: '%mto.knowledge.root%/vector_tags.index.meta.json'
# Tag vector scripts (in src/Vector)
mto.vector.ingest_tags_script: '%mto.root%/src/Vector/vector_ingest_tags.py'
mto.vector.search_tags_script: '%mto.root%/src/Vector/vector_search_tags.py'
# Lock for tag rebuild jobs
mto.tags.rebuild_lock: '%mto.knowledge.root%/locks/tag_rebuild.lock'
# Backward compatibility alias # Backward compatibility alias
mto.vector.data.upload.path: '%mto.knowledge.upload%' mto.vector.data.upload.path: '%mto.knowledge.upload%'
# ------------------------------------------------------------ # ------------------------------------------------------------
# Index Configuration (Fallback Guardrails) # Index Configuration (Fallback Guardrails)
# ------------------------------------------------------------ # ------------------------------------------------------------
mto.index.chunk_size: 800 mto.index.chunk_size: 800
mto.index.chunk_overlap: 100 mto.index.chunk_overlap: 100
mto.index.embedding_model: 'all-MiniLM-L6-v2' mto.index.embedding_model: 'all-MiniLM-L6-v2'
@@ -39,7 +52,6 @@ parameters:
# ------------------------------------------------------------ # ------------------------------------------------------------
# Python / Vector Runtime # Python / Vector Runtime
# ------------------------------------------------------------ # ------------------------------------------------------------
mto.vector.python_bin: '/var/www/html/.venv/bin/python3' mto.vector.python_bin: '/var/www/html/.venv/bin/python3'
mto.vector.ingest_script: '%mto.root%/src/Vector/vector_ingest.py' mto.vector.ingest_script: '%mto.root%/src/Vector/vector_ingest.py'
mto.vector.search_script: '%mto.root%/src/Vector/vector_search.py' mto.vector.search_script: '%mto.root%/src/Vector/vector_search.py'
@@ -158,6 +170,52 @@ services:
$timeoutSeconds: '%mto.vector.timeout%' $timeoutSeconds: '%mto.vector.timeout%'
$configurationProvider: '@App\Index\IndexConfigurationProvider' $configurationProvider: '@App\Index\IndexConfigurationProvider'
# ------------------------------------------------------------
# Tags Export (Document Routing)
# ------------------------------------------------------------
App\Tag\TagNdjsonExporter:
arguments:
$tagsNdjsonPath: '%mto.knowledge.tags_ndjson%'
# ------------------------------------------------------------
# Tags Vector (Builder + Search) ✅ HIER IST DER FIX
# ------------------------------------------------------------
App\Tag\TagVectorIndexBuilder:
arguments:
$pythonBin: '%mto.vector.python_bin%'
$scriptPath: '%mto.vector.ingest_tags_script%'
$tagsNdjsonPath: '%mto.knowledge.tags_ndjson%'
$vectorTagsIndexPath: '%mto.knowledge.vector_tags_index%'
$embeddingModel: '%mto.index.embedding_model%'
$timeoutSeconds: '%mto.vector.timeout%'
$agentLogger: '@monolog.logger.agent'
App\Tag\TagVectorSearchClient:
arguments:
$pythonBin: '%mto.vector.python_bin%'
$scriptPath: '%mto.vector.search_tags_script%'
$vectorTagsIndexPath: '%mto.knowledge.vector_tags_index%'
$vectorTagsMetaPath: '%mto.knowledge.vector_tags_index_meta%'
$embeddingModel: '%mto.index.embedding_model%'
$agentLogger: '@monolog.logger.agent'
App\Tag\TagRoutingService: ~
# ------------------------------------------------------------
# Tag Rebuild Jobs (8A)
# ------------------------------------------------------------
App\Service\TagRebuildJobService:
arguments:
$projectDir: '%mto.root%'
$agentLogger: '@monolog.logger.agent'
App\Command\TagRebuildRunJobCommand:
arguments:
$lockFilePath: '%mto.tags.rebuild_lock%'
# ------------------------------------------------------------ # ------------------------------------------------------------
# Admin Utilities # Admin Utilities
# ------------------------------------------------------------ # ------------------------------------------------------------

View File

@@ -0,0 +1,81 @@
<?php
declare(strict_types=1);
namespace DoctrineMigrations;
use Doctrine\DBAL\Schema\Schema;
use Doctrine\Migrations\AbstractMigration;
final class Version20260221000100 extends AbstractMigration
{
public function getDescription(): string
{
return 'Adds tagging system for DocumentVersion (Tag + document_version_tag)';
}
public function up(Schema $schema): void
{
// --------------------------------------------------
// TAG TABLE
// --------------------------------------------------
$this->addSql("
CREATE TABLE tag (
id UUID NOT NULL,
name VARCHAR(120) NOT NULL,
slug VARCHAR(120) NOT NULL,
created_at TIMESTAMP(0) WITHOUT TIME ZONE NOT NULL,
PRIMARY KEY(id)
)
");
$this->addSql("
CREATE UNIQUE INDEX uniq_tag_slug ON tag (slug)
");
// --------------------------------------------------
// DOCUMENT_VERSION_TAG (ManyToMany)
// --------------------------------------------------
$this->addSql("
CREATE TABLE document_version_tag (
document_version_id UUID NOT NULL,
tag_id UUID NOT NULL,
PRIMARY KEY(document_version_id, tag_id)
)
");
$this->addSql("
CREATE INDEX idx_dv_tag_version
ON document_version_tag (document_version_id)
");
$this->addSql("
CREATE INDEX idx_dv_tag_tag
ON document_version_tag (tag_id)
");
$this->addSql("
ALTER TABLE document_version_tag
ADD CONSTRAINT fk_dv_tag_version
FOREIGN KEY (document_version_id)
REFERENCES document_version (id)
ON DELETE CASCADE
");
$this->addSql("
ALTER TABLE document_version_tag
ADD CONSTRAINT fk_dv_tag_tag
FOREIGN KEY (tag_id)
REFERENCES tag (id)
ON DELETE CASCADE
");
}
public function down(Schema $schema): void
{
$this->addSql("DROP TABLE document_version_tag");
$this->addSql("DROP TABLE tag");
}
}

View File

@@ -0,0 +1,103 @@
<?php
declare(strict_types=1);
namespace App\Command;
use App\Entity\TagRebuildJob;
use App\Tag\TagNdjsonExporter;
use App\Tag\TagVectorIndexBuilder;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
#[AsCommand(
name: 'mto:agent:tags:job:run',
description: 'Run a single tag rebuild job (export tags.ndjson + build vector_tags.index) with lock'
)]
final class TagRebuildRunJobCommand extends Command
{
public function __construct(
private readonly EntityManagerInterface $em,
private readonly TagNdjsonExporter $exporter,
private readonly TagVectorIndexBuilder $builder,
private readonly string $lockFilePath,
) {
parent::__construct();
}
protected function configure(): void
{
$this->addArgument('jobId', InputArgument::REQUIRED, 'TagRebuildJob UUID');
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$jobId = (string)$input->getArgument('jobId');
/** @var TagRebuildJob|null $job */
$job = $this->em->getRepository(TagRebuildJob::class)->find($jobId);
if (!$job instanceof TagRebuildJob) {
$output->writeln('<error>Job not found.</error>');
return Command::FAILURE;
}
// ---------------------------------------------------------
// Global lock to avoid parallel rebuilds
// ---------------------------------------------------------
$lockDir = \dirname($this->lockFilePath);
if (!\is_dir($lockDir)) {
@\mkdir($lockDir, 0775, true);
}
$fh = @\fopen($this->lockFilePath, 'c+');
if (!$fh) {
$job->markFailed('Cannot open lock file: ' . $this->lockFilePath);
$this->em->flush();
$output->writeln('<error>Cannot open lock file.</error>');
return Command::FAILURE;
}
// If another rebuild runs, we fail fast (simple & safe).
if (!@\flock($fh, LOCK_EX | LOCK_NB)) {
\fclose($fh);
$job->markFailed('Another tag rebuild is currently running (lock busy).');
$this->em->flush();
$output->writeln('<error>Lock busy. Another rebuild is running.</error>');
return Command::FAILURE;
}
// mark running
$job->markRunning();
$this->em->flush();
try {
$export = $this->exporter->export();
$this->builder->build();
$job->markCompleted();
$this->em->flush();
$output->writeln('<info>OK</info>');
$output->writeln('tags.ndjson: ' . $export['path']);
} catch (\Throwable $e) {
$job->markFailed($e->getMessage());
$this->em->flush();
$output->writeln('<error>FAILED: ' . $e->getMessage() . '</error>');
@\flock($fh, LOCK_UN);
@\fclose($fh);
return Command::FAILURE;
}
@\flock($fh, LOCK_UN);
@\fclose($fh);
return Command::SUCCESS;
}
}

View File

@@ -0,0 +1,42 @@
<?php
declare(strict_types=1);
namespace App\Command;
use App\Tag\TagNdjsonExporter;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
#[AsCommand(
name: 'mto:agent:tags:export',
description: 'Export tags to NDJSON for tag vector routing'
)]
final class TagsExportCommand extends Command
{
public function __construct(
private TagNdjsonExporter $exporter,
) {
parent::__construct();
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
try {
$result = $this->exporter->export();
} catch (\Throwable $e) {
$output->writeln('<error>ERROR: ' . $e->getMessage() . '</error>');
return Command::FAILURE;
}
$output->writeln('<info>Tags NDJSON exported</info>');
$output->writeln('Path: ' . $result['path']);
$output->writeln('Tags: ' . $result['tags']);
$output->writeln('Lines: ' . $result['lines']);
$output->writeln('Bytes: ' . $result['bytes']);
return Command::SUCCESS;
}
}

View File

@@ -0,0 +1,46 @@
<?php
declare(strict_types=1);
namespace App\Command;
use App\Tag\TagNdjsonExporter;
use App\Tag\TagVectorIndexBuilder;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
#[AsCommand(
name: 'mto:agent:tags:rebuild',
description: 'Export tags.ndjson and rebuild tag vector index (vector_tags.index)'
)]
final class TagsRebuildCommand extends Command
{
public function __construct(
private readonly TagNdjsonExporter $exporter,
private readonly TagVectorIndexBuilder $builder,
) {
parent::__construct();
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
try {
$export = $this->exporter->export();
$output->writeln('<info>1/2 Exported tags.ndjson</info>');
$output->writeln('Path: ' . $export['path']);
$output->writeln('Tags: ' . $export['tags']);
$output->writeln('Lines: ' . $export['lines']);
$output->writeln('Bytes: ' . $export['bytes']);
$this->builder->build();
$output->writeln('<info>2/2 Built vector_tags.index</info>');
} catch (\Throwable $e) {
$output->writeln('<error>ERROR: ' . $e->getMessage() . '</error>');
return Command::FAILURE;
}
return Command::SUCCESS;
}
}

View File

@@ -0,0 +1,93 @@
<?php
declare(strict_types=1);
namespace App\Controller\Admin;
use App\Entity\Document;
use App\Entity\Tag;
use App\Service\TagRebuildJobService;
use Doctrine\DBAL\Types\Types;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
use Symfony\Component\HttpFoundation\RedirectResponse;
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\Routing\Attribute\Route;
use Symfony\Component\Uid\Uuid;
#[Route('/admin/documents')]
final class DocumentTagController extends AbstractController
{
#[Route('/{id}/tags', name: 'admin_document_tags_edit', methods: ['GET'])]
public function edit(string $id, EntityManagerInterface $em): Response
{
$document = $em->getRepository(Document::class)->find($id);
if (!$document instanceof Document) {
throw $this->createNotFoundException('Document not found');
}
$allTags = $em->createQueryBuilder()
->select('t')
->from(Tag::class, 't')
->orderBy('t.label', 'ASC')
->getQuery()
->getResult();
$assigned = [];
foreach ($document->getTags() as $tag) {
$assigned[(string)$tag->getId()] = true;
}
return $this->render('admin/document_tags/edit.html.twig', [
'document' => $document,
'allTags' => $allTags,
'assigned' => $assigned,
]);
}
#[Route('/{id}/tags/save', name: 'admin_document_tags_save', methods: ['POST'])]
public function save(
string $id,
Request $request,
EntityManagerInterface $em,
TagRebuildJobService $jobs
): RedirectResponse {
$document = $em->getRepository(Document::class)->find($id);
if (!$document instanceof Document) {
return $this->redirectToRoute('admin_documents');
}
$selected = $request->request->all('tag_ids') ?? [];
$uuidObjects = [];
foreach ($selected as $value) {
try {
$uuidObjects[] = \Symfony\Component\Uid\Uuid::fromString($value);
} catch (\Throwable) {
continue;
}
}
// Remove
foreach ($document->getTags() as $tag) {
if (!in_array($tag->getId(), $uuidObjects, false)) {
$document->removeTag($tag);
}
}
// Add
foreach ($uuidObjects as $uuid) {
$tag = $em->find(\App\Entity\Tag::class, $uuid);
if ($tag && !$document->hasTag($tag)) {
$document->addTag($tag);
}
}
$em->flush();
$jobs->enqueueAndStartAsync();
return $this->redirectToRoute('admin_document_tags_edit', ['id' => $id]);
}
}

View File

@@ -0,0 +1,101 @@
<?php
declare(strict_types=1);
namespace App\Controller\Admin;
use App\Entity\Tag;
use App\Service\TagRebuildJobService;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
use Symfony\Component\HttpFoundation\RedirectResponse;
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\Routing\Attribute\Route;
#[Route('/admin/tags')]
final class TagController extends AbstractController
{
#[Route('', name: 'admin_tags_index', methods: ['GET'])]
public function index(EntityManagerInterface $em): Response
{
$tags = $em->createQueryBuilder()
->select('t')
->from(Tag::class, 't')
->orderBy('t.label', 'ASC')
->getQuery()
->getResult();
return $this->render('admin/tag/index.html.twig', [
'tags' => $tags,
]);
}
#[Route('/create', name: 'admin_tags_create', methods: ['POST'])]
public function create(Request $request, EntityManagerInterface $em, TagRebuildJobService $jobs): RedirectResponse
{
$token = (string)$request->request->get('_token', '');
if (!$this->isCsrfTokenValid('admin_tag_create', $token)) {
$this->addFlash('danger', 'Ungültiges CSRF Token.');
return $this->redirectToRoute('admin_tags_index');
}
$label = trim((string)$request->request->get('label', ''));
$slug = trim((string)$request->request->get('slug', ''));
$desc = trim((string)$request->request->get('description', ''));
if ($label === '' || $slug === '') {
$this->addFlash('danger', 'Label und Slug sind Pflichtfelder.');
return $this->redirectToRoute('admin_tags_index');
}
$exists = (int)$em->createQueryBuilder()
->select('COUNT(t.id)')
->from(Tag::class, 't')
->where('t.slug = :slug')
->setParameter('slug', $slug)
->getQuery()
->getSingleScalarResult();
if ($exists > 0) {
$this->addFlash('danger', 'Slug existiert bereits.');
return $this->redirectToRoute('admin_tags_index');
}
$tag = new Tag($slug, $label, $desc !== '' ? $desc : null);
$em->persist($tag);
$em->flush();
// enqueue async rebuild
$jobs->enqueueAndStartAsync();
$this->addFlash('success', 'Tag wurde erstellt. Rebuild läuft im Hintergrund.');
return $this->redirectToRoute('admin_tags_index');
}
#[Route('/{id}/delete', name: 'admin_tags_delete', methods: ['POST'])]
public function delete(string $id, Request $request, EntityManagerInterface $em, TagRebuildJobService $jobs): RedirectResponse
{
$token = (string)$request->request->get('_token', '');
if (!$this->isCsrfTokenValid('admin_tag_delete_' . $id, $token)) {
$this->addFlash('danger', 'Ungültiges CSRF Token.');
return $this->redirectToRoute('admin_tags_index');
}
$tag = $em->getRepository(Tag::class)->find($id);
if (!$tag instanceof Tag) {
$this->addFlash('danger', 'Tag nicht gefunden.');
return $this->redirectToRoute('admin_tags_index');
}
$em->remove($tag);
$em->flush();
// enqueue async rebuild
$jobs->enqueueAndStartAsync();
$this->addFlash('success', 'Tag wurde gelöscht. Rebuild läuft im Hintergrund.');
return $this->redirectToRoute('admin_tags_index');
}
}

View File

@@ -2,6 +2,7 @@
namespace App\Entity; namespace App\Entity;
use DateTimeImmutable;
use Doctrine\ORM\Mapping as ORM; use Doctrine\ORM\Mapping as ORM;
use Symfony\Component\Uid\Uuid; use Symfony\Component\Uid\Uuid;
use Doctrine\Common\Collections\ArrayCollection; use Doctrine\Common\Collections\ArrayCollection;
@@ -14,7 +15,7 @@ class Document
public const STATUS_ARCHIVED = 'ARCHIVED'; public const STATUS_ARCHIVED = 'ARCHIVED';
#[ORM\Id] #[ORM\Id]
#[ORM\Column(type: 'uuid', unique: true)] #[ORM\Column(type: 'uuid')]
private Uuid $id; private Uuid $id;
#[ORM\Column(length: 255)] #[ORM\Column(length: 255)]
@@ -28,35 +29,49 @@ class Document
private User $createdBy; private User $createdBy;
#[ORM\Column] #[ORM\Column]
private \DateTimeImmutable $createdAt; private DateTimeImmutable $createdAt;
// 🔥 REMOVE ergänzt
#[ORM\OneToMany( #[ORM\OneToMany(
mappedBy: 'document',
targetEntity: DocumentVersion::class, targetEntity: DocumentVersion::class,
cascade: ['persist', 'remove'], mappedBy: 'document',
cascade: ['persist'],
orphanRemoval: true orphanRemoval: true
)] )]
private Collection $versions; private Collection $versions;
// 🔥 onDelete ergänzt
#[ORM\ManyToOne] #[ORM\ManyToOne]
#[ORM\JoinColumn(nullable: true, onDelete: 'SET NULL')]
private ?DocumentVersion $currentVersion = null; private ?DocumentVersion $currentVersion = null;
// ---------------------------------------------------------
// Tags via Join-Entity (DocumentTag)
// ---------------------------------------------------------
#[ORM\OneToMany(
targetEntity: DocumentTag::class,
mappedBy: 'document',
cascade: ['persist', 'remove'],
orphanRemoval: true
)]
private Collection $documentTags;
public function __construct() public function __construct()
{ {
$this->id = Uuid::v4(); $this->id = Uuid::v4();
$this->createdAt = new \DateTimeImmutable(); $this->createdAt = new DateTimeImmutable();
$this->versions = new ArrayCollection(); $this->versions = new ArrayCollection();
$this->documentTags = new ArrayCollection();
} }
// ---------------------------------------------------------
// Basic Getters
// ---------------------------------------------------------
public function getId(): Uuid public function getId(): Uuid
{ {
return $this->id; return $this->id;
} }
public function getCreatedAt(): \DateTimeImmutable public function getCreatedAt(): DateTimeImmutable
{ {
return $this->createdAt; return $this->createdAt;
} }
@@ -87,12 +102,14 @@ class Document
return $this->createdBy; return $this->createdBy;
} }
public function setCreatedBy(User $user): static public function setCreatedBy(User $createdBy): void
{ {
$this->createdBy = $user; $this->createdBy = $createdBy;
return $this;
} }
/**
* @return Collection<int, DocumentVersion>
*/
public function getVersions(): Collection public function getVersions(): Collection
{ {
return $this->versions; return $this->versions;
@@ -106,13 +123,67 @@ class Document
} }
} }
public function getCurrentVersion(): ?DocumentVersion
{
return $this->currentVersion;
}
public function setCurrentVersion(?DocumentVersion $version): void public function setCurrentVersion(?DocumentVersion $version): void
{ {
$this->currentVersion = $version; $this->currentVersion = $version;
} }
public function getCurrentVersion(): ?DocumentVersion // ---------------------------------------------------------
// Tag API (Join-Entity basiert)
// ---------------------------------------------------------
/**
* @return Collection<int, DocumentTag>
*/
public function getDocumentTags(): Collection
{ {
return $this->currentVersion; return $this->documentTags;
}
/**
* Convenience: liefert direkt Tag-Objekte
*
* @return Tag[]
*/
public function getTags(): array
{
return array_map(
fn (DocumentTag $dt) => $dt->getTag(),
$this->documentTags->toArray()
);
}
public function hasTag(Tag $tag): bool
{
foreach ($this->documentTags as $dt) {
if ($dt->getTag()->getId()->equals($tag->getId())) {
return true;
}
}
return false;
}
public function addTag(Tag $tag): void
{
if ($this->hasTag($tag)) {
return;
}
$this->documentTags->add(new DocumentTag($this, $tag));
}
public function removeTag(Tag $tag): void
{
foreach ($this->documentTags as $dt) {
if ($dt->getTag()->getId()->equals($tag->getId())) {
$this->documentTags->removeElement($dt);
return;
}
}
} }
} }

View File

@@ -0,0 +1,38 @@
<?php
declare(strict_types=1);
namespace App\Entity;
use Doctrine\ORM\Mapping as ORM;
#[ORM\Entity]
#[ORM\Table(name: 'document_tag')]
class DocumentTag
{
#[ORM\Id]
#[ORM\ManyToOne(targetEntity: Document::class, inversedBy: 'documentTags')]
#[ORM\JoinColumn(name: 'document_id', referencedColumnName: 'id', nullable: false, onDelete: 'CASCADE')]
private Document $document;
#[ORM\Id]
#[ORM\ManyToOne(targetEntity: Tag::class)]
#[ORM\JoinColumn(name: 'tag_id', referencedColumnName: 'id', nullable: false, onDelete: 'CASCADE')]
private Tag $tag;
public function __construct(Document $document, Tag $tag)
{
$this->document = $document;
$this->tag = $tag;
}
public function getDocument(): Document
{
return $this->document;
}
public function getTag(): Tag
{
return $this->tag;
}
}

82
src/Entity/Tag.php Normal file
View File

@@ -0,0 +1,82 @@
<?php
namespace App\Entity;
use Doctrine\ORM\Mapping as ORM;
use Symfony\Component\Uid\Uuid;
#[ORM\Entity]
#[ORM\Table(name: 'knowledge_tag')]
#[ORM\Index(name: 'idx_knowledge_tag_slug', columns: ['slug'])]
#[ORM\Index(name: 'idx_knowledge_tag_label', columns: ['label'])]
class Tag
{
#[ORM\Id]
#[ORM\Column(type: 'uuid', unique: true)]
private Uuid $id;
#[ORM\Column(length: 120, unique: true)]
private string $slug;
#[ORM\Column(length: 180)]
private string $label;
#[ORM\Column(type: 'text', nullable: true)]
private ?string $description = null;
#[ORM\Column]
private \DateTimeImmutable $createdAt;
public function __construct(string $slug, string $label, ?string $description = null)
{
$this->id = Uuid::v4();
$this->createdAt = new \DateTimeImmutable();
$this->slug = $slug;
$this->label = $label;
$this->description = $description;
}
public function getId(): Uuid
{
return $this->id;
}
public function getSlug(): string
{
return $this->slug;
}
public function setSlug(string $slug): static
{
$this->slug = $slug;
return $this;
}
public function getLabel(): string
{
return $this->label;
}
public function setLabel(string $label): static
{
$this->label = $label;
return $this;
}
public function getDescription(): ?string
{
return $this->description;
}
public function setDescription(?string $description): static
{
$this->description = $description;
return $this;
}
public function getCreatedAt(): \DateTimeImmutable
{
return $this->createdAt;
}
}

View File

@@ -0,0 +1,96 @@
<?php
declare(strict_types=1);
namespace App\Entity;
use Doctrine\ORM\Mapping as ORM;
use Symfony\Component\Uid\Uuid;
#[ORM\Entity]
#[ORM\Table(name: 'tag_rebuild_job')]
#[ORM\Index(columns: ['status'], name: 'idx_tag_rebuild_job_status')]
#[ORM\Index(columns: ['created_at'], name: 'idx_tag_rebuild_job_created_at')]
class TagRebuildJob
{
public const STATUS_QUEUED = 'QUEUED';
public const STATUS_RUNNING = 'RUNNING';
public const STATUS_COMPLETED = 'COMPLETED';
public const STATUS_FAILED = 'FAILED';
#[ORM\Id]
#[ORM\Column(type: 'uuid', unique: true)]
private Uuid $id;
#[ORM\Column(length: 16)]
private string $status = self::STATUS_QUEUED;
#[ORM\Column(type: 'datetime_immutable')]
private \DateTimeImmutable $createdAt;
#[ORM\Column(type: 'datetime_immutable', nullable: true)]
private ?\DateTimeImmutable $startedAt = null;
#[ORM\Column(type: 'datetime_immutable', nullable: true)]
private ?\DateTimeImmutable $finishedAt = null;
#[ORM\Column(type: 'text', nullable: true)]
private ?string $errorMessage = null;
public function __construct()
{
$this->id = Uuid::v4();
$this->createdAt = new \DateTimeImmutable();
$this->status = self::STATUS_QUEUED;
}
public function getId(): Uuid
{
return $this->id;
}
public function getStatus(): string
{
return $this->status;
}
public function markRunning(): void
{
$this->status = self::STATUS_RUNNING;
$this->startedAt = new \DateTimeImmutable();
$this->errorMessage = null;
}
public function markCompleted(): void
{
$this->status = self::STATUS_COMPLETED;
$this->finishedAt = new \DateTimeImmutable();
}
public function markFailed(string $message): void
{
$this->status = self::STATUS_FAILED;
$this->finishedAt = new \DateTimeImmutable();
$this->errorMessage = $message;
}
public function getCreatedAt(): \DateTimeImmutable
{
return $this->createdAt;
}
public function getStartedAt(): ?\DateTimeImmutable
{
return $this->startedAt;
}
public function getFinishedAt(): ?\DateTimeImmutable
{
return $this->finishedAt;
}
public function getErrorMessage(): ?string
{
return $this->errorMessage;
}
}

View File

@@ -4,33 +4,75 @@ declare(strict_types=1);
namespace App\Knowledge\Retrieval; namespace App\Knowledge\Retrieval;
use App\Knowledge\QueryCleaner; use App\Knowledge\ChunkManager;
use App\Tag\TagRoutingService;
use App\Vector\VectorSearchClient; use App\Vector\VectorSearchClient;
final class NdjsonHybridRetriever implements RetrieverInterface final class NdjsonHybridRetriever implements RetrieverInterface
{ {
private const VECTOR_SCORE_THRESHOLD = 0.25; private const VECTOR_SCORE_THRESHOLD = 0.65;
/**
* Wenn Tag-Routing aktiv ist, erhöhen wir TopK,
* weil wir danach per document_id filtern.
*/
private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 10;
/**
* Keyword-Scan: Mindest-Trefferanzahl an Terms, damit ein Chunk als Kandidat gilt.
*/
private const KEYWORD_MIN_HITS = 1;
public function __construct( public function __construct(
private readonly ChunkManager $chunkManager,
private readonly NdjsonChunkLookup $lookup, private readonly NdjsonChunkLookup $lookup,
private readonly VectorSearchClient $vectorClient, private readonly VectorSearchClient $vectorClient,
private readonly QueryCleaner $queryCleaner, private readonly TagRoutingService $tagRouting,
private readonly int $maxChunks = 25, private readonly int $maxChunks = 3,
private readonly int $vectorTopK = 10, private readonly int $vectorTopK = 5,
) ) {}
{
}
public function retrieve(string $prompt, int $limit = null): array public function retrieve(string $prompt, int $limit = null): array
{ {
$limit = $this->maxChunks; $limit ??= $this->maxChunks;
$keywordChunks = [];
$query = $this->queryCleaner->clean($prompt);
// Vector / enrichment // ---------------------------------------------------------
$hits = $this->vectorClient->search($query, $this->vectorTopK); // 0) Tag-Routing FIRST (soft gate)
// ---------------------------------------------------------
$candidateDocIds = $this->tagRouting->route($prompt);
$candidateSet = null;
if (is_array($candidateDocIds) && $candidateDocIds !== []) {
$candidateSet = array_fill_keys($candidateDocIds, true);
}
// ---------------------------------------------------------
// 1) Keyword first (simple streaming scan)
// ---------------------------------------------------------
$terms = $this->extractTerms($prompt);
$keywordChunks = $this->keywordSearchStreaming($terms, $limit, $candidateSet);
if (\count($keywordChunks) >= $limit) {
return array_slice($keywordChunks, 0, $limit);
}
// ---------------------------------------------------------
// 2) Vector fallback / enrichment
// - If routed: increase TopK, then filter by document_id
// - Soft fallback: if filtering yields nothing -> global vector once
// ---------------------------------------------------------
$topK = $this->vectorTopK;
if ($candidateSet !== null) {
$topK = max($this->vectorTopK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $this->vectorTopK);
$topK = min($topK, 200); // guardrail
}
$hits = $this->vectorClient->search($prompt, $topK);
if ($hits === []) { if ($hits === []) {
return $this->diversifyByDevice($keywordChunks, $limit, 1); return $keywordChunks;
} }
$chunkIds = []; $chunkIds = [];
@@ -45,73 +87,78 @@ final class NdjsonHybridRetriever implements RetrieverInterface
} }
if ($chunkIds === []) { if ($chunkIds === []) {
return $this->diversifyByDevice($keywordChunks, $limit, 1); return $keywordChunks;
} }
$rows = $this->lookup->findByChunkIds($chunkIds); $rows = $this->lookup->findByChunkIds($chunkIds);
// routed filtering by document_id
$finalChunkIds = $chunkIds;
if ($candidateSet !== null) {
$filtered = [];
foreach ($chunkIds as $id) { foreach ($chunkIds as $id) {
$row = $rows[$id] ?? null;
if (!is_array($row)) {
continue;
}
$docId = $row['document_id'] ?? null;
if (!is_string($docId) || !isset($candidateSet[$docId])) {
continue;
}
$filtered[] = $id;
}
// Soft fallback: if routing filtered everything away, retry global vector once
if ($filtered === []) {
$hits2 = $this->vectorClient->search($prompt, $this->vectorTopK);
if ($hits2 === []) {
return $keywordChunks;
}
$chunkIds2 = [];
foreach ($hits2 as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
continue;
}
$chunkIds2[] = (string)$hit['chunk_id'];
}
if ($chunkIds2 === []) {
return $keywordChunks;
}
$rows = $this->lookup->findByChunkIds($chunkIds2);
$finalChunkIds = $chunkIds2;
} else {
$finalChunkIds = $filtered;
}
}
foreach ($finalChunkIds as $id) {
if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) { if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) {
continue; continue;
} }
$keywordChunks[] = trim($rows[$id]['text']); $keywordChunks[] = trim($rows[$id]['text']);
} }
// dedupe // ---------------------------------------------------------
// 3) dedupe + limit
// ---------------------------------------------------------
$seen = []; $seen = [];
$deduped = []; $out = [];
foreach ($keywordChunks as $chunk) { foreach ($keywordChunks as $chunk) {
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk)); $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) { if (isset($seen[$key])) {
continue; continue;
} }
$seen[$key] = true; $seen[$key] = true;
$deduped[] = $chunk;
}
// diversify
return $this->diversifyByDevice($deduped, $limit, 1);
}
private function extractTerms(string $text): array
{
$text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
return array_values(array_filter(
explode(' ', $text),
static fn(string $w) => mb_strlen($w) > 2
));
}
private function extractDevice(string $chunk): string
{
$firstLine = explode("\n", $chunk, 2)[0] ?? '';
return trim($firstLine);
}
private function diversifyByDevice(array $chunks, int $limit, int $maxPerDevice = 1): array
{
$seenDevices = [];
$out = [];
foreach ($chunks as $chunk) {
$device = $this->extractDevice($chunk);
if ($device === '') {
continue;
}
if (!isset($seenDevices[$device])) {
$seenDevices[$device] = 0;
}
if ($seenDevices[$device] >= $maxPerDevice) {
continue;
}
$out[] = $chunk; $out[] = $chunk;
$seenDevices[$device]++;
if (\count($out) >= $limit) { if (\count($out) >= $limit) {
break; break;
@@ -120,4 +167,116 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return $out; return $out;
} }
/**
* Streaming Keyword Search über index.ndjson.
* Minimal, aber nützlich:
* - Score = Anzahl gefundener Terms
* - CandidateDocs (Tag-Routing) reduziert Scan massiv
*
* @param string[] $terms
* @param array<string,true>|null $candidateSet
* @return string[]
*/
private function keywordSearchStreaming(array $terms, int $limit, ?array $candidateSet): array
{
if ($terms === []) {
return [];
}
$maxScore = \count($terms);
// top list: each item = ['score' => int, 'text' => string]
$top = [];
foreach ($this->chunkManager->streamAll() as $row) {
$text = $row['text'] ?? null;
if (!is_string($text) || $text === '') {
continue;
}
if ($candidateSet !== null) {
$docId = $row['document_id'] ?? null;
if (!is_string($docId) || !isset($candidateSet[$docId])) {
continue;
}
}
$haystack = mb_strtolower($text);
$score = 0;
foreach ($terms as $t) {
if ($t === '') {
continue;
}
if (mb_stripos($haystack, $t) !== false) {
$score++;
}
}
if ($score < self::KEYWORD_MIN_HITS) {
continue;
}
$top[] = [
'score' => $score,
'text' => trim($text),
];
// keep only best N (simple sort, N is tiny)
usort($top, static function (array $a, array $b): int {
// higher score first
$cmp = ($b['score'] <=> $a['score']);
if ($cmp !== 0) {
return $cmp;
}
// shorter chunk first (often more precise)
return (mb_strlen($a['text']) <=> mb_strlen($b['text']));
});
if (\count($top) > $limit) {
$top = array_slice($top, 0, $limit);
}
// early exit: perfect matches filled
if (\count($top) === $limit && ($top[0]['score'] ?? 0) >= $maxScore) {
break;
}
}
$out = [];
foreach ($top as $item) {
$out[] = (string)$item['text'];
}
return $out;
}
/**
* Minimal term extraction (stabiles Verhalten, wenig Magie)
*
* @return string[]
*/
private function extractTerms(string $text): array
{
$text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
$parts = array_values(array_filter(
explode(' ', $text),
static fn(string $w) => mb_strlen($w) > 2
));
// unique, order preserved
$seen = [];
$out = [];
foreach ($parts as $w) {
if (isset($seen[$w])) {
continue;
}
$seen[$w] = true;
$out[] = $w;
}
return $out;
}
} }

View File

@@ -0,0 +1,51 @@
<?php
declare(strict_types=1);
namespace App\Service;
use App\Entity\TagRebuildJob;
use Doctrine\ORM\EntityManagerInterface;
use Psr\Log\LoggerInterface;
final class TagRebuildJobService
{
public function __construct(
private readonly EntityManagerInterface $em,
private readonly LoggerInterface $agentLogger,
private readonly string $projectDir,
) {}
public function enqueueAndStartAsync(): TagRebuildJob
{
$job = new TagRebuildJob();
$this->em->persist($job);
$this->em->flush();
$this->startAsync($job);
return $job;
}
private function startAsync(TagRebuildJob $job): void
{
$php = PHP_BINARY; // safest in runtime
$console = rtrim($this->projectDir, '/') . '/bin/console';
$cmd = sprintf(
'%s %s %s %s > /dev/null 2>&1 &',
escapeshellarg($php),
escapeshellarg($console),
'mto:agent:tags:job:run',
escapeshellarg((string)$job->getId())
);
$this->agentLogger->info('[tags] enqueue job async', [
'job' => (string)$job->getId(),
'cmd' => $cmd,
]);
@exec($cmd);
}
}

View File

@@ -0,0 +1,159 @@
<?php
declare(strict_types=1);
namespace App\Tag;
use App\Entity\DocumentTag;
use App\Entity\Tag;
use Doctrine\ORM\EntityManagerInterface;
final class TagNdjsonExporter
{
public function __construct(
private EntityManagerInterface $em,
private string $tagsNdjsonPath,
) {}
/**
* Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
*
* Line format:
* {"tag_id":"...","text":"label\nslug\noptional description","document_ids":["...","..."]}
*
* @return array{tags:int, lines:int, bytes:int, path:string}
*/
public function export(): array
{
$dir = \dirname($this->tagsNdjsonPath);
if (!\is_dir($dir)) {
@\mkdir($dir, 0775, true);
}
$tmpPath = $this->tagsNdjsonPath . '.tmp';
$fh = @\fopen($tmpPath, 'wb');
if (!$fh) {
throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
}
// ------------------------------------------------------------------
// Fetch tags (small) + join document ids (can be bigger) efficiently.
// We avoid repositories and keep it DB-agnostic via DQL/QB.
// ------------------------------------------------------------------
// 1) Load all tags (id, slug, label, description)
$tags = $this->em->createQueryBuilder()
->select('t')
->from(Tag::class, 't')
->orderBy('t.label', 'ASC')
->getQuery()
->getResult();
if (!\is_array($tags) || $tags === []) {
\fclose($fh);
// Write empty file atomically
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return [
'tags' => 0,
'lines' => 0,
'bytes' => (int) @\filesize($this->tagsNdjsonPath),
'path' => $this->tagsNdjsonPath,
];
}
// 2) Build tagId => docIds map from document_tag
// We query pairs (tag_id, document_id) in one go.
$rows = $this->em->createQueryBuilder()
->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
->from(DocumentTag::class, 'dt')
->getQuery()
->getArrayResult();
$tagToDocs = [];
foreach ($rows as $r) {
$tagId = (string) ($r['tagId'] ?? '');
$docId = (string) ($r['docId'] ?? '');
if ($tagId === '' || $docId === '') {
continue;
}
$tagToDocs[$tagId][] = $docId;
}
// 3) Stream NDJSON lines
$lines = 0;
foreach ($tags as $tag) {
if (!$tag instanceof Tag) {
continue;
}
$tagId = (string) $tag->getId();
$docIds = $tagToDocs[$tagId] ?? [];
// de-dupe docIds for safety
if ($docIds !== []) {
$docIds = \array_values(\array_unique($docIds));
}
// "text" is the embedding source for tag vectors later:
// Keep it short but semantically useful.
$textParts = [
$tag->getLabel(),
$tag->getSlug(),
];
$desc = $tag->getDescription();
if (\is_string($desc) && \trim($desc) !== '') {
$textParts[] = \trim($desc);
}
$line = [
'tag_id' => $tagId,
'text' => \implode("\n", $textParts),
'document_ids' => $docIds,
];
$json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
if (!\is_string($json)) {
// skip invalid line but keep export running
continue;
}
\fwrite($fh, $json . "\n");
$lines++;
}
\fclose($fh);
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return [
'tags' => \count($tags),
'lines' => $lines,
'bytes' => (int) @\filesize($this->tagsNdjsonPath),
'path' => $this->tagsNdjsonPath,
];
}
private function atomicReplace(string $tmpPath, string $finalPath): void
{
// Ensure old file can be replaced on Windows-like FS too (best effort)
if (\is_file($finalPath)) {
@\chmod($finalPath, 0664);
}
if (!@\rename($tmpPath, $finalPath)) {
// if rename fails, try copy+unlink fallback
if (!@\copy($tmpPath, $finalPath)) {
@\unlink($tmpPath);
throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);
}
@\unlink($tmpPath);
}
@\chmod($finalPath, 0664);
}
}

View File

@@ -0,0 +1,98 @@
<?php
declare(strict_types=1);
namespace App\Tag;
use Doctrine\DBAL\ArrayParameterType;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Component\Uid\Uuid;
final class TagRoutingService
{
private const DEFAULT_TOPK = 8;
private const MIN_BEST_SCORE = 0.10;
private const MAX_CANDIDATE_DOCS = 200;
public function __construct(
private readonly TagVectorSearchClient $tagSearch,
private readonly EntityManagerInterface $em,
) {}
/**
* @return string[]|null
*/
public function route(string $query): ?array
{
$query = trim($query);
if ($query === '') {
return null;
}
$hits = $this->tagSearch->search($query, self::DEFAULT_TOPK);
if (!is_array($hits) || $hits === []) {
return null;
}
$bestScore = (float)($hits[0]['score'] ?? 0.0);
if ($bestScore < self::MIN_BEST_SCORE) {
return null;
}
// Convert tag UUID strings to binary(16)
$tagBinaryIds = [];
foreach ($hits as $hit) {
$id = (string)($hit['tag_id'] ?? '');
if ($id === '') {
continue;
}
try {
$tagBinaryIds[] = Uuid::fromString($id)->toBinary();
} catch (\Throwable) {
continue;
}
}
if ($tagBinaryIds === []) {
return null;
}
// Direct DBAL query (binary-safe)
$conn = $this->em->getConnection();
$rows = $conn->executeQuery(
'SELECT document_id
FROM document_tag
WHERE tag_id IN (:tagIds)',
['tagIds' => $tagBinaryIds],
['tagIds' => ArrayParameterType::BINARY]
)->fetchAllAssociative();
if ($rows === []) {
return null;
}
$docIds = [];
foreach ($rows as $row) {
if (!isset($row['document_id'])) {
continue;
}
try {
$uuid = Uuid::fromBinary($row['document_id']);
$docIds[(string)$uuid] = true;
} catch (\Throwable) {
continue;
}
if (count($docIds) >= self::MAX_CANDIDATE_DOCS) {
break;
}
}
return array_keys($docIds);
}
}

View File

@@ -0,0 +1,107 @@
<?php
declare(strict_types=1);
namespace App\Tag;
use Psr\Log\LoggerInterface;
final class TagVectorIndexBuilder
{
public function __construct(
private readonly string $pythonBin,
private readonly string $scriptPath,
private readonly string $tagsNdjsonPath,
private readonly string $vectorTagsIndexPath,
private readonly string $embeddingModel,
private readonly int $timeoutSeconds,
private readonly LoggerInterface $agentLogger,
) {}
public function build(): void
{
if (!is_file($this->tagsNdjsonPath)) {
throw new \RuntimeException('tags.ndjson missing: ' . $this->tagsNdjsonPath);
}
if (!is_file($this->scriptPath)) {
throw new \RuntimeException('Tag ingest script missing: ' . $this->scriptPath);
}
$tmpIndex = $this->vectorTagsIndexPath . '.tmp';
$tmpMeta = $tmpIndex . '.meta.json';
$finalIndex = $this->vectorTagsIndexPath;
$finalMeta = $finalIndex . '.meta.json';
// Ensure output dir exists
$dir = \dirname($finalIndex);
if (!\is_dir($dir)) {
@\mkdir($dir, 0775, true);
}
// Clean tmp leftovers
@\unlink($tmpIndex);
@\unlink($tmpMeta);
// Positional args:
// python vector_ingest_tags.py <tags.ndjson> <out.tmp> <model>
$cmd = sprintf(
'%s %s %s %s %s 2>&1',
escapeshellarg($this->pythonBin),
escapeshellarg($this->scriptPath),
escapeshellarg($this->tagsNdjsonPath),
escapeshellarg($tmpIndex),
escapeshellarg($this->embeddingModel),
);
$this->agentLogger->info('[tags] build tag vector index', [
'cmd' => $cmd,
'timeout' => $this->timeoutSeconds,
]);
$out = [];
$exit = 0;
exec($cmd, $out, $exit);
if ($exit !== 0) {
$this->agentLogger->error('[tags] tag vector ingest failed', [
'exit' => $exit,
'out' => $out,
]);
throw new \RuntimeException('Tag vector ingest failed (exit=' . $exit . ')');
}
// If no tags -> python may remove outputs and exit 0
if (!is_file($tmpIndex) || !is_file($tmpMeta)) {
// treat as "no index" rather than hard error
@\unlink($tmpIndex);
@\unlink($tmpMeta);
$this->agentLogger->warning('[tags] no tag index produced (maybe 0 tags).');
return;
}
// Atomic switch
$this->atomicReplace($tmpIndex, $finalIndex);
$this->atomicReplace($tmpMeta, $finalMeta);
$this->agentLogger->info('[tags] tag vector index build completed', [
'index' => $finalIndex,
'meta' => $finalMeta,
]);
}
private function atomicReplace(string $tmp, string $final): void
{
if (!@rename($tmp, $final)) {
if (!@copy($tmp, $final)) {
@unlink($tmp);
throw new \RuntimeException('Atomic replace failed for: ' . $final);
}
@unlink($tmp);
}
@chmod($final, 0664);
}
}

View File

@@ -0,0 +1,88 @@
<?php
declare(strict_types=1);
namespace App\Tag;
use Psr\Log\LoggerInterface;
final readonly class TagVectorSearchClient
{
public function __construct(
private string $pythonBin,
private string $scriptPath,
private string $vectorTagsIndexPath,
private string $vectorTagsMetaPath,
private string $embeddingModel,
private LoggerInterface $agentLogger,
) {}
/**
* @return array<int, array{tag_id:string, score:float}>
*/
public function search(string $query, int $limit = 8): array
{
if (!is_file($this->scriptPath)) {
$this->agentLogger->warning('Tag vector search script missing: ' . $this->scriptPath);
return [];
}
if (!is_file($this->vectorTagsIndexPath) || !is_file($this->vectorTagsMetaPath)) {
// no tag index available yet => no routing
return [];
}
$limit = max(1, min($limit, 50));
// Positional args, aligned with existing VectorSearchClient approach:
// python vector_search_tags.py <query> <limit> <index> <meta> <model>
$cmd = sprintf(
'%s %s %s %d %s %s %s 2>&1',
escapeshellarg($this->pythonBin),
escapeshellarg($this->scriptPath),
escapeshellarg($query),
$limit,
escapeshellarg($this->vectorTagsIndexPath),
escapeshellarg($this->vectorTagsMetaPath),
escapeshellarg($this->embeddingModel),
);
exec($cmd, $out, $exitCode);
if ($exitCode !== 0 || empty($out)) {
return [];
}
$json = implode("\n", $out);
try {
$data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
} catch (\Throwable) {
return [];
}
if (!is_array($data)) {
return [];
}
$hits = [];
foreach ($data as $row) {
if (!is_array($row)) {
continue;
}
$tagId = (string)($row['tag_id'] ?? '');
$score = $row['score'] ?? null;
if ($tagId === '' || !is_numeric($score)) {
continue;
}
$hits[] = [
'tag_id' => $tagId,
'score' => (float)$score,
];
}
return $hits;
}
}

View File

@@ -0,0 +1,126 @@
#!/usr/bin/env python3
import sys
import json
from pathlib import Path
# ---------------------------------------------------------
# Positional args (aligned with PHP builder exec call)
# ---------------------------------------------------------
# 1 tags.ndjson
# 2 out_index_path (can be .tmp)
# 3 model
# Example:
# python vector_ingest_tags.py /var/knowledge/tags.ndjson /var/knowledge/vector_tags.index.tmp all-MiniLM-L6-v2
# ---------------------------------------------------------
if len(sys.argv) < 4:
print("ERROR: usage: vector_ingest_tags.py <tags.ndjson> <out.index> <model>", file=sys.stderr)
sys.exit(2)
tags_path = Path(sys.argv[1]).resolve()
out_path = Path(sys.argv[2]).resolve()
model_name = sys.argv[3]
meta_path = Path(str(out_path) + ".meta.json") # vector_tags.index(.tmp).meta.json
# ---------------------------------------------------------
# Dependency checks
# ---------------------------------------------------------
try:
import faiss
except Exception:
print("ERROR: Python module 'faiss' not found.", file=sys.stderr)
sys.exit(10)
try:
from sentence_transformers import SentenceTransformer
except Exception:
print("ERROR: Python module 'sentence-transformers' not found.", file=sys.stderr)
sys.exit(11)
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if not tags_path.is_file():
print(f"ERROR: tags.ndjson not found at {tags_path}", file=sys.stderr)
sys.exit(20)
# Ensure output directory exists
out_path.parent.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------
# Load model
# ---------------------------------------------------------
model = SentenceTransformer(model_name)
# ---------------------------------------------------------
# Streaming read NDJSON
# ---------------------------------------------------------
texts = []
ids = []
with open(tags_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
except Exception:
continue
text = entry.get("text")
tag_id = entry.get("tag_id")
if not text or not tag_id:
continue
text = str(text)
if len(text) > 4000:
text = text[:4000]
texts.append(text)
ids.append(str(tag_id))
# If empty: remove outputs (tmp) and exit success
if not texts:
if out_path.exists():
out_path.unlink()
if meta_path.exists():
meta_path.unlink()
sys.exit(0)
# ---------------------------------------------------------
# Build embeddings
# ---------------------------------------------------------
embeddings = model.encode(
texts,
normalize_embeddings=True,
show_progress_bar=False,
batch_size=64
)
embeddings = np.array(embeddings).astype("float32")
dim = embeddings.shape[1]
# ---------------------------------------------------------
# Build FAISS index
# ---------------------------------------------------------
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
faiss.write_index(index, str(out_path))
# ---------------------------------------------------------
# Write ID mapping meta
# ---------------------------------------------------------
with open(meta_path, "w", encoding="utf-8") as f:
json.dump(ids, f)
sys.exit(0)

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python3
import sys
import json
from pathlib import Path
# ---------------------------------------------------------
# Positional args (aligned with PHP client exec call)
# ---------------------------------------------------------
# 1 query
# 2 limit
# 3 index_path
# 4 meta_path
# 5 model
#
# Example:
# python vector_search_tags.py "foo" 8 /path/vector_tags.index /path/vector_tags.index.meta.json all-MiniLM-L6-v2
# ---------------------------------------------------------
if len(sys.argv) < 6:
print("[]")
sys.exit(0)
query = sys.argv[1]
try:
limit = int(sys.argv[2])
except Exception:
limit = 5
index_path = Path(sys.argv[3]).resolve()
meta_path = Path(sys.argv[4]).resolve()
model_name = sys.argv[5]
# ---------------------------------------------------------
# Dependency checks
# ---------------------------------------------------------
try:
import faiss
except Exception:
# keep stdout clean for caller
print("[]")
sys.exit(0)
try:
from sentence_transformers import SentenceTransformer
except Exception:
print("[]")
sys.exit(0)
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if limit <= 0:
print("[]")
sys.exit(0)
if not index_path.is_file() or not meta_path.is_file():
# No tag index available => no routing
print("[]")
sys.exit(0)
# ---------------------------------------------------------
# Load model
# ---------------------------------------------------------
model = SentenceTransformer(model_name)
# ---------------------------------------------------------
# Load index + meta
# ---------------------------------------------------------
index = faiss.read_index(str(index_path))
try:
with open(meta_path, "r", encoding="utf-8") as f:
ids = json.load(f)
except Exception:
print("[]")
sys.exit(0)
if not isinstance(ids, list) or len(ids) == 0:
print("[]")
sys.exit(0)
# ---------------------------------------------------------
# Embed & search
# ---------------------------------------------------------
qvec = model.encode([query], normalize_embeddings=True)
scores, idxs = index.search(qvec, limit)
out = []
for score, idx in zip(scores[0], idxs[0]):
if idx is None or idx < 0 or idx >= len(ids):
continue
out.append({
"tag_id": str(ids[idx]),
"score": float(score),
})
print(json.dumps(out))
sys.exit(0)

View File

@@ -67,6 +67,14 @@
Dokumente Dokumente
</a> </a>
{# ------------------------- #}
{# Tags (Document Routing) #}
{# ------------------------- #}
<a class="nav-link text-light {% if route starts with 'admin_tags' %}active fw-bold{% endif %}"
href="{{ path('admin_tags_index') }}">
Tags
</a>
<a class="nav-link text-light {% if route starts with 'admin_job' %}active fw-bold{% endif %}" <a class="nav-link text-light {% if route starts with 'admin_job' %}active fw-bold{% endif %}"
href="{{ path('admin_jobs') }}"> href="{{ path('admin_jobs') }}">
Ingest Jobs Ingest Jobs

View File

@@ -119,6 +119,11 @@
{# Aktionen #} {# Aktionen #}
<td class="text-end"> <td class="text-end">
<a class="btn btn-sm btn-outline-info me-2"
href="{{ path('admin_document_tags_edit', {id: document.id}) }}">
Tags
</a>
<a class="btn btn-sm btn-outline-light me-2" <a class="btn btn-sm btn-outline-light me-2"
href="{{ path('admin_document_show', {id: document.id}) }}"> href="{{ path('admin_document_show', {id: document.id}) }}">
Details Details

View File

@@ -0,0 +1,99 @@
{% extends 'admin/base.html.twig' %}
{% block title %}Tags {{ document.title }}{% endblock %}
{% block body %}
<div class="d-flex justify-content-between align-items-center mb-4">
<h1 class="h3 mb-0">
Tags für Dokument
<span class="text-info">{{ document.title }}</span>
</h1>
<a href="{{ path('admin_documents') }}"
class="btn btn-sm btn-outline-light">
Zurück
</a>
</div>
{# ============================================= #}
{# Bereits zugewiesene Tags #}
{# ============================================= #}
<div class="card bg-black border-secondary mb-4">
<div class="card-body">
<h5 class="text-info mb-3">Zugewiesene Tags</h5>
{% if document.tags is empty %}
<div class="alert alert-secondary mb-0">
Dieses Dokument hat noch keine Tags.
</div>
{% else %}
<div class="d-flex flex-wrap gap-2">
{% for tag in document.tags %}
<span class="badge bg-info text-dark px-3 py-2">
{{ tag.label }}
</span>
{% endfor %}
</div>
{% endif %}
</div>
</div>
{# ============================================= #}
{# Tag-Zuweisung Formular #}
{# ============================================= #}
<div class="card bg-black border-secondary">
<div class="card-body">
<h5 class="text-info mb-3">Tags bearbeiten</h5>
<form method="post"
action="{{ path('admin_document_tags_save', {id: document.id}) }}">
<input type="hidden"
name="_token"
value="{{ csrf_token('admin_document_tags_save_' ~ document.id) }}">
<div class="row">
{% for tag in allTags %}
<div class="col-md-4 mb-2">
<div class="form-check">
<input
class="form-check-input"
type="checkbox"
name="tag_ids[]"
value="{{ tag.id }}"
id="tag_{{ tag.id }}"
{% if tag in document.tags %}checked{% endif %}
>
<label class="form-check-label"
for="tag_{{ tag.id }}">
{{ tag.label }}
</label>
</div>
</div>
{% endfor %}
</div>
<hr class="border-secondary">
<button type="submit"
class="btn btn-sm btn-outline-info">
Speichern
</button>
</form>
</div>
</div>
{% endblock %}

View File

@@ -0,0 +1,85 @@
{% extends 'admin/base.html.twig' %}
{% block title %}Tags{% endblock %}
{% block body %}
<div class="d-flex justify-content-between align-items-center mb-4">
<h1 class="h3">Tags</h1>
</div>
{% for message in app.flashes('success') %}
<div class="alert alert-success">{{ message }}</div>
{% endfor %}
{% for message in app.flashes('danger') %}
<div class="alert alert-danger">{{ message }}</div>
{% endfor %}
<div class="card bg-black border-secondary mb-4 text-light">
<div class="card-body">
<h5 class="text-info mb-3">Neuen Tag erstellen</h5>
<form method="post" action="{{ path('admin_tags_create') }}" class="row g-2">
<input type="hidden" name="_token" value="{{ csrf_token('admin_tag_create') }}"/>
<div class="col-md-3">
<input class="form-control form-control-sm" name="label" placeholder="Label (z.B. Testomat 808)" required />
</div>
<div class="col-md-3">
<input class="form-control form-control-sm" name="slug" placeholder="Slug (z.B. testomat-808)" required />
</div>
<div class="col-md-4">
<input class="form-control form-control-sm" name="description" placeholder="Beschreibung (optional)" />
</div>
<div class="col-md-2 d-grid">
<button class="btn btn-sm btn-outline-info" type="submit">Anlegen</button>
</div>
<div class="col-12">
<small class="text-light">
Hinweis: Nach Änderungen an Tags/Zuweisungen bitte <code>bin/console mto:agent:tags:rebuild</code> ausführen.
</small>
</div>
</form>
</div>
</div>
<div class="card bg-black border-secondary text-light">
<div class="card-body p-0">
<table class="table table-dark table-striped table-hover mb-0 align-middle">
<thead class="table-secondary text-dark">
<tr>
<th>Label</th>
<th>Slug</th>
<th>Beschreibung</th>
<th class="text-end">Aktion</th>
</tr>
</thead>
<tbody>
{% for tag in tags %}
<tr>
<td>{{ tag.label }}</td>
<td><code>{{ tag.slug }}</code></td>
<td class="text-muted">{{ tag.description ?: '' }}</td>
<td class="text-end">
<form method="post" action="{{ path('admin_tags_delete', {id: tag.id}) }}" style="display:inline">
<input type="hidden" name="_token" value="{{ csrf_token('admin_tag_delete_' ~ tag.id) }}"/>
<button class="btn btn-sm btn-outline-danger" type="submit"
onclick="return confirm('Tag wirklich löschen? (Zuweisungen werden mit gelöscht)')">
Löschen
</button>
</form>
</td>
</tr>
{% else %}
<tr><td colspan="4" class="text-light p-3">Noch keine Tags vorhanden.</td></tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
{% endblock %}