diff --git a/config/services.yaml b/config/services.yaml
index 74ffb5d..55428b5 100644
--- a/config/services.yaml
+++ b/config/services.yaml
@@ -6,14 +6,12 @@ parameters:
# ------------------------------------------------------------
# Root
# ------------------------------------------------------------
-
mto.root: '%kernel.project_dir%'
mto.kernel.dir: '%mto.root%'
# ------------------------------------------------------------
# Knowledge Root (ZENTRAL)
# ------------------------------------------------------------
-
mto.knowledge.root: '%mto.root%/var/knowledge'
mto.knowledge.ndjson: '%mto.knowledge.root%/index.ndjson'
@@ -23,13 +21,28 @@ parameters:
mto.runtime.meta: '%mto.knowledge.root%/index_runtime.json'
mto.knowledge.upload: '%mto.knowledge.root%/uploads'
+ # ------------------------------------------------------------
+ # Tags (Document Routing)
+ # ------------------------------------------------------------
+ mto.knowledge.tags_ndjson: '%mto.knowledge.root%/tags.ndjson'
+
+ # Tag vector index outputs
+ mto.knowledge.vector_tags_index: '%mto.knowledge.root%/vector_tags.index'
+ mto.knowledge.vector_tags_index_meta: '%mto.knowledge.root%/vector_tags.index.meta.json'
+
+ # Tag vector scripts (in src/Vector)
+ mto.vector.ingest_tags_script: '%mto.root%/src/Vector/vector_ingest_tags.py'
+ mto.vector.search_tags_script: '%mto.root%/src/Vector/vector_search_tags.py'
+
+ # Lock for tag rebuild jobs
+ mto.tags.rebuild_lock: '%mto.knowledge.root%/locks/tag_rebuild.lock'
+
# Backward compatibility alias
mto.vector.data.upload.path: '%mto.knowledge.upload%'
# ------------------------------------------------------------
# Index Configuration (Fallback Guardrails)
# ------------------------------------------------------------
-
mto.index.chunk_size: 800
mto.index.chunk_overlap: 100
mto.index.embedding_model: 'all-MiniLM-L6-v2'
@@ -39,7 +52,6 @@ parameters:
# ------------------------------------------------------------
# Python / Vector Runtime
# ------------------------------------------------------------
-
mto.vector.python_bin: '/var/www/html/.venv/bin/python3'
mto.vector.ingest_script: '%mto.root%/src/Vector/vector_ingest.py'
mto.vector.search_script: '%mto.root%/src/Vector/vector_search.py'
@@ -158,6 +170,52 @@ services:
$timeoutSeconds: '%mto.vector.timeout%'
$configurationProvider: '@App\Index\IndexConfigurationProvider'
+ # ------------------------------------------------------------
+ # Tags Export (Document Routing)
+ # ------------------------------------------------------------
+
+ App\Tag\TagNdjsonExporter:
+ arguments:
+ $tagsNdjsonPath: '%mto.knowledge.tags_ndjson%'
+
+ # ------------------------------------------------------------
+ # Tags Vector (Builder + Search) ✅ HIER IST DER FIX
+ # ------------------------------------------------------------
+
+ App\Tag\TagVectorIndexBuilder:
+ arguments:
+ $pythonBin: '%mto.vector.python_bin%'
+ $scriptPath: '%mto.vector.ingest_tags_script%'
+ $tagsNdjsonPath: '%mto.knowledge.tags_ndjson%'
+ $vectorTagsIndexPath: '%mto.knowledge.vector_tags_index%'
+ $embeddingModel: '%mto.index.embedding_model%'
+ $timeoutSeconds: '%mto.vector.timeout%'
+ $agentLogger: '@monolog.logger.agent'
+
+ App\Tag\TagVectorSearchClient:
+ arguments:
+ $pythonBin: '%mto.vector.python_bin%'
+ $scriptPath: '%mto.vector.search_tags_script%'
+ $vectorTagsIndexPath: '%mto.knowledge.vector_tags_index%'
+ $vectorTagsMetaPath: '%mto.knowledge.vector_tags_index_meta%'
+ $embeddingModel: '%mto.index.embedding_model%'
+ $agentLogger: '@monolog.logger.agent'
+
+ App\Tag\TagRoutingService: ~
+
+ # ------------------------------------------------------------
+ # Tag Rebuild Jobs (8A)
+ # ------------------------------------------------------------
+
+ App\Service\TagRebuildJobService:
+ arguments:
+ $projectDir: '%mto.root%'
+ $agentLogger: '@monolog.logger.agent'
+
+ App\Command\TagRebuildRunJobCommand:
+ arguments:
+ $lockFilePath: '%mto.tags.rebuild_lock%'
+
# ------------------------------------------------------------
# Admin Utilities
# ------------------------------------------------------------
diff --git a/migrations/Version20260221000100.php b/migrations/Version20260221000100.php
new file mode 100644
index 0000000..8cdff04
--- /dev/null
+++ b/migrations/Version20260221000100.php
@@ -0,0 +1,81 @@
+addSql("
+ CREATE TABLE tag (
+ id UUID NOT NULL,
+ name VARCHAR(120) NOT NULL,
+ slug VARCHAR(120) NOT NULL,
+ created_at TIMESTAMP(0) WITHOUT TIME ZONE NOT NULL,
+ PRIMARY KEY(id)
+ )
+ ");
+
+ $this->addSql("
+ CREATE UNIQUE INDEX uniq_tag_slug ON tag (slug)
+ ");
+
+ // --------------------------------------------------
+ // DOCUMENT_VERSION_TAG (ManyToMany)
+ // --------------------------------------------------
+
+ $this->addSql("
+ CREATE TABLE document_version_tag (
+ document_version_id UUID NOT NULL,
+ tag_id UUID NOT NULL,
+ PRIMARY KEY(document_version_id, tag_id)
+ )
+ ");
+
+ $this->addSql("
+ CREATE INDEX idx_dv_tag_version
+ ON document_version_tag (document_version_id)
+ ");
+
+ $this->addSql("
+ CREATE INDEX idx_dv_tag_tag
+ ON document_version_tag (tag_id)
+ ");
+
+ $this->addSql("
+ ALTER TABLE document_version_tag
+ ADD CONSTRAINT fk_dv_tag_version
+ FOREIGN KEY (document_version_id)
+ REFERENCES document_version (id)
+ ON DELETE CASCADE
+ ");
+
+ $this->addSql("
+ ALTER TABLE document_version_tag
+ ADD CONSTRAINT fk_dv_tag_tag
+ FOREIGN KEY (tag_id)
+ REFERENCES tag (id)
+ ON DELETE CASCADE
+ ");
+ }
+
+ public function down(Schema $schema): void
+ {
+ $this->addSql("DROP TABLE document_version_tag");
+ $this->addSql("DROP TABLE tag");
+ }
+}
\ No newline at end of file
diff --git a/src/Command/TagRebuildRunJobCommand.php b/src/Command/TagRebuildRunJobCommand.php
new file mode 100644
index 0000000..f32d3c6
--- /dev/null
+++ b/src/Command/TagRebuildRunJobCommand.php
@@ -0,0 +1,103 @@
+addArgument('jobId', InputArgument::REQUIRED, 'TagRebuildJob UUID');
+ }
+
+ protected function execute(InputInterface $input, OutputInterface $output): int
+ {
+ $jobId = (string)$input->getArgument('jobId');
+
+ /** @var TagRebuildJob|null $job */
+ $job = $this->em->getRepository(TagRebuildJob::class)->find($jobId);
+ if (!$job instanceof TagRebuildJob) {
+ $output->writeln('Job not found.');
+ return Command::FAILURE;
+ }
+
+ // ---------------------------------------------------------
+ // Global lock to avoid parallel rebuilds
+ // ---------------------------------------------------------
+ $lockDir = \dirname($this->lockFilePath);
+ if (!\is_dir($lockDir)) {
+ @\mkdir($lockDir, 0775, true);
+ }
+
+ $fh = @\fopen($this->lockFilePath, 'c+');
+ if (!$fh) {
+ $job->markFailed('Cannot open lock file: ' . $this->lockFilePath);
+ $this->em->flush();
+ $output->writeln('Cannot open lock file.');
+ return Command::FAILURE;
+ }
+
+ // If another rebuild runs, we fail fast (simple & safe).
+ if (!@\flock($fh, LOCK_EX | LOCK_NB)) {
+ \fclose($fh);
+ $job->markFailed('Another tag rebuild is currently running (lock busy).');
+ $this->em->flush();
+ $output->writeln('Lock busy. Another rebuild is running.');
+ return Command::FAILURE;
+ }
+
+ // mark running
+ $job->markRunning();
+ $this->em->flush();
+
+ try {
+ $export = $this->exporter->export();
+ $this->builder->build();
+
+ $job->markCompleted();
+ $this->em->flush();
+
+ $output->writeln('OK');
+ $output->writeln('tags.ndjson: ' . $export['path']);
+ } catch (\Throwable $e) {
+ $job->markFailed($e->getMessage());
+ $this->em->flush();
+
+ $output->writeln('FAILED: ' . $e->getMessage() . '');
+
+ @\flock($fh, LOCK_UN);
+ @\fclose($fh);
+
+ return Command::FAILURE;
+ }
+
+ @\flock($fh, LOCK_UN);
+ @\fclose($fh);
+
+ return Command::SUCCESS;
+ }
+}
\ No newline at end of file
diff --git a/src/Command/TagsExportCommand.php b/src/Command/TagsExportCommand.php
new file mode 100644
index 0000000..5d3bd27
--- /dev/null
+++ b/src/Command/TagsExportCommand.php
@@ -0,0 +1,42 @@
+exporter->export();
+ } catch (\Throwable $e) {
+ $output->writeln('ERROR: ' . $e->getMessage() . '');
+ return Command::FAILURE;
+ }
+
+ $output->writeln('Tags NDJSON exported');
+ $output->writeln('Path: ' . $result['path']);
+ $output->writeln('Tags: ' . $result['tags']);
+ $output->writeln('Lines: ' . $result['lines']);
+ $output->writeln('Bytes: ' . $result['bytes']);
+
+ return Command::SUCCESS;
+ }
+}
\ No newline at end of file
diff --git a/src/Command/TagsRebuildCommand.php b/src/Command/TagsRebuildCommand.php
new file mode 100644
index 0000000..beeb5a0
--- /dev/null
+++ b/src/Command/TagsRebuildCommand.php
@@ -0,0 +1,46 @@
+exporter->export();
+ $output->writeln('1/2 Exported tags.ndjson');
+ $output->writeln('Path: ' . $export['path']);
+ $output->writeln('Tags: ' . $export['tags']);
+ $output->writeln('Lines: ' . $export['lines']);
+ $output->writeln('Bytes: ' . $export['bytes']);
+
+ $this->builder->build();
+ $output->writeln('2/2 Built vector_tags.index');
+ } catch (\Throwable $e) {
+ $output->writeln('ERROR: ' . $e->getMessage() . '');
+ return Command::FAILURE;
+ }
+
+ return Command::SUCCESS;
+ }
+}
\ No newline at end of file
diff --git a/src/Controller/Admin/DocumentTagController.php b/src/Controller/Admin/DocumentTagController.php
new file mode 100644
index 0000000..52930f8
--- /dev/null
+++ b/src/Controller/Admin/DocumentTagController.php
@@ -0,0 +1,93 @@
+getRepository(Document::class)->find($id);
+ if (!$document instanceof Document) {
+ throw $this->createNotFoundException('Document not found');
+ }
+
+ $allTags = $em->createQueryBuilder()
+ ->select('t')
+ ->from(Tag::class, 't')
+ ->orderBy('t.label', 'ASC')
+ ->getQuery()
+ ->getResult();
+
+ $assigned = [];
+ foreach ($document->getTags() as $tag) {
+ $assigned[(string)$tag->getId()] = true;
+ }
+
+ return $this->render('admin/document_tags/edit.html.twig', [
+ 'document' => $document,
+ 'allTags' => $allTags,
+ 'assigned' => $assigned,
+ ]);
+ }
+
+ #[Route('/{id}/tags/save', name: 'admin_document_tags_save', methods: ['POST'])]
+ public function save(
+ string $id,
+ Request $request,
+ EntityManagerInterface $em,
+ TagRebuildJobService $jobs
+ ): RedirectResponse {
+
+ $document = $em->getRepository(Document::class)->find($id);
+ if (!$document instanceof Document) {
+ return $this->redirectToRoute('admin_documents');
+ }
+
+ $selected = $request->request->all('tag_ids') ?? [];
+
+ $uuidObjects = [];
+ foreach ($selected as $value) {
+ try {
+ $uuidObjects[] = \Symfony\Component\Uid\Uuid::fromString($value);
+ } catch (\Throwable) {
+ continue;
+ }
+ }
+
+ // Remove
+ foreach ($document->getTags() as $tag) {
+ if (!in_array($tag->getId(), $uuidObjects, false)) {
+ $document->removeTag($tag);
+ }
+ }
+
+ // Add
+ foreach ($uuidObjects as $uuid) {
+ $tag = $em->find(\App\Entity\Tag::class, $uuid);
+ if ($tag && !$document->hasTag($tag)) {
+ $document->addTag($tag);
+ }
+ }
+
+ $em->flush();
+ $jobs->enqueueAndStartAsync();
+
+ return $this->redirectToRoute('admin_document_tags_edit', ['id' => $id]);
+ }
+}
\ No newline at end of file
diff --git a/src/Controller/Admin/TagController.php b/src/Controller/Admin/TagController.php
new file mode 100644
index 0000000..42c8ecb
--- /dev/null
+++ b/src/Controller/Admin/TagController.php
@@ -0,0 +1,101 @@
+createQueryBuilder()
+ ->select('t')
+ ->from(Tag::class, 't')
+ ->orderBy('t.label', 'ASC')
+ ->getQuery()
+ ->getResult();
+
+ return $this->render('admin/tag/index.html.twig', [
+ 'tags' => $tags,
+ ]);
+ }
+
+ #[Route('/create', name: 'admin_tags_create', methods: ['POST'])]
+ public function create(Request $request, EntityManagerInterface $em, TagRebuildJobService $jobs): RedirectResponse
+ {
+ $token = (string)$request->request->get('_token', '');
+ if (!$this->isCsrfTokenValid('admin_tag_create', $token)) {
+ $this->addFlash('danger', 'Ungültiges CSRF Token.');
+ return $this->redirectToRoute('admin_tags_index');
+ }
+
+ $label = trim((string)$request->request->get('label', ''));
+ $slug = trim((string)$request->request->get('slug', ''));
+ $desc = trim((string)$request->request->get('description', ''));
+
+ if ($label === '' || $slug === '') {
+ $this->addFlash('danger', 'Label und Slug sind Pflichtfelder.');
+ return $this->redirectToRoute('admin_tags_index');
+ }
+
+ $exists = (int)$em->createQueryBuilder()
+ ->select('COUNT(t.id)')
+ ->from(Tag::class, 't')
+ ->where('t.slug = :slug')
+ ->setParameter('slug', $slug)
+ ->getQuery()
+ ->getSingleScalarResult();
+
+ if ($exists > 0) {
+ $this->addFlash('danger', 'Slug existiert bereits.');
+ return $this->redirectToRoute('admin_tags_index');
+ }
+
+ $tag = new Tag($slug, $label, $desc !== '' ? $desc : null);
+
+ $em->persist($tag);
+ $em->flush();
+
+ // enqueue async rebuild
+ $jobs->enqueueAndStartAsync();
+
+ $this->addFlash('success', 'Tag wurde erstellt. Rebuild läuft im Hintergrund.');
+ return $this->redirectToRoute('admin_tags_index');
+ }
+
+ #[Route('/{id}/delete', name: 'admin_tags_delete', methods: ['POST'])]
+ public function delete(string $id, Request $request, EntityManagerInterface $em, TagRebuildJobService $jobs): RedirectResponse
+ {
+ $token = (string)$request->request->get('_token', '');
+ if (!$this->isCsrfTokenValid('admin_tag_delete_' . $id, $token)) {
+ $this->addFlash('danger', 'Ungültiges CSRF Token.');
+ return $this->redirectToRoute('admin_tags_index');
+ }
+
+ $tag = $em->getRepository(Tag::class)->find($id);
+ if (!$tag instanceof Tag) {
+ $this->addFlash('danger', 'Tag nicht gefunden.');
+ return $this->redirectToRoute('admin_tags_index');
+ }
+
+ $em->remove($tag);
+ $em->flush();
+
+ // enqueue async rebuild
+ $jobs->enqueueAndStartAsync();
+
+ $this->addFlash('success', 'Tag wurde gelöscht. Rebuild läuft im Hintergrund.');
+ return $this->redirectToRoute('admin_tags_index');
+ }
+}
\ No newline at end of file
diff --git a/src/Entity/Document.php b/src/Entity/Document.php
index f49730e..e7d932d 100644
--- a/src/Entity/Document.php
+++ b/src/Entity/Document.php
@@ -2,6 +2,7 @@
namespace App\Entity;
+use DateTimeImmutable;
use Doctrine\ORM\Mapping as ORM;
use Symfony\Component\Uid\Uuid;
use Doctrine\Common\Collections\ArrayCollection;
@@ -14,7 +15,7 @@ class Document
public const STATUS_ARCHIVED = 'ARCHIVED';
#[ORM\Id]
- #[ORM\Column(type: 'uuid', unique: true)]
+ #[ORM\Column(type: 'uuid')]
private Uuid $id;
#[ORM\Column(length: 255)]
@@ -28,35 +29,49 @@ class Document
private User $createdBy;
#[ORM\Column]
- private \DateTimeImmutable $createdAt;
+ private DateTimeImmutable $createdAt;
- // 🔥 REMOVE ergänzt
#[ORM\OneToMany(
- mappedBy: 'document',
targetEntity: DocumentVersion::class,
- cascade: ['persist', 'remove'],
+ mappedBy: 'document',
+ cascade: ['persist'],
orphanRemoval: true
)]
private Collection $versions;
- // 🔥 onDelete ergänzt
#[ORM\ManyToOne]
- #[ORM\JoinColumn(nullable: true, onDelete: 'SET NULL')]
private ?DocumentVersion $currentVersion = null;
+ // ---------------------------------------------------------
+ // Tags via Join-Entity (DocumentTag)
+ // ---------------------------------------------------------
+
+ #[ORM\OneToMany(
+ targetEntity: DocumentTag::class,
+ mappedBy: 'document',
+ cascade: ['persist', 'remove'],
+ orphanRemoval: true
+ )]
+ private Collection $documentTags;
+
public function __construct()
{
$this->id = Uuid::v4();
- $this->createdAt = new \DateTimeImmutable();
+ $this->createdAt = new DateTimeImmutable();
$this->versions = new ArrayCollection();
+ $this->documentTags = new ArrayCollection();
}
+ // ---------------------------------------------------------
+ // Basic Getters
+ // ---------------------------------------------------------
+
public function getId(): Uuid
{
return $this->id;
}
- public function getCreatedAt(): \DateTimeImmutable
+ public function getCreatedAt(): DateTimeImmutable
{
return $this->createdAt;
}
@@ -87,12 +102,14 @@ class Document
return $this->createdBy;
}
- public function setCreatedBy(User $user): static
+ public function setCreatedBy(User $createdBy): void
{
- $this->createdBy = $user;
- return $this;
+ $this->createdBy = $createdBy;
}
+ /**
+ * @return Collection
+ */
public function getVersions(): Collection
{
return $this->versions;
@@ -106,13 +123,67 @@ class Document
}
}
+ public function getCurrentVersion(): ?DocumentVersion
+ {
+ return $this->currentVersion;
+ }
+
public function setCurrentVersion(?DocumentVersion $version): void
{
$this->currentVersion = $version;
}
- public function getCurrentVersion(): ?DocumentVersion
+ // ---------------------------------------------------------
+ // Tag API (Join-Entity basiert)
+ // ---------------------------------------------------------
+
+ /**
+ * @return Collection
+ */
+ public function getDocumentTags(): Collection
{
- return $this->currentVersion;
+ return $this->documentTags;
}
-}
+
+ /**
+ * Convenience: liefert direkt Tag-Objekte
+ *
+ * @return Tag[]
+ */
+ public function getTags(): array
+ {
+ return array_map(
+ fn (DocumentTag $dt) => $dt->getTag(),
+ $this->documentTags->toArray()
+ );
+ }
+
+ public function hasTag(Tag $tag): bool
+ {
+ foreach ($this->documentTags as $dt) {
+ if ($dt->getTag()->getId()->equals($tag->getId())) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public function addTag(Tag $tag): void
+ {
+ if ($this->hasTag($tag)) {
+ return;
+ }
+
+ $this->documentTags->add(new DocumentTag($this, $tag));
+ }
+
+ public function removeTag(Tag $tag): void
+ {
+ foreach ($this->documentTags as $dt) {
+ if ($dt->getTag()->getId()->equals($tag->getId())) {
+ $this->documentTags->removeElement($dt);
+ return;
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/Entity/DocumentTag.php b/src/Entity/DocumentTag.php
new file mode 100644
index 0000000..169b69f
--- /dev/null
+++ b/src/Entity/DocumentTag.php
@@ -0,0 +1,38 @@
+document = $document;
+ $this->tag = $tag;
+ }
+
+ public function getDocument(): Document
+ {
+ return $this->document;
+ }
+
+ public function getTag(): Tag
+ {
+ return $this->tag;
+ }
+}
\ No newline at end of file
diff --git a/src/Entity/Tag.php b/src/Entity/Tag.php
new file mode 100644
index 0000000..aeb94b0
--- /dev/null
+++ b/src/Entity/Tag.php
@@ -0,0 +1,82 @@
+id = Uuid::v4();
+ $this->createdAt = new \DateTimeImmutable();
+
+ $this->slug = $slug;
+ $this->label = $label;
+ $this->description = $description;
+ }
+
+ public function getId(): Uuid
+ {
+ return $this->id;
+ }
+
+ public function getSlug(): string
+ {
+ return $this->slug;
+ }
+
+ public function setSlug(string $slug): static
+ {
+ $this->slug = $slug;
+ return $this;
+ }
+
+ public function getLabel(): string
+ {
+ return $this->label;
+ }
+
+ public function setLabel(string $label): static
+ {
+ $this->label = $label;
+ return $this;
+ }
+
+ public function getDescription(): ?string
+ {
+ return $this->description;
+ }
+
+ public function setDescription(?string $description): static
+ {
+ $this->description = $description;
+ return $this;
+ }
+
+ public function getCreatedAt(): \DateTimeImmutable
+ {
+ return $this->createdAt;
+ }
+}
\ No newline at end of file
diff --git a/src/Entity/TagRebuildJob.php b/src/Entity/TagRebuildJob.php
new file mode 100644
index 0000000..0cadabf
--- /dev/null
+++ b/src/Entity/TagRebuildJob.php
@@ -0,0 +1,96 @@
+id = Uuid::v4();
+ $this->createdAt = new \DateTimeImmutable();
+ $this->status = self::STATUS_QUEUED;
+ }
+
+ public function getId(): Uuid
+ {
+ return $this->id;
+ }
+
+ public function getStatus(): string
+ {
+ return $this->status;
+ }
+
+ public function markRunning(): void
+ {
+ $this->status = self::STATUS_RUNNING;
+ $this->startedAt = new \DateTimeImmutable();
+ $this->errorMessage = null;
+ }
+
+ public function markCompleted(): void
+ {
+ $this->status = self::STATUS_COMPLETED;
+ $this->finishedAt = new \DateTimeImmutable();
+ }
+
+ public function markFailed(string $message): void
+ {
+ $this->status = self::STATUS_FAILED;
+ $this->finishedAt = new \DateTimeImmutable();
+ $this->errorMessage = $message;
+ }
+
+ public function getCreatedAt(): \DateTimeImmutable
+ {
+ return $this->createdAt;
+ }
+
+ public function getStartedAt(): ?\DateTimeImmutable
+ {
+ return $this->startedAt;
+ }
+
+ public function getFinishedAt(): ?\DateTimeImmutable
+ {
+ return $this->finishedAt;
+ }
+
+ public function getErrorMessage(): ?string
+ {
+ return $this->errorMessage;
+ }
+}
\ No newline at end of file
diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
index 409f419..1e62a2a 100644
--- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
+++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
@@ -4,33 +4,75 @@ declare(strict_types=1);
namespace App\Knowledge\Retrieval;
-use App\Knowledge\QueryCleaner;
+use App\Knowledge\ChunkManager;
+use App\Tag\TagRoutingService;
use App\Vector\VectorSearchClient;
final class NdjsonHybridRetriever implements RetrieverInterface
{
- private const VECTOR_SCORE_THRESHOLD = 0.25;
+ private const VECTOR_SCORE_THRESHOLD = 0.65;
+
+ /**
+ * Wenn Tag-Routing aktiv ist, erhöhen wir TopK,
+ * weil wir danach per document_id filtern.
+ */
+ private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 10;
+
+ /**
+ * Keyword-Scan: Mindest-Trefferanzahl an Terms, damit ein Chunk als Kandidat gilt.
+ */
+ private const KEYWORD_MIN_HITS = 1;
public function __construct(
+ private readonly ChunkManager $chunkManager,
private readonly NdjsonChunkLookup $lookup,
private readonly VectorSearchClient $vectorClient,
- private readonly QueryCleaner $queryCleaner,
- private readonly int $maxChunks = 25,
- private readonly int $vectorTopK = 10,
- )
- {
- }
+ private readonly TagRoutingService $tagRouting,
+ private readonly int $maxChunks = 3,
+ private readonly int $vectorTopK = 5,
+ ) {}
public function retrieve(string $prompt, int $limit = null): array
{
- $limit = $this->maxChunks;
- $keywordChunks = [];
- $query = $this->queryCleaner->clean($prompt);
+ $limit ??= $this->maxChunks;
- // Vector / enrichment
- $hits = $this->vectorClient->search($query, $this->vectorTopK);
+ // ---------------------------------------------------------
+ // 0) Tag-Routing FIRST (soft gate)
+ // ---------------------------------------------------------
+ $candidateDocIds = $this->tagRouting->route($prompt);
+
+ $candidateSet = null;
+
+ if (is_array($candidateDocIds) && $candidateDocIds !== []) {
+ $candidateSet = array_fill_keys($candidateDocIds, true);
+ }
+
+ // ---------------------------------------------------------
+ // 1) Keyword first (simple streaming scan)
+ // ---------------------------------------------------------
+ $terms = $this->extractTerms($prompt);
+
+ $keywordChunks = $this->keywordSearchStreaming($terms, $limit, $candidateSet);
+
+ if (\count($keywordChunks) >= $limit) {
+ return array_slice($keywordChunks, 0, $limit);
+ }
+
+ // ---------------------------------------------------------
+ // 2) Vector fallback / enrichment
+ // - If routed: increase TopK, then filter by document_id
+ // - Soft fallback: if filtering yields nothing -> global vector once
+ // ---------------------------------------------------------
+ $topK = $this->vectorTopK;
+
+ if ($candidateSet !== null) {
+ $topK = max($this->vectorTopK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $this->vectorTopK);
+ $topK = min($topK, 200); // guardrail
+ }
+
+ $hits = $this->vectorClient->search($prompt, $topK);
if ($hits === []) {
- return $this->diversifyByDevice($keywordChunks, $limit, 1);
+ return $keywordChunks;
}
$chunkIds = [];
@@ -45,73 +87,78 @@ final class NdjsonHybridRetriever implements RetrieverInterface
}
if ($chunkIds === []) {
- return $this->diversifyByDevice($keywordChunks, $limit, 1);
+ return $keywordChunks;
}
$rows = $this->lookup->findByChunkIds($chunkIds);
- foreach ($chunkIds as $id) {
+ // routed filtering by document_id
+ $finalChunkIds = $chunkIds;
+
+ if ($candidateSet !== null) {
+ $filtered = [];
+
+ foreach ($chunkIds as $id) {
+ $row = $rows[$id] ?? null;
+ if (!is_array($row)) {
+ continue;
+ }
+ $docId = $row['document_id'] ?? null;
+ if (!is_string($docId) || !isset($candidateSet[$docId])) {
+ continue;
+ }
+ $filtered[] = $id;
+ }
+
+ // Soft fallback: if routing filtered everything away, retry global vector once
+ if ($filtered === []) {
+ $hits2 = $this->vectorClient->search($prompt, $this->vectorTopK);
+ if ($hits2 === []) {
+ return $keywordChunks;
+ }
+
+ $chunkIds2 = [];
+ foreach ($hits2 as $hit) {
+ if (!isset($hit['chunk_id'], $hit['score'])) {
+ continue;
+ }
+ if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
+ continue;
+ }
+ $chunkIds2[] = (string)$hit['chunk_id'];
+ }
+
+ if ($chunkIds2 === []) {
+ return $keywordChunks;
+ }
+
+ $rows = $this->lookup->findByChunkIds($chunkIds2);
+ $finalChunkIds = $chunkIds2;
+ } else {
+ $finalChunkIds = $filtered;
+ }
+ }
+
+ foreach ($finalChunkIds as $id) {
if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) {
continue;
}
$keywordChunks[] = trim($rows[$id]['text']);
}
- // dedupe
+ // ---------------------------------------------------------
+ // 3) dedupe + limit
+ // ---------------------------------------------------------
$seen = [];
- $deduped = [];
+ $out = [];
foreach ($keywordChunks as $chunk) {
- $key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
+ $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
- $deduped[] = $chunk;
- }
-
- // diversify
- return $this->diversifyByDevice($deduped, $limit, 1);
- }
-
- private function extractTerms(string $text): array
- {
- $text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
-
- return array_values(array_filter(
- explode(' ', $text),
- static fn(string $w) => mb_strlen($w) > 2
- ));
- }
-
- private function extractDevice(string $chunk): string
- {
- $firstLine = explode("\n", $chunk, 2)[0] ?? '';
- return trim($firstLine);
- }
-
- private function diversifyByDevice(array $chunks, int $limit, int $maxPerDevice = 1): array
- {
- $seenDevices = [];
- $out = [];
-
- foreach ($chunks as $chunk) {
- $device = $this->extractDevice($chunk);
-
- if ($device === '') {
- continue;
- }
-
- if (!isset($seenDevices[$device])) {
- $seenDevices[$device] = 0;
- }
-
- if ($seenDevices[$device] >= $maxPerDevice) {
- continue;
- }
-
$out[] = $chunk;
- $seenDevices[$device]++;
if (\count($out) >= $limit) {
break;
@@ -120,4 +167,116 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return $out;
}
-}
+
+ /**
+ * Streaming Keyword Search über index.ndjson.
+ * Minimal, aber nützlich:
+ * - Score = Anzahl gefundener Terms
+ * - CandidateDocs (Tag-Routing) reduziert Scan massiv
+ *
+ * @param string[] $terms
+ * @param array|null $candidateSet
+ * @return string[]
+ */
+ private function keywordSearchStreaming(array $terms, int $limit, ?array $candidateSet): array
+ {
+ if ($terms === []) {
+ return [];
+ }
+
+ $maxScore = \count($terms);
+
+ // top list: each item = ['score' => int, 'text' => string]
+ $top = [];
+
+ foreach ($this->chunkManager->streamAll() as $row) {
+ $text = $row['text'] ?? null;
+ if (!is_string($text) || $text === '') {
+ continue;
+ }
+
+ if ($candidateSet !== null) {
+ $docId = $row['document_id'] ?? null;
+ if (!is_string($docId) || !isset($candidateSet[$docId])) {
+ continue;
+ }
+ }
+
+ $haystack = mb_strtolower($text);
+
+ $score = 0;
+ foreach ($terms as $t) {
+ if ($t === '') {
+ continue;
+ }
+ if (mb_stripos($haystack, $t) !== false) {
+ $score++;
+ }
+ }
+
+ if ($score < self::KEYWORD_MIN_HITS) {
+ continue;
+ }
+
+ $top[] = [
+ 'score' => $score,
+ 'text' => trim($text),
+ ];
+
+ // keep only best N (simple sort, N is tiny)
+ usort($top, static function (array $a, array $b): int {
+ // higher score first
+ $cmp = ($b['score'] <=> $a['score']);
+ if ($cmp !== 0) {
+ return $cmp;
+ }
+ // shorter chunk first (often more precise)
+ return (mb_strlen($a['text']) <=> mb_strlen($b['text']));
+ });
+
+ if (\count($top) > $limit) {
+ $top = array_slice($top, 0, $limit);
+ }
+
+ // early exit: perfect matches filled
+ if (\count($top) === $limit && ($top[0]['score'] ?? 0) >= $maxScore) {
+ break;
+ }
+ }
+
+ $out = [];
+ foreach ($top as $item) {
+ $out[] = (string)$item['text'];
+ }
+
+ return $out;
+ }
+
+ /**
+ * Minimal term extraction (stabiles Verhalten, wenig Magie)
+ *
+ * @return string[]
+ */
+ private function extractTerms(string $text): array
+ {
+ $text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
+
+ $parts = array_values(array_filter(
+ explode(' ', $text),
+ static fn(string $w) => mb_strlen($w) > 2
+ ));
+
+ // unique, order preserved
+ $seen = [];
+ $out = [];
+ foreach ($parts as $w) {
+ if (isset($seen[$w])) {
+ continue;
+ }
+ $seen[$w] = true;
+ $out[] = $w;
+ }
+
+ return $out;
+ }
+}
\ No newline at end of file
diff --git a/src/Service/TagRebuildJobService.php b/src/Service/TagRebuildJobService.php
new file mode 100644
index 0000000..342dc9a
--- /dev/null
+++ b/src/Service/TagRebuildJobService.php
@@ -0,0 +1,51 @@
+em->persist($job);
+ $this->em->flush();
+
+ $this->startAsync($job);
+
+ return $job;
+ }
+
+ private function startAsync(TagRebuildJob $job): void
+ {
+ $php = PHP_BINARY; // safest in runtime
+ $console = rtrim($this->projectDir, '/') . '/bin/console';
+
+ $cmd = sprintf(
+ '%s %s %s %s > /dev/null 2>&1 &',
+ escapeshellarg($php),
+ escapeshellarg($console),
+ 'mto:agent:tags:job:run',
+ escapeshellarg((string)$job->getId())
+ );
+
+ $this->agentLogger->info('[tags] enqueue job async', [
+ 'job' => (string)$job->getId(),
+ 'cmd' => $cmd,
+ ]);
+
+ @exec($cmd);
+ }
+}
\ No newline at end of file
diff --git a/src/Tag/TagNdjsonExporter.php b/src/Tag/TagNdjsonExporter.php
new file mode 100644
index 0000000..eeb5c93
--- /dev/null
+++ b/src/Tag/TagNdjsonExporter.php
@@ -0,0 +1,159 @@
+tagsNdjsonPath);
+ if (!\is_dir($dir)) {
+ @\mkdir($dir, 0775, true);
+ }
+
+ $tmpPath = $this->tagsNdjsonPath . '.tmp';
+
+ $fh = @\fopen($tmpPath, 'wb');
+ if (!$fh) {
+ throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
+ }
+
+ // ------------------------------------------------------------------
+ // Fetch tags (small) + join document ids (can be bigger) efficiently.
+ // We avoid repositories and keep it DB-agnostic via DQL/QB.
+ // ------------------------------------------------------------------
+
+ // 1) Load all tags (id, slug, label, description)
+ $tags = $this->em->createQueryBuilder()
+ ->select('t')
+ ->from(Tag::class, 't')
+ ->orderBy('t.label', 'ASC')
+ ->getQuery()
+ ->getResult();
+
+ if (!\is_array($tags) || $tags === []) {
+ \fclose($fh);
+
+ // Write empty file atomically
+ $this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
+
+ return [
+ 'tags' => 0,
+ 'lines' => 0,
+ 'bytes' => (int) @\filesize($this->tagsNdjsonPath),
+ 'path' => $this->tagsNdjsonPath,
+ ];
+ }
+
+ // 2) Build tagId => docIds map from document_tag
+ // We query pairs (tag_id, document_id) in one go.
+ $rows = $this->em->createQueryBuilder()
+ ->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
+ ->from(DocumentTag::class, 'dt')
+ ->getQuery()
+ ->getArrayResult();
+
+ $tagToDocs = [];
+ foreach ($rows as $r) {
+ $tagId = (string) ($r['tagId'] ?? '');
+ $docId = (string) ($r['docId'] ?? '');
+ if ($tagId === '' || $docId === '') {
+ continue;
+ }
+ $tagToDocs[$tagId][] = $docId;
+ }
+
+ // 3) Stream NDJSON lines
+ $lines = 0;
+
+ foreach ($tags as $tag) {
+ if (!$tag instanceof Tag) {
+ continue;
+ }
+
+ $tagId = (string) $tag->getId();
+ $docIds = $tagToDocs[$tagId] ?? [];
+
+ // de-dupe docIds for safety
+ if ($docIds !== []) {
+ $docIds = \array_values(\array_unique($docIds));
+ }
+
+ // "text" is the embedding source for tag vectors later:
+ // Keep it short but semantically useful.
+ $textParts = [
+ $tag->getLabel(),
+ $tag->getSlug(),
+ ];
+
+ $desc = $tag->getDescription();
+ if (\is_string($desc) && \trim($desc) !== '') {
+ $textParts[] = \trim($desc);
+ }
+
+ $line = [
+ 'tag_id' => $tagId,
+ 'text' => \implode("\n", $textParts),
+ 'document_ids' => $docIds,
+ ];
+
+ $json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
+ if (!\is_string($json)) {
+ // skip invalid line but keep export running
+ continue;
+ }
+
+ \fwrite($fh, $json . "\n");
+ $lines++;
+ }
+
+ \fclose($fh);
+
+ $this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
+
+ return [
+ 'tags' => \count($tags),
+ 'lines' => $lines,
+ 'bytes' => (int) @\filesize($this->tagsNdjsonPath),
+ 'path' => $this->tagsNdjsonPath,
+ ];
+ }
+
+ private function atomicReplace(string $tmpPath, string $finalPath): void
+ {
+ // Ensure old file can be replaced on Windows-like FS too (best effort)
+ if (\is_file($finalPath)) {
+ @\chmod($finalPath, 0664);
+ }
+
+ if (!@\rename($tmpPath, $finalPath)) {
+ // if rename fails, try copy+unlink fallback
+ if (!@\copy($tmpPath, $finalPath)) {
+ @\unlink($tmpPath);
+ throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);
+ }
+ @\unlink($tmpPath);
+ }
+
+ @\chmod($finalPath, 0664);
+ }
+}
\ No newline at end of file
diff --git a/src/Tag/TagRoutingService.php b/src/Tag/TagRoutingService.php
new file mode 100644
index 0000000..a15372c
--- /dev/null
+++ b/src/Tag/TagRoutingService.php
@@ -0,0 +1,98 @@
+tagSearch->search($query, self::DEFAULT_TOPK);
+ if (!is_array($hits) || $hits === []) {
+ return null;
+ }
+
+ $bestScore = (float)($hits[0]['score'] ?? 0.0);
+ if ($bestScore < self::MIN_BEST_SCORE) {
+ return null;
+ }
+
+ // Convert tag UUID strings to binary(16)
+ $tagBinaryIds = [];
+
+ foreach ($hits as $hit) {
+ $id = (string)($hit['tag_id'] ?? '');
+ if ($id === '') {
+ continue;
+ }
+
+ try {
+ $tagBinaryIds[] = Uuid::fromString($id)->toBinary();
+ } catch (\Throwable) {
+ continue;
+ }
+ }
+
+ if ($tagBinaryIds === []) {
+ return null;
+ }
+
+ // Direct DBAL query (binary-safe)
+ $conn = $this->em->getConnection();
+
+ $rows = $conn->executeQuery(
+ 'SELECT document_id
+ FROM document_tag
+ WHERE tag_id IN (:tagIds)',
+ ['tagIds' => $tagBinaryIds],
+ ['tagIds' => ArrayParameterType::BINARY]
+ )->fetchAllAssociative();
+
+ if ($rows === []) {
+ return null;
+ }
+
+ $docIds = [];
+
+ foreach ($rows as $row) {
+ if (!isset($row['document_id'])) {
+ continue;
+ }
+
+ try {
+ $uuid = Uuid::fromBinary($row['document_id']);
+ $docIds[(string)$uuid] = true;
+ } catch (\Throwable) {
+ continue;
+ }
+
+ if (count($docIds) >= self::MAX_CANDIDATE_DOCS) {
+ break;
+ }
+ }
+
+ return array_keys($docIds);
+ }
+}
\ No newline at end of file
diff --git a/src/Tag/TagVectorIndexBuilder.php b/src/Tag/TagVectorIndexBuilder.php
new file mode 100644
index 0000000..03f794f
--- /dev/null
+++ b/src/Tag/TagVectorIndexBuilder.php
@@ -0,0 +1,107 @@
+tagsNdjsonPath)) {
+ throw new \RuntimeException('tags.ndjson missing: ' . $this->tagsNdjsonPath);
+ }
+
+ if (!is_file($this->scriptPath)) {
+ throw new \RuntimeException('Tag ingest script missing: ' . $this->scriptPath);
+ }
+
+ $tmpIndex = $this->vectorTagsIndexPath . '.tmp';
+ $tmpMeta = $tmpIndex . '.meta.json';
+
+ $finalIndex = $this->vectorTagsIndexPath;
+ $finalMeta = $finalIndex . '.meta.json';
+
+ // Ensure output dir exists
+ $dir = \dirname($finalIndex);
+ if (!\is_dir($dir)) {
+ @\mkdir($dir, 0775, true);
+ }
+
+ // Clean tmp leftovers
+ @\unlink($tmpIndex);
+ @\unlink($tmpMeta);
+
+ // Positional args:
+ // python vector_ingest_tags.py
+ $cmd = sprintf(
+ '%s %s %s %s %s 2>&1',
+ escapeshellarg($this->pythonBin),
+ escapeshellarg($this->scriptPath),
+ escapeshellarg($this->tagsNdjsonPath),
+ escapeshellarg($tmpIndex),
+ escapeshellarg($this->embeddingModel),
+ );
+
+ $this->agentLogger->info('[tags] build tag vector index', [
+ 'cmd' => $cmd,
+ 'timeout' => $this->timeoutSeconds,
+ ]);
+
+ $out = [];
+ $exit = 0;
+
+ exec($cmd, $out, $exit);
+
+ if ($exit !== 0) {
+ $this->agentLogger->error('[tags] tag vector ingest failed', [
+ 'exit' => $exit,
+ 'out' => $out,
+ ]);
+ throw new \RuntimeException('Tag vector ingest failed (exit=' . $exit . ')');
+ }
+
+ // If no tags -> python may remove outputs and exit 0
+ if (!is_file($tmpIndex) || !is_file($tmpMeta)) {
+ // treat as "no index" rather than hard error
+ @\unlink($tmpIndex);
+ @\unlink($tmpMeta);
+ $this->agentLogger->warning('[tags] no tag index produced (maybe 0 tags).');
+ return;
+ }
+
+ // Atomic switch
+ $this->atomicReplace($tmpIndex, $finalIndex);
+ $this->atomicReplace($tmpMeta, $finalMeta);
+
+ $this->agentLogger->info('[tags] tag vector index build completed', [
+ 'index' => $finalIndex,
+ 'meta' => $finalMeta,
+ ]);
+ }
+
+ private function atomicReplace(string $tmp, string $final): void
+ {
+ if (!@rename($tmp, $final)) {
+ if (!@copy($tmp, $final)) {
+ @unlink($tmp);
+ throw new \RuntimeException('Atomic replace failed for: ' . $final);
+ }
+ @unlink($tmp);
+ }
+
+ @chmod($final, 0664);
+ }
+}
\ No newline at end of file
diff --git a/src/Tag/TagVectorSearchClient.php b/src/Tag/TagVectorSearchClient.php
new file mode 100644
index 0000000..a3b7bfa
--- /dev/null
+++ b/src/Tag/TagVectorSearchClient.php
@@ -0,0 +1,88 @@
+
+ */
+ public function search(string $query, int $limit = 8): array
+ {
+ if (!is_file($this->scriptPath)) {
+ $this->agentLogger->warning('Tag vector search script missing: ' . $this->scriptPath);
+ return [];
+ }
+
+ if (!is_file($this->vectorTagsIndexPath) || !is_file($this->vectorTagsMetaPath)) {
+ // no tag index available yet => no routing
+ return [];
+ }
+
+ $limit = max(1, min($limit, 50));
+
+ // Positional args, aligned with existing VectorSearchClient approach:
+ // python vector_search_tags.py
+ $cmd = sprintf(
+ '%s %s %s %d %s %s %s 2>&1',
+ escapeshellarg($this->pythonBin),
+ escapeshellarg($this->scriptPath),
+ escapeshellarg($query),
+ $limit,
+ escapeshellarg($this->vectorTagsIndexPath),
+ escapeshellarg($this->vectorTagsMetaPath),
+ escapeshellarg($this->embeddingModel),
+ );
+
+ exec($cmd, $out, $exitCode);
+
+ if ($exitCode !== 0 || empty($out)) {
+ return [];
+ }
+
+ $json = implode("\n", $out);
+
+ try {
+ $data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
+ } catch (\Throwable) {
+ return [];
+ }
+
+ if (!is_array($data)) {
+ return [];
+ }
+
+ $hits = [];
+ foreach ($data as $row) {
+ if (!is_array($row)) {
+ continue;
+ }
+ $tagId = (string)($row['tag_id'] ?? '');
+ $score = $row['score'] ?? null;
+
+ if ($tagId === '' || !is_numeric($score)) {
+ continue;
+ }
+
+ $hits[] = [
+ 'tag_id' => $tagId,
+ 'score' => (float)$score,
+ ];
+ }
+
+ return $hits;
+ }
+}
\ No newline at end of file
diff --git a/src/Vector/vector_ingest_tags.py b/src/Vector/vector_ingest_tags.py
new file mode 100644
index 0000000..41a4f4c
--- /dev/null
+++ b/src/Vector/vector_ingest_tags.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+from pathlib import Path
+
+# ---------------------------------------------------------
+# Positional args (aligned with PHP builder exec call)
+# ---------------------------------------------------------
+# 1 tags.ndjson
+# 2 out_index_path (can be .tmp)
+# 3 model
+# Example:
+# python vector_ingest_tags.py /var/knowledge/tags.ndjson /var/knowledge/vector_tags.index.tmp all-MiniLM-L6-v2
+# ---------------------------------------------------------
+
+if len(sys.argv) < 4:
+ print("ERROR: usage: vector_ingest_tags.py ", file=sys.stderr)
+ sys.exit(2)
+
+tags_path = Path(sys.argv[1]).resolve()
+out_path = Path(sys.argv[2]).resolve()
+model_name = sys.argv[3]
+
+meta_path = Path(str(out_path) + ".meta.json") # vector_tags.index(.tmp).meta.json
+
+# ---------------------------------------------------------
+# Dependency checks
+# ---------------------------------------------------------
+try:
+ import faiss
+except Exception:
+ print("ERROR: Python module 'faiss' not found.", file=sys.stderr)
+ sys.exit(10)
+
+try:
+ from sentence_transformers import SentenceTransformer
+except Exception:
+ print("ERROR: Python module 'sentence-transformers' not found.", file=sys.stderr)
+ sys.exit(11)
+
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+
+# ---------------------------------------------------------
+# File checks
+# ---------------------------------------------------------
+if not tags_path.is_file():
+ print(f"ERROR: tags.ndjson not found at {tags_path}", file=sys.stderr)
+ sys.exit(20)
+
+# Ensure output directory exists
+out_path.parent.mkdir(parents=True, exist_ok=True)
+
+# ---------------------------------------------------------
+# Load model
+# ---------------------------------------------------------
+model = SentenceTransformer(model_name)
+
+# ---------------------------------------------------------
+# Streaming read NDJSON
+# ---------------------------------------------------------
+texts = []
+ids = []
+
+with open(tags_path, "r", encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+
+ try:
+ entry = json.loads(line)
+ except Exception:
+ continue
+
+ text = entry.get("text")
+ tag_id = entry.get("tag_id")
+
+ if not text or not tag_id:
+ continue
+
+ text = str(text)
+ if len(text) > 4000:
+ text = text[:4000]
+
+ texts.append(text)
+ ids.append(str(tag_id))
+
+# If empty: remove outputs (tmp) and exit success
+if not texts:
+ if out_path.exists():
+ out_path.unlink()
+ if meta_path.exists():
+ meta_path.unlink()
+ sys.exit(0)
+
+# ---------------------------------------------------------
+# Build embeddings
+# ---------------------------------------------------------
+embeddings = model.encode(
+ texts,
+ normalize_embeddings=True,
+ show_progress_bar=False,
+ batch_size=64
+)
+
+embeddings = np.array(embeddings).astype("float32")
+dim = embeddings.shape[1]
+
+# ---------------------------------------------------------
+# Build FAISS index
+# ---------------------------------------------------------
+index = faiss.IndexFlatIP(dim)
+index.add(embeddings)
+
+faiss.write_index(index, str(out_path))
+
+# ---------------------------------------------------------
+# Write ID mapping meta
+# ---------------------------------------------------------
+with open(meta_path, "w", encoding="utf-8") as f:
+ json.dump(ids, f)
+
+sys.exit(0)
\ No newline at end of file
diff --git a/src/Vector/vector_search_tags.py b/src/Vector/vector_search_tags.py
new file mode 100644
index 0000000..e9e41ea
--- /dev/null
+++ b/src/Vector/vector_search_tags.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+from pathlib import Path
+
+# ---------------------------------------------------------
+# Positional args (aligned with PHP client exec call)
+# ---------------------------------------------------------
+# 1 query
+# 2 limit
+# 3 index_path
+# 4 meta_path
+# 5 model
+#
+# Example:
+# python vector_search_tags.py "foo" 8 /path/vector_tags.index /path/vector_tags.index.meta.json all-MiniLM-L6-v2
+# ---------------------------------------------------------
+
+if len(sys.argv) < 6:
+ print("[]")
+ sys.exit(0)
+
+query = sys.argv[1]
+
+try:
+ limit = int(sys.argv[2])
+except Exception:
+ limit = 5
+
+index_path = Path(sys.argv[3]).resolve()
+meta_path = Path(sys.argv[4]).resolve()
+model_name = sys.argv[5]
+
+# ---------------------------------------------------------
+# Dependency checks
+# ---------------------------------------------------------
+try:
+ import faiss
+except Exception:
+ # keep stdout clean for caller
+ print("[]")
+ sys.exit(0)
+
+try:
+ from sentence_transformers import SentenceTransformer
+except Exception:
+ print("[]")
+ sys.exit(0)
+
+from sentence_transformers import SentenceTransformer
+
+# ---------------------------------------------------------
+# File checks
+# ---------------------------------------------------------
+if limit <= 0:
+ print("[]")
+ sys.exit(0)
+
+if not index_path.is_file() or not meta_path.is_file():
+ # No tag index available => no routing
+ print("[]")
+ sys.exit(0)
+
+# ---------------------------------------------------------
+# Load model
+# ---------------------------------------------------------
+model = SentenceTransformer(model_name)
+
+# ---------------------------------------------------------
+# Load index + meta
+# ---------------------------------------------------------
+index = faiss.read_index(str(index_path))
+
+try:
+ with open(meta_path, "r", encoding="utf-8") as f:
+ ids = json.load(f)
+except Exception:
+ print("[]")
+ sys.exit(0)
+
+if not isinstance(ids, list) or len(ids) == 0:
+ print("[]")
+ sys.exit(0)
+
+# ---------------------------------------------------------
+# Embed & search
+# ---------------------------------------------------------
+qvec = model.encode([query], normalize_embeddings=True)
+
+scores, idxs = index.search(qvec, limit)
+
+out = []
+for score, idx in zip(scores[0], idxs[0]):
+ if idx is None or idx < 0 or idx >= len(ids):
+ continue
+ out.append({
+ "tag_id": str(ids[idx]),
+ "score": float(score),
+ })
+
+print(json.dumps(out))
+sys.exit(0)
\ No newline at end of file
diff --git a/templates/admin/base.html.twig b/templates/admin/base.html.twig
index f32b07a..072d9f5 100644
--- a/templates/admin/base.html.twig
+++ b/templates/admin/base.html.twig
@@ -67,6 +67,14 @@
Dokumente
+ {# ------------------------- #}
+ {# Tags (Document Routing) #}
+ {# ------------------------- #}
+
+ Tags
+
+
Ingest Jobs
@@ -119,4 +127,4 @@