first commit

This commit is contained in:
team 1
2026-04-20 16:36:28 +02:00
parent a0ec07a99c
commit 2587ac8b4b
41 changed files with 5126 additions and 2280 deletions

View File

@@ -4,77 +4,84 @@ declare(strict_types=1);
namespace App\Catalog;
use App\Config\CatalogIntentConfig;
use App\Entity\Document;
use App\Tag\TagTypes;
use App\Tag\TagVectorSearchClient;
use Doctrine\DBAL\Connection;
use Symfony\Component\Uid\Uuid;
/**
* EntityCatalogService
* Builds deterministic catalog lists from a validated catalog entity term.
*
* Deterministische Katalog-Listen auf Basis eines Entity-Terms:
* - TagVectorSearch (Score-Gate + Ambiguity-Check)
* - DB Query auf document_tag + document (ACTIVE)
* - Rückgabe als EIN Textblock (string) oder null (Fallback auf normalen Retrieval)
*
* Schritt-3 Änderung:
* - Headline ist NICHT mehr hardcoded
* - Headline basiert dynamisch auf dem gefundenen Tag
* This service is intentionally conservative:
* - only strong catalog_entity matches may open the catalog path
* - ambiguous matches fall back to normal retrieval
* - only ACTIVE documents are listed
*/
final class EntityCatalogService
{
private const MIN_SCORE = 0.55;
private const AMBIGUITY_DELTA = 0.05;
private const SEARCH_LIMIT = 3;
public function __construct(
private readonly TagVectorSearchClient $tagVectorClient,
private readonly Connection $connection,
) {}
private readonly Connection $connection,
) {
}
/**
* @return string|null Textblock oder null (wenn kein sicherer Catalog möglich ist)
* Returns a catalog text block or null when no safe catalog path exists.
*/
public function listByTerm(string $entityTerm): ?string
{
$entityTerm = trim($entityTerm);
if ($entityTerm === '') {
return null;
}
// 1) Tag-Vektorsuche (Top 3 für Ambiguity-Prüfung)
$hits = $this->tagVectorClient->search($entityTerm, 3);
$hits = $this->tagVectorClient->search($entityTerm, self::SEARCH_LIMIT);
if ($hits === []) {
return null;
}
$best = $hits[0];
$bestScore = (float) ($best['score'] ?? 0.0);
$bestScore = isset($best['score']) ? (float)$best['score'] : 0.0;
if ($bestScore < self::MIN_SCORE) {
if ($bestScore < CatalogIntentConfig::MIN_SCORE) {
return null;
}
if (($best['tag_type'] ?? null) !== TagTypes::CATALOG_ENTITY) {
return null;
}
// 2) Ambiguity: wenn Top2 zu nah ist → konservativ abbrechen
if (isset($hits[1])) {
$secondScore = isset($hits[1]['score']) ? (float)$hits[1]['score'] : 0.0;
if (abs($bestScore - $secondScore) < self::AMBIGUITY_DELTA) {
$secondScore = (float) ($hits[1]['score'] ?? 0.0);
if (abs($bestScore - $secondScore) < CatalogIntentConfig::AMBIGUITY_DELTA) {
return null;
}
}
$tagHex = (string)($best['tag_id'] ?? '');
if ($tagHex === '') {
$tagId = trim((string) ($best['tag_id'] ?? ''));
if ($tagId === '') {
return null;
}
// OPTIONAL: Falls TagVectorSearchClient künftig tag_label zurückliefert,
// kann das hier direkt verwendet werden.
$tagLabel = isset($best['tag_label']) ? (string)$best['tag_label'] : null;
try {
$tagBinaryId = Uuid::fromString($tagId)->toBinary();
} catch (\Throwable) {
return null;
}
$tagLabel = trim((string) ($best['label'] ?? ''));
// 3) DB Query: alle ACTIVE Dokumente zu diesem Tag
$rows = $this->connection->fetchAllAssociative(
'
SELECT d.title
SELECT DISTINCT d.title
FROM document d
INNER JOIN document_tag dt ON dt.document_id = d.id
WHERE dt.tag_id = :tagId
@@ -82,8 +89,8 @@ final class EntityCatalogService
ORDER BY d.title ASC
',
[
'tagId' => Uuid::fromString($tagHex)->toBinary(),
'status' => 'ACTIVE',
'tagId' => $tagBinaryId,
'status' => Document::STATUS_ACTIVE,
]
);
@@ -92,37 +99,42 @@ final class EntityCatalogService
}
$titles = [];
foreach ($rows as $row) {
$t = trim((string)($row['title'] ?? ''));
if ($t !== '') {
$titles[] = $t;
$title = trim((string) ($row['title'] ?? ''));
if ($title === '') {
continue;
}
$titles[$title] = $title;
}
if ($titles === []) {
return null;
}
return $this->buildTextBlock($tagLabel, $titles);
return $this->buildTextBlock(
$tagLabel !== '' ? $tagLabel : null,
array_values($titles)
);
}
/**
* Dynamische Headline:
* - Wenn Tag-Label vorhanden → verwenden
* - Sonst generischer Fallback
* Builds a stable human-readable list block for the prompt.
*
* @param list<string> $titles
*/
private function buildTextBlock(?string $tagLabel, array $titles): string
{
$headline = 'Folgende Einträge sind verfügbar:';
if (\is_string($tagLabel) && \trim($tagLabel) !== '') {
$headline = sprintf(
'Folgende %s sind verfügbar:',
$tagLabel
);
if ($tagLabel !== null && trim($tagLabel) !== '') {
$headline = sprintf('Folgende %s sind verfügbar:', trim($tagLabel));
}
$lines = [];
foreach ($titles as $title) {
$lines[] = '- ' . $title;
}

View File

@@ -1,6 +1,5 @@
<?php
declare(strict_types=1);
namespace App\Command;
@@ -28,16 +27,15 @@ use Symfony\Component\Process\Process;
final class SystemRebuildCommand extends Command
{
public function __construct(
private readonly IngestJobService $jobService,
private readonly IngestOrchestrator $orchestrator,
private readonly TagNdjsonExporter $tagExporter,
private readonly TagVectorIndexBuilder $tagIndexBuilder,
private readonly IndexMetaManager $metaManager,
private readonly VectorIndexHealthService $health,
private readonly IngestJobService $jobService,
private readonly IngestOrchestrator $orchestrator,
private readonly TagNdjsonExporter $tagExporter,
private readonly TagVectorIndexBuilder $tagIndexBuilder,
private readonly IndexMetaManager $metaManager,
private readonly VectorIndexHealthService $health,
private readonly TagVectorIndexHealthService $tagHealth,
private readonly string $projectDir,
)
{
private readonly string $projectDir,
) {
parent::__construct();
}
@@ -58,16 +56,37 @@ final class SystemRebuildCommand extends Command
if (!$input->getOption('hard')) {
$io->error('Safety switch missing: you must pass --hard to run this command.');
$io->writeln('Example: bin/console mto:agent:system:rebuild --hard');
return Command::FAILURE;
}
$dryRun = (bool)$input->getOption('dry-run');
$dryRun = (bool) $input->getOption('dry-run');
$io->title('mto:agent:system:rebuild --hard');
// ---------------------------------------------------------
// 1) GLOBAL REINDEX (chunks rewrite + vector rebuild)
// ---------------------------------------------------------
if (!$this->runGlobalReindex($io, $dryRun)) {
return Command::FAILURE;
}
if (!$this->runTagRebuild($io, $input, $dryRun)) {
return Command::FAILURE;
}
if (!$this->runVectorServiceReload($io, $input, $dryRun)) {
return Command::FAILURE;
}
if (!$this->runHealthChecks($io, $input)) {
return Command::FAILURE;
}
$io->success('System rebuild finished.');
return Command::SUCCESS;
}
private function runGlobalReindex(SymfonyStyle $io, bool $dryRun): bool
{
$io->section('1/4 Global reindex (chunks + vector index)');
$job = $this->jobService->startJob(
@@ -82,141 +101,181 @@ final class SystemRebuildCommand extends Command
try {
$this->orchestrator->runExistingJob($job, $dryRun);
$io->success('Global reindex completed.');
return true;
} catch (\Throwable $e) {
$io->error('Global reindex failed: ' . $e->getMessage());
return Command::FAILURE;
return false;
}
}
// ---------------------------------------------------------
// 2) TAG REBUILD (tags.ndjson + vector_tags.index)
// ---------------------------------------------------------
if (!$input->getOption('no-tags')) {
$io->section('2/4 Tag rebuild (tags.ndjson + vector_tags.index)');
if ($dryRun) {
$io->note('dry-run enabled: tag rebuild skipped (would export + build tag index).');
} else {
try {
$export = $this->tagExporter->export();
$io->writeln('<info>Exported tags.ndjson</info>');
$io->writeln('Path: ' . $export['path']);
$io->writeln('Tags: ' . $export['tags']);
$io->writeln('Lines: ' . $export['lines']);
$io->writeln('Bytes: ' . $export['bytes']);
$this->tagIndexBuilder->build();
$io->writeln('<info>Built vector_tags.index</info>');
$this->metaManager->touchRuntime([
'last_tags_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
]);
$io->success('Tag rebuild completed.');
} catch (\Throwable $e) {
$io->error('Tag rebuild failed: ' . $e->getMessage());
return Command::FAILURE;
}
}
} else {
private function runTagRebuild(SymfonyStyle $io, InputInterface $input, bool $dryRun): bool
{
if ((bool) $input->getOption('no-tags')) {
$io->section('2/4 Tag rebuild');
$io->note('Skipped due to --no-tags.');
return true;
}
// ---------------------------------------------------------
// 3) VECTOR SERVICE (install deps + start + reload)
// ---------------------------------------------------------
if (!$input->getOption('no-reload')) {
$io->section('3/4 Vector service reload (uvicorn)');
$io->section('2/4 Tag rebuild (tags.ndjson + vector_tags.index)');
if ($dryRun) {
$io->note('dry-run enabled: service reload skipped.');
} else {
$cmd = [
'.venv/bin/python',
'python/vector/vector_control.py',
'--install',
'--start',
'--reload',
'--port', '8090',
'--host', '0.0.0.0'
];
if ($dryRun) {
$io->note('dry-run enabled: tag rebuild skipped (would export + build tag index).');
$process = new Process($cmd, $this->projectDir);
$process->setTimeout(600);
$process->run();
return true;
}
$out = trim($process->getOutput());
$err = trim($process->getErrorOutput());
try {
$export = $this->tagExporter->export();
if ($out !== '') {
$io->writeln($out);
}
if ($err !== '') {
$io->writeln('<comment>' . $err . '</comment>');
}
$io->writeln('<info>Exported tags.ndjson</info>');
$io->writeln('Path: ' . (string) $export['path']);
$io->writeln('Tags: ' . (string) $export['tags']);
$io->writeln('Lines: ' . (string) $export['lines']);
$io->writeln('Bytes: ' . (string) $export['bytes']);
if (!$process->isSuccessful()) {
$io->error('Vector service reload failed (non-zero exit code).');
return Command::FAILURE;
}
$this->tagIndexBuilder->build();
$io->success('Vector service reloaded.');
}
} else {
$io->success('Tag rebuild completed.');
return true;
} catch (\Throwable $e) {
$io->error('Tag rebuild failed: ' . $e->getMessage());
return false;
}
}
private function runVectorServiceReload(SymfonyStyle $io, InputInterface $input, bool $dryRun): bool
{
if ((bool) $input->getOption('no-reload')) {
$io->section('3/4 Vector service reload');
$io->note('Skipped due to --no-reload.');
return true;
}
// ---------------------------------------------------------
// 4) HEALTH CHECK (NDJSON vs vector meta)
// ---------------------------------------------------------
if (!$input->getOption('no-health')) {
$io->section('4/4 Health check');
$io->section('3/4 Vector service reload (uvicorn)');
try {
$report = $this->health->check();
} catch (\Throwable $e) {
$io->error('Health check failed: ' . $e->getMessage());
return Command::FAILURE;
}
if ($dryRun) {
$io->note('dry-run enabled: service reload skipped.');
try {
$reportTag = $this->tagHealth->check();
} catch (\Throwable $e) {
$io->error('Tag health check failed: ' . $e->getMessage());
return Command::FAILURE;
}
return true;
}
$io->definitionList(
['ndjson_exists' => $report['ndjson_exists'] ? 'yes' : 'no'],
['ndjson_chunk_count' => (string)$report['ndjson_chunk_count']],
['vector_exists' => $report['vector_exists'] ? 'yes' : 'no'],
['meta_exists' => $report['meta_exists'] ? 'yes' : 'no'],
['vector_chunk_count' => (string)$report['vector_chunk_count']],
['status' => (string)$report['status']],
);
$cmd = [
'.venv/bin/python',
'python/vector/vector_control.py',
'--install',
'--start',
'--reload',
'--port', '8090',
'--host', '0.0.0.0',
];
$io->definitionList(
['tags_ndjson_exists' => $reportTag['tags_ndjson_exists'] ? 'yes' : 'no'],
['tags_ndjson_count' => (string)$reportTag['tags_ndjson_count']],
['tag_vector_exists' => $reportTag['vector_exists'] ? 'yes' : 'no'],
['tag_meta_exists' => $reportTag['meta_exists'] ? 'yes' : 'no'],
['vector_tag_count' => (string)$reportTag['vector_tag_count']],
['status' => (string)$reportTag['status']],
);
$process = new Process($cmd, $this->projectDir);
$process->setTimeout(600);
$process->run();
if (!in_array($report['status'], ['OK', 'OK_EMPTY'], true)) {
$io->error('Health check not OK: ' . $report['status']);
return Command::FAILURE;
}
$stdout = trim($process->getOutput());
$stderr = trim($process->getErrorOutput());
$io->success('Health check OK.');
} else {
if ($stdout !== '') {
$io->writeln($stdout);
}
if ($stderr !== '') {
$io->writeln('<comment>' . $stderr . '</comment>');
}
if (!$process->isSuccessful()) {
$io->error('Vector service reload failed (non-zero exit code).');
return false;
}
$io->success('Vector service reloaded.');
return true;
}
private function runHealthChecks(SymfonyStyle $io, InputInterface $input): bool
{
if ((bool) $input->getOption('no-health')) {
$io->section('4/4 Health check');
$io->note('Skipped due to --no-health.');
return true;
}
$io->success('System rebuild finished.');
return Command::SUCCESS;
$io->section('4/4 Health check');
try {
$chunkReport = $this->health->check();
} catch (\Throwable $e) {
$io->error('Health check failed: ' . $e->getMessage());
return false;
}
try {
$tagReport = $this->tagHealth->check();
} catch (\Throwable $e) {
$io->error('Tag health check failed: ' . $e->getMessage());
return false;
}
$this->renderChunkHealth($io, $chunkReport);
$this->renderTagHealth($io, $tagReport);
if (!$this->isHealthOk((string) ($chunkReport['status'] ?? 'UNKNOWN'))) {
$io->error('Chunk health check not OK: ' . (string) ($chunkReport['status'] ?? 'UNKNOWN'));
return false;
}
if (!$this->isHealthOk((string) ($tagReport['status'] ?? 'UNKNOWN'))) {
$io->error('Tag health check not OK: ' . (string) ($tagReport['status'] ?? 'UNKNOWN'));
return false;
}
$io->success('Health check OK.');
return true;
}
private function renderChunkHealth(SymfonyStyle $io, array $report): void
{
$io->definitionList(
['ndjson_exists' => !empty($report['ndjson_exists']) ? 'yes' : 'no'],
['ndjson_chunk_count' => (string) ($report['ndjson_chunk_count'] ?? 0)],
['vector_exists' => !empty($report['vector_exists']) ? 'yes' : 'no'],
['meta_exists' => !empty($report['meta_exists']) ? 'yes' : 'no'],
['vector_chunk_count' => (string) ($report['vector_chunk_count'] ?? 0)],
['status' => (string) ($report['status'] ?? 'UNKNOWN')],
);
}
private function renderTagHealth(SymfonyStyle $io, array $report): void
{
$io->definitionList(
['tags_ndjson_exists' => !empty($report['tags_ndjson_exists']) ? 'yes' : 'no'],
['tags_ndjson_count' => (string) ($report['tags_ndjson_count'] ?? 0)],
['tag_vector_exists' => !empty($report['vector_exists']) ? 'yes' : 'no'],
['tag_meta_exists' => !empty($report['meta_exists']) ? 'yes' : 'no'],
['vector_tag_count' => (string) ($report['vector_tag_count'] ?? 0)],
['tags_with_active_document_ids' => (string) ($report['tags_with_active_document_ids'] ?? 0)],
['meta_valid' => !empty($report['meta_valid']) ? 'yes' : 'no'],
['status' => (string) ($report['status'] ?? 'UNKNOWN')],
);
}
private function isHealthOk(string $status): bool
{
return in_array($status, ['OK', 'OK_EMPTY'], true);
}
}

View File

@@ -8,11 +8,13 @@ use App\Tag\TagVectorIndexHealthService;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
#[AsCommand(
name: 'mto:agent:tag:health',
description: 'Health-Check für TAG/FAISS Konsistenz'
description: 'Health-Check für Tag-/FAISS-Konsistenz'
)]
final class TagHealthCheckCommand extends Command
{
@@ -22,14 +24,87 @@ final class TagHealthCheckCommand extends Command
parent::__construct();
}
protected function configure(): void
{
$this->addOption(
'summary',
null,
InputOption::VALUE_NONE,
'Gibt eine lesbare Zusammenfassung statt JSON aus.'
);
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$result = $this->health->check();
$status = trim((string) ($result['status'] ?? ''));
$output->writeln(json_encode($result, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES));
if ($status === '') {
$status = 'UNKNOWN';
$result['status'] = $status;
$result['error'] = 'Health service returned no status.';
}
return str_starts_with($result['status'], 'OK')
if ((bool) $input->getOption('summary')) {
$this->renderSummary(new SymfonyStyle($input, $output), $result);
} else {
$this->renderJson($output, $result);
}
return $this->isHealthy($status)
? Command::SUCCESS
: Command::FAILURE;
}
}
/**
* @param array<string, mixed> $result
*/
private function renderSummary(SymfonyStyle $io, array $result): void
{
$io->title('Tag Vector Health');
$io->definitionList(
['status' => (string) ($result['status'] ?? 'UNKNOWN')],
['tags_ndjson_exists' => !empty($result['tags_ndjson_exists']) ? 'yes' : 'no'],
['tags_ndjson_count' => (string) ($result['tags_ndjson_count'] ?? 0)],
['vector_exists' => !empty($result['vector_exists']) ? 'yes' : 'no'],
['meta_exists' => !empty($result['meta_exists']) ? 'yes' : 'no'],
['vector_tag_count' => (string) ($result['vector_tag_count'] ?? 0)],
['meta_valid' => !empty($result['meta_valid']) ? 'yes' : 'no'],
['tags_with_active_document_ids' => (string) ($result['tags_with_active_document_ids'] ?? 0)],
);
if (!empty($result['error'])) {
$io->warning((string) $result['error']);
}
}
/**
* @param array<string, mixed> $result
*/
private function renderJson(OutputInterface $output, array $result): void
{
$json = json_encode(
$result,
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
);
if (!is_string($json)) {
$json = json_encode([
'status' => 'UNKNOWN',
'error' => 'json_encode_failed',
], JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
if (!is_string($json)) {
$json = "{\"status\":\"UNKNOWN\",\"error\":\"json_encode_failed\"}";
}
}
$output->writeln($json);
}
private function isHealthy(string $status): bool
{
return in_array($status, ['OK', 'OK_EMPTY'], true);
}
}

View File

@@ -14,6 +14,7 @@ use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
#[AsCommand(
name: 'mto:agent:tags:job:run',
@@ -39,112 +40,152 @@ final class TagRebuildRunJobCommand extends Command
protected function execute(InputInterface $input, OutputInterface $output): int
{
$jobId = $input->getArgument('jobId');
$io = new SymfonyStyle($input, $output);
$jobId = trim((string) $input->getArgument('jobId'));
$create = (bool) $input->getOption('create');
if (!$create && !$jobId) {
$output->writeln('<error>You must provide either a jobId or use --create.</error>');
if (!$create && $jobId === '') {
$io->error('You must provide either a jobId or use --create.');
return Command::FAILURE;
}
if ($create && $jobId) {
$output->writeln('<error>Use either jobId OR --create, not both.</error>');
if ($create && $jobId !== '') {
$io->error('Use either jobId OR --create, not both.');
return Command::FAILURE;
}
if ($create) {
$job = new TagRebuildJob();
$this->em->persist($job);
$this->em->flush();
$jobId = $job->getId();
$output->writeln('<info>Created new TagRebuildJob: ' . $jobId . '</info>');
} else {
/** @var TagRebuildJob|null $job */
$job = $this->em->getRepository(TagRebuildJob::class)->find($jobId);
if (!$job instanceof TagRebuildJob) {
$output->writeln('<error>Job not found.</error>');
return Command::FAILURE;
}
}
$fh = null;
$job = null;
$lockHandle = null;
try {
// ---------------------------------------------------------
// LOCK INITIALIZATION
// ---------------------------------------------------------
$lockDir = \dirname($this->lockFilePath);
$job = $create ? $this->createJob($io) : $this->findJob($jobId);
$lockHandle = $this->acquireLock();
if (!\is_dir($lockDir) && !@\mkdir($lockDir, 0775, true) && !\is_dir($lockDir)) {
throw new \RuntimeException('Cannot create lock directory.');
}
$fh = @\fopen($this->lockFilePath, 'c+');
if (!$fh) {
throw new \RuntimeException('Cannot open lock file: ' . $this->lockFilePath);
}
if (!@\flock($fh, LOCK_EX | LOCK_NB)) {
throw new \RuntimeException('Another tag rebuild is currently running (lock busy).');
}
// ---------------------------------------------------------
// MARK RUNNING
// ---------------------------------------------------------
$job->markRunning();
$this->em->flush();
// ---------------------------------------------------------
// EXPORT TAGS (NDJSON)
// ---------------------------------------------------------
$export = $this->exporter->export();
$this->assertValidExport($export);
if (
!isset($export['path']) ||
!\is_string($export['path']) ||
!\file_exists($export['path'])
) {
throw new \RuntimeException('Export failed: NDJSON file missing.');
}
$io->writeln('<info>tags.ndjson exported</info>');
$io->writeln('Path: ' . (string) $export['path']);
$io->writeln('Tags: ' . (string) ($export['tags'] ?? 0));
$io->writeln('Lines: ' . (string) ($export['lines'] ?? 0));
$io->writeln('Bytes: ' . (string) ($export['bytes'] ?? 0));
if (isset($export['count']) && (int) $export['count'] === 0) {
throw new \RuntimeException('Export produced zero tags.');
}
// ---------------------------------------------------------
// BUILD VECTOR INDEX
// ---------------------------------------------------------
$this->builder->build();
// ---------------------------------------------------------
// MARK COMPLETED
// ---------------------------------------------------------
$job->markCompleted();
$this->em->flush();
$output->writeln('<info>Tag rebuild successful.</info>');
$output->writeln('NDJSON: ' . $export['path']);
$io->success('Tag rebuild successful.');
return Command::SUCCESS;
}
catch (\Throwable $e) {
if (isset($job)) {
$job->markFailed($e->getMessage());
} catch (\Throwable $e) {
if ($job instanceof TagRebuildJob) {
$job->markFailed($this->buildSafeErrorMessage($e));
$this->em->flush();
}
$output->writeln('<error>FAILED: ' . $e->getMessage() . '</error>');
$io->error('FAILED: ' . $e->getMessage());
return Command::FAILURE;
}
finally {
if ($fh) {
@\flock($fh, LOCK_UN);
@\fclose($fh);
}
} finally {
$this->releaseLock($lockHandle);
}
}
private function createJob(SymfonyStyle $io): TagRebuildJob
{
$job = new TagRebuildJob();
$this->em->persist($job);
$this->em->flush();
$io->writeln('<info>Created new TagRebuildJob: ' . (string) $job->getId() . '</info>');
return $job;
}
private function findJob(string $jobId): TagRebuildJob
{
/** @var TagRebuildJob|null $job */
$job = $this->em->getRepository(TagRebuildJob::class)->find($jobId);
if (!$job instanceof TagRebuildJob) {
throw new \RuntimeException('Job not found.');
}
return $job;
}
/**
* @return resource
*/
private function acquireLock()
{
$lockDir = \dirname($this->lockFilePath);
if (!\is_dir($lockDir) && !@\mkdir($lockDir, 0775, true) && !\is_dir($lockDir)) {
throw new \RuntimeException('Cannot create lock directory.');
}
$handle = @\fopen($this->lockFilePath, 'c+');
if ($handle === false) {
throw new \RuntimeException('Cannot open lock file: ' . $this->lockFilePath);
}
if (!@\flock($handle, LOCK_EX | LOCK_NB)) {
@\fclose($handle);
throw new \RuntimeException('Another tag rebuild is currently running (lock busy).');
}
return $handle;
}
/**
* @param resource|null $handle
*/
private function releaseLock($handle): void
{
if (!is_resource($handle)) {
return;
}
@\flock($handle, LOCK_UN);
@\fclose($handle);
}
/**
* @param array<string, mixed> $export
*/
private function assertValidExport(array $export): void
{
$path = trim((string) ($export['path'] ?? ''));
if ($path === '' || !\is_file($path)) {
throw new \RuntimeException('Export failed: NDJSON file missing.');
}
$tags = (int) ($export['tags'] ?? 0);
$lines = (int) ($export['lines'] ?? 0);
if ($tags < 0 || $lines < 0) {
throw new \RuntimeException('Export returned invalid statistics.');
}
}
private function buildSafeErrorMessage(\Throwable $e): string
{
$message = trim($e->getMessage());
if ($message === '') {
return 'Unknown tag rebuild failure.';
}
return mb_substr($message, 0, 4000);
}
}

View File

@@ -9,6 +9,7 @@ use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
#[AsCommand(
name: 'mto:agent:tags:export',
@@ -17,26 +18,51 @@ use Symfony\Component\Console\Output\OutputInterface;
final class TagsExportCommand extends Command
{
public function __construct(
private TagNdjsonExporter $exporter,
private readonly TagNdjsonExporter $exporter,
) {
parent::__construct();
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$io = new SymfonyStyle($input, $output);
try {
$result = $this->exporter->export();
$this->assertValidExport($result);
$io->writeln('<info>Tags NDJSON exported</info>');
$io->writeln('Path: ' . (string) ($result['path'] ?? ''));
$io->writeln('Tags: ' . (string) ($result['tags'] ?? 0));
$io->writeln('Lines: ' . (string) ($result['lines'] ?? 0));
$io->writeln('Bytes: ' . (string) ($result['bytes'] ?? 0));
$io->success('Tag export completed.');
return Command::SUCCESS;
} catch (\Throwable $e) {
$output->writeln('<error>ERROR: ' . $e->getMessage() . '</error>');
$io->error($e->getMessage());
return Command::FAILURE;
}
}
$output->writeln('<info>Tags NDJSON exported</info>');
$output->writeln('Path: ' . $result['path']);
$output->writeln('Tags: ' . $result['tags']);
$output->writeln('Lines: ' . $result['lines']);
$output->writeln('Bytes: ' . $result['bytes']);
/**
* @param array<string, mixed> $result
*/
private function assertValidExport(array $result): void
{
$path = trim((string) ($result['path'] ?? ''));
return Command::SUCCESS;
if ($path === '' || !is_file($path)) {
throw new \RuntimeException('Tag export failed: tags.ndjson is missing.');
}
$tags = (int) ($result['tags'] ?? 0);
$lines = (int) ($result['lines'] ?? 0);
$bytes = (int) ($result['bytes'] ?? 0);
if ($tags < 0 || $lines < 0 || $bytes < 0) {
throw new \RuntimeException('Tag export returned invalid statistics.');
}
}
}

View File

@@ -4,13 +4,13 @@ declare(strict_types=1);
namespace App\Command;
use App\Index\IndexMetaManager;
use App\Tag\TagNdjsonExporter;
use App\Tag\TagVectorIndexBuilder;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
#[AsCommand(
name: 'mto:agent:tags:rebuild',
@@ -21,45 +21,54 @@ final class TagsRebuildCommand extends Command
public function __construct(
private readonly TagNdjsonExporter $exporter,
private readonly TagVectorIndexBuilder $builder,
private readonly IndexMetaManager $metaManager,
) {
parent::__construct();
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$io = new SymfonyStyle($input, $output);
try {
// -----------------------------------------
// 1) Export tags.ndjson
// -----------------------------------------
$export = $this->exporter->export();
$this->assertValidExport($export);
$output->writeln('<info>1/3 Exported tags.ndjson</info>');
$output->writeln('Path: ' . $export['path']);
$output->writeln('Tags: ' . $export['tags']);
$output->writeln('Lines: ' . $export['lines']);
$output->writeln('Bytes: ' . $export['bytes']);
$io->writeln('<info>1/2 Exported tags.ndjson</info>');
$io->writeln('Path: ' . (string) ($export['path'] ?? ''));
$io->writeln('Tags: ' . (string) ($export['tags'] ?? 0));
$io->writeln('Lines: ' . (string) ($export['lines'] ?? 0));
$io->writeln('Bytes: ' . (string) ($export['bytes'] ?? 0));
// -----------------------------------------
// 2) Build FAISS tag index
// -----------------------------------------
$this->builder->build();
$output->writeln('<info>2/3 Built vector_tags.index</info>');
$io->writeln('<info>2/2 Built vector_tags.index</info>');
$io->success('Tag rebuild completed.');
// -----------------------------------------
// 3) Enterprise Commit Marker
// -----------------------------------------
$this->metaManager->touchRuntime([
'last_tags_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
]);
$output->writeln('<info>3/3 Runtime commit marker updated</info>');
return Command::SUCCESS;
} catch (\Throwable $e) {
$output->writeln('<error>ERROR: ' . $e->getMessage() . '</error>');
$io->error($e->getMessage());
return Command::FAILURE;
}
}
return Command::SUCCESS;
/**
* @param array<string, mixed> $export
*/
private function assertValidExport(array $export): void
{
$path = trim((string) ($export['path'] ?? ''));
if ($path === '' || !is_file($path)) {
throw new \RuntimeException('Tag export failed: tags.ndjson is missing.');
}
$tags = (int) ($export['tags'] ?? 0);
$lines = (int) ($export['lines'] ?? 0);
$bytes = (int) ($export['bytes'] ?? 0);
if ($tags < 0 || $lines < 0 || $bytes < 0) {
throw new \RuntimeException('Tag export returned invalid statistics.');
}
}
}

View File

@@ -1,12 +1,62 @@
<?php
declare(strict_types=1);
namespace App\Config;
class CatalogIntentConfig
/**
* Central thresholds for deterministic catalog-entity detection.
*
* The values in this class intentionally define a conservative gate:
* - only strong semantic tag hits may open the catalog path
* - small score gaps between the best and second-best hit are treated as ambiguous
*/
final class CatalogIntentConfig
{
// Minimum similarity score. Prevents noise.
/**
* Minimum semantic similarity required before a catalog entity is accepted.
*/
public const MIN_SCORE = 0.72;
// Difference between Top 1 and Top 2, so that no uncertain match is accepted.
/**
* Required distance between the best and second-best catalog entity hit.
*/
public const AMBIGUITY_DELTA = 0.02;
/**
* Number of candidate tag hits to inspect during catalog intent detection.
*
* This is intentionally wider than the final accepted set so that strong
* catalog_entity tags are not hidden behind generic tags in the raw result.
*/
public const SEARCH_LIMIT = 6;
/**
* Conservative lower boundary for score normalization helpers.
*/
public const MIN_ALLOWED_SCORE = 0.0;
/**
* Conservative upper boundary for score normalization helpers.
*/
public const MAX_ALLOWED_SCORE = 1.0;
public static function isScoreAccepted(float $score): bool
{
return $score >= self::MIN_SCORE;
}
public static function isAmbiguous(float $bestScore, float $secondScore): bool
{
return abs($bestScore - $secondScore) < self::AMBIGUITY_DELTA;
}
public static function clampScore(float $score): float
{
return max(self::MIN_ALLOWED_SCORE, min(self::MAX_ALLOWED_SCORE, $score));
}
private function __construct()
{
}
}

View File

@@ -1,5 +1,6 @@
<?php
declare(strict_types=1);
namespace App\Controller\Admin;
@@ -17,25 +18,22 @@ final class DashboardController extends AbstractController
#[Route('', name: 'admin_dashboard_null')]
#[Route('/', name: 'admin_dashboard_trail')]
#[Route('/admin', name: 'admin_dashboard_alias')]
public function trailNull(IndexMetaManager $metaManager,VectorIndexHealthService $health): RedirectResponse
public function redirectToDashboard(): RedirectResponse
{
return $this->redirectToRoute('admin_dashboard');
}
#[Route('/admin/dashboard', name: 'admin_dashboard')]
public function dashboard(IndexMetaManager $metaManager,VectorIndexHealthService $health,TagVectorIndexHealthService $tagHealth): Response
{
$chunkCount = $metaManager->getRuntimeChunkCount();
$limit = IngestFlow::CHUNK_LIMIT_HARD;
#[Route('/admin/dashboard', name: 'admin_dashboard', methods: ['GET'])]
public function dashboard(
IndexMetaManager $metaManager,
VectorIndexHealthService $health,
TagVectorIndexHealthService $tagHealth
): Response {
return $this->render('admin/dashboard/index.html.twig', [
'chunkCount' => $chunkCount,
'chunkLimit' => $limit,
'chunkCount' => $metaManager->getRuntimeChunkCount(),
'chunkLimit' => IngestFlow::CHUNK_LIMIT_HARD,
'vectorHealth' => $health->check(),
'tagVectorHealth' => $tagHealth->check(),
]);
}
}
}

View File

@@ -1,10 +1,13 @@
<?php
declare(strict_types=1);
namespace App\Controller\Admin;
use App\Entity\Document;
use App\Entity\DocumentVersion;
use App\Entity\IngestJob;
use App\Entity\User;
use App\Service\DocumentService;
use App\Service\FormatText;
use App\Service\IngestJobService;
@@ -23,9 +26,11 @@ use Symfony\Component\Routing\Attribute\Route;
use Symfony\Component\Uid\Uuid;
#[Route('/admin/documents')]
class DocumentController extends AbstractController
final class DocumentController extends AbstractController
{
#[Route('', name: 'admin_documents')]
private const INGEST_DUPLICATE_WINDOW_SECONDS = 3;
#[Route('', name: 'admin_documents', methods: ['GET'])]
public function index(EntityManagerInterface $em): Response
{
$documents = $em->getRepository(Document::class)
@@ -46,115 +51,106 @@ class DocumentController extends AbstractController
#[Route(
'/{id}',
name: 'admin_document_show',
requirements: ['id' => '[0-9a-fA-F\-]{36}']
requirements: ['id' => '[0-9a-fA-F\-]{36}'],
methods: ['GET']
)]
public function show(string $id, EntityManagerInterface $em): Response
{
try {
$uuid = Uuid::fromString($id);
} catch (\Exception) {
throw new NotFoundHttpException();
}
$document = $em->getRepository(Document::class)->find($uuid);
if (!$document) {
$this->addFlash('danger', 'Das Dokument existiert nicht mehr.');
}
return $this->render('admin/document/show.html.twig', [
'document' => $document,
'document' => $this->findDocument($id, $em),
]);
}
#[Route('/new', name: 'admin_document_new')]
#[Route('/new', name: 'admin_document_new', methods: ['GET', 'POST'])]
public function new(
Request $request,
DocumentService $documentService,
FormatText $formatText,
IngestJobService $jobService,
ParameterBagInterface $params
Request $request,
DocumentService $documentService,
FormatText $formatText,
IngestJobService $jobService,
ParameterBagInterface $params,
EntityManagerInterface $em,
): Response {
if (!$request->isMethod('POST')) {
return $this->render('admin/document/new.html.twig');
}
/** @var UploadedFile|null $file */
$file = $request->files->get('file');
if (!$file instanceof UploadedFile) {
throw new \InvalidArgumentException('No valid file uploaded.');
}
if (!$this->isCsrfTokenValid('create_document', (string) $request->request->get('_token'))) {
$this->addFlash('danger', 'Ungültiges CSRF-Token.');
$rawTitle = $request->request->get('title');
$title = is_string($rawTitle) && $rawTitle !== ''
? $rawTitle
: $formatText->slugify($file->getClientOriginalName());
if (!$title) {
$this->addFlash('error', 'Titel ist erforderlich.');
return $this->redirectToRoute('admin_document_new');
}
$uploadDir = (string)$params->get('mto.vector.data.upload.path');
$this->ensureDir($uploadDir);
/** @var UploadedFile|null $file */
$file = $request->files->get('file');
if (!$file instanceof UploadedFile) {
$this->addFlash('danger', 'Keine gültige Datei hochgeladen.');
$newFilename = uniqid('', true) . '_' . $file->getClientOriginalName();
return $this->redirectToRoute('admin_document_new');
}
$title = $this->resolveDocumentTitle($request, $file, $formatText);
if ($title === '') {
$this->addFlash('danger', 'Titel ist erforderlich.');
return $this->redirectToRoute('admin_document_new');
}
$user = $this->requireUser();
$uploadDir = trim((string) $params->get('mto.vector.data.upload.path'));
try {
$file->move($uploadDir, $newFilename);
} catch (FileException) {
throw new \RuntimeException('File upload failed.');
$this->ensureDir($uploadDir);
$filePath = $this->moveUploadedFile($file, $uploadDir, $formatText);
$document = $documentService->createDocument($title, $filePath, $user);
$version = $document->getCurrentVersion();
if (!$version instanceof DocumentVersion) {
throw new \RuntimeException('Dokument erstellt, aber keine aktuelle Version vorhanden.');
}
$job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE,
$user,
$version->getDocument()->getId(),
$version->getId(),
null,
IngestJob::STATUS_QUEUED
);
$logFile = $this->prepareJobLogFile((string) $job->getId());
$job->setLogPath($logFile);
$em->flush();
if (!$this->canExec()) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('danger', 'Dokument erstellt, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_documents');
}
$this->startIngestJob((string) $job->getId(), $logFile);
return $this->redirectToRoute('admin_job_show', [
'id' => (string) $job->getId(),
]);
} catch (\Throwable $e) {
$this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Dokument konnte nicht erstellt werden.'));
return $this->redirectToRoute('admin_document_new');
}
$filePath = $uploadDir . '/' . $newFilename;
$document = $documentService->createDocument(
$title,
$filePath,
$this->getUser()
);
$version = $document->getCurrentVersion();
if (!$version instanceof DocumentVersion) {
$this->addFlash('danger', 'Dokument erstellt, aber es wurde keine aktuelle Version erzeugt.');
return $this->redirectToRoute('admin_documents');
}
$job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE,
$this->getUser(),
$version->getDocument()->getId(),
$version->getId(),
null,
IngestJob::STATUS_QUEUED
);
if (!$this->canExec()) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('danger', 'Dokument erstellt, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_documents');
}
$this->startIngestJob((string)$job->getId());
return $this->redirectToRoute('admin_job_show', [
'id' => (string)$job->getId(),
]);
}
#[Route('/{id}/version/new', name: 'admin_document_version_new', requirements: ['id' => '[0-9a-fA-F\-]{36}'])]
#[Route('/{id}/version/new', name: 'admin_document_version_new', requirements: ['id' => '[0-9a-fA-F\-]{36}'], methods: ['GET', 'POST'])]
public function newVersion(
string $id,
Request $request,
string $id,
Request $request,
EntityManagerInterface $em,
DocumentService $documentService,
ParameterBagInterface $params
DocumentService $documentService,
ParameterBagInterface $params,
FormatText $formatText,
): Response {
$document = $em->getRepository(Document::class)->find($id);
if (!$document) {
throw $this->createNotFoundException();
}
$document = $this->findDocument($id, $em);
if (!$request->isMethod('POST')) {
return $this->render('admin/document/new_version.html.twig', [
@@ -162,31 +158,33 @@ class DocumentController extends AbstractController
]);
}
/** @var UploadedFile|null $file */
$file = $request->files->get('file');
if (!$file instanceof UploadedFile) {
$this->addFlash('error', 'Datei ist erforderlich.');
if (!$this->isCsrfTokenValid('create_document_version_' . $id, (string) $request->request->get('_token'))) {
$this->addFlash('danger', 'Ungültiges CSRF-Token.');
return $this->redirectToRoute('admin_document_version_new', ['id' => $id]);
}
$uploadDir = (string)$params->get('mto.vector.data.upload.path');
$this->ensureDir($uploadDir);
/** @var UploadedFile|null $file */
$file = $request->files->get('file');
if (!$file instanceof UploadedFile) {
$this->addFlash('danger', 'Datei ist erforderlich.');
$newFilename = uniqid('', true) . '_' . $file->getClientOriginalName();
try {
$file->move($uploadDir, $newFilename);
} catch (FileException) {
throw new \RuntimeException('File upload failed.');
return $this->redirectToRoute('admin_document_version_new', ['id' => $id]);
}
$filePath = $uploadDir . '/' . $newFilename;
try {
$user = $this->requireUser();
$uploadDir = trim((string) $params->get('mto.vector.data.upload.path'));
$this->ensureDir($uploadDir);
$filePath = $this->moveUploadedFile($file, $uploadDir, $formatText);
$documentService->addVersion(
$document,
$filePath,
$this->getUser()
);
$documentService->addVersion($document, $filePath, $user);
$this->addFlash('success', 'Neue Dokumentversion wurde hochgeladen.');
} catch (\Throwable $e) {
$this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Neue Dokumentversion konnte nicht erstellt werden.'));
return $this->redirectToRoute('admin_document_version_new', ['id' => $id]);
}
return $this->redirectToRoute('admin_document_show', ['id' => $id]);
}
@@ -198,54 +196,55 @@ class DocumentController extends AbstractController
methods: ['POST']
)]
public function activateVersion(
string $versionId,
Request $request,
string $versionId,
Request $request,
EntityManagerInterface $em,
DocumentService $documentService,
IngestJobService $jobService,
DocumentService $documentService,
IngestJobService $jobService,
): RedirectResponse {
if (!$this->isCsrfTokenValid('activate_version_' . $versionId, (string)$request->request->get('_token'))) {
if (!$this->isCsrfTokenValid('activate_version_' . $versionId, (string) $request->request->get('_token'))) {
throw $this->createAccessDeniedException();
}
$version = $em->getRepository(DocumentVersion::class)->find($versionId);
if (!$version) {
throw $this->createNotFoundException();
}
$version = $this->findDocumentVersion($versionId, $em);
try {
$documentService->activateVersion($version);
$job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE,
$this->getUser(),
$this->requireUser(),
$version->getDocument()->getId(),
$version->getId(),
null,
IngestJob::STATUS_QUEUED
);
$logFile = $this->prepareJobLogFile((string) $job->getId());
$job->setLogPath($logFile);
$em->flush();
if (!$this->canExec()) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('danger', 'Aktivierung ok, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_document_show', [
'id' => $version->getDocument()->getId(),
'id' => (string) $version->getDocument()->getId(),
]);
}
$this->startIngestJob((string)$job->getId());
$this->startIngestJob((string) $job->getId(), $logFile);
$this->addFlash('success', 'Version aktiviert. Ingest-Job wurde erstellt und gestartet.');
return $this->redirectToRoute('admin_job_show', [
'id' => (string)$job->getId(),
'id' => (string) $job->getId(),
]);
} catch (\Throwable $e) {
$this->addFlash('danger', 'Aktivierung/Re-Ingest fehlgeschlagen: ' . $e->getMessage());
$this->addFlash('danger', 'Aktivierung/Re-Ingest fehlgeschlagen: ' . $this->buildSafeErrorMessage($e, 'Unbekannter Fehler.'));
}
return $this->redirectToRoute('admin_document_show', [
'id' => $version->getDocument()->getId(),
'id' => (string) $version->getDocument()->getId(),
]);
}
@@ -256,115 +255,135 @@ class DocumentController extends AbstractController
methods: ['POST']
)]
public function ingestVersion(
string $versionId,
Request $request,
string $versionId,
Request $request,
EntityManagerInterface $em,
IngestJobService $jobService,
): ?RedirectResponse {
if (!$this->isCsrfTokenValid('ingest_version_' . $versionId, (string)$request->request->get('_token'))) {
IngestJobService $jobService,
): RedirectResponse {
if (!$this->isCsrfTokenValid('ingest_version_' . $versionId, (string) $request->request->get('_token'))) {
throw $this->createAccessDeniedException();
}
$version = $em->getRepository(DocumentVersion::class)->find($versionId);
if (!$version) {
throw $this->createNotFoundException();
}
$version = $this->findDocumentVersion($versionId, $em);
/** @var IngestJob|null $existing */
$existing = $em->getRepository(IngestJob::class)
->findOneBy(
['documentVersionId' => $version->getId()],
['startedAt' => 'DESC']
['startedAt' => 'DESC', 'id' => 'DESC']
);
if ($existing && $existing->getStartedAt() > new \DateTimeImmutable('-3 seconds')) {
return null;
if (
$existing instanceof IngestJob
&& $existing->getStartedAt() > new \DateTimeImmutable('-' . self::INGEST_DUPLICATE_WINDOW_SECONDS . ' seconds')
&& in_array($existing->getStatus(), [IngestJob::STATUS_QUEUED, IngestJob::STATUS_RUNNING], true)
) {
$this->addFlash('info', 'Für diese Version läuft bereits ein aktueller Ingest-Job.');
return $this->redirectToRoute('admin_job_show', [
'id' => (string) $existing->getId(),
]);
}
$job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT,
$this->getUser(),
$this->requireUser(),
$version->getDocument()->getId(),
$version->getId(),
null,
IngestJob::STATUS_QUEUED
);
$logFile = $this->prepareJobLogFile((string) $job->getId());
$job->setLogPath($logFile);
$em->flush();
if (!$this->canExec()) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('error', 'Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
$this->addFlash('danger', 'Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_document_show', [
'id' => $version->getDocument()->getId(),
'id' => (string) $version->getDocument()->getId(),
]);
}
$this->startIngestJob((string)$job->getId());
try {
$this->startIngestJob((string) $job->getId(), $logFile);
} catch (\Throwable $e) {
$jobService->markFailed($job, 'Ingest async start failed: ' . $e->getMessage());
$this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Ingest konnte nicht gestartet werden.'));
return $this->redirectToRoute('admin_document_show', [
'id' => (string) $version->getDocument()->getId(),
]);
}
return $this->redirectToRoute('admin_job_show', [
'id' => (string)$job->getId(),
'id' => (string) $job->getId(),
]);
}
#[Route(
'/reset',
name: 'admin_document_reset',
methods: ['POST']
)]
public function resetCompleteSystem(ParameterBagInterface $params, Connection $connection): ?RedirectResponse
{
if (!$this->canExec()) {
$this->addFlash('danger', 'Der Reset konnte nicht gestartet werden (exec deaktiviert).');
#[Route('/reset', name: 'admin_document_reset', methods: ['POST'])]
public function resetCompleteSystem(
Request $request,
ParameterBagInterface $params,
Connection $connection,
): RedirectResponse {
$this->denyAccessUnlessGranted('ROLE_SUPER_ADMIN');
if (!$this->isCsrfTokenValid('system_reset', (string) $request->request->get('_token'))) {
$this->addFlash('danger', 'Ungültiges CSRF-Token.');
return $this->redirectToRoute('admin_dashboard');
}
@unlink((string)$params->get('mto.knowledge.ndjson'));
@unlink((string)$params->get('mto.knowledge.vector_index'));
@unlink((string)$params->get('mto.knowledge.vector_index_meta'));
@unlink((string)$params->get('mto.knowledge.index_meta'));
@unlink((string)$params->get('mto.runtime.meta'));
if (!$this->canExec()) {
$this->addFlash('danger', 'Der Reset konnte nicht gestartet werden (exec deaktiviert).');
@unlink((string)$params->get('mto.knowledge.tags_ndjson'));
@unlink((string)$params->get('mto.knowledge.vector_tags_index'));
@unlink((string)$params->get('mto.knowledge.vector_tags_index_meta'));
return $this->redirectToRoute('admin_dashboard');
}
$uploadDir = (string)$params->get('mto.knowledge.upload');
foreach ([
'mto.knowledge.ndjson',
'mto.knowledge.vector_index',
'mto.knowledge.vector_index_meta',
'mto.knowledge.index_meta',
'mto.runtime.meta',
'mto.knowledge.tags_ndjson',
'mto.knowledge.vector_tags_index',
'mto.knowledge.vector_tags_index_meta',
] as $parameterName) {
$path = trim((string) $params->get($parameterName));
if ($path !== '' && is_file($path)) {
@unlink($path);
}
}
$uploadDir = trim((string) $params->get('mto.knowledge.upload'));
if ($uploadDir !== '' && is_dir($uploadDir)) {
exec('rm -rf ' . escapeshellarg($uploadDir));
}
$lockDir = (string)$params->get('mto.locks.dir');
$lockDir = trim((string) $params->get('mto.locks.dir'));
if ($lockDir !== '' && is_dir($lockDir)) {
exec('rm -rf ' . escapeshellarg($lockDir));
}
$sql = '
SET FOREIGN_KEY_CHECKS = 0;
TRUNCATE TABLE db.document;
SET FOREIGN_KEY_CHECKS = 1;
SET FOREIGN_KEY_CHECKS = 0;
TRUNCATE TABLE db.document_version;
SET FOREIGN_KEY_CHECKS = 1;
SET FOREIGN_KEY_CHECKS = 0;
TRUNCATE TABLE db.ingest_job;
SET FOREIGN_KEY_CHECKS = 1;
SET FOREIGN_KEY_CHECKS = 0;
TRUNCATE TABLE db.knowledge_tag;
SET FOREIGN_KEY_CHECKS = 1;
SET FOREIGN_KEY_CHECKS = 0;
TRUNCATE TABLE db.tag_rebuild_job;
SET FOREIGN_KEY_CHECKS = 1;
SET FOREIGN_KEY_CHECKS = 0;
TRUNCATE TABLE db.document_tag;
SET FOREIGN_KEY_CHECKS = 1;
';
$connection->executeQuery($sql);
$sql = <<<'SQL'
SET FOREIGN_KEY_CHECKS = 0;
TRUNCATE TABLE db.document_tag;
TRUNCATE TABLE db.tag_rebuild_job;
TRUNCATE TABLE db.knowledge_tag;
TRUNCATE TABLE db.ingest_job;
TRUNCATE TABLE db.document_version;
TRUNCATE TABLE db.document;
SET FOREIGN_KEY_CHECKS = 1;
SQL;
$connection->executeStatement($sql);
$this->addFlash('success', 'Das System wurde erfolgreich zurückgesetzt.');
return $this->redirectToRoute('admin_dashboard');
}
@@ -375,62 +394,63 @@ class DocumentController extends AbstractController
methods: ['POST']
)]
public function deleteDocument(
string $id,
Request $request,
string $id,
Request $request,
EntityManagerInterface $em,
IngestJobService $jobService,
LockService $lockService,
IngestJobService $jobService,
LockService $lockService,
): RedirectResponse {
if (!$this->isCsrfTokenValid('delete_document_' . $id, (string)$request->request->get('_token'))) {
$this->denyAccessUnlessGranted('ROLE_SUPER_ADMIN');
if (!$this->isCsrfTokenValid('delete_document_' . $id, (string) $request->request->get('_token'))) {
throw $this->createAccessDeniedException();
}
try {
$uuid = Uuid::fromString($id);
} catch (\Exception) {
throw $this->createNotFoundException();
}
/** @var Document|null $document */
$document = $em->getRepository(Document::class)->find($uuid);
if (!$document) {
throw $this->createNotFoundException();
}
$document = $this->findDocument($id, $em);
if (!$lockService->acquire()) {
$this->addFlash('danger', 'Ein Ingest-Job läuft bereits. Löschen derzeit nicht möglich.');
return $this->redirectToRoute('admin_documents');
}
$lockService->release();
$job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT_DELETE,
$this->getUser(),
$this->requireUser(),
$document->getId(),
null,
null,
IngestJob::STATUS_QUEUED
);
$logFile = $this->prepareJobLogFile((string) $job->getId());
$job->setLogPath($logFile);
$em->flush();
if (!$this->canExec()) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('danger', 'Löschen konnte nicht gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_documents');
}
$this->startIngestJob((string)$job->getId());
try {
$this->startIngestJob((string) $job->getId(), $logFile);
} catch (\Throwable $e) {
$jobService->markFailed($job, 'Delete async start failed: ' . $e->getMessage());
$this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Löschvorgang konnte nicht gestartet werden.'));
return $this->redirectToRoute('admin_documents');
}
$this->addFlash('success', 'Löschvorgang gestartet. Dokument wird nach Index-Rebuild entfernt.');
return $this->redirectToRoute('admin_job_show', [
'id' => (string)$job->getId(),
'id' => (string) $job->getId(),
]);
}
// =========================================================
// Helpers
// =========================================================
private function canExec(): bool
{
if (!function_exists('exec')) {
@@ -443,6 +463,7 @@ class DocumentController extends AbstractController
}
$list = array_map('trim', explode(',', $disabled));
return !in_array('exec', $list, true);
}
@@ -452,34 +473,209 @@ class DocumentController extends AbstractController
throw new \RuntimeException('Upload directory not configured.');
}
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
if (!is_dir($dir) && !mkdir($dir, 0775, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create upload directory.');
}
}
private function startIngestJob(string $jobId): void
private function moveUploadedFile(UploadedFile $file, string $uploadDir, FormatText $formatText): string
{
$projectDir = (string)$this->getParameter('kernel.project_dir');
$originalName = trim((string) $file->getClientOriginalName());
$baseName = pathinfo($originalName !== '' ? $originalName : 'document', PATHINFO_FILENAME);
$extension = strtolower((string) $file->getClientOriginalExtension());
$safeBaseName = $formatText->slugify($baseName !== '' ? $baseName : 'document');
if ($safeBaseName === '') {
$safeBaseName = 'document';
}
$newFilename = uniqid('', true) . '_' . $safeBaseName;
if ($extension !== '') {
$newFilename .= '.' . $extension;
}
try {
$file->move($uploadDir, $newFilename);
} catch (FileException) {
throw new \RuntimeException('File upload failed.');
}
return rtrim($uploadDir, '/') . '/' . $newFilename;
}
private function resolveDocumentTitle(Request $request, UploadedFile $file, FormatText $formatText): string
{
$rawTitle = trim((string) $request->request->get('title', ''));
if ($rawTitle !== '') {
return $rawTitle;
}
$originalName = trim((string) $file->getClientOriginalName());
$baseName = pathinfo($originalName, PATHINFO_FILENAME);
return trim((string) $formatText->slugify($baseName !== '' ? $baseName : $originalName));
}
private function startIngestJob(string $jobId, string $logFile): void
{
$projectDir = $this->resolveProjectDir();
$console = $projectDir . '/bin/console';
$logDir = $projectDir . '/var/log/ingest';
if (!is_dir($logDir)) {
@mkdir($logDir, 0777, true);
if (!is_file($console)) {
throw new \RuntimeException('bin/console not found: ' . $console);
}
$logFile = $logDir . '/job_' . $jobId . '.log';
// Wichtig: CLI-PHP verwenden, nicht PHP_BINARY aus FPM
$php = 'php';
$php = $this->resolvePhpBinary();
$cmd = sprintf(
'%s %s --no-interaction %s %s >> %s 2>&1 &',
escapeshellcmd($php),
'cd %s && nohup %s %s %s %s --no-interaction >> %s 2>&1 & echo $!',
escapeshellarg($projectDir),
escapeshellarg($php),
escapeshellarg($console),
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg($jobId),
escapeshellarg($logFile),
);
exec($cmd);
$output = [];
$exitCode = 0;
@exec($cmd, $output, $exitCode);
if ($exitCode !== 0) {
throw new \RuntimeException('Background ingest bootstrap failed with exit code ' . $exitCode . '.');
}
}
private function prepareJobLogFile(string $jobId): string
{
$projectDir = $this->resolveProjectDir();
$logDir = $projectDir . '/var/log/ingest';
$this->ensureDir($logDir);
return $logDir . '/job_' . $jobId . '.log';
}
private function resolveProjectDir(): string
{
$projectDir = trim((string) $this->getParameter('kernel.project_dir'));
if ($projectDir === '' || !is_dir($projectDir)) {
throw new \RuntimeException('Project directory is invalid.');
}
return rtrim($projectDir, '/');
}
private function resolvePhpBinary(): string
{
$envCandidates = [
trim((string) ($_SERVER['PHP_CLI_BINARY'] ?? '')),
trim((string) ($_ENV['PHP_CLI_BINARY'] ?? '')),
trim((string) getenv('PHP_CLI_BINARY')),
];
foreach ($envCandidates as $candidate) {
if ($this->isValidCliPhpBinary($candidate)) {
return $candidate;
}
}
$phpBinary = defined('PHP_BINARY') ? trim((string) PHP_BINARY) : '';
if ($this->isValidCliPhpBinary($phpBinary)) {
return $phpBinary;
}
$fallbackCandidates = [
'/usr/bin/php',
'/usr/local/bin/php',
'/bin/php',
'/opt/homebrew/bin/php',
];
foreach ($fallbackCandidates as $candidate) {
if ($this->isValidCliPhpBinary($candidate)) {
return $candidate;
}
}
$whichPhp = trim((string) @shell_exec('command -v php 2>/dev/null'));
if ($this->isValidCliPhpBinary($whichPhp)) {
return $whichPhp;
}
throw new \RuntimeException(
'Could not resolve a CLI PHP binary. Set PHP_CLI_BINARY explicitly, e.g. /usr/bin/php.'
);
}
private function isValidCliPhpBinary(string $path): bool
{
$path = trim($path);
if ($path === '' || !is_file($path) || !is_executable($path)) {
return false;
}
$basename = strtolower(basename($path));
if (str_contains($basename, 'fpm') || str_contains($basename, 'cgi')) {
return false;
}
return true;
}
private function findDocument(string $id, EntityManagerInterface $em): Document
{
try {
$uuid = Uuid::fromString(trim($id));
} catch (\Throwable) {
throw new NotFoundHttpException();
}
/** @var Document|null $document */
$document = $em->getRepository(Document::class)->find($uuid);
if (!$document instanceof Document) {
throw new NotFoundHttpException();
}
return $document;
}
private function findDocumentVersion(string $versionId, EntityManagerInterface $em): DocumentVersion
{
try {
$uuid = Uuid::fromString(trim($versionId));
} catch (\Throwable) {
throw new NotFoundHttpException();
}
/** @var DocumentVersion|null $version */
$version = $em->getRepository(DocumentVersion::class)->find($uuid);
if (!$version instanceof DocumentVersion) {
throw new NotFoundHttpException();
}
return $version;
}
private function requireUser(): User
{
$user = $this->getUser();
if (!$user instanceof User) {
throw new \RuntimeException('No authenticated user available.');
}
return $user;
}
private function buildSafeErrorMessage(\Throwable $e, string $fallback): string
{
$message = trim($e->getMessage());
return $message !== '' ? $message : $fallback;
}
}

View File

@@ -19,44 +19,97 @@ final class DocumentTagController extends AbstractController
#[Route('/{id}/tags', name: 'admin_document_tags_edit', methods: ['GET'])]
public function edit(string $id, DocumentTagAdminService $svc): Response
{
$data = $svc->getEditData($id);
$id = trim($id);
try {
$data = $svc->getEditData($id);
} catch (\Throwable $e) {
$this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Dokument-Tags konnten nicht geladen werden.'));
return $this->redirectToRoute('admin_documents');
}
return $this->render('admin/document_tags/edit.html.twig', [
'document' => $data['document'],
'allTags' => $data['allTags'],
'latestJob' => $data['latestJob'],
'statusRunning' => TagRebuildJob::STATUS_RUNNING,
'statusQueued' => TagRebuildJob::STATUS_QUEUED,
'statusCompleted' => TagRebuildJob::STATUS_COMPLETED,
'statusFailed' => TagRebuildJob::STATUS_FAILED,
...$data,
...$this->buildJobStatusViewData(),
]);
}
#[Route('/{id}/tags/save', name: 'admin_document_tags_save', methods: ['POST'])]
public function save(string $id, Request $request, DocumentTagAdminService $svc): RedirectResponse
{
$selected = $request->request->all('tag_ids') ?? [];
$id = trim($id);
if (!$this->isCsrfTokenValid('admin_document_tags_save_' . $id, (string) $request->request->get('_token'))) {
$this->addFlash('danger', 'Ungültiges CSRF-Token.');
return $this->redirectToRoute('admin_document_tags_edit', ['id' => $id]);
}
try {
$svc->saveTags($id, $selected);
$svc->saveTags($id, $this->normalizeStringList($request->request->all('tag_ids')));
$this->addFlash('success', 'Tags wurden aktualisiert. Rebuild läuft im Hintergrund.');
} catch (\Throwable $e) {
$this->addFlash('danger', $e->getMessage());
$this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Tags konnten nicht aktualisiert werden.'));
}
return $this->redirectToRoute('admin_document_tags_edit', ['id' => $id]);
}
/**
* Wichtig: Ohne extra "admin/" im Pfad, weil Prefix schon /admin/documents ist.
* Ergebnis: /admin/documents/tags/status
*/
#[Route('/tags/status', name: 'admin_tags_status', methods: ['GET'])]
public function status(DocumentTagAdminService $svc): JsonResponse
{
$status = $svc->getLatestRebuildStatus();
return $this->json([
'status' => $svc->getLatestRebuildStatus(),
'status' => $status,
'hasActiveJob' => $status === TagRebuildJob::STATUS_RUNNING
|| $status === TagRebuildJob::STATUS_QUEUED,
]);
}
/**
* @param mixed $values
* @return list<string>
*/
private function normalizeStringList(mixed $values): array
{
if (!is_array($values)) {
return [];
}
$normalized = [];
foreach ($values as $value) {
$value = trim((string) $value);
if ($value === '') {
continue;
}
$normalized[] = $value;
}
return array_values(array_unique($normalized));
}
/**
* @return array<string, string>
*/
private function buildJobStatusViewData(): array
{
return [
'statusRunning' => TagRebuildJob::STATUS_RUNNING,
'statusQueued' => TagRebuildJob::STATUS_QUEUED,
'statusCompleted' => TagRebuildJob::STATUS_COMPLETED,
'statusFailed' => TagRebuildJob::STATUS_FAILED,
];
}
private function buildSafeErrorMessage(\Throwable $e, string $fallback): string
{
$message = trim($e->getMessage());
return $message !== '' ? $message : $fallback;
}
}

View File

@@ -1,46 +1,44 @@
<?php
declare(strict_types=1);
namespace App\Controller\Admin;
use App\Entity\IngestJob;
use App\Service\IngestJobService;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
use Symfony\Component\HttpFoundation\JsonResponse;
use Symfony\Component\HttpFoundation\RedirectResponse;
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
use Symfony\Component\Routing\Attribute\Route;
use Symfony\Component\HttpFoundation\RedirectResponse;
use Symfony\Component\HttpFoundation\JsonResponse;
#[Route('/admin/jobs')]
class IngestJobController extends AbstractController
final class IngestJobController extends AbstractController
{
#[Route('', name: 'admin_jobs')]
#[Route('', name: 'admin_jobs', methods: ['GET'])]
public function index(EntityManagerInterface $em): Response
{
$jobs = $em->getRepository(IngestJob::class)
->findBy([], ['startedAt' => 'DESC']);
->findBy([], ['startedAt' => 'DESC', 'id' => 'DESC']);
return $this->render('admin/job/index.html.twig', [
'jobs' => $jobs
'jobs' => $jobs,
]);
}
#[Route(
'/{id}',
name: 'admin_job_show',
requirements: ['id' => '[0-9a-fA-F\-]{36}']
requirements: ['id' => '[0-9a-fA-F\-]{36}'],
methods: ['GET']
)]
public function show(string $id, EntityManagerInterface $em): Response
{
$job = $em->getRepository(IngestJob::class)->find($id);
if (!$job) {
throw new NotFoundHttpException();
}
return $this->render('admin/job/show.html.twig', [
'job' => $job
'job' => $this->findJob($id, $em),
]);
}
@@ -54,12 +52,7 @@ class IngestJobController extends AbstractController
{
$this->denyAccessUnlessGranted('ROLE_USER');
/** @var IngestJob|null $job */
$job = $em->getRepository(IngestJob::class)->find($id);
if (!$job) {
throw new NotFoundHttpException();
}
$job = $this->findJob($id, $em);
return $this->json([
'id' => (string) $job->getId(),
@@ -68,58 +61,185 @@ class IngestJobController extends AbstractController
'startedAt' => $job->getStartedAt()->format(DATE_ATOM),
'finishedAt' => $job->getFinishedAt()?->format(DATE_ATOM),
'errorMessage' => $job->getErrorMessage(),
'logPath' => $job->getLogPath(),
]);
}
#[Route('/global-reindex', name: 'admin_global_reindex', methods: ['POST'])]
public function globalReindex(
Request $request,
IngestJobService $jobService,
EntityManagerInterface $em,
): RedirectResponse {
$this->denyAccessUnlessGranted('ROLE_SUPER_ADMIN');
// ---------------------------------------------------------
// 1) Job anlegen (QUEUED)
// ---------------------------------------------------------
$job = $jobService->startJob(
IngestJob::TYPE_GLOBAL_REINDEX,
$this->getUser(),
null,
null,
null,
IngestJob::STATUS_QUEUED
);
if (!$this->isCsrfTokenValid('global_reindex', (string) $request->request->get('_token'))) {
$this->addFlash('danger', 'Ungültiges CSRF-Token.');
// ---------------------------------------------------------
// 2) CLI im Hintergrund starten
// ---------------------------------------------------------
$projectDir = (string)$this->getParameter('kernel.project_dir');
$console = $projectDir . '/bin/console';
$logDir = $projectDir . '/var/log/ingest';
if (!is_dir($logDir)) {
@mkdir($logDir, 0777, true);
return $this->redirectToRoute('admin_jobs');
}
$logFile = $logDir . '/job_' . (string)$job->getId() . '.log';
$php = 'php';
try {
$projectDir = $this->resolveProjectDir();
$console = $projectDir . '/bin/console';
$cmd = sprintf(
'%s %s --no-interaction %s %s >> %s 2>&1 &',
escapeshellcmd($php),
escapeshellarg($console),
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg((string)$job->getId()),
escapeshellarg($logFile),
);
if (!is_file($console)) {
throw new \RuntimeException('bin/console not found: ' . $console);
}
exec($cmd);
$logDir = $projectDir . '/var/log/ingest';
$this->ensureDirectoryExists($logDir);
// ---------------------------------------------------------
// 3) Redirect auf Job-Detailseite (Loader)
// ---------------------------------------------------------
return $this->redirectToRoute('admin_job_show', [
'id' => (string)$job->getId(),
]);
$job = $jobService->startJob(
IngestJob::TYPE_GLOBAL_REINDEX,
$this->getUser(),
null,
null,
null,
IngestJob::STATUS_QUEUED
);
$logFile = $logDir . '/job_' . (string) $job->getId() . '.log';
$job->setLogPath($logFile);
$em->flush();
$phpBinary = $this->resolvePhpBinary();
$cmd = sprintf(
'cd %s && nohup %s %s %s %s --no-interaction >> %s 2>&1 & echo $!',
escapeshellarg($projectDir),
escapeshellarg($phpBinary),
escapeshellarg($console),
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg((string) $job->getId()),
escapeshellarg($logFile),
);
$output = [];
$exitCode = 0;
@exec($cmd, $output, $exitCode);
if ($exitCode !== 0) {
$job->markFailed('Global reindex async bootstrap failed with exit code ' . $exitCode . '.');
$em->flush();
$this->addFlash('danger', 'Global Reindex konnte nicht im Hintergrund gestartet werden.');
return $this->redirectToRoute('admin_job_show', [
'id' => (string) $job->getId(),
]);
}
$this->addFlash('success', 'Global Reindex wurde gestartet.');
return $this->redirectToRoute('admin_job_show', [
'id' => (string) $job->getId(),
]);
} catch (\Throwable $e) {
$this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Global Reindex konnte nicht gestartet werden.'));
return $this->redirectToRoute('admin_jobs');
}
}
}
private function findJob(string $id, EntityManagerInterface $em): IngestJob
{
$id = trim($id);
/** @var IngestJob|null $job */
$job = $em->getRepository(IngestJob::class)->find($id);
if (!$job instanceof IngestJob) {
throw new NotFoundHttpException();
}
return $job;
}
private function resolveProjectDir(): string
{
$projectDir = trim((string) $this->getParameter('kernel.project_dir'));
if ($projectDir === '' || !is_dir($projectDir)) {
throw new \RuntimeException('Project directory is invalid.');
}
return rtrim($projectDir, '/');
}
private function resolvePhpBinary(): string
{
$envCandidates = [
trim((string) ($_SERVER['PHP_CLI_BINARY'] ?? '')),
trim((string) ($_ENV['PHP_CLI_BINARY'] ?? '')),
trim((string) getenv('PHP_CLI_BINARY')),
];
foreach ($envCandidates as $candidate) {
if ($this->isValidCliPhpBinary($candidate)) {
return $candidate;
}
}
$phpBinary = defined('PHP_BINARY') ? trim((string) PHP_BINARY) : '';
if ($this->isValidCliPhpBinary($phpBinary)) {
return $phpBinary;
}
$fallbackCandidates = [
'/usr/bin/php',
'/usr/local/bin/php',
'/bin/php',
'/opt/homebrew/bin/php',
];
foreach ($fallbackCandidates as $candidate) {
if ($this->isValidCliPhpBinary($candidate)) {
return $candidate;
}
}
$whichPhp = trim((string) @shell_exec('command -v php 2>/dev/null'));
if ($this->isValidCliPhpBinary($whichPhp)) {
return $whichPhp;
}
throw new \RuntimeException(
'Could not resolve a CLI PHP binary. Set PHP_CLI_BINARY explicitly, e.g. /usr/bin/php.'
);
}
private function isValidCliPhpBinary(string $path): bool
{
$path = trim($path);
if ($path === '' || !is_file($path) || !is_executable($path)) {
return false;
}
$basename = strtolower(basename($path));
if (str_contains($basename, 'fpm') || str_contains($basename, 'cgi')) {
return false;
}
return true;
}
private function ensureDirectoryExists(string $dir): void
{
if (is_dir($dir)) {
return;
}
if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
throw new \RuntimeException('Could not create ingest log directory.');
}
}
private function buildSafeErrorMessage(\Throwable $e, string $fallback): string
{
$message = trim($e->getMessage());
return $message !== '' ? $message : $fallback;
}
}

View File

@@ -6,6 +6,7 @@ namespace App\Controller\Admin;
use App\Entity\TagRebuildJob;
use App\Service\Admin\TagAdminService;
use App\Tag\TagTypes;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
use Symfony\Component\HttpFoundation\RedirectResponse;
use Symfony\Component\HttpFoundation\Request;
@@ -18,41 +19,32 @@ final class TagController extends AbstractController
#[Route('', name: 'admin_tags_index', methods: ['GET'])]
public function index(TagAdminService $svc): Response
{
$data = $svc->getIndexData();
return $this->render('admin/tag/index.html.twig', [
...$data,
'statusRunning' => TagRebuildJob::STATUS_RUNNING,
'statusQueued' => TagRebuildJob::STATUS_QUEUED,
'statusCompleted' => TagRebuildJob::STATUS_COMPLETED,
'statusFailed' => TagRebuildJob::STATUS_FAILED,
...$svc->getIndexData(),
...$this->buildJobStatusViewData(),
]);
}
#[Route('/create', name: 'admin_tags_create', methods: ['POST'])]
public function create(Request $request, TagAdminService $svc): RedirectResponse
{
if (!$this->isCsrfTokenValid(
'admin_tag_create',
$request->request->get('_token')
)) {
$this->addFlash('danger', 'Ungültiges CSRF Token.');
if (!$this->isCsrfTokenValid('admin_tag_create', (string) $request->request->get('_token'))) {
$this->addFlash('danger', 'Ungültiges CSRF-Token.');
return $this->redirectToRoute('admin_tags_index');
}
try {
$svc->create(
(string)$request->request->get('slug', ''),
(string)$request->request->get('label', ''),
$request->request->get('description')
? (string)$request->request->get('description')
: null,
(string)$request->request->get('type', 'generic') // NEU
(string) $request->request->get('slug', ''),
(string) $request->request->get('label', ''),
$this->normalizeNullableString($request->request->get('description')),
TagTypes::normalize((string) $request->request->get('type', TagTypes::GENERIC))
);
$this->addFlash('success', 'Tag wurde erstellt.');
} catch (\Throwable $e) {
$this->addFlash('danger', $e->getMessage());
$this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Tag konnte nicht erstellt werden.'));
}
return $this->redirectToRoute('admin_tags_index');
@@ -61,58 +53,110 @@ final class TagController extends AbstractController
#[Route('/{id}/delete', name: 'admin_tags_delete', methods: ['POST'])]
public function delete(string $id, Request $request, TagAdminService $svc): RedirectResponse
{
if (!$this->isCsrfTokenValid(
'admin_tag_delete_' . $id,
$request->request->get('_token')
)) {
$this->addFlash('danger', 'Ungültiges CSRF Token.');
if (!$this->isCsrfTokenValid('admin_tag_delete_' . $id, (string) $request->request->get('_token'))) {
$this->addFlash('danger', 'Ungültiges CSRF-Token.');
return $this->redirectToRoute('admin_tags_index');
}
try {
$svc->delete($id);
$svc->delete(trim($id));
$this->addFlash('success', 'Tag wurde gelöscht.');
} catch (\Throwable $e) {
$this->addFlash('danger', $e->getMessage());
$this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Tag konnte nicht gelöscht werden.'));
}
return $this->redirectToRoute('admin_tags_index');
}
#[Route('/{id}/assign', name: 'admin_tags_assign', methods: ['GET', 'POST'])]
public function assign(
string $id,
Request $request,
TagAdminService $svc
): Response {
public function assign(string $id, Request $request, TagAdminService $svc): Response
{
$id = trim($id);
if ($request->isMethod('POST')) {
if (!$this->isCsrfTokenValid('assign_tag_' . $id, (string) $request->request->get('_token'))) {
$this->addFlash('danger', 'Ungültiges CSRF-Token.');
if (!$this->isCsrfTokenValid(
'assign_tag_' . $id,
$request->request->get('_token')
)) {
throw $this->createAccessDeniedException();
return $this->redirectToRoute('admin_tags_assign', ['id' => $id]);
}
$svc->syncAssignments(
$id,
$request->request->all('documents') ?? []
);
$this->addFlash('success', 'Zuweisungen aktualisiert.');
try {
$svc->syncAssignments($id, $this->normalizeStringList($request->request->all('documents')));
$this->addFlash('success', 'Zuweisungen aktualisiert.');
} catch (\Throwable $e) {
$this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Zuweisungen konnten nicht aktualisiert werden.'));
}
return $this->redirectToRoute('admin_tags_assign', ['id' => $id]);
}
$data = $svc->getAssignData($id);
try {
$data = $svc->getAssignData($id);
} catch (\Throwable $e) {
$this->addFlash('danger', $this->buildSafeErrorMessage($e, 'Tag konnte nicht geladen werden.'));
return $this->redirectToRoute('admin_tags_index');
}
return $this->render('admin/tag/assign.html.twig', [
...$data,
...$this->buildJobStatusViewData(),
]);
}
/**
* @param mixed $value
*/
private function normalizeNullableString(mixed $value): ?string
{
$value = trim((string) $value);
return $value !== '' ? $value : null;
}
/**
* @param mixed $values
* @return list<string>
*/
private function normalizeStringList(mixed $values): array
{
if (!is_array($values)) {
return [];
}
$normalized = [];
foreach ($values as $value) {
$value = trim((string) $value);
if ($value === '') {
continue;
}
$normalized[] = $value;
}
return array_values(array_unique($normalized));
}
/**
* @return array<string, string>
*/
private function buildJobStatusViewData(): array
{
return [
'statusRunning' => TagRebuildJob::STATUS_RUNNING,
'statusQueued' => TagRebuildJob::STATUS_QUEUED,
'statusCompleted' => TagRebuildJob::STATUS_COMPLETED,
'statusFailed' => TagRebuildJob::STATUS_FAILED,
]);
];
}
private function buildSafeErrorMessage(\Throwable $e, string $fallback): string
{
$message = trim($e->getMessage());
return $message !== '' ? $message : $fallback;
}
}

View File

@@ -10,38 +10,79 @@ use Symfony\Component\Routing\Attribute\Route;
final class TagRebuildStreamController
{
#[Route('/admin/tags/rebuild/stream', name: 'admin_tags_rebuild_stream')]
private const POLL_INTERVAL_SECONDS = 2;
private const KEEPALIVE_INTERVAL_SECONDS = 10;
#[Route('/admin/tags/rebuild/stream', name: 'admin_tags_rebuild_stream', methods: ['GET'])]
public function stream(TagRebuildStatusProvider $provider): StreamedResponse
{
$response = new StreamedResponse(function () use ($provider) {
$response = new StreamedResponse(function () use ($provider): void {
self::disableOutputBuffering();
echo "event: ping\n";
echo "data: " . json_encode(['init' => true]) . "\n\n";
echo "retry: 3000\n";
self::sendEvent('ping', ['init' => true]);
@ob_flush();
@flush();
$lastPayloadHash = null;
$lastKeepaliveAt = time();
while (!connection_aborted()) {
$data = $provider->getLatestStatus();
if ($data !== null) {
echo "event: message\n";
echo "data: " . json_encode($data) . "\n\n";
$payloadHash = md5(
json_encode($data, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES) ?: 'null'
);
@ob_flush();
@flush();
if ($payloadHash !== $lastPayloadHash) {
self::sendEvent('message', $data);
$lastPayloadHash = $payloadHash;
$lastKeepaliveAt = time();
}
}
sleep(2);
if ((time() - $lastKeepaliveAt) >= self::KEEPALIVE_INTERVAL_SECONDS) {
self::sendEvent('ping', [
'ts' => (new \DateTimeImmutable())->format(DATE_ATOM),
]);
$lastKeepaliveAt = time();
}
sleep(self::POLL_INTERVAL_SECONDS);
}
});
$response->headers->set('Content-Type', 'text/event-stream');
$response->headers->set('Cache-Control', 'no-cache');
$response->headers->set('Cache-Control', 'no-cache, no-store, must-revalidate');
$response->headers->set('Pragma', 'no-cache');
$response->headers->set('Expires', '0');
$response->headers->set('Connection', 'keep-alive');
$response->headers->set('X-Accel-Buffering', 'no');
return $response;
}
private static function disableOutputBuffering(): void
{
while (ob_get_level() > 0) {
@ob_end_flush();
}
}
/**
* @param array<string, mixed> $data
*/
private static function sendEvent(string $event, array $data): void
{
$json = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
if (!is_string($json)) {
$json = '{"error":"json_encode_failed"}';
}
echo 'event: ' . $event . "\n";
echo 'data: ' . $json . "\n\n";
@ob_flush();
@flush();
}
}

View File

@@ -8,6 +8,7 @@ use Doctrine\ORM\Mapping as ORM;
#[ORM\Entity]
#[ORM\Table(name: 'document_tag')]
#[ORM\Index(name: 'idx_document_tag_tag_id', columns: ['tag_id'])]
class DocumentTag
{
#[ORM\Id]
@@ -22,8 +23,8 @@ class DocumentTag
public function __construct(Document $document, Tag $tag)
{
$this->document = $document;
$this->tag = $tag;
$this->setDocument($document);
$this->setTag($tag);
}
public function getDocument(): Document
@@ -35,4 +36,20 @@ class DocumentTag
{
return $this->tag;
}
public function isSameRelation(Document $document, Tag $tag): bool
{
return $this->document->getId()->equals($document->getId())
&& $this->tag->getId()->equals($tag->getId());
}
private function setDocument(Document $document): void
{
$this->document = $document;
}
private function setTag(Tag $tag): void
{
$this->tag = $tag;
}
}

View File

@@ -1,8 +1,12 @@
<?php
declare(strict_types=1);
namespace App\Entity;
use App\Tag\TagTypes;
use Doctrine\ORM\Mapping as ORM;
use InvalidArgumentException;
use Symfony\Component\Uid\Uuid;
#[ORM\Entity]
@@ -24,25 +28,25 @@ class Tag
#[ORM\Column(type: 'text', nullable: true)]
private ?string $description = null;
/**
* NEU: Governance-Typ des Tags
* - generic
* - catalog_entity
*/
#[ORM\Column(length: 50)]
private string $type = 'generic';
private string $type = TagTypes::GENERIC;
#[ORM\Column]
private \DateTimeImmutable $createdAt;
public function __construct(string $slug, string $label, ?string $description = null)
{
public function __construct(
string $slug,
string $label,
?string $description = null,
string $type = TagTypes::GENERIC,
) {
$this->id = Uuid::v4();
$this->createdAt = new \DateTimeImmutable();
$this->slug = $slug;
$this->label = $label;
$this->description = $description;
$this->setSlug($slug);
$this->setLabel($label);
$this->setDescription($description);
$this->setType($type);
}
public function getId(): Uuid
@@ -57,7 +61,14 @@ class Tag
public function setSlug(string $slug): static
{
$slug = $this->normalizeSlug($slug);
if ($slug === '') {
throw new InvalidArgumentException('Tag slug must not be empty.');
}
$this->slug = $slug;
return $this;
}
@@ -68,7 +79,14 @@ class Tag
public function setLabel(string $label): static
{
$label = trim($label);
if ($label === '') {
throw new InvalidArgumentException('Tag label must not be empty.');
}
$this->label = $label;
return $this;
}
@@ -79,7 +97,9 @@ class Tag
public function setDescription(?string $description): static
{
$this->description = $description;
$description = trim((string) $description);
$this->description = $description !== '' ? $description : null;
return $this;
}
@@ -90,13 +110,43 @@ class Tag
public function setType(string $type): static
{
$type = trim($type);
$this->type = $type !== '' ? $type : 'generic';
$normalizedType = TagTypes::normalize($type);
if (!TagTypes::isValid($normalizedType)) {
throw new InvalidArgumentException(sprintf('Unsupported tag type "%s".', $type));
}
$this->type = $normalizedType;
return $this;
}
public function isGeneric(): bool
{
return $this->type === TagTypes::GENERIC;
}
public function isCatalogEntity(): bool
{
return $this->type === TagTypes::CATALOG_ENTITY;
}
public function isSalesSignal(): bool
{
return $this->type === TagTypes::SALES_SIGNAL;
}
public function getCreatedAt(): \DateTimeImmutable
{
return $this->createdAt;
}
private function normalizeSlug(string $slug): string
{
$slug = mb_strtolower(trim($slug));
$slug = preg_replace('/\s+/u', '-', $slug) ?? $slug;
$slug = preg_replace('/-+/u', '-', $slug) ?? $slug;
return trim($slug, '-');
}
}

View File

@@ -9,14 +9,16 @@ use Symfony\Component\Uid\Uuid;
#[ORM\Entity]
#[ORM\Table(name: 'tag_rebuild_job')]
#[ORM\Index(columns: ['status'], name: 'idx_tag_rebuild_job_status')]
#[ORM\Index(columns: ['created_at'], name: 'idx_tag_rebuild_job_created_at')]
#[ORM\Index(name: 'idx_tag_rebuild_job_status', columns: ['status'])]
#[ORM\Index(name: 'idx_tag_rebuild_job_created_at', columns: ['created_at'])]
class TagRebuildJob
{
public const STATUS_QUEUED = 'QUEUED';
public const STATUS_RUNNING = 'RUNNING';
public const STATUS_QUEUED = 'QUEUED';
public const STATUS_RUNNING = 'RUNNING';
public const STATUS_COMPLETED = 'COMPLETED';
public const STATUS_FAILED = 'FAILED';
public const STATUS_FAILED = 'FAILED';
private const ERROR_MESSAGE_MAX_LENGTH = 4000;
#[ORM\Id]
#[ORM\Column(type: 'uuid', unique: true)]
@@ -44,6 +46,19 @@ class TagRebuildJob
$this->status = self::STATUS_QUEUED;
}
/**
* @return list<string>
*/
public static function statuses(): array
{
return [
self::STATUS_QUEUED,
self::STATUS_RUNNING,
self::STATUS_COMPLETED,
self::STATUS_FAILED,
];
}
public function getId(): Uuid
{
return $this->id;
@@ -54,24 +69,59 @@ class TagRebuildJob
return $this->status;
}
public function isQueued(): bool
{
return $this->status === self::STATUS_QUEUED;
}
public function isRunning(): bool
{
return $this->status === self::STATUS_RUNNING;
}
public function isCompleted(): bool
{
return $this->status === self::STATUS_COMPLETED;
}
public function isFailed(): bool
{
return $this->status === self::STATUS_FAILED;
}
public function isActive(): bool
{
return $this->isQueued() || $this->isRunning();
}
public function markRunning(): void
{
$this->status = self::STATUS_RUNNING;
$this->startedAt = new \DateTimeImmutable();
$this->finishedAt = null;
$this->errorMessage = null;
}
public function markCompleted(): void
{
if ($this->startedAt === null) {
$this->startedAt = new \DateTimeImmutable();
}
$this->status = self::STATUS_COMPLETED;
$this->finishedAt = new \DateTimeImmutable();
$this->errorMessage = null;
}
public function markFailed(string $message): void
{
if ($this->startedAt === null) {
$this->startedAt = new \DateTimeImmutable();
}
$this->status = self::STATUS_FAILED;
$this->finishedAt = new \DateTimeImmutable();
$this->errorMessage = $message;
$this->errorMessage = $this->normalizeErrorMessage($message);
}
public function getCreatedAt(): \DateTimeImmutable
@@ -93,4 +143,19 @@ class TagRebuildJob
{
return $this->errorMessage;
}
private function normalizeErrorMessage(string $message): ?string
{
$message = trim($message);
if ($message === '') {
return 'Unknown tag rebuild failure.';
}
if (mb_strlen($message) > self::ERROR_MESSAGE_MAX_LENGTH) {
$message = mb_substr($message, 0, self::ERROR_MESSAGE_MAX_LENGTH);
}
return $message;
}
}

View File

@@ -6,82 +6,132 @@ namespace App\Intent;
use App\Config\CatalogIntentConfig;
use App\Knowledge\Retrieval\QueryCleaner;
use App\Tag\TagVectorSearchClient;
use App\Tag\TagTypes;
use App\Tag\TagVectorSearchClient;
/**
* CatalogIntentLite
* Lightweight catalog entity detector.
*
* Reiner Entity-Detector.
*
* Verantwortlich nur für:
* - Vector-Tag-Erkennung
* - Score-Gate
* - Ambiguity-Check
* - Sicherstellen, dass TagType = catalog_entity
*
* KEIN:
* - Listen-Signal
* - SalesIntent
* - Routing
* Responsibilities:
* - clean the user query for tag lookup
* - query the tag vector index
* - keep only catalog_entity hits
* - apply confidence and ambiguity gates
* - return one canonical entity label or null
*/
final readonly class CatalogIntentLite
{
/**
* Slightly wider than the old top-3 search so generic tags do not crowd out
* relevant catalog_entity hits too easily.
*/
private const SEARCH_LIMIT = 6;
public function __construct(
private TagVectorSearchClient $tagVectorClient,
private QueryCleaner $queryCleaner
) {}
private QueryCleaner $queryCleaner,
) {
}
/**
* Gibt das canonical Label der erkannten catalog_entity zurück
* oder null, wenn kein sauberer Treffer.
* Returns the canonical normalized label of the detected catalog entity,
* or null when no safe entity match exists.
*/
public function detect(string $prompt): ?string
{
$prompt = trim($prompt);
if ($prompt === '') {
return null;
}
$promptTag = $this->queryCleaner->clean($prompt);
$cleanQuery = trim($this->queryCleaner->clean($prompt));
// 1) Tag-Vector-Suche
$hits = $this->tagVectorClient->search($promptTag, 3);
if ($hits === []) {
if ($cleanQuery === '') {
return null;
}
$best = $hits[0];
$bestScore = (float)($best['score'] ?? 0.0);
$catalogHits = $this->filterCatalogEntityHits(
$this->tagVectorClient->search($cleanQuery, self::SEARCH_LIMIT)
);
if ($catalogHits === []) {
return null;
}
$best = $catalogHits[0];
$bestScore = (float) ($best['score'] ?? 0.0);
// 2) Score-Tags
if ($bestScore < CatalogIntentConfig::MIN_SCORE) {
return null;
}
// 3) Ambiguity-Check
if (isset($hits[1])) {
$secondScore = (float)($hits[1]['score'] ?? 0.0);
if (isset($catalogHits[1])) {
$secondScore = (float) ($catalogHits[1]['score'] ?? 0.0);
if (abs($bestScore - $secondScore) < CatalogIntentConfig::AMBIGUITY_DELTA) {
return null;
}
}
// 4) Nur catalog_entity zulassen
if (($best['tag_type'] ?? null) !== TagTypes::CATALOG_ENTITY) {
return null;
$label = $this->normalizeLabel((string) ($best['label'] ?? ''));
return $label !== '' ? $label : null;
}
/**
* @param array<int, array{
* tag_id:string,
* score:float,
* label?:string,
* tag_type?:string
* }> $hits
*
* @return list<array{
* tag_id:string,
* score:float,
* label?:string,
* tag_type:string
* }>
*/
private function filterCatalogEntityHits(array $hits): array
{
$filtered = [];
foreach ($hits as $hit) {
$tagId = trim((string) ($hit['tag_id'] ?? ''));
$score = (float) ($hit['score'] ?? 0.0);
$tagType = TagTypes::normalize((string) ($hit['tag_type'] ?? TagTypes::GENERIC));
if ($tagId === '') {
continue;
}
if ($tagType !== TagTypes::CATALOG_ENTITY) {
continue;
}
$filtered[] = [
'tag_id' => $tagId,
'score' => $score,
'label' => isset($hit['label']) ? (string) $hit['label'] : null,
'tag_type' => $tagType,
];
}
// 5) Canonical Label
$label = trim((string)($best['label'] ?? ''));
usort(
$filtered,
static fn (array $left, array $right): int => ($right['score'] <=> $left['score'])
);
if ($label === '') {
return null;
}
return $filtered;
}
return mb_strtolower($label);
private function normalizeLabel(string $label): string
{
$label = mb_strtolower(trim($label));
$label = preg_replace('/\s+/u', ' ', $label) ?? $label;
return trim($label);
}
}

View File

@@ -8,65 +8,99 @@ use App\Entity\Document;
use App\Entity\Tag;
use App\Service\TagRebuildJobService;
use App\Tag\TagService;
use App\Tag\TagTypes;
use Doctrine\ORM\EntityManagerInterface;
use RuntimeException;
final class DocumentTagAdminService
final readonly class DocumentTagAdminService
{
public function __construct(
private readonly EntityManagerInterface $em,
private readonly TagService $tagService,
private readonly TagRebuildJobService $jobs,
) {}
private EntityManagerInterface $em,
private TagService $tagService,
private TagRebuildJobService $jobs,
) {
}
/**
* @return array{
* document: Document,
* allTags: list<Tag>,
* latestJob: mixed
* latestJob: mixed,
* hasActiveJob: bool
* }
*/
public function getEditData(string $documentId): array
{
$document = $this->em->getRepository(Document::class)->find($documentId);
if (!$document instanceof Document) {
throw new \RuntimeException('Document not found');
}
$document = $this->findDocumentById($documentId);
/** @var list<Tag> $allTags */
$allTags = $this->em->createQueryBuilder()
->select('t')
->from(Tag::class, 't')
->orderBy('t.label', 'ASC')
->getQuery()
->getResult();
$allTags = $this->em->getRepository(Tag::class)->findAll();
$latestJob = $this->jobs->getLatestJob();
usort(
$allTags,
static function (Tag $left, Tag $right): int {
$typeOrder = [
TagTypes::CATALOG_ENTITY => 10,
TagTypes::GENERIC => 20,
TagTypes::SALES_SIGNAL => 30,
];
$leftTypeRank = $typeOrder[$left->getType()] ?? 999;
$rightTypeRank = $typeOrder[$right->getType()] ?? 999;
if ($leftTypeRank !== $rightTypeRank) {
return $leftTypeRank <=> $rightTypeRank;
}
$labelComparison = strcasecmp($left->getLabel(), $right->getLabel());
if ($labelComparison !== 0) {
return $labelComparison;
}
return strcmp($left->getSlug(), $right->getSlug());
}
);
return [
'document' => $document,
'allTags' => $allTags,
'latestJob' => $latestJob,
'latestJob' => $this->jobs->getLatestJob(),
'hasActiveJob' => $this->jobs->hasActiveJob(),
];
}
/**
* Speichert die Tag-Auswahl für ein Dokument (inkl. Sync-Logik).
* Persists the selected tag set for a document via the central domain service.
*
* @param array<mixed> $selectedTagIds
*/
public function saveTags(string $documentId, array $selectedTagIds): void
{
$document = $this->em->getRepository(Document::class)->find($documentId);
if (!$document instanceof Document) {
throw new \RuntimeException('Document not found');
}
$document = $this->findDocumentById($documentId);
// Delegation an deine Domain-Logik (bleibt dort, wo sie hingehört)
$this->tagService->syncDocumentTags($document, $selectedTagIds);
}
public function getLatestRebuildStatus(): ?string
{
$job = $this->jobs->getLatestJob();
return $this->jobs->getLatestJob()?->getStatus();
}
return $job?->getStatus();
private function findDocumentById(string $documentId): Document
{
$documentId = trim($documentId);
if ($documentId === '') {
throw new RuntimeException('Document not found.');
}
$document = $this->em->getRepository(Document::class)->find($documentId);
if (!$document instanceof Document) {
throw new RuntimeException('Document not found.');
}
return $document;
}
}

View File

@@ -9,23 +9,29 @@ use App\Entity\DocumentTag;
use App\Entity\Tag;
use App\Service\TagRebuildJobService;
use App\Tag\TagService;
use App\Tag\TagTypes;
use Doctrine\ORM\EntityManagerInterface;
use RuntimeException;
final readonly class TagAdminService
{
public function __construct(
private EntityManagerInterface $em,
private TagService $tagService,
private TagRebuildJobService $jobs,
) {}
private TagService $tagService,
private TagRebuildJobService $jobs,
) {
}
public function getIndexData(): array
{
/** @var list<Tag> $tags */
$tags = $this->em->getRepository(Tag::class)
->findBy([], ['label' => 'ASC']);
->findBy([], ['type' => 'ASC', 'label' => 'ASC']);
return [
'tags' => $tags,
'tagTypeChoices' => TagTypes::choices(),
'documentCountByTagId' => $this->buildDocumentCountByTagId(),
'latestJob' => $this->jobs->getLatestJob(),
'hasActiveJob' => $this->jobs->hasActiveJob(),
];
@@ -35,7 +41,7 @@ final readonly class TagAdminService
string $slug,
string $label,
?string $description,
string $type = 'generic' // NEU
string $type = TagTypes::GENERIC,
): void {
$this->tagService->create($slug, $label, $description, $type);
}
@@ -47,35 +53,47 @@ final readonly class TagAdminService
public function getAssignData(string $tagId): array
{
$tag = $this->em->getRepository(Tag::class)->find($tagId);
$tag = $this->findTagById($tagId);
if (!$tag instanceof Tag) {
throw new \RuntimeException('Tag nicht gefunden.');
}
$documents = $this->em->getRepository(Document::class)->findAll();
/** @var list<Document> $documents */
$documents = $this->em->getRepository(Document::class)->findBy(
['status' => Document::STATUS_ACTIVE],
['title' => 'ASC']
);
$documentsData = array_map(
fn(Document $d) => [
'id' => (string)$d->getId(),
'title' => $d->getTitle(),
static fn (Document $document): array => [
'id' => (string) $document->getId(),
'title' => $document->getTitle(),
],
$documents
);
/** @var list<DocumentTag> $existingRelations */
$existingRelations = $this->em
->getRepository(DocumentTag::class)
->findBy(['tag' => $tag]);
$assignedDocIds = array_map(
fn(DocumentTag $dt) => (string)$dt->getDocument()->getId(),
$existingRelations
$activeDocumentIds = array_map(
static fn (Document $document): string => (string) $document->getId(),
$documents
);
$assignedDocIds = [];
foreach ($existingRelations as $relation) {
$documentId = (string) $relation->getDocument()->getId();
if (in_array($documentId, $activeDocumentIds, true)) {
$assignedDocIds[] = $documentId;
}
}
return [
'tag' => $tag,
'documents' => $documentsData,
'assignedDocIds' => $assignedDocIds,
'assignedDocIds' => array_values(array_unique($assignedDocIds)),
'tagTypeChoices' => TagTypes::choices(),
'latestJob' => $this->jobs->getLatestJob(),
'hasActiveJob' => $this->jobs->hasActiveJob(),
];
@@ -83,12 +101,55 @@ final readonly class TagAdminService
public function syncAssignments(string $tagId, array $selectedDocIds): void
{
$tag = $this->findTagById($tagId);
$this->tagService->syncTagDocuments($tag, $selectedDocIds);
}
private function findTagById(string $tagId): Tag
{
$tagId = trim($tagId);
if ($tagId === '') {
throw new RuntimeException('Tag nicht gefunden.');
}
$tag = $this->em->getRepository(Tag::class)->find($tagId);
if (!$tag instanceof Tag) {
throw new \RuntimeException('Tag nicht gefunden.');
throw new RuntimeException('Tag nicht gefunden.');
}
$this->tagService->syncTagDocuments($tag, $selectedDocIds);
return $tag;
}
/**
* @return array<string, int>
*/
private function buildDocumentCountByTagId(): array
{
$rows = $this->em->createQueryBuilder()
->select('t AS tag', 'COUNT(d.id) AS documentCount')
->from(Tag::class, 't')
->leftJoin(DocumentTag::class, 'dt', 'WITH', 'dt.tag = t')
->leftJoin('dt.document', 'd', 'WITH', 'd.status = :status')
->groupBy('t.id')
->setParameter('status', Document::STATUS_ACTIVE)
->getQuery()
->getResult();
$counts = [];
foreach ($rows as $row) {
$tag = $row[0] ?? $row['tag'] ?? null;
$documentCount = (int) ($row['documentCount'] ?? 0);
if (!$tag instanceof Tag) {
continue;
}
$counts[$tag->getId()->toRfc4122()] = $documentCount;
}
return $counts;
}
}

View File

@@ -1,29 +1,33 @@
<?php
declare(strict_types=1);
namespace App\Service;
use App\Entity\Document;
use App\Entity\DocumentVersion;
use App\Entity\User;
use Doctrine\ORM\EntityManagerInterface;
use RuntimeException;
class DocumentService
final readonly class DocumentService
{
public function __construct(
private EntityManagerInterface $em,
) {}
private TagRebuildJobService $tagRebuildJobService,
) {
}
/**
* Erstellt ein neues Dokument inkl. Version 1
* Creates a new document including version 1.
*/
public function createDocument(
string $title,
string $filePath,
User $user
): Document {
$document = new Document();
$document->setTitle($title);
$document->setTitle(trim($title));
$document->setCreatedBy($user);
$version = new DocumentVersion();
@@ -44,14 +48,13 @@ class DocumentService
}
/**
* Fügt neue Version hinzu (immutable)
* Adds a new immutable version to an existing document.
*/
public function addVersion(
Document $document,
string $filePath,
User $user
): DocumentVersion {
$nextVersionNumber = $this->getNextVersionNumber($document);
$version = new DocumentVersion();
@@ -70,7 +73,7 @@ class DocumentService
}
/**
* Aktiviert eine Version
* Activates a document version and marks it for re-ingest.
*/
public function activateVersion(DocumentVersion $version): void
{
@@ -82,41 +85,77 @@ class DocumentService
$version->setActive(true);
$document->setCurrentVersion($version);
$version->setIngestStatus(DocumentVersion::INGEST_PENDING);
$this->em->flush();
}
/**
* Archiviert Dokument
* Archives a document.
*
* If the document had tag assignments, the tag index is rebuilt so the
* routing layer no longer works with an outdated active document set.
*/
public function archive(Document $document): void
{
if ($document->getStatus() === Document::STATUS_ARCHIVED) {
return;
}
$shouldRebuildTags = $this->hasTagAssignments($document);
$document->archive();
$this->em->flush();
}
public function delete(Document $document): void
{
$this->em->remove($document);
$this->em->flush();
if ($shouldRebuildTags) {
$this->triggerTagRebuildIfIdle();
}
}
/**
* Berechnet SHA256 Checksum
* Deletes a document.
*
* If the document had tag assignments, the tag index is rebuilt after the
* removal so stale document references disappear from tag-based routing.
*/
public function delete(Document $document): void
{
$shouldRebuildTags = $this->hasTagAssignments($document);
$this->em->remove($document);
$this->em->flush();
if ($shouldRebuildTags) {
$this->triggerTagRebuildIfIdle();
}
}
/**
* Calculates the SHA256 checksum for a file path.
*/
private function calculateChecksum(string $filePath): string
{
if (!file_exists($filePath)) {
throw new \RuntimeException('File not found for checksum.');
$filePath = trim($filePath);
if ($filePath === '') {
throw new RuntimeException('File path must not be empty.');
}
return hash_file('sha256', $filePath);
if (!is_file($filePath)) {
throw new RuntimeException('File not found for checksum.');
}
$checksum = hash_file('sha256', $filePath);
if ($checksum === false) {
throw new RuntimeException('Could not calculate file checksum.');
}
return $checksum;
}
/**
* Ermittelt nächste Versionsnummer
* Determines the next version number for a document.
*/
private function getNextVersionNumber(Document $document): int
{
@@ -128,4 +167,16 @@ class DocumentService
return $max + 1;
}
}
private function hasTagAssignments(Document $document): bool
{
return $document->getDocumentTags()->count() > 0;
}
private function triggerTagRebuildIfIdle(): void
{
if (!$this->tagRebuildJobService->hasActiveJob()) {
$this->tagRebuildJobService->enqueueAndStartAsync();
}
}
}

View File

@@ -11,16 +11,24 @@ use Psr\Log\LoggerInterface;
final readonly class TagRebuildJobService
{
/**
* Wenn ein QUEUED-Job länger nicht startet, gilt er als "stale" und wird auf FAILED gesetzt,
* damit das System nicht dauerhaft blockiert.
* If a QUEUED job does not transition into RUNNING in time,
* it is treated as stale so the system does not stay blocked forever.
*/
private const STALE_QUEUED_AFTER_SECONDS = 300; // 5 Minuten
private const STALE_QUEUED_AFTER_SECONDS = 300;
/**
* The background runner should switch the job from QUEUED to RUNNING almost
* immediately because markRunning() happens at the top of the command.
*/
private const ASYNC_START_TIMEOUT_SECONDS = 3;
private const ASYNC_START_POLL_INTERVAL_MICROSECONDS = 250000;
public function __construct(
private EntityManagerInterface $em,
private LoggerInterface $agentLogger,
private string $projectDir,
) {}
private LoggerInterface $agentLogger,
private string $projectDir,
) {
}
public function enqueueAndStartAsync(): TagRebuildJob
{
@@ -29,14 +37,25 @@ final readonly class TagRebuildJobService
$this->em->persist($job);
$this->em->flush();
$this->startAsync($job);
try {
$this->startAsync($job);
} catch (\Throwable $e) {
$job->markFailed('Async tag rebuild start failed: ' . $e->getMessage());
$this->em->flush();
$this->agentLogger->error('[tags] async job start failed', [
'job' => (string) $job->getId(),
'error' => $e->getMessage(),
]);
throw $e;
}
return $job;
}
public function enqueueIfIdle(): ?TagRebuildJob
{
// Coalescing: Wenn ein Job läuft oder queued ist -> nichts tun
if ($this->hasActiveJob()) {
return null;
}
@@ -44,23 +63,18 @@ final readonly class TagRebuildJobService
return $this->enqueueAndStartAsync();
}
/**
* Letzter Job (egal welcher Status).
*/
public function getLatestJob(): ?TagRebuildJob
{
return $this->em->createQueryBuilder()
->select('j')
->from(TagRebuildJob::class, 'j')
->orderBy('j.createdAt', 'DESC')
->addOrderBy('j.id', 'DESC')
->setMaxResults(1)
->getQuery()
->getOneOrNullResult();
}
/**
* Letzter Job mit Status COMPLETED.
*/
public function getLatestCompletedJob(): ?TagRebuildJob
{
return $this->em->createQueryBuilder()
@@ -69,18 +83,12 @@ final readonly class TagRebuildJobService
->where('j.status = :status')
->setParameter('status', TagRebuildJob::STATUS_COMPLETED)
->orderBy('j.createdAt', 'DESC')
->addOrderBy('j.id', 'DESC')
->setMaxResults(1)
->getQuery()
->getOneOrNullResult();
}
/**
* Ob gerade ein Job aktiv ist:
* - RUNNING ist immer aktiv
* - QUEUED ist nur aktiv, wenn er nicht stale ist
*
* Zusätzlich: stale QUEUED Jobs werden auf FAILED gesetzt (Recovery).
*/
public function hasActiveJob(): bool
{
$this->markStaleQueuedJobsFailed();
@@ -106,31 +114,33 @@ final readonly class TagRebuildJobService
return (int) $qb->getQuery()->getSingleScalarResult() > 0;
}
/**
* Startet den Job async über bin/console.
* Wichtige Fixes:
* - php explizit verwenden
* - --no-interaction
* - Logfile statt /dev/null
*/
private function startAsync(TagRebuildJob $job): void
{
$projectDir = rtrim($this->projectDir, '/');
$console = $projectDir . '/bin/console';
$projectDir = rtrim(trim($this->projectDir), '/');
$console = $projectDir . '/bin/console';
if ($projectDir === '' || !is_dir($projectDir)) {
throw new \RuntimeException('Project directory is invalid.');
}
if (!is_file($console)) {
throw new \RuntimeException('bin/console not found: ' . $console);
}
$phpBinary = $this->resolvePhpBinary();
$jobId = (string) $job->getId();
$logDir = $projectDir . '/var/log/tags';
if (!is_dir($logDir)) {
@mkdir($logDir, 0777, true);
if (!is_dir($logDir) && !@mkdir($logDir, 0775, true) && !is_dir($logDir)) {
throw new \RuntimeException('Could not create tag job log directory.');
}
$logFile = $logDir . '/job_' . $jobId . '.log';
// Robust: cd ins Projekt, dann nohup php bin/console ...
$cmd = sprintf(
'cd %s && nohup %s %s %s %s --no-interaction >> %s 2>&1 &',
'cd %s && nohup %s %s %s %s --no-interaction >> %s 2>&1 & echo $!',
escapeshellarg($projectDir),
escapeshellcmd('php'),
escapeshellarg($phpBinary),
escapeshellarg($console),
escapeshellarg('mto:agent:tags:job:run'),
escapeshellarg($jobId),
@@ -141,15 +151,92 @@ final readonly class TagRebuildJobService
'job' => $jobId,
'cmd' => $cmd,
'log' => $logFile,
'php_binary' => $phpBinary,
]);
@exec($cmd);
$output = [];
$exitCode = 0;
@exec($cmd, $output, $exitCode);
$pid = isset($output[0]) ? trim((string) $output[0]) : '';
if ($exitCode !== 0) {
throw new \RuntimeException('Async process bootstrap failed with exit code ' . $exitCode . '.');
}
if ($pid === '' || !ctype_digit($pid)) {
throw new \RuntimeException('Async process bootstrap did not return a valid PID.');
}
$this->agentLogger->info('[tags] async job process started', [
'job' => $jobId,
'pid' => $pid,
'log' => $logFile,
'php_binary' => $phpBinary,
]);
$this->waitForAsyncJobTransition($job, $logFile);
}
private function resolvePhpBinary(): string
{
$envCandidates = [
trim((string) ($_SERVER['PHP_CLI_BINARY'] ?? '')),
trim((string) ($_ENV['PHP_CLI_BINARY'] ?? '')),
trim((string) getenv('PHP_CLI_BINARY')),
];
foreach ($envCandidates as $candidate) {
if ($this->isValidCliPhpBinary($candidate)) {
return $candidate;
}
}
$phpBinary = defined('PHP_BINARY') ? trim((string) PHP_BINARY) : '';
if ($this->isValidCliPhpBinary($phpBinary)) {
return $phpBinary;
}
$fallbackCandidates = [
'/usr/bin/php',
'/usr/local/bin/php',
'/bin/php',
'/opt/homebrew/bin/php',
];
foreach ($fallbackCandidates as $candidate) {
if ($this->isValidCliPhpBinary($candidate)) {
return $candidate;
}
}
$whichPhp = trim((string) @shell_exec('command -v php 2>/dev/null'));
if ($this->isValidCliPhpBinary($whichPhp)) {
return $whichPhp;
}
throw new \RuntimeException(
'Could not resolve a CLI PHP binary. Set PHP_CLI_BINARY explicitly, e.g. /usr/bin/php.'
);
}
private function isValidCliPhpBinary(string $path): bool
{
$path = trim($path);
if ($path === '' || !is_file($path) || !is_executable($path)) {
return false;
}
$basename = strtolower(basename($path));
if (str_contains($basename, 'fpm') || str_contains($basename, 'cgi')) {
return false;
}
return true;
}
/**
* Recovery gegen "ewig QUEUED":
* Setzt alte QUEUED Jobs auf FAILED, damit enqueueIfIdle() nicht dauerhaft blockiert.
*/
private function markStaleQueuedJobsFailed(): void
{
$cutoff = new \DateTimeImmutable('-' . self::STALE_QUEUED_AFTER_SECONDS . ' seconds');
@@ -161,12 +248,13 @@ final readonly class TagRebuildJobService
->andWhere('j.createdAt < :cutoff')
->setParameter('queued', TagRebuildJob::STATUS_QUEUED)
->setParameter('cutoff', $cutoff)
->orderBy('j.createdAt', 'ASC')
->setMaxResults(25);
/** @var TagRebuildJob[] $stale */
/** @var list<TagRebuildJob> $stale */
$stale = $qb->getQuery()->getResult();
if (!$stale) {
if ($stale === []) {
return;
}
@@ -183,4 +271,46 @@ final readonly class TagRebuildJobService
$this->em->flush();
}
private function waitForAsyncJobTransition(TagRebuildJob $job, string $logFile): void
{
$deadline = microtime(true) + self::ASYNC_START_TIMEOUT_SECONDS;
while (microtime(true) < $deadline) {
usleep(self::ASYNC_START_POLL_INTERVAL_MICROSECONDS);
$this->em->refresh($job);
if (!$job->isQueued()) {
return;
}
}
$logHint = $this->readLogTail($logFile);
throw new \RuntimeException(
'Async tag rebuild runner did not transition from QUEUED to RUNNING within '
. self::ASYNC_START_TIMEOUT_SECONDS
. ' seconds.'
. ($logHint !== null ? ' Log tail: ' . $logHint : '')
);
}
private function readLogTail(string $logFile): ?string
{
if (!is_file($logFile) || !is_readable($logFile)) {
return null;
}
$content = @file_get_contents($logFile);
if (!is_string($content) || trim($content) === '') {
return null;
}
$content = trim($content);
$tail = mb_substr($content, -800);
$tail = preg_replace('/\s+/u', ' ', $tail) ?? $tail;
return trim($tail) !== '' ? trim($tail) : null;
}
}

View File

@@ -11,29 +11,76 @@ final readonly class TagRebuildStatusProvider
{
public function __construct(
private EntityManagerInterface $em
) {}
) {
}
public function getLatestStatus(): ?array
{
$this->em->clear();
$job = $this->em->createQueryBuilder()
->select('j')
$row = $this->em->createQueryBuilder()
->select(
'j.status AS status',
'j.createdAt AS createdAt',
'j.startedAt AS startedAt',
'j.finishedAt AS finishedAt',
'j.errorMessage AS errorMessage'
)
->from(TagRebuildJob::class, 'j')
->orderBy('j.createdAt', 'DESC')
->addOrderBy('j.id', 'DESC')
->setMaxResults(1)
->getQuery()
->getOneOrNullResult();
->getOneOrNullResult(\Doctrine\ORM\Query::HYDRATE_ARRAY);
if (!$job instanceof TagRebuildJob) {
if (!is_array($row)) {
return null;
}
$status = trim((string) ($row['status'] ?? ''));
if ($status === '') {
return null;
}
return [
'status' => $job->getStatus(),
'startedAt' => $job->getStartedAt()?->format(DATE_ATOM),
'finishedAt' => $job->getFinishedAt()?->format(DATE_ATOM),
'error' => $job->getErrorMessage(),
'status' => $status,
'createdAt' => $this->formatDateValue($row['createdAt'] ?? null),
'startedAt' => $this->formatDateValue($row['startedAt'] ?? null),
'finishedAt' => $this->formatDateValue($row['finishedAt'] ?? null),
'error' => $this->normalizeNullableString($row['errorMessage'] ?? null),
'hasActiveJob' => in_array($status, [
TagRebuildJob::STATUS_QUEUED,
TagRebuildJob::STATUS_RUNNING,
], true),
];
}
private function formatDateValue(mixed $value): ?string
{
if ($value instanceof \DateTimeInterface) {
return $value->format(DATE_ATOM);
}
if (is_string($value)) {
$value = trim($value);
if ($value === '') {
return null;
}
try {
return (new \DateTimeImmutable($value))->format(DATE_ATOM);
} catch (\Throwable) {
return null;
}
}
return null;
}
private function normalizeNullableString(mixed $value): ?string
{
$value = trim((string) $value);
return $value !== '' ? $value : null;
}
}

View File

@@ -4,6 +4,7 @@ declare(strict_types=1);
namespace App\Tag;
use App\Entity\Document;
use App\Entity\DocumentTag;
use App\Entity\Tag;
use Doctrine\ORM\EntityManagerInterface;
@@ -12,148 +13,199 @@ final readonly class TagNdjsonExporter
{
public function __construct(
private EntityManagerInterface $em,
private string $tagsNdjsonPath,
) {}
private string $tagsNdjsonPath,
) {
}
/**
* Export all tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
* Export all relevant tags into NDJSON (streaming) with atomic switch (.tmp + rename()).
*
* Line format:
* {
* "tag_id":"...",
* "text":"label\nslug\noptional description",
* "type":"catalog_entity|generic|...",
* "type":"catalog_entity|generic|sales_signal",
* "document_ids":["...","..."]
* }
*
* Only ACTIVE document assignments are exported. Tags without active document
* assignments are intentionally skipped so they do not influence retrieval.
*
* @return array{tags:int, lines:int, bytes:int, path:string}
*/
public function export(): array
{
$dir = \dirname($this->tagsNdjsonPath);
if (!\is_dir($dir)) {
@\mkdir($dir, 0775, true);
}
$this->ensureTargetDirectoryExists();
$tmpPath = $this->tagsNdjsonPath . '.tmp';
$this->cleanupTemporaryFile($tmpPath);
$fh = @\fopen($tmpPath, 'wb');
if (!$fh) {
$fh = @fopen($tmpPath, 'wb');
if ($fh === false) {
throw new \RuntimeException('Cannot write tags NDJSON: ' . $tmpPath);
}
// 1) Load all tags
$tags = $this->em->createQueryBuilder()
->select('t')
->from(Tag::class, 't')
->orderBy('t.label', 'ASC')
->getQuery()
->getResult();
try {
/** @var list<Tag> $tags */
$tags = $this->em->createQueryBuilder()
->select('t')
->from(Tag::class, 't')
->orderBy('t.type', 'ASC')
->addOrderBy('t.label', 'ASC')
->getQuery()
->getResult();
if (!\is_array($tags) || $tags === []) {
\fclose($fh);
if ($tags === []) {
fclose($fh);
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return [
'tags' => 0,
'lines' => 0,
'bytes' => (int) @filesize($this->tagsNdjsonPath),
'path' => $this->tagsNdjsonPath,
];
}
$tagToActiveDocs = $this->buildActiveDocumentMap();
$lines = 0;
foreach ($tags as $tag) {
$tagId = $tag->getId()->toRfc4122();
$docIds = $tagToActiveDocs[$tagId] ?? [];
if ($docIds === []) {
continue;
}
$line = [
'tag_id' => $tagId,
'text' => $this->buildEmbeddingText($tag),
'type' => TagTypes::normalize($tag->getType()),
'document_ids' => $docIds,
];
$json = json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
if (!is_string($json)) {
continue;
}
fwrite($fh, $json . "\n");
$lines++;
}
fclose($fh);
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return [
'tags' => 0,
'lines' => 0,
'bytes' => (int) @\filesize($this->tagsNdjsonPath),
'path' => $this->tagsNdjsonPath,
'tags' => count($tags),
'lines' => $lines,
'bytes' => (int) @filesize($this->tagsNdjsonPath),
'path' => $this->tagsNdjsonPath,
];
}
} catch (\Throwable $e) {
fclose($fh);
$this->cleanupTemporaryFile($tmpPath);
// 2) Build tagId => docIds map
$rows = $this->em->createQueryBuilder()
->select('IDENTITY(dt.tag) AS tagId', 'IDENTITY(dt.document) AS docId')
throw $e;
}
}
/**
* @return array<string, list<string>>
*/
private function buildActiveDocumentMap(): array
{
/** @var list<DocumentTag> $relations */
$relations = $this->em->createQueryBuilder()
->select('dt')
->addSelect('t', 'd')
->from(DocumentTag::class, 'dt')
->innerJoin('dt.tag', 't')
->innerJoin('dt.document', 'd')
->where('d.status = :status')
->setParameter('status', Document::STATUS_ACTIVE)
->getQuery()
->getArrayResult();
->getResult();
$tagToDocs = [];
foreach ($rows as $r) {
$tagId = (string) ($r['tagId'] ?? '');
$docId = (string) ($r['docId'] ?? '');
if ($tagId === '' || $docId === '') {
continue;
}
$tagToDocs[$tagId][] = $docId;
foreach ($relations as $relation) {
$tag = $relation->getTag();
$document = $relation->getDocument();
$tagId = $tag->getId()->toRfc4122();
$docId = $document->getId()->toRfc4122();
$tagToDocs[$tagId][$docId] = $docId;
}
// 3) Stream NDJSON
$lines = 0;
foreach ($tags as $tag) {
if (!$tag instanceof Tag) {
continue;
}
$tagId = (string) $tag->getId();
$docIds = $tagToDocs[$tagId] ?? [];
if ($docIds !== []) {
$docIds = \array_values(\array_unique($docIds));
}
// Embedding source
$textParts = [
$tag->getLabel(),
$tag->getSlug(),
];
$desc = $tag->getDescription();
if (\is_string($desc) && \trim($desc) !== '') {
$textParts[] = \trim($desc);
}
$type = method_exists($tag, 'getType')
? (string) $tag->getType()
: 'generic';
if ($type === '') {
$type = 'generic';
}
$line = [
'tag_id' => $tagId,
'text' => \implode("\n", $textParts),
'type' => $type, // 🔥 NEW
'document_ids' => $docIds,
];
$json = \json_encode($line, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
if (!\is_string($json)) {
continue;
}
\fwrite($fh, $json . "\n");
$lines++;
foreach ($tagToDocs as $tagId => $docIds) {
ksort($docIds);
$tagToDocs[$tagId] = array_values($docIds);
}
\fclose($fh);
$this->atomicReplace($tmpPath, $this->tagsNdjsonPath);
return $tagToDocs;
}
return [
'tags' => \count($tags),
'lines' => $lines,
'bytes' => (int) @\filesize($this->tagsNdjsonPath),
'path' => $this->tagsNdjsonPath,
private function buildEmbeddingText(Tag $tag): string
{
$parts = [
trim($tag->getLabel()),
trim($tag->getSlug()),
];
$description = trim((string) $tag->getDescription());
if ($description !== '') {
$parts[] = preg_replace('/\s+/u', ' ', $description) ?? $description;
}
$parts = array_values(array_filter(
array_unique($parts),
static fn (string $part): bool => $part !== ''
));
return implode("\n", $parts);
}
private function ensureTargetDirectoryExists(): void
{
$dir = dirname($this->tagsNdjsonPath);
if (is_dir($dir)) {
return;
}
if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
throw new \RuntimeException('Cannot create tags NDJSON directory: ' . $dir);
}
}
private function cleanupTemporaryFile(string $tmpPath): void
{
if (is_file($tmpPath)) {
@unlink($tmpPath);
}
}
private function atomicReplace(string $tmpPath, string $finalPath): void
{
if (\is_file($finalPath)) {
@\chmod($finalPath, 0664);
if (is_file($finalPath)) {
@chmod($finalPath, 0664);
}
if (!@\rename($tmpPath, $finalPath)) {
if (!@\copy($tmpPath, $finalPath)) {
@\unlink($tmpPath);
if (!@rename($tmpPath, $finalPath)) {
if (!@copy($tmpPath, $finalPath)) {
@unlink($tmpPath);
throw new \RuntimeException('Atomic replace failed for: ' . $finalPath);
}
@\unlink($tmpPath);
@unlink($tmpPath);
}
@\chmod($finalPath, 0664);
@chmod($finalPath, 0664);
}
}

View File

@@ -4,6 +4,7 @@ declare(strict_types=1);
namespace App\Tag;
use App\Entity\Document;
use Doctrine\DBAL\ArrayParameterType;
use Doctrine\DBAL\Exception;
use Doctrine\ORM\EntityManagerInterface;
@@ -11,91 +12,239 @@ use Symfony\Component\Uid\Uuid;
final class TagRoutingService
{
/**
* Number of raw tag hits requested from the vector service.
*/
private const DEFAULT_TOPK = 8;
private const MIN_BEST_SCORE = 0.25;
private const MAX_CANDIDATE_DOCS = 200;
/**
* Hard minimum confidence required to activate tag-based document routing.
*
* This intentionally aligns with the tag vector client gate to avoid
* misleading secondary thresholds in this class.
*/
private const MIN_BEST_SCORE = 0.72;
/**
* Only keep tag hits that stay reasonably close to the best hit.
* This reduces semantic spillover into weakly related document spaces.
*/
private const MAX_SCORE_DROP_FROM_BEST = 0.08;
/**
* Maximum number of tag hits that may influence routing.
*/
private const MAX_ROUTING_TAGS = 5;
/**
* Maximum number of candidate documents passed into scoped chunk search.
*/
private const MAX_CANDIDATE_DOCS = 80;
/**
* Small bonus for documents matched by multiple routed tags.
*/
private const MULTI_TAG_BONUS_PER_EXTRA_TAG = 0.05;
private const MAX_MULTI_TAG_BONUS = 0.15;
public function __construct(
private readonly TagVectorSearchClient $tagSearch,
private readonly EntityManagerInterface $em,
) {}
) {
}
/**
* @return string[]|null
* Returns ordered active document ids for tag-scoped retrieval.
*
* The method intentionally returns only document ids so the current
* retriever pipeline can stay unchanged.
*
* @return list<string>|null
* @throws Exception
*/
public function route(string $query): ?array
{
$query = trim($query);
if ($query === '') {
return null;
}
$hits = $this->tagSearch->search($query, self::DEFAULT_TOPK);
$hits = $this->filterRoutingHits(
$this->tagSearch->search($query, self::DEFAULT_TOPK)
);
if (!is_array($hits) || $hits === []) {
if ($hits === []) {
return null;
}
$bestScore = (float)($hits[0]['score'] ?? 0.0);
if ($bestScore < self::MIN_BEST_SCORE) {
return null;
}
// Convert tag UUID strings to binary(16)
$tagBinaryIds = [];
$tagMetaById = [];
foreach ($hits as $hit) {
$id = (string)($hit['tag_id'] ?? '');
if ($id === '') {
$tagId = (string) ($hit['tag_id'] ?? '');
if ($tagId === '') {
continue;
}
try {
$tagBinaryIds[] = Uuid::fromString($id)->toBinary();
$tagBinaryIds[] = Uuid::fromString($tagId)->toBinary();
} catch (\Throwable) {
continue;
}
$tagMetaById[$tagId] = [
'score' => (float) $hit['score'],
'weight' => $this->resolveTypeWeight((string) $hit['tag_type']),
];
}
if ($tagBinaryIds === []) {
return null;
}
// Direct DBAL query (binary-safe)
$conn = $this->em->getConnection();
$rows = $conn->executeQuery(
'SELECT document_id
FROM document_tag
WHERE tag_id IN (:tagIds)',
['tagIds' => $tagBinaryIds],
['tagIds' => ArrayParameterType::BINARY]
$rows = $this->em->getConnection()->executeQuery(
'SELECT dt.document_id, dt.tag_id
FROM document_tag dt
INNER JOIN document d ON d.id = dt.document_id
WHERE dt.tag_id IN (:tagIds)
AND d.status = :status',
[
'tagIds' => $tagBinaryIds,
'status' => Document::STATUS_ACTIVE,
],
[
'tagIds' => ArrayParameterType::BINARY,
]
)->fetchAllAssociative();
if ($rows === []) {
return null;
}
$docIds = [];
$documentScores = [];
$documentMatchedTags = [];
foreach ($rows as $row) {
if (!isset($row['document_id'])) {
if (!isset($row['document_id'], $row['tag_id'])) {
continue;
}
try {
$uuid = Uuid::fromBinary($row['document_id']);
$docIds[(string)$uuid] = true;
$documentId = (string) Uuid::fromBinary($row['document_id']);
$tagId = (string) Uuid::fromBinary($row['tag_id']);
} catch (\Throwable) {
continue;
}
if (count($docIds) >= self::MAX_CANDIDATE_DOCS) {
if (!isset($tagMetaById[$tagId])) {
continue;
}
$documentScores[$documentId] = ($documentScores[$documentId] ?? 0.0)
+ ($tagMetaById[$tagId]['score'] * $tagMetaById[$tagId]['weight']);
$documentMatchedTags[$documentId][$tagId] = true;
}
if ($documentScores === []) {
return null;
}
foreach ($documentScores as $documentId => $score) {
$matchedTagCount = isset($documentMatchedTags[$documentId])
? count($documentMatchedTags[$documentId])
: 0;
if ($matchedTagCount > 1) {
$documentScores[$documentId] += min(
self::MAX_MULTI_TAG_BONUS,
($matchedTagCount - 1) * self::MULTI_TAG_BONUS_PER_EXTRA_TAG
);
}
}
arsort($documentScores, SORT_NUMERIC);
return array_slice(
array_keys($documentScores),
0,
self::MAX_CANDIDATE_DOCS
);
}
/**
* @param array<int, array{
* tag_id:string,
* score:float,
* label?:string,
* tag_type?:string
* }> $hits
*
* @return list<array{
* tag_id:string,
* score:float,
* tag_type:string
* }>
*/
private function filterRoutingHits(array $hits): array
{
if ($hits === []) {
return [];
}
$bestScore = (float) ($hits[0]['score'] ?? 0.0);
if ($bestScore < self::MIN_BEST_SCORE) {
return [];
}
$minimumAcceptedScore = max(
self::MIN_BEST_SCORE,
$bestScore - self::MAX_SCORE_DROP_FROM_BEST
);
$filtered = [];
foreach ($hits as $hit) {
$tagId = (string) ($hit['tag_id'] ?? '');
$score = (float) ($hit['score'] ?? 0.0);
$tagType = TagTypes::normalize(
(string) ($hit['tag_type'] ?? TagTypes::GENERIC)
);
if ($tagId === '' || $score < $minimumAcceptedScore) {
continue;
}
// Sales signals may still be useful elsewhere, but they should not
// expand the document scope for semantic retrieval.
if ($tagType === TagTypes::SALES_SIGNAL) {
continue;
}
$filtered[] = [
'tag_id' => $tagId,
'score' => $score,
'tag_type' => $tagType,
];
if (count($filtered) >= self::MAX_ROUTING_TAGS) {
break;
}
}
return array_keys($docIds);
return $filtered;
}
private function resolveTypeWeight(string $tagType): float
{
return match (TagTypes::normalize($tagType)) {
TagTypes::CATALOG_ENTITY => 1.20,
TagTypes::GENERIC => 1.00,
TagTypes::SALES_SIGNAL => 0.00,
default => 1.00,
};
}
}

View File

@@ -4,42 +4,45 @@ declare(strict_types=1);
namespace App\Tag;
use App\Entity\Tag;
use App\Entity\Document;
use App\Entity\DocumentTag;
use App\Entity\Tag;
use App\Service\TagRebuildJobService;
use Doctrine\ORM\EntityManagerInterface;
use InvalidArgumentException;
use RuntimeException;
final readonly class TagService
{
public function __construct(
private EntityManagerInterface $em,
private TagRebuildJobService $jobs,
) {}
// =========================================================
// TAG CREATE
// =========================================================
private TagRebuildJobService $jobs,
) {
}
public function create(
string $slug,
string $label,
?string $description = null,
string $type = 'generic' // NEU
string $type = TagTypes::GENERIC,
): Tag {
$slug = trim($slug);
$normalizedSlug = $this->normalizeSlug($slug);
$label = trim($label);
if ($label === '' || $slug === '') {
throw new \InvalidArgumentException('Label und Slug sind Pflichtfelder.');
if ($normalizedSlug === '' || $label === '') {
throw new InvalidArgumentException('Tag label and slug are required.');
}
if ($this->slugExists($slug)) {
throw new \RuntimeException('Slug existiert bereits.');
if ($this->slugExists($normalizedSlug)) {
throw new RuntimeException(sprintf('Tag slug "%s" already exists.', $normalizedSlug));
}
$tag = new Tag($slug, $label, $description);
$tag->setType($type); // NEU
$tag = new Tag(
$normalizedSlug,
$label,
$description,
TagTypes::normalize($type)
);
$this->em->persist($tag);
$this->em->flush();
@@ -49,18 +52,9 @@ final readonly class TagService
return $tag;
}
// =========================================================
// TAG DELETE
// =========================================================
public function deleteById(string $tagId): void
{
$tag = $this->em->getRepository(Tag::class)->find($tagId);
if (!$tag instanceof Tag) {
throw new \RuntimeException('Tag nicht gefunden.');
}
$tag = $this->findTagById($tagId);
$this->delete($tag);
}
@@ -72,87 +66,103 @@ final readonly class TagService
$this->triggerRebuildIfIdle();
}
// =========================================================
// DOCUMENT TAG SYNC
// =========================================================
public function syncDocumentTags(Document $document, array $newTagIds): void
{
$newTagIds = array_unique($newTagIds);
$normalizedTagIds = $this->normalizeIdList($newTagIds);
/** @var list<DocumentTag> $currentRelations */
$currentRelations = $this->em
->getRepository(DocumentTag::class)
->findBy(['document' => $document]);
$currentTagIds = array_map(
fn(DocumentTag $dt) => (string) $dt->getTag()->getId(),
static fn (DocumentTag $relation): string => (string) $relation->getTag()->getId(),
$currentRelations
);
$toAdd = array_diff($newTagIds, $currentTagIds);
$toRemove = array_diff($currentTagIds, $newTagIds);
$toAdd = array_values(array_diff($normalizedTagIds, $currentTagIds));
$toRemove = array_values(array_diff($currentTagIds, $normalizedTagIds));
foreach ($toAdd as $tagId) {
$tag = $this->em->getRepository(Tag::class)->find($tagId);
if ($tag instanceof Tag) {
$this->em->persist(new DocumentTag($document, $tag));
}
}
foreach ($currentRelations as $relation) {
if (in_array((string) $relation->getTag()->getId(), $toRemove, true)) {
$relationTagId = (string) $relation->getTag()->getId();
if (in_array($relationTagId, $toRemove, true)) {
$this->em->remove($relation);
}
}
if ($toAdd || $toRemove) {
if ($toAdd !== [] || $toRemove !== []) {
$this->em->flush();
$this->triggerRebuildIfIdle();
}
}
// =========================================================
// TAG → DOCUMENT SYNC (Bulk Assign)
// =========================================================
public function syncTagDocuments(Tag $tag, array $newDocumentIds): void
{
$newDocumentIds = array_unique($newDocumentIds);
$normalizedDocumentIds = $this->normalizeIdList($newDocumentIds);
/** @var list<DocumentTag> $currentRelations */
$currentRelations = $this->em
->getRepository(DocumentTag::class)
->findBy(['tag' => $tag]);
$currentDocumentIds = array_map(
fn(DocumentTag $dt) => (string) $dt->getDocument()->getId(),
static fn (DocumentTag $relation): string => (string) $relation->getDocument()->getId(),
$currentRelations
);
$toAdd = array_diff($newDocumentIds, $currentDocumentIds);
$toRemove = array_diff($currentDocumentIds, $newDocumentIds);
$toAdd = array_values(array_diff($normalizedDocumentIds, $currentDocumentIds));
$toRemove = array_values(array_diff($currentDocumentIds, $normalizedDocumentIds));
foreach ($toAdd as $documentId) {
$document = $this->em->getRepository(Document::class)->find($documentId);
if ($document instanceof Document) {
if (
$document instanceof Document
&& $document->getStatus() === Document::STATUS_ACTIVE
) {
$this->em->persist(new DocumentTag($document, $tag));
}
}
foreach ($currentRelations as $relation) {
if (in_array((string) $relation->getDocument()->getId(), $toRemove, true)) {
$relationDocumentId = (string) $relation->getDocument()->getId();
if (in_array($relationDocumentId, $toRemove, true)) {
$this->em->remove($relation);
}
}
if ($toAdd || $toRemove) {
if ($toAdd !== [] || $toRemove !== []) {
$this->em->flush();
$this->triggerRebuildIfIdle();
}
}
// =========================================================
// INTERNAL HELPERS
// =========================================================
private function findTagById(string $tagId): Tag
{
$tagId = trim($tagId);
if ($tagId === '') {
throw new InvalidArgumentException('Tag id must not be empty.');
}
$tag = $this->em->getRepository(Tag::class)->find($tagId);
if (!$tag instanceof Tag) {
throw new RuntimeException('Tag not found.');
}
return $tag;
}
private function slugExists(string $slug): bool
{
@@ -165,6 +175,36 @@ final readonly class TagService
->getSingleScalarResult() > 0;
}
/**
* @param array<mixed> $ids
* @return list<string>
*/
private function normalizeIdList(array $ids): array
{
$normalized = [];
foreach ($ids as $id) {
$id = trim((string) $id);
if ($id === '') {
continue;
}
$normalized[] = $id;
}
return array_values(array_unique($normalized));
}
private function normalizeSlug(string $slug): string
{
$slug = mb_strtolower(trim($slug));
$slug = preg_replace('/\s+/u', '-', $slug) ?? $slug;
$slug = preg_replace('/-+/u', '-', $slug) ?? $slug;
return trim($slug, '-');
}
private function triggerRebuildIfIdle(): void
{
if (!$this->jobs->hasActiveJob()) {

View File

@@ -5,8 +5,10 @@ declare(strict_types=1);
namespace App\Tag;
/**
* Zentrale Definition aller erlaubten Tag-Typen.
* Verhindert Magic Strings im Code.
* Central definition of all supported tag types.
*
* This class is intentionally tiny and dependency-free because it is the
* foundation for entity validation, admin forms, routing, and catalog logic.
*/
final class TagTypes
{
@@ -14,6 +16,25 @@ final class TagTypes
public const CATALOG_ENTITY = 'catalog_entity';
public const SALES_SIGNAL = 'sales_signal';
/**
* Returns the canonical list of allowed type values.
*
* @return list<string>
*/
public static function all(): array
{
return [
self::GENERIC,
self::CATALOG_ENTITY,
self::SALES_SIGNAL,
];
}
/**
* Returns UI choices for forms and admin screens.
*
* @return array<string, string>
*/
public static function choices(): array
{
return [
@@ -23,5 +44,53 @@ final class TagTypes
];
}
private function __construct() {}
/**
* Returns true if the given value is an allowed tag type.
*/
public static function isValid(?string $type): bool
{
if ($type === null) {
return false;
}
return in_array(self::normalize($type), self::all(), true);
}
/**
* Normalizes external input into a canonical internal value.
*
* Empty or unknown input falls back to the provided default.
*/
public static function normalize(?string $type, string $default = self::GENERIC): string
{
$type = mb_strtolower(trim((string) $type));
$default = mb_strtolower(trim($default));
if ($type === '') {
return self::isKnownDefault($default) ? $default : self::GENERIC;
}
if (in_array($type, self::all(), true)) {
return $type;
}
return self::isKnownDefault($default) ? $default : self::GENERIC;
}
/**
* Returns a human-readable label for a canonical type.
*/
public static function labelFor(string $type): string
{
return array_flip(self::choices())[self::normalize($type)] ?? 'Generic';
}
private static function isKnownDefault(string $type): bool
{
return in_array($type, self::all(), true);
}
private function __construct()
{
}
}

View File

@@ -9,18 +9,81 @@ use Psr\Log\LoggerInterface;
final readonly class TagVectorIndexBuilder
{
private const GRACEFUL_TERMINATION_SECONDS = 2;
public function __construct(
private string $pythonBin,
private string $scriptPath,
private string $tagsNdjsonPath,
private string $vectorTagsIndexPath,
private string $embeddingModel,
private int $timeoutSeconds,
private LoggerInterface $agentLogger,
private IndexMetaManager $metaManager, // ✅ NEU
) {}
private string $pythonBin,
private string $scriptPath,
private string $tagsNdjsonPath,
private string $vectorTagsIndexPath,
private string $embeddingModel,
private int $timeoutSeconds,
private LoggerInterface $agentLogger,
private IndexMetaManager $metaManager,
) {
}
public function build(): void
{
$this->assertPreconditions();
$tmpIndex = $this->vectorTagsIndexPath . '.tmp';
$tmpMeta = $tmpIndex . '.meta.json';
$finalIndex = $this->vectorTagsIndexPath;
$finalMeta = $finalIndex . '.meta.json';
$this->ensureTargetDirectoryExists($finalIndex);
$this->cleanupTemporaryArtifacts($tmpIndex, $tmpMeta);
if (!$this->hasEmbeddableTags()) {
$this->agentLogger->info('[tags] no embeddable tags found, removing stale tag index artifacts.');
$this->removeFileIfExists($finalIndex);
$this->removeFileIfExists($finalMeta);
$this->commitRuntime(false);
return;
}
$cmd = $this->buildCommand($tmpIndex);
$this->agentLogger->info('[tags] build tag vector index', [
'cmd' => $cmd,
'timeout' => $this->timeoutSeconds,
'embedding_model' => $this->embeddingModel,
]);
try {
$result = $this->runCommand($cmd);
if ($result['exit'] !== 0) {
$this->agentLogger->error('[tags] tag vector ingest failed', [
'exit' => $result['exit'],
'stdout' => $result['stdout'],
'stderr' => $result['stderr'],
]);
throw new \RuntimeException('Tag vector ingest failed (exit=' . $result['exit'] . ')');
}
if (!$this->isUsableArtifact($tmpIndex) || !$this->isUsableArtifact($tmpMeta)) {
throw new \RuntimeException('Tag vector ingest produced incomplete artifacts.');
}
$this->atomicReplace($tmpIndex, $finalIndex);
$this->atomicReplace($tmpMeta, $finalMeta);
$this->commitRuntime(true);
$this->agentLogger->info('[tags] tag vector index build completed + runtime committed', [
'index' => $finalIndex,
'meta' => $finalMeta,
]);
} catch (\Throwable $e) {
$this->cleanupTemporaryArtifacts($tmpIndex, $tmpMeta);
throw $e;
}
}
private function assertPreconditions(): void
{
if (!is_file($this->tagsNdjsonPath)) {
throw new \RuntimeException('tags.ndjson missing: ' . $this->tagsNdjsonPath);
@@ -30,65 +93,178 @@ final readonly class TagVectorIndexBuilder
throw new \RuntimeException('Tag ingest script missing: ' . $this->scriptPath);
}
$tmpIndex = $this->vectorTagsIndexPath . '.tmp';
$tmpMeta = $tmpIndex . '.meta.json';
$finalIndex = $this->vectorTagsIndexPath;
$finalMeta = $finalIndex . '.meta.json';
$dir = \dirname($finalIndex);
if (!\is_dir($dir)) {
@\mkdir($dir, 0775, true);
if (trim($this->pythonBin) === '') {
throw new \RuntimeException('Python binary must not be empty.');
}
@\unlink($tmpIndex);
@\unlink($tmpMeta);
if ($this->timeoutSeconds < 1) {
throw new \RuntimeException('Tag vector timeout must be >= 1 second.');
}
}
$cmd = sprintf(
'%s %s %s %s %s 2>&1',
private function buildCommand(string $tmpIndex): string
{
return sprintf(
'%s %s %s %s 2>&1',
escapeshellarg($this->pythonBin),
escapeshellarg($this->scriptPath),
escapeshellarg($this->tagsNdjsonPath),
escapeshellarg($tmpIndex),
escapeshellarg($this->embeddingModel),
);
}
$this->agentLogger->info('[tags] build tag vector index', [
'cmd' => $cmd,
'timeout' => $this->timeoutSeconds,
]);
private function ensureTargetDirectoryExists(string $finalIndexPath): void
{
$dir = dirname($finalIndexPath);
$out = [];
$exit = 0;
exec($cmd, $out, $exit);
if ($exit !== 0) {
$this->agentLogger->error('[tags] tag vector ingest failed', [
'exit' => $exit,
'out' => $out,
]);
throw new \RuntimeException('Tag vector ingest failed (exit=' . $exit . ')');
}
if (!is_file($tmpIndex) || !is_file($tmpMeta)) {
@\unlink($tmpIndex);
@\unlink($tmpMeta);
$this->agentLogger->warning('[tags] no tag index produced (maybe 0 tags).');
if (is_dir($dir)) {
return;
}
$this->atomicReplace($tmpIndex, $finalIndex);
$this->atomicReplace($tmpMeta, $finalMeta);
if (!@mkdir($dir, 0775, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create tag vector directory: ' . $dir);
}
}
// ✅ ENTERPRISE COMMIT MARKER
private function hasEmbeddableTags(): bool
{
$fh = @fopen($this->tagsNdjsonPath, 'rb');
if ($fh === false) {
throw new \RuntimeException('Unable to read tags NDJSON: ' . $this->tagsNdjsonPath);
}
try {
while (($line = fgets($fh)) !== false) {
$line = trim($line);
if ($line === '') {
continue;
}
$decoded = json_decode($line, true);
if (!is_array($decoded)) {
continue;
}
$tagId = trim((string) ($decoded['tag_id'] ?? ''));
$text = trim((string) ($decoded['text'] ?? ''));
if ($tagId !== '' && $text !== '') {
return true;
}
}
} finally {
fclose($fh);
}
return false;
}
/**
* @return array{exit:int, stdout:string, stderr:string}
*/
private function runCommand(string $cmd): array
{
$descriptorSpec = [
0 => ['pipe', 'r'],
1 => ['pipe', 'w'],
2 => ['pipe', 'w'],
];
$process = @proc_open($cmd, $descriptorSpec, $pipes);
if (!is_resource($process)) {
throw new \RuntimeException('Could not start tag vector ingest process.');
}
fclose($pipes[0]);
stream_set_blocking($pipes[1], false);
stream_set_blocking($pipes[2], false);
$stdout = '';
$stderr = '';
$startedAt = microtime(true);
$timedOut = false;
try {
while (true) {
$stdout .= stream_get_contents($pipes[1]) ?: '';
$stderr .= stream_get_contents($pipes[2]) ?: '';
$status = proc_get_status($process);
if (!is_array($status) || ($status['running'] ?? false) !== true) {
break;
}
if ((microtime(true) - $startedAt) > $this->timeoutSeconds) {
$timedOut = true;
proc_terminate($process);
usleep(self::GRACEFUL_TERMINATION_SECONDS * 1000000);
$status = proc_get_status($process);
if (is_array($status) && ($status['running'] ?? false) === true) {
proc_terminate($process, 9);
}
break;
}
usleep(100000);
}
$stdout .= stream_get_contents($pipes[1]) ?: '';
$stderr .= stream_get_contents($pipes[2]) ?: '';
} finally {
fclose($pipes[1]);
fclose($pipes[2]);
}
$exitCode = proc_close($process);
if ($timedOut) {
$this->agentLogger->error('[tags] tag vector ingest timed out', [
'timeout' => $this->timeoutSeconds,
'stdout' => $stdout,
'stderr' => $stderr,
]);
throw new \RuntimeException('Tag vector ingest timed out after ' . $this->timeoutSeconds . ' seconds.');
}
return [
'exit' => is_int($exitCode) ? $exitCode : 1,
'stdout' => trim($stdout),
'stderr' => trim($stderr),
];
}
private function isUsableArtifact(string $path): bool
{
return is_file($path) && filesize($path) > 0;
}
private function cleanupTemporaryArtifacts(string ...$paths): void
{
foreach ($paths as $path) {
$this->removeFileIfExists($path);
}
}
private function removeFileIfExists(string $path): void
{
if (is_file($path)) {
@unlink($path);
}
}
private function commitRuntime(bool $indexPresent): void
{
$this->metaManager->touchRuntime([
'last_tags_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
]);
$this->agentLogger->info('[tags] tag vector index build completed + runtime committed', [
'index' => $finalIndex,
'meta' => $finalMeta,
'tags_index_present' => $indexPresent,
]);
}
@@ -99,6 +275,7 @@ final readonly class TagVectorIndexBuilder
@unlink($tmp);
throw new \RuntimeException('Atomic replace failed for: ' . $final);
}
@unlink($tmp);
}

View File

@@ -6,63 +6,210 @@ namespace App\Tag;
final readonly class TagVectorIndexHealthService
{
private const STATUS_OK = 'OK';
private const STATUS_OK_EMPTY = 'OK_EMPTY';
private const STATUS_INCONSISTENT_STALE_VECTOR = 'INCONSISTENT_STALE_VECTOR';
private const STATUS_INCONSISTENT_MISSING_VECTOR = 'INCONSISTENT_MISSING_VECTOR';
private const STATUS_INCONSISTENT_COUNT_MISMATCH = 'INCONSISTENT_COUNT_MISMATCH';
private const STATUS_INCONSISTENT_INVALID_META = 'INCONSISTENT_INVALID_META';
private const STATUS_UNKNOWN = 'UNKNOWN';
public function __construct(
private string $tagsNdjsonPath,
private string $vectorTagsIndexPath,
private string $vectorTagsMetaPath
) {}
private string $vectorTagsMetaPath,
) {
}
public function check(): array
{
$ndjsonExists = is_file($this->tagsNdjsonPath);
$vectorExists = is_file($this->vectorTagsIndexPath);
$metaExists = is_file($this->vectorTagsMetaPath);
$metaExists = is_file($this->vectorTagsMetaPath);
$ndjsonTagCount = 0;
$ndjsonStats = $this->readNdjsonStats();
$metaStats = $this->readMetaStats();
if ($ndjsonExists) {
$h = @fopen($this->tagsNdjsonPath, 'r');
if ($h !== false) {
while (($line = fgets($h)) !== false) {
$line = trim($line);
if ($line === '') continue;
$data = json_decode($line, true);
if (is_array($data) && !empty($data['tag_id']) && !empty($data['text'])) {
$ndjsonTagCount++;
}
}
fclose($h);
}
}
$vectorTagCount = 0;
if ($metaExists) {
$meta = json_decode((string) file_get_contents($this->vectorTagsMetaPath), true);
if (is_array($meta)) {
$vectorTagCount = count($meta);
}
}
$status = $this->determineStatus($ndjsonTagCount, $vectorExists, $metaExists, $vectorTagCount);
$status = $this->determineStatus(
$ndjsonStats['exported_tag_count'],
$vectorExists,
$metaExists,
$metaStats['vector_tag_count'],
$metaStats['meta_valid']
);
return [
'tags_ndjson_exists' => $ndjsonExists,
'tags_ndjson_count' => $ndjsonTagCount,
'vector_exists' => $vectorExists,
'meta_exists' => $metaExists,
'vector_tag_count' => $vectorTagCount,
'status' => $status,
'tags_ndjson_count' => $ndjsonStats['exported_tag_count'],
'vector_exists' => $vectorExists,
'meta_exists' => $metaExists,
'vector_tag_count' => $metaStats['vector_tag_count'],
'status' => $status,
// Extra diagnostics for admin/CLI.
'tags_ndjson_lines_total' => $ndjsonStats['lines_total'],
'tags_ndjson_invalid_lines' => $ndjsonStats['invalid_lines'],
'tags_ndjson_empty_lines' => $ndjsonStats['empty_lines'],
'tags_with_active_document_ids' => $ndjsonStats['tags_with_document_ids'],
'meta_valid' => $metaStats['meta_valid'],
'paths' => [
'tags_ndjson' => $this->tagsNdjsonPath,
'vector_index' => $this->vectorTagsIndexPath,
'vector_meta' => $this->vectorTagsMetaPath,
],
];
}
private function determineStatus(int $ndjsonTagCount, bool $vectorExists, bool $metaExists, int $vectorTagCount): string
/**
* @return array{
* lines_total:int,
* empty_lines:int,
* invalid_lines:int,
* exported_tag_count:int,
* tags_with_document_ids:int
* }
*/
private function readNdjsonStats(): array
{
if ($ndjsonTagCount === 0 && !$vectorExists && !$metaExists) return 'OK_EMPTY';
if ($ndjsonTagCount > 0 && $vectorExists && $metaExists && $vectorTagCount === $ndjsonTagCount) return 'OK';
if ($ndjsonTagCount === 0 && ($vectorExists || $metaExists)) return 'INCONSISTENT_STALE_VECTOR';
if ($ndjsonTagCount > 0 && (!$vectorExists || !$metaExists)) return 'INCONSISTENT_MISSING_VECTOR';
if ($ndjsonTagCount !== $vectorTagCount) return 'INCONSISTENT_COUNT_MISMATCH';
return 'UNKNOWN';
$stats = [
'lines_total' => 0,
'empty_lines' => 0,
'invalid_lines' => 0,
'exported_tag_count' => 0,
'tags_with_document_ids' => 0,
];
if (!is_file($this->tagsNdjsonPath)) {
return $stats;
}
$handle = @fopen($this->tagsNdjsonPath, 'rb');
if ($handle === false) {
return $stats;
}
try {
while (($line = fgets($handle)) !== false) {
$stats['lines_total']++;
$line = trim($line);
if ($line === '') {
$stats['empty_lines']++;
continue;
}
$data = json_decode($line, true);
if (!is_array($data)) {
$stats['invalid_lines']++;
continue;
}
$tagId = trim((string) ($data['tag_id'] ?? ''));
$text = trim((string) ($data['text'] ?? ''));
$documentIds = $data['document_ids'] ?? null;
$hasDocumentIds = is_array($documentIds) && $documentIds !== [];
if ($tagId === '' || $text === '') {
$stats['invalid_lines']++;
continue;
}
$stats['exported_tag_count']++;
if ($hasDocumentIds) {
$stats['tags_with_document_ids']++;
}
}
} finally {
fclose($handle);
}
return $stats;
}
/**
* @return array{vector_tag_count:int, meta_valid:bool}
*/
private function readMetaStats(): array
{
if (!is_file($this->vectorTagsMetaPath)) {
return [
'vector_tag_count' => 0,
'meta_valid' => false,
];
}
$raw = file_get_contents($this->vectorTagsMetaPath);
if (!is_string($raw) || trim($raw) === '') {
return [
'vector_tag_count' => 0,
'meta_valid' => false,
];
}
$decoded = json_decode($raw, true);
if (is_array($decoded)) {
if (array_is_list($decoded)) {
return [
'vector_tag_count' => count($decoded),
'meta_valid' => true,
];
}
$numericKeys = array_filter(
array_keys($decoded),
static fn (string|int $key): bool => is_string($key) && ctype_digit($key)
);
if ($numericKeys !== [] && count($numericKeys) === count($decoded)) {
return [
'vector_tag_count' => count($decoded),
'meta_valid' => true,
];
}
}
return [
'vector_tag_count' => 0,
'meta_valid' => false,
];
}
private function determineStatus(
int $ndjsonTagCount,
bool $vectorExists,
bool $metaExists,
int $vectorTagCount,
bool $metaValid
): string {
if ($ndjsonTagCount === 0 && !$vectorExists && !$metaExists) {
return self::STATUS_OK_EMPTY;
}
if ($ndjsonTagCount === 0 && ($vectorExists || $metaExists)) {
return self::STATUS_INCONSISTENT_STALE_VECTOR;
}
if ($ndjsonTagCount > 0 && (!$vectorExists || !$metaExists)) {
return self::STATUS_INCONSISTENT_MISSING_VECTOR;
}
if ($metaExists && !$metaValid) {
return self::STATUS_INCONSISTENT_INVALID_META;
}
if ($ndjsonTagCount > 0 && $vectorExists && $metaExists && $metaValid && $vectorTagCount === $ndjsonTagCount) {
return self::STATUS_OK;
}
if ($ndjsonTagCount !== $vectorTagCount) {
return self::STATUS_INCONSISTENT_COUNT_MISMATCH;
}
return self::STATUS_UNKNOWN;
}
}

View File

@@ -12,18 +12,29 @@ final readonly class TagVectorSearchClient
/**
* Minimum similarity score required for a tag to be considered.
*/
private const MIN_SCORE = 0.72;
public const MIN_SCORE = 0.72;
/**
* Default result size when callers do not specify a limit.
*/
private const DEFAULT_LIMIT = 8;
/**
* Hard limit to prevent excessive requests.
*/
private const MAX_LIMIT = 50;
/**
* HTTP timeout for the Python vector service.
*/
private const TIMEOUT_SECONDS = 10;
public function __construct(
private HttpClientInterface $http,
private string $serviceUrl,
private LoggerInterface $agentLogger,
) {}
private string $serviceUrl,
private LoggerInterface $agentLogger,
) {
}
/**
* Executes a vector search against the Python tag index.
@@ -33,43 +44,51 @@ final readonly class TagVectorSearchClient
* {
* "tag_id": "...",
* "score": 0.73,
* "label": "Geräte", // optional (new)
* "tag_type": "catalog_entity" // optional (new)
* "label": "Geräte",
* "tag_type": "catalog_entity"
* }
* ]
*
* @return array<int, array{
* @return list<array{
* tag_id:string,
* score:float,
* label?:string,
* tag_type?:string
* label:string,
* tag_type:string
* }>
*/
public function search(string $query, int $limit = 8): array
public function search(string $query, int $limit = self::DEFAULT_LIMIT): array
{
$query = trim($query);
if ($query === '') {
return [];
}
$limit = max(1, min($limit, self::MAX_LIMIT));
$serviceUrl = rtrim(trim($this->serviceUrl), '/');
if ($serviceUrl === '') {
$this->agentLogger->warning('Tag vector service URL is empty.');
return [];
}
try {
$response = $this->http->request(
'POST',
rtrim($this->serviceUrl, '/') . '/search-tags',
$serviceUrl . '/search-tags',
[
'json' => [
'query' => $query,
'limit' => $limit,
],
'timeout' => 10,
'timeout' => self::TIMEOUT_SECONDS,
]
);
if ($response->getStatusCode() !== 200) {
$this->agentLogger->warning(
'Tag vector service returned non-200',
'Tag vector service returned non-200.',
['status' => $response->getStatusCode()]
);
@@ -77,10 +96,9 @@ final readonly class TagVectorSearchClient
}
$data = $response->toArray(false);
} catch (\Throwable $e) {
$this->agentLogger->warning(
'Tag vector service unreachable',
'Tag vector service unreachable.',
['error' => $e->getMessage()]
);
@@ -88,18 +106,33 @@ final readonly class TagVectorSearchClient
}
if (!is_array($data)) {
$this->agentLogger->warning('Tag vector service returned invalid payload');
$this->agentLogger->warning('Tag vector service returned invalid payload.');
return [];
}
$hits = [];
return $this->normalizeHits($data, $limit);
}
foreach ($data as $row) {
/**
* @param array<mixed> $rows
* @return list<array{
* tag_id:string,
* score:float,
* label:string,
* tag_type:string
* }>
*/
private function normalizeHits(array $rows, int $limit): array
{
$hitsByTagId = [];
foreach ($rows as $row) {
if (!is_array($row)) {
continue;
}
$tagId = (string)($row['tag_id'] ?? '');
$tagId = trim((string) ($row['tag_id'] ?? ''));
$score = $row['score'] ?? null;
if ($tagId === '' || !is_numeric($score)) {
@@ -112,24 +145,45 @@ final readonly class TagVectorSearchClient
continue;
}
$hit = [
$normalizedHit = [
'tag_id' => $tagId,
'score' => $score,
'score' => $score,
'label' => trim((string) ($row['label'] ?? '')),
'tag_type' => TagTypes::normalize((string) ($row['tag_type'] ?? TagTypes::GENERIC)),
];
// Optional: label
if (isset($row['label']) && is_string($row['label'])) {
$hit['label'] = $row['label'];
}
$existingHit = $hitsByTagId[$tagId] ?? null;
// Optional: tag_type
if (isset($row['tag_type']) && is_string($row['tag_type'])) {
$hit['tag_type'] = $row['tag_type'];
if ($existingHit === null || $normalizedHit['score'] > $existingHit['score']) {
$hitsByTagId[$tagId] = $normalizedHit;
}
$hits[] = $hit;
}
return $hits;
if ($hitsByTagId === []) {
return [];
}
$hits = array_values($hitsByTagId);
usort(
$hits,
static function (array $left, array $right): int {
$scoreComparison = $right['score'] <=> $left['score'];
if ($scoreComparison !== 0) {
return $scoreComparison;
}
$typeComparison = strcmp($left['tag_type'], $right['tag_type']);
if ($typeComparison !== 0) {
return $typeComparison;
}
return strcmp($left['tag_id'], $right['tag_id']);
}
);
return array_slice($hits, 0, $limit);
}
}