diff --git a/src/Command/SystemRebuildCommand.php b/src/Command/SystemRebuildCommand.php new file mode 100644 index 0000000..2804f3a --- /dev/null +++ b/src/Command/SystemRebuildCommand.php @@ -0,0 +1,204 @@ +addOption('hard', null, InputOption::VALUE_NONE, 'Required safety switch. Without --hard, the command aborts.') + ->addOption('no-tags', null, InputOption::VALUE_NONE, 'Skip tag rebuild') + ->addOption('no-reload', null, InputOption::VALUE_NONE, 'Skip vector service reload/start') + ->addOption('no-health', null, InputOption::VALUE_NONE, 'Skip health check (not recommended)') + ->addOption('dry-run', null, InputOption::VALUE_NONE, 'Simulate ingest steps (no writes)'); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $io = new SymfonyStyle($input, $output); + + if (!$input->getOption('hard')) { + $io->error('Safety switch missing: you must pass --hard to run this command.'); + $io->writeln('Example: bin/console mto:agent:system:rebuild --hard'); + return Command::FAILURE; + } + + $dryRun = (bool)$input->getOption('dry-run'); + + $io->title('mto:agent:system:rebuild --hard'); + + // --------------------------------------------------------- + // 1) GLOBAL REINDEX (chunks rewrite + vector rebuild) + // --------------------------------------------------------- + $io->section('1/4 Global reindex (chunks + vector index)'); + + $job = $this->jobService->startJob( + IngestJob::TYPE_GLOBAL_REINDEX, + null, + null, + null, + null, + IngestJob::STATUS_QUEUED + ); + + try { + $this->orchestrator->runExistingJob($job, $dryRun); + $io->success('Global reindex completed.'); + } catch (\Throwable $e) { + $io->error('Global reindex failed: ' . $e->getMessage()); + return Command::FAILURE; + } + + // --------------------------------------------------------- + // 2) TAG REBUILD (tags.ndjson + vector_tags.index) + // --------------------------------------------------------- + if (!$input->getOption('no-tags')) { + $io->section('2/4 Tag rebuild (tags.ndjson + vector_tags.index)'); + + if ($dryRun) { + $io->note('dry-run enabled: tag rebuild skipped (would export + build tag index).'); + } else { + try { + $export = $this->tagExporter->export(); + + $io->writeln('Exported tags.ndjson'); + $io->writeln('Path: ' . $export['path']); + $io->writeln('Tags: ' . $export['tags']); + $io->writeln('Lines: ' . $export['lines']); + $io->writeln('Bytes: ' . $export['bytes']); + + $this->tagIndexBuilder->build(); + $io->writeln('Built vector_tags.index'); + + $this->metaManager->touchRuntime([ + 'last_tags_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM), + ]); + $io->success('Tag rebuild completed.'); + } catch (\Throwable $e) { + $io->error('Tag rebuild failed: ' . $e->getMessage()); + return Command::FAILURE; + } + } + } else { + $io->section('2/4 Tag rebuild'); + $io->note('Skipped due to --no-tags.'); + } + + // --------------------------------------------------------- + // 3) VECTOR SERVICE (install deps + start + reload) + // --------------------------------------------------------- + if (!$input->getOption('no-reload')) { + $io->section('3/4 Vector service reload (uvicorn)'); + + if ($dryRun) { + $io->note('dry-run enabled: service reload skipped.'); + } else { + $cmd = [ + '.venv/bin/python', + 'python/vector/vector_control.py', + '--install', + '--start', + '--reload', + '--port', '8090', + '--host', '0.0.0.0' + ]; + + $process = new Process($cmd, $this->projectDir); + $process->setTimeout(600); + $process->run(); + + $out = trim($process->getOutput()); + $err = trim($process->getErrorOutput()); + + if ($out !== '') { + $io->writeln($out); + } + if ($err !== '') { + $io->writeln('' . $err . ''); + } + + if (!$process->isSuccessful()) { + $io->error('Vector service reload failed (non-zero exit code).'); + return Command::FAILURE; + } + + $io->success('Vector service reloaded.'); + } + } else { + $io->section('3/4 Vector service reload'); + $io->note('Skipped due to --no-reload.'); + } + + // --------------------------------------------------------- + // 4) HEALTH CHECK (NDJSON vs vector meta) + // --------------------------------------------------------- + if (!$input->getOption('no-health')) { + $io->section('4/4 Health check'); + + try { + $report = $this->health->check(); + } catch (\Throwable $e) { + $io->error('Health check failed: ' . $e->getMessage()); + return Command::FAILURE; + } + + $io->definitionList( + ['ndjson_exists' => $report['ndjson_exists'] ? 'yes' : 'no'], + ['ndjson_chunk_count' => (string)$report['ndjson_chunk_count']], + ['vector_exists' => $report['vector_exists'] ? 'yes' : 'no'], + ['meta_exists' => $report['meta_exists'] ? 'yes' : 'no'], + ['vector_chunk_count' => (string)$report['vector_chunk_count']], + ['status' => (string)$report['status']], + ); + + if (!in_array($report['status'], ['OK', 'OK_EMPTY'], true)) { + $io->error('Health check not OK: ' . $report['status']); + return Command::FAILURE; + } + + $io->success('Health check OK.'); + } else { + $io->section('4/4 Health check'); + $io->note('Skipped due to --no-health.'); + } + + $io->success('System rebuild finished.'); + return Command::SUCCESS; + } +} \ No newline at end of file diff --git a/src/Knowledge/Ingest/KnowledgeIngestService.php b/src/Knowledge/Ingest/KnowledgeIngestService.php index 78a93b3..adf299f 100644 --- a/src/Knowledge/Ingest/KnowledgeIngestService.php +++ b/src/Knowledge/Ingest/KnowledgeIngestService.php @@ -6,7 +6,6 @@ namespace App\Knowledge\Ingest; use App\Entity\DocumentVersion; use App\Repository\DocumentVersionRepository; -use Symfony\Component\Uid\Uuid; final class KnowledgeIngestService { @@ -18,7 +17,7 @@ final class KnowledgeIngestService } /** - * Lokaler Ingest: erzeugt NDJSON-Records für genau diese Version. + * Lokaler Ingest: erzeugt deterministische NDJSON-Records. * * @return iterable> */ @@ -34,25 +33,34 @@ final class KnowledgeIngestService $documentId = $doc->getId()->toRfc4122(); $versionId = $version->getId()->toRfc4122(); - // ✅ Regel: Wenn title gefüllt ist, kommt er in jeden Chunk $title = trim((string) $doc->getTitle()); $index = 0; foreach ($chunks as $chunkText) { - // ✅ Prefix nur wenn title vorhanden; keine Flags, keine Meta-Schalter + // Titel optional weiterhin prefixen (wenn du das behalten willst) if ($title !== '' && !str_starts_with($chunkText, $title)) { $chunkText = $title . "\n\n" . $chunkText; } + $chunkText = trim($chunkText); + + // 🔥 deterministische Chunk-ID + $normalizedForId = $this->normalizeForId($chunkText); + + $chunkId = sha1( + $documentId . '|' . + $versionId . '|' . + $normalizedForId + ); + yield [ - 'chunk_id' => Uuid::v4()->toRfc4122(), + 'chunk_id' => $chunkId, 'document_id' => $documentId, 'version_id' => $versionId, 'chunk_index' => $index++, 'text' => $chunkText, - // ✅ checksum muss den finalen Text abbilden (inkl. Titel) 'checksum' => sha1($chunkText), 'metadata' => $this->buildMetadata($version), ]; @@ -60,10 +68,7 @@ final class KnowledgeIngestService } /** - * Global Reindex: iteriert streamingfähig über alle aktiven Versionen. - * Keine RAM-Explosion, da alles generatorbasiert bleibt. - * - * @return iterable> + * Global Reindex */ public function buildAllActiveChunkRecords(): iterable { @@ -76,8 +81,18 @@ final class KnowledgeIngestService { $text = preg_replace("/\n{3,}/", "\n\n", $text); $text = preg_replace("/[ \t]+$/m", "", $text); + return trim($text); + } - return $text; + /** + * Normalisierung für stabile ID-Berechnung. + * Wichtig: ID darf nicht durch Whitespace minimal variieren. + */ + private function normalizeForId(string $text): string + { + $text = mb_strtolower($text); + $text = preg_replace('/\s+/u', ' ', $text); + return trim($text); } /** @@ -87,7 +102,6 @@ final class KnowledgeIngestService { $doc = $version->getDocument(); - // Optional: Titel/Name, falls vorhanden $title = null; if (method_exists($doc, 'getTitle')) { $title = $doc->getTitle(); @@ -97,8 +111,10 @@ final class KnowledgeIngestService return array_filter([ 'document_title' => $title, - 'version_number' => method_exists($version, 'getVersionNumber') ? $version->getVersionNumber() : null, + 'version_number' => method_exists($version, 'getVersionNumber') + ? $version->getVersionNumber() + : null, 'file_path' => $version->getFilePath(), ], static fn($v) => $v !== null && $v !== ''); } -} +} \ No newline at end of file