phase a audit

This commit is contained in:
team2
2026-02-22 18:04:53 +01:00
parent b3e9110dd1
commit 3b2e1bc772
10 changed files with 608 additions and 516 deletions

View File

@@ -3,15 +3,9 @@
# ------------------------------------------------------------
parameters:
# ------------------------------------------------------------
# Root
# ------------------------------------------------------------
mto.root: '%kernel.project_dir%'
mto.kernel.dir: '%mto.root%'
# ------------------------------------------------------------
# Knowledge Root (ZENTRAL)
# ------------------------------------------------------------
mto.knowledge.root: '%mto.root%/var/knowledge'
mto.knowledge.ndjson: '%mto.knowledge.root%/index.ndjson'
@@ -21,42 +15,25 @@ parameters:
mto.runtime.meta: '%mto.knowledge.root%/index_runtime.json'
mto.knowledge.upload: '%mto.knowledge.root%/uploads'
# ------------------------------------------------------------
# Tags (Document Routing)
# ------------------------------------------------------------
mto.knowledge.tags_ndjson: '%mto.knowledge.root%/tags.ndjson'
# Tag vector index outputs
mto.knowledge.vector_tags_index: '%mto.knowledge.root%/vector_tags.index'
mto.knowledge.vector_tags_index_meta: '%mto.knowledge.root%/vector_tags.index.meta.json'
# ------------------------------------------------------------
# Vector Script Directory (A2)
# ------------------------------------------------------------
mto.vector.script_dir: '%mto.root%/python/vector'
# Tag vector scripts
mto.vector.ingest_tags_script: '%mto.vector.script_dir%/vector_ingest_tags.py'
mto.vector.search_tags_script: '%mto.vector.script_dir%/vector_search_tags.py'
# Lock for tag rebuild jobs
mto.tags.rebuild_lock: '%mto.knowledge.root%/locks/tag_rebuild.lock'
# Backward compatibility alias
mto.vector.data.upload.path: '%mto.knowledge.upload%'
# ------------------------------------------------------------
# Index Configuration (Fallback Guardrails)
# ------------------------------------------------------------
mto.index.chunk_size: 800
mto.index.chunk_overlap: 100
mto.index.embedding_model: 'all-MiniLM-L6-v2'
mto.index.embedding_dimension: 768
mto.index.scoring_version: 1
# ------------------------------------------------------------
# Python / Vector Runtime
# ------------------------------------------------------------
mto.vector.python_bin: '/var/www/html/.venv/bin/python3'
mto.vector.ingest_script: '%mto.vector.script_dir%/vector_ingest.py'
mto.vector.search_script: '%mto.vector.script_dir%/vector_search.py'
@@ -131,7 +108,7 @@ services:
alias: App\Knowledge\Retrieval\CachedRetriever
# ------------------------------------------------------------
# Index Configuration Provider (DB + Fallback)
# Index Configuration Provider
# ------------------------------------------------------------
App\Index\IndexConfigurationProvider:
@@ -144,7 +121,7 @@ services:
$fallbackScoringVersion: '%mto.index.scoring_version%'
# ------------------------------------------------------------
# Index Meta Manager (uses Provider)
# Index Meta Manager
# ------------------------------------------------------------
App\Index\IndexMetaManager:
@@ -167,13 +144,24 @@ services:
$pythonBin: '%mto.vector.python_bin%'
$scriptPath: '%mto.vector.ingest_script%'
$indexNdjsonPath: '%mto.knowledge.ndjson%'
$indexMetaPath: '%mto.knowledge.index_meta%'
$vectorIndexPath: '%mto.knowledge.vector_index%'
$timeoutSeconds: '%mto.vector.timeout%'
$configurationProvider: '@App\Index\IndexConfigurationProvider'
# ------------------------------------------------------------
# Tags Export (Document Routing)
# Ingest Layer (Phase B Refactor)
# ------------------------------------------------------------
App\Ingest\GuardrailValidator: ~
App\Ingest\ChunkWriteService: ~
App\Ingest\VectorRebuildService: ~
App\Ingest\IngestFlow: ~
# ------------------------------------------------------------
# Tags Export
# ------------------------------------------------------------
App\Tag\TagNdjsonExporter:
@@ -202,7 +190,7 @@ services:
App\Tag\TagRoutingService: ~
# ------------------------------------------------------------
# Tag Rebuild Jobs (8A)
# Tag Rebuild Jobs
# ------------------------------------------------------------
App\Service\TagRebuildJobService:

View File

@@ -11,7 +11,8 @@ from pathlib import Path
parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON")
parser.add_argument("--index", required=True, help="Path to index.ndjson")
parser.add_argument("--out", required=True, help="Path to output vector.index")
parser.add_argument("--out", required=True, help="Path to output vector.index (tmp)")
parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model")
args = parser.parse_args()
@@ -82,13 +83,7 @@ with open(index_path, "r", encoding="utf-8") as f:
if not texts:
print("No chunks found. Removing vector index.")
if out_path.exists():
out_path.unlink()
meta_path = out_path.with_suffix(".meta.json")
if meta_path.exists():
meta_path.unlink()
# Entferne final erst später in PHP atomar
sys.exit(0)
print(f"Loaded {len(texts)} chunks.")
@@ -119,15 +114,18 @@ index.add(embeddings)
# Ensure output directory exists
out_path.parent.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------
# Write FAISS index (tmp)
# ---------------------------------------------------------
print(f"Writing FAISS index to {out_path}")
faiss.write_index(index, str(out_path))
# ---------------------------------------------------------
# Write ID mapping meta
# Write ID mapping meta (tmp)
# ---------------------------------------------------------
meta_path = out_path.with_suffix(".meta.json")
meta_tmp_path = Path(str(out_path) + ".meta.json")
with open(meta_path, "w", encoding="utf-8") as f:
with open(meta_tmp_path, "w", encoding="utf-8") as f:
json.dump(ids, f)
print(f"Indexed {len(ids)} chunks successfully.")

View File

@@ -21,12 +21,10 @@ use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
use Symfony\Component\Routing\Attribute\Route;
use Symfony\Component\Uid\Uuid;
use function function_exists;
#[Route('/admin/documents')]
class DocumentController extends AbstractController
{
#[Route('', name: 'admin_documents')]
public function index(EntityManagerInterface $em): Response
{
@@ -41,7 +39,7 @@ class DocumentController extends AbstractController
->getResult();
return $this->render('admin/document/index.html.twig', [
'documents' => $documents
'documents' => $documents,
]);
}
@@ -54,7 +52,7 @@ class DocumentController extends AbstractController
{
try {
$uuid = Uuid::fromString($id);
} catch (\Exception $e) {
} catch (\Exception) {
throw new NotFoundHttpException();
}
@@ -65,7 +63,7 @@ class DocumentController extends AbstractController
}
return $this->render('admin/document/show.html.twig', [
'document' => $document
'document' => $document,
]);
}
@@ -76,57 +74,51 @@ class DocumentController extends AbstractController
FormatText $formatText,
IngestJobService $jobService,
ParameterBagInterface $params
): Response
{
if ($request->isMethod('POST')) {
): Response {
if (!$request->isMethod('POST')) {
return $this->render('admin/document/new.html.twig');
}
/** @var UploadedFile|null $file */
$file = $request->files->get('file');
if (!$file instanceof UploadedFile) {
throw new \InvalidArgumentException('No valid file uploaded.');
}
$rawTitle = $request->request->get('title');
$title = is_string($rawTitle) && $rawTitle !== ''
? $rawTitle
: $formatText->slugify($file->getClientOriginalName());
if (!$title) {
$this->addFlash('error', 'Titel ist erforderlich.');
return $this->redirectToRoute('admin_document_new');
}
$uploadDir = $params->get('mto.vector.data.upload.path');
$uploadDir = (string)$params->get('mto.vector.data.upload.path');
$this->ensureDir($uploadDir);
if (!is_dir($uploadDir)) {
mkdir($uploadDir, 0777, true);
}
$newFilename = uniqid() . '_' . $file->getClientOriginalName();
$newFilename = uniqid('', true) . '_' . $file->getClientOriginalName();
try {
$file->move($uploadDir, $newFilename);
} catch (FileException $e) {
} catch (FileException) {
throw new \RuntimeException('File upload failed.');
}
$filePath = $uploadDir . '/' . $newFilename;
// Dokument erstellen
$document = $documentService->createDocument(
$title,
$filePath,
$this->getUser()
);
// ---------------------------------------------------------
// AUTO-INTEGRATION: gleicher Flow wie "Version aktivieren"
// ---------------------------------------------------------
$version = $document->getCurrentVersion();
if (!$version instanceof DocumentVersion) {
$this->addFlash('danger', 'Dokument erstellt, aber es wurde keine aktuelle Version erzeugt.');
return $this->redirectToRoute('admin_documents');
}
$job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE,
@@ -137,33 +129,19 @@ class DocumentController extends AbstractController
IngestJob::STATUS_QUEUED
);
$projectDir = (string)$this->getParameter('kernel.project_dir');
$console = $projectDir . '/bin/console';
$cmd = sprintf(
'%s %s %s %s > /dev/null 2>&1 &',
escapeshellarg($console),
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg((string)$job->getId()),
escapeshellarg('--no-interaction'),
);
if (!function_exists('exec')) {
if (!$this->canExec()) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('danger', 'Dokument erstellt, aber Ingest konnte nicht asynchron gestartet werden.');
$this->addFlash('danger', 'Dokument erstellt, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_documents');
}
exec($cmd);
$this->startIngestJob((string)$job->getId());
return $this->redirectToRoute('admin_job_show', [
'id' => (string)$job->getId(),
]);
}
return $this->render('admin/document/new.html.twig');
}
#[Route('/{id}/version/new', name: 'admin_document_version_new', requirements: ['id' => '[0-9a-fA-F\-]{36}'])]
public function newVersion(
string $id,
@@ -171,35 +149,34 @@ class DocumentController extends AbstractController
EntityManagerInterface $em,
DocumentService $documentService,
ParameterBagInterface $params
): Response
{
): Response {
$document = $em->getRepository(Document::class)->find($id);
if (!$document) {
throw $this->createNotFoundException();
}
if ($request->isMethod('POST')) {
if (!$request->isMethod('POST')) {
return $this->render('admin/document/new_version.html.twig', [
'document' => $document,
]);
}
/** @var UploadedFile|null $file */
$file = $request->files->get('file');
if (!$file) {
if (!$file instanceof UploadedFile) {
$this->addFlash('error', 'Datei ist erforderlich.');
return $this->redirectToRoute('admin_document_version_new', ['id' => $id]);
}
$uploadDir = $params->get('mto.vector.data.upload.path');
$uploadDir = (string)$params->get('mto.vector.data.upload.path');
$this->ensureDir($uploadDir);
if (!is_dir($uploadDir)) {
mkdir($uploadDir, 0777, true);
}
$newFilename = uniqid() . '_' . $file->getClientOriginalName();
$newFilename = uniqid('', true) . '_' . $file->getClientOriginalName();
try {
$file->move($uploadDir, $newFilename);
} catch (FileException $e) {
} catch (FileException) {
throw new \RuntimeException('File upload failed.');
}
@@ -214,11 +191,6 @@ class DocumentController extends AbstractController
return $this->redirectToRoute('admin_document_show', ['id' => $id]);
}
return $this->render('admin/document/new_version.html.twig', [
'document' => $document
]);
}
#[Route(
'/version/{versionId}/activate',
name: 'admin_document_version_activate',
@@ -231,27 +203,18 @@ class DocumentController extends AbstractController
EntityManagerInterface $em,
DocumentService $documentService,
IngestJobService $jobService,
): RedirectResponse
{
if (!$this->isCsrfTokenValid('activate_version_' . $versionId, $request->request->get('_token'))) {
): RedirectResponse {
if (!$this->isCsrfTokenValid('activate_version_' . $versionId, (string)$request->request->get('_token'))) {
throw $this->createAccessDeniedException();
}
$version = $em->getRepository(DocumentVersion::class)->find($versionId);
if (!$version) {
throw $this->createNotFoundException();
}
try {
$documentService->activateVersion($version);
// ---------------------------------------------------------
// Saubere IngestJob-Integration:
// 1) Job als QUEUED anlegen (spezieller Typ für Aktivierung)
// 2) Symfony-Command im Hintergrund starten
// 3) Direkt auf Job-Detailseite redirecten (Loader + Polling)
// ---------------------------------------------------------
$job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE,
@@ -262,28 +225,15 @@ class DocumentController extends AbstractController
IngestJob::STATUS_QUEUED
);
// Hintergrundprozess starten (Provider-kompatibel, kein Worker/Daemon)
$projectDir = (string)$this->getParameter('kernel.project_dir');
$console = $projectDir . '/bin/console';
$cmd = sprintf(
'%s %s %s %s > /dev/null 2>&1 &',
escapeshellarg($console),
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg((string)$job->getId()),
escapeshellarg('--no-interaction'),
);
// Best effort: wenn exec deaktiviert ist, sauber abbrechen.
if (!function_exists('exec')) {
if (!$this->canExec()) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('danger', 'Aktivierung ok, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_document_show', [
'id' => $version->getDocument()->getId()
'id' => $version->getDocument()->getId(),
]);
}
exec($cmd);
$this->startIngestJob((string)$job->getId());
$this->addFlash('success', 'Version aktiviert. Ingest-Job wurde erstellt und gestartet.');
@@ -295,7 +245,7 @@ class DocumentController extends AbstractController
}
return $this->redirectToRoute('admin_document_show', [
'id' => $version->getDocument()->getId()
'id' => $version->getDocument()->getId(),
]);
}
@@ -310,19 +260,17 @@ class DocumentController extends AbstractController
Request $request,
EntityManagerInterface $em,
IngestJobService $jobService,
): ?RedirectResponse
{
$dryRun = false;
if (!$this->isCsrfTokenValid('ingest_version_' . $versionId, $request->request->get('_token'))) {
): ?RedirectResponse {
if (!$this->isCsrfTokenValid('ingest_version_' . $versionId, (string)$request->request->get('_token'))) {
throw $this->createAccessDeniedException();
}
$version = $em->getRepository(DocumentVersion::class)->find($versionId);
if (!$version) {
throw $this->createNotFoundException();
}
/** @var IngestJob|null $existing */
$existing = $em->getRepository(IngestJob::class)
->findOneBy(
['documentVersionId' => $version->getId()],
@@ -333,13 +281,6 @@ class DocumentController extends AbstractController
return null;
}
// ---------------------------------------------------------
// Asynchroner Ingest (ohne Messenger):
// 1) Job als QUEUED anlegen
// 2) Symfony-Command im Hintergrund starten
// 3) Direkt auf Job-Detailseite redirecten (Loader + Polling)
// ---------------------------------------------------------
$job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT,
$this->getUser(),
@@ -349,28 +290,15 @@ class DocumentController extends AbstractController
IngestJob::STATUS_QUEUED
);
// Hintergrundprozess starten (Provider-kompatibel, kein Worker/Daemon)
$projectDir = (string)$this->getParameter('kernel.project_dir');
$console = $projectDir . '/bin/console';
$cmd = sprintf(
'%s %s %s %s > /dev/null 2>&1 &',
escapeshellarg($console),
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg((string)$job->getId()),
escapeshellarg('--no-interaction'),
);
// Best effort: wenn exec deaktiviert ist, sauber abbrechen.
if (!function_exists('exec')) {
if (!$this->canExec()) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('error', 'Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_document_show', [
'id' => $version->getDocument()->getId()
'id' => $version->getDocument()->getId(),
]);
}
exec($cmd);
$this->startIngestJob((string)$job->getId());
return $this->redirectToRoute('admin_job_show', [
'id' => (string)$job->getId(),
@@ -384,17 +312,21 @@ class DocumentController extends AbstractController
)]
public function resetCompleteSystem(ParameterBagInterface $params, Connection $connection): ?RedirectResponse
{
if (!function_exists('exec')) {
if (!$this->canExec()) {
$this->addFlash('danger', 'Der Reset konnte nicht gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_dashboard');
}
@unlink($params->get('mto.knowledge.ndjson'));
@unlink($params->get('mto.knowledge.vector_index'));
@unlink($params->get('mto.knowledge.vector_index_meta'));
@unlink($params->get('mto.knowledge.index_meta'));
@unlink($params->get('mto.runtime.meta'));
exec('rm -rf ' . $params->get('mto.knowledge.upload'));
@unlink((string)$params->get('mto.knowledge.ndjson'));
@unlink((string)$params->get('mto.knowledge.vector_index'));
@unlink((string)$params->get('mto.knowledge.vector_index_meta'));
@unlink((string)$params->get('mto.knowledge.index_meta'));
@unlink((string)$params->get('mto.runtime.meta'));
$uploadDir = (string)$params->get('mto.knowledge.upload');
if ($uploadDir !== '' && is_dir($uploadDir)) {
exec('rm -rf ' . escapeshellarg($uploadDir));
}
$sql = '
SET FOREIGN_KEY_CHECKS = 0;
@@ -425,39 +357,29 @@ class DocumentController extends AbstractController
EntityManagerInterface $em,
IngestJobService $jobService,
LockService $lockService,
DocumentService $documentService
): RedirectResponse
{
if (!$this->isCsrfTokenValid('delete_document_' . $id, $request->request->get('_token'))) {
): RedirectResponse {
if (!$this->isCsrfTokenValid('delete_document_' . $id, (string)$request->request->get('_token'))) {
throw $this->createAccessDeniedException();
}
try {
$uuid = Uuid::fromString($id);
} catch (\Exception $e) {
} catch (\Exception) {
throw $this->createNotFoundException();
}
/** @var Document|null $document */
$document = $em->getRepository(Document::class)->find($uuid);
if (!$document) {
throw $this->createNotFoundException();
}
// ---------------------------------------------------------
// 🔒 Delete nur erlauben wenn kein anderer Job läuft
// ---------------------------------------------------------
if (!$lockService->acquire()) {
$this->addFlash('danger', 'Ein Ingest-Job läuft bereits. Löschen derzeit nicht möglich.');
return $this->redirectToRoute('admin_documents');
}
// Nur Test-Lock echter Lock im Orchestrator
$lockService->release();
// ---------------------------------------------------------
// 1) Delete-Job anlegen (QUEUED)
// ---------------------------------------------------------
$job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT_DELETE,
$this->getUser(),
@@ -467,27 +389,13 @@ class DocumentController extends AbstractController
IngestJob::STATUS_QUEUED
);
// ---------------------------------------------------------
// 2) Hintergrundprozess starten
// ---------------------------------------------------------
$projectDir = (string)$this->getParameter('kernel.project_dir');
$console = $projectDir . '/bin/console';
$cmd = sprintf(
'%s %s %s %s > /dev/null 2>&1 &',
escapeshellarg($console),
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg((string)$job->getId()),
escapeshellarg('--no-interaction'),
);
if (!function_exists('exec')) {
if (!$this->canExec()) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('danger', 'Löschen konnte nicht gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_documents');
}
exec($cmd);
$this->startIngestJob((string)$job->getId());
$this->addFlash('success', 'Löschvorgang gestartet. Dokument wird nach Index-Rebuild entfernt.');
@@ -495,4 +403,42 @@ class DocumentController extends AbstractController
'id' => (string)$job->getId(),
]);
}
// =========================================================
// Helpers
// =========================================================
private function canExec(): bool
{
return function_exists('exec');
}
private function ensureDir(string $dir): void
{
if ($dir === '') {
throw new \RuntimeException('Upload directory not configured.');
}
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create upload directory.');
}
}
private function startIngestJob(string $jobId): void
{
$projectDir = (string)$this->getParameter('kernel.project_dir');
$console = $projectDir . '/bin/console';
// WICHTIG: --no-interaction ist ein GLOBAL-Flag und muss VOR dem Command stehen!
$cmd = sprintf(
'%s %s %s %s %s > /dev/null 2>&1 &',
escapeshellarg(PHP_BINARY),
escapeshellarg($console),
'--no-interaction',
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg($jobId),
);
exec($cmd);
}
}

View File

@@ -16,14 +16,12 @@ final class IndexMetaManager
IndexConfigurationProvider $provider
) {
$this->metaPath = $metaPath;
$this->provider = $provider;
// runtime liegt im selben Verzeichnis
$this->runtimePath = $runTimePath;
$this->provider = $provider;
}
// =====================================================
// META (Governance unverändert lassen!)
// META (Governance unverändert inhaltlich)
// =====================================================
public function ensureExists(): void
@@ -39,10 +37,12 @@ final class IndexMetaManager
return null;
}
return json_decode(
$data = json_decode(
(string) file_get_contents($this->metaPath),
true
);
return is_array($data) ? $data : null;
}
public function validateAgainstCurrent(): void
@@ -85,18 +85,7 @@ final class IndexMetaManager
$config->toStructureArray()
);
$dir = dirname($this->metaPath);
if (!is_dir($dir)) {
mkdir($dir, 0777, true);
}
file_put_contents(
$this->metaPath,
json_encode(
$payload,
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
)
);
$this->atomicWriteJson($this->metaPath, $payload);
}
// =====================================================
@@ -109,20 +98,12 @@ final class IndexMetaManager
return;
}
$dir = dirname($this->runtimePath);
if (!is_dir($dir)) {
mkdir($dir, 0777, true);
}
$payload = [
'chunk_count' => 0,
'last_rebuild_at' => null,
];
file_put_contents(
$this->runtimePath,
json_encode($payload, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
);
$this->atomicWriteJson($this->runtimePath, $payload);
}
public function updateRuntimeStats(int $chunkCount): void
@@ -134,10 +115,7 @@ final class IndexMetaManager
'last_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
];
file_put_contents(
$this->runtimePath,
json_encode($payload, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
);
$this->atomicWriteJson($this->runtimePath, $payload);
}
public function getRuntimeChunkCount(): int
@@ -151,5 +129,37 @@ final class IndexMetaManager
return (int)($data['chunk_count'] ?? 0);
}
}
// =====================================================
// INTERNAL ATOMIC JSON WRITE
// =====================================================
private function atomicWriteJson(string $path, array $payload): void
{
$dir = dirname($path);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create directory for meta/runtime');
}
$tmpPath = $path . '.tmp';
$json = json_encode(
$payload,
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
);
if ($json === false) {
throw new \RuntimeException('Unable to encode JSON payload');
}
if (file_put_contents($tmpPath, $json) === false) {
throw new \RuntimeException('Unable to write temporary JSON file');
}
if (!rename($tmpPath, $path)) {
@unlink($tmpPath);
throw new \RuntimeException('Atomic switch failed for JSON file');
}
}
}

View File

@@ -0,0 +1,72 @@
<?php
declare(strict_types=1);
namespace App\Ingest;
use App\Entity\DocumentVersion;
use App\Knowledge\ChunkManager;
use Symfony\Component\Uid\Uuid;
final readonly class ChunkWriteService
{
public function __construct(
private ChunkManager $chunkManager,
) {}
public function getIndexPath(): string
{
return $this->chunkManager->getIndexPath();
}
public function countAllChunks(): int
{
return $this->chunkManager->countAllChunks();
}
public function compactByDocumentId(Uuid $documentId): void
{
$this->chunkManager->compactByDocument($documentId);
}
/**
* @param iterable<array<string,mixed>> $chunks
*/
public function appendChunks(iterable $chunks): void
{
$this->chunkManager->appendChunks($chunks);
}
/**
* Lokaler Ingest für eine einzelne DocumentVersion.
*
* Ablauf:
* 1. Entfernt bestehende Chunks dieses Dokuments
* 2. Appendet neue Chunks
*
* @param iterable<array<string,mixed>> $chunks
*/
public function writeForDocumentVersion(
DocumentVersion $version,
iterable $chunks
): void {
$documentId = $version->getDocument()->getId();
if (!$documentId instanceof Uuid) {
throw new \RuntimeException('Document ID must be a Uuid instance');
}
$this->chunkManager->compactByDocument($documentId);
$this->chunkManager->appendChunks($chunks);
}
/**
* Vollständiger Rewrite des NDJSON-Index (Global Reindex).
*
* @param iterable<array<string,mixed>> $allChunks
*/
public function rewriteAll(iterable $allChunks): void
{
$this->chunkManager->rewriteAll($allChunks);
}
}

View File

@@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
namespace App\Ingest;
use App\Index\IndexMetaManager;
final readonly class GuardrailValidator
{
public function __construct(
private IndexMetaManager $metaManager,
) {}
/**
* Wirft eine Exception, wenn ein lokaler Ingest nicht kompatibel ist
* und ein Global Reindex erforderlich ist.
*/
public function validateOrThrow(): void
{
$this->metaManager->validateAgainstCurrent();
}
}

View File

@@ -7,9 +7,7 @@ namespace App\Ingest;
use App\Entity\Document;
use App\Entity\DocumentVersion;
use App\Index\IndexMetaManager;
use App\Knowledge\ChunkManager;
use App\Knowledge\Ingest\KnowledgeIngestService;
use App\Vector\VectorIndexBuilder;
use Doctrine\ORM\EntityManagerInterface;
use Psr\Log\LoggerInterface;
use Symfony\Component\Uid\Uuid;
@@ -21,45 +19,52 @@ final readonly class IngestFlow
public function __construct(
private KnowledgeIngestService $knowledgeIngestService,
private ChunkManager $chunkManager,
private VectorIndexBuilder $vectorBuilder,
private GuardrailValidator $guardrailValidator,
private ChunkWriteService $chunkWriteService,
private VectorRebuildService $vectorRebuildService,
private IndexMetaManager $metaManager,
private IngestLockService $lockService,
private LoggerInterface $logger,
private EntityManagerInterface $em,
) {}
// =========================================================
// DOCUMENT INGEST (STREAMING SAFE)
// DOCUMENT INGEST
// =========================================================
public function ingestDocumentVersion(DocumentVersion $version): void
{
$this->metaManager->validateAgainstCurrent();
$this->withLock(function () use ($version): void {
$this->guardrailValidator->validateOrThrow();
$version->setIngestStatus(DocumentVersion::INGEST_RUNNING);
$this->em->flush();
try {
$documentId = $version->getDocument()->getId();
if (!$documentId instanceof Uuid) {
throw new \RuntimeException('Document ID must be a Uuid instance');
}
$this->chunkManager->compactByDocument($version->getDocument()->getId());
// 1) Alte Chunks entfernen
$this->chunkWriteService->compactByDocumentId($documentId);
$existing = $this->chunkManager->countAllChunks();
// 2) Existing Chunks nach Compaction zählen
$existing = $this->chunkWriteService->countAllChunks();
$incoming = 0;
$warned = false;
$generator = $this->knowledgeIngestService->buildChunkRecords($version);
$wrappedGenerator = (function () use ($generator, $existing, &$incoming) {
$wrappedGenerator = (function () use ($generator, $existing, &$incoming, &$warned) {
foreach ($generator as $record) {
$incoming++;
$total = $existing + $incoming;
if ($total >= self::CHUNK_LIMIT_WARN) {
// Nur einmal warnen
if ($incoming === 1 || $total === self::CHUNK_LIMIT_WARN) {
// Logging erfolgt außerhalb des Streams final
}
if (!$warned && $total >= self::CHUNK_LIMIT_WARN) {
$warned = true;
}
if ($total > self::CHUNK_LIMIT_HARD) {
@@ -68,40 +73,47 @@ final readonly class IngestFlow
yield $record;
}
})();
$this->chunkManager->appendChunks($wrappedGenerator);
// 3) Streaming Append
$this->chunkWriteService->appendChunks($wrappedGenerator);
$total = $existing + $incoming;
if ($total >= self::CHUNK_LIMIT_WARN) {
if ($warned) {
$this->logger->warning('Chunk count approaching limit.', [
'existing' => $existing,
'incoming' => $incoming,
'total' => $total,
'document' => (string)$documentId,
'version' => (string)$version->getId(),
]);
}
$this->rebuildIndex(false);
// 4) Vector Rebuild + Runtime Update
$this->vectorRebuildService->rebuild();
$version->setIngestStatus(DocumentVersion::INGEST_INDEXED);
$this->em->flush();
} catch (\Throwable $e) {
$version->setIngestStatus(DocumentVersion::INGEST_FAILED);
$this->em->flush();
throw $e;
}
});
}
// =========================================================
// GLOBAL REINDEX (STREAMING SAFE)
// GLOBAL REINDEX
// =========================================================
public function globalReindex(): void
{
$this->withLock(function (): void {
// Global Reindex ist der Drift-Fix → keine Guardrail-Blockade hier.
$activeDocuments = $this->em
->getRepository(Document::class)
->createQueryBuilder('d')
@@ -111,46 +123,70 @@ final readonly class IngestFlow
->getResult();
if (empty($activeDocuments)) {
throw new \RuntimeException(
'Global Reindex abgebrochen: Es sind keine aktiven Dokumente vorhanden.'
);
throw new \RuntimeException('Global Reindex aborted: no active documents found.');
}
$existing = 0; // rewriteAll ersetzt alles
$incoming = 0;
$warned = false;
$generator = $this->knowledgeIngestService->buildAllActiveChunkRecords();
$wrappedGenerator = (function () use ($generator, &$incoming) {
// 1) "Peek" ohne RAM: erstes Element ziehen
$first = null;
foreach ($generator as $record) {
$first = $record;
$incoming++;
break;
}
if ($first === null) {
throw new \RuntimeException('Global Reindex aborted: no chunks generated.');
}
// 2) Stream bauen, Limits prüfen
$stream = (function () use ($first, $generator, $existing, &$incoming, &$warned) {
// first
$total = $existing + $incoming;
if (!$warned && $total >= self::CHUNK_LIMIT_WARN) {
$warned = true;
}
if ($total > self::CHUNK_LIMIT_HARD) {
throw new \RuntimeException('Chunk limit exceeded.');
}
yield $first;
foreach ($generator as $record) {
$incoming++;
$total = $existing + $incoming;
if (!$warned && $total >= self::CHUNK_LIMIT_WARN) {
$warned = true;
}
if ($total > self::CHUNK_LIMIT_HARD) {
throw new \RuntimeException('Chunk limit exceeded.');
}
yield $record;
}
})();
// Prüfen ob überhaupt etwas kommt (ohne alles in RAM zu ziehen)
$peekIterator = $wrappedGenerator instanceof \Iterator
? $wrappedGenerator
: (function () use ($wrappedGenerator) {
foreach ($wrappedGenerator as $item) {
yield $item;
}
})();
// 3) Rewrite + Rebuild
$this->chunkWriteService->rewriteAll($stream);
if (!$peekIterator->valid()) {
$peekIterator->rewind();
if ($warned) {
$this->logger->warning('Chunk count approaching limit after global reindex.', [
'incoming' => $incoming,
'total' => $incoming,
]);
}
if (!$peekIterator->valid()) {
throw new \RuntimeException(
'Global Reindex abgebrochen: Es wurden keine Chunks erzeugt.'
);
}
$this->vectorRebuildService->rebuild();
$this->chunkManager->rewriteAll($peekIterator);
$this->rebuildIndex(true);
// Governance: Version erhöhen
$this->metaManager->writeMetaForGlobalReindex();
});
}
// =========================================================
@@ -159,8 +195,11 @@ final readonly class IngestFlow
public function deleteDocument(Uuid $documentId): void
{
$this->metaManager->validateAgainstCurrent();
$this->withLock(function () use ($documentId): void {
$this->guardrailValidator->validateOrThrow();
/** @var Document|null $document */
$document = $this->em
->getRepository(Document::class)
->find($documentId);
@@ -169,32 +208,41 @@ final readonly class IngestFlow
throw new \RuntimeException('Document not found.');
}
$this->chunkManager->compactByDocument($documentId);
// 1) Chunks entfernen
$this->chunkWriteService->compactByDocumentId($documentId);
// 2) FK-sicher löschen: currentVersion lösen (verhindert „Version zeigt noch auf DocumentVersion“)
if (method_exists($document, 'getCurrentVersion') && method_exists($document, 'setCurrentVersion')) {
if ($document->getCurrentVersion() !== null) {
$document->setCurrentVersion(null);
$this->em->flush();
}
}
// 3) Dokument entfernen
$this->em->remove($document);
$this->em->flush();
$this->rebuildIndex(false);
// 4) Vector rebuild + runtime update
$this->vectorRebuildService->rebuild();
});
}
// =========================================================
// CENTRAL REBUILD
// INTERNALS
// =========================================================
private function rebuildIndex(bool $isGlobal): void
/**
* @param callable():void $fn
*/
private function withLock(callable $fn): void
{
$this->vectorBuilder->rebuildFromNdjson();
$this->lockService->acquire();
if ($isGlobal) {
$this->metaManager->writeMetaForGlobalReindex();
try {
$fn();
} finally {
$this->lockService->release();
}
$this->updateChunkCount();
}
private function updateChunkCount(): void
{
$chunkCount = $this->chunkManager->countAllChunks();
$this->metaManager->updateRuntimeStats($chunkCount);
}
}

View File

@@ -0,0 +1,51 @@
<?php
declare(strict_types=1);
namespace App\Ingest;
final class IngestLockService
{
private string $lockFilePath;
/** @var resource|null */
private $handle = null;
public function __construct(string $projectDir)
{
$this->lockFilePath = rtrim($projectDir, '/') . '/var/knowledge/locks/ingest.lock';
}
public function acquire(): void
{
$dir = dirname($this->lockFilePath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create lock directory.');
}
$this->handle = fopen($this->lockFilePath, 'c');
if ($this->handle === false) {
throw new \RuntimeException('Unable to open ingest lock file.');
}
if (!flock($this->handle, LOCK_EX | LOCK_NB)) {
throw new \RuntimeException('Another ingest process is already running.');
}
}
public function release(): void
{
if ($this->handle !== null) {
flock($this->handle, LOCK_UN);
fclose($this->handle);
$this->handle = null;
}
}
public function __destruct()
{
$this->release();
}
}

View File

@@ -0,0 +1,38 @@
<?php
declare(strict_types=1);
namespace App\Ingest;
use App\Index\IndexMetaManager;
use App\Knowledge\ChunkManager;
use App\Vector\VectorIndexBuilder;
final readonly class VectorRebuildService
{
public function __construct(
private VectorIndexBuilder $vectorBuilder,
private IndexMetaManager $metaManager,
private ChunkManager $chunkManager,
) {}
/**
* Führt einen vollständigen, deterministischen FAISS-Rebuild aus.
*
* Ablauf:
* 1. Rebuild des Vector Index aus index.ndjson
* 2. Chunk-Zählung via ChunkManager
* 3. Runtime-Stats atomar aktualisieren
*/
public function rebuild(?string $logPath = null): void
{
// 1⃣ Vector Index neu bauen
$this->vectorBuilder->rebuildFromNdjson($logPath);
// 2⃣ Chunk Count streaming-safe zählen
$chunkCount = $this->chunkManager->countAllChunks();
// 3⃣ Runtime-Stats aktualisieren (atomar)
$this->metaManager->updateRuntimeStats($chunkCount);
}
}

View File

@@ -13,7 +13,6 @@ final class VectorIndexBuilder
private string $pythonBin;
private string $scriptPath;
private string $indexNdjsonPath;
private string $indexMetaPath;
private string $vectorIndexPath;
private string $vectorMetaPath;
private int $timeoutSeconds;
@@ -24,7 +23,6 @@ final class VectorIndexBuilder
string $pythonBin,
string $scriptPath,
string $indexNdjsonPath,
string $indexMetaPath,
string $vectorIndexPath,
int $timeoutSeconds,
IndexConfigurationProvider $configurationProvider
@@ -32,54 +30,30 @@ final class VectorIndexBuilder
$this->pythonBin = $pythonBin;
$this->scriptPath = $scriptPath;
$this->indexNdjsonPath = $indexNdjsonPath;
$this->indexMetaPath = $indexMetaPath;
$this->vectorIndexPath = $vectorIndexPath;
$this->vectorMetaPath = $vectorIndexPath . '.meta.json';
$this->timeoutSeconds = $timeoutSeconds;
$this->configurationProvider = $configurationProvider;
}
/**
* Rebuild FAISS Index deterministisch aus index.ndjson.
*/
public function rebuildFromNdjson(?string $logPath = null): void
{
$this->assertPreconditions();
// --------------------------------------------
// 🔵 FALL: NDJSON ist leer → kein Vector Index
// --------------------------------------------
if (!is_file($this->indexNdjsonPath) || filesize($this->indexNdjsonPath) === 0) {
@unlink($this->vectorIndexPath);
@unlink($this->vectorMetaPath);
if ($logPath !== null) {
@file_put_contents(
$logPath,
"NDJSON empty → Vector index removed\n",
FILE_APPEND
);
}
return;
}
// --------------------------------------------
// 🟢 FALL: NDJSON enthält Chunks
// --------------------------------------------
if (!is_file($this->indexMetaPath)) {
$this->initializeIndexMeta();
}
$indexMeta = $this->readIndexMeta();
$embeddingModel = $indexMeta['embedding_model'];
$config = $this->configurationProvider->getConfiguration();
$embeddingModel = $config->getEmbeddingModel();
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
$tmpVectorMetaPath = $tmpVectorIndexPath . '.meta.json';
@unlink($tmpVectorIndexPath);
@unlink($this->vectorMetaPath);
@unlink($tmpVectorMetaPath);
$cmd = [
$this->pythonBin,
@@ -94,107 +68,51 @@ final class VectorIndexBuilder
$this->runProcess($process, $logPath);
$this->validatePythonOutputs($tmpVectorIndexPath);
$this->validateOutputs($tmpVectorIndexPath, $tmpVectorMetaPath);
$this->atomicSwitch($tmpVectorIndexPath);
$this->atomicSwitchPair(
$tmpVectorIndexPath,
$tmpVectorMetaPath
);
}
// -----------------------------------------------------
// Internals
// -----------------------------------------------------
private function assertPreconditions(): void
{
if (!is_file($this->scriptPath)) {
throw new \RuntimeException(
'Vector build script not found at: ' . $this->scriptPath
);
throw new \RuntimeException('Vector build script not found.');
}
if (!is_file($this->indexNdjsonPath)) {
throw new \RuntimeException(
'index.ndjson not found at: ' . $this->indexNdjsonPath
);
throw new \RuntimeException('index.ndjson not found.');
}
}
private function readIndexMeta(): array
private function validateOutputs(string $tmpIndex, string $tmpMeta): void
{
$meta = json_decode(
(string) file_get_contents($this->indexMetaPath),
true
);
if (!is_array($meta) || empty($meta['embedding_model'])) {
throw new \RuntimeException('Invalid index_meta.json');
}
return $meta;
}
private function initializeIndexMeta(): void
{
$dir = dirname($this->indexMetaPath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Cannot create knowledge directory');
}
$config = $this->configurationProvider->getConfiguration();
$data = [
'index_version' => 1,
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
'embedding_model' => $config->getEmbeddingModel(),
'embedding_dimension' => $config->getEmbeddingDimension(),
'chunk_size' => $config->getChunkSize(),
'chunk_overlap' => $config->getChunkOverlap(),
'scoring_version' => $config->getScoringVersion(),
'index_format' => 'ndjson',
'vector_backend' => 'faiss',
];
file_put_contents(
$this->indexMetaPath,
json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
);
}
private function validatePythonOutputs(string $tmpVectorIndexPath): void
{
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
if (!is_file($tmpIndex) || filesize($tmpIndex) === 0) {
throw new \RuntimeException('Vector index tmp missing or empty');
}
if (!is_file($this->vectorMetaPath) || filesize($this->vectorMetaPath) === 0) {
throw new \RuntimeException('Vector meta missing or empty');
if (!is_file($tmpMeta) || filesize($tmpMeta) === 0) {
throw new \RuntimeException('Vector meta tmp missing or empty');
}
}
private function atomicSwitch(string $tmpVectorIndexPath): void
private function atomicSwitchPair(string $tmpIndex, string $tmpMeta): void
{
if (!rename($tmpVectorIndexPath, $this->vectorIndexPath)) {
if (!rename($tmpIndex, $this->vectorIndexPath)) {
throw new \RuntimeException('Atomic switch failed for vector index');
}
if (!rename($tmpMeta, $this->vectorMetaPath)) {
throw new \RuntimeException('Atomic switch failed for vector meta');
}
}
private function runProcess(Process $process, ?string $logPath): void
{
if ($logPath !== null) {
@file_put_contents($logPath, "=== VectorIndexBuilder START ===\n", FILE_APPEND);
}
$process->run();
if (!$process->isSuccessful()) {
if ($logPath !== null) {
@file_put_contents($logPath, $process->getErrorOutput(), FILE_APPEND);
}
throw new ProcessFailedException($process);
}
if ($logPath !== null) {
@file_put_contents($logPath, "=== VectorIndexBuilder OK ===\n", FILE_APPEND);
}
}
}