phase a audit

This commit is contained in:
team2
2026-02-22 18:04:53 +01:00
parent b3e9110dd1
commit 3b2e1bc772
10 changed files with 608 additions and 516 deletions

View File

@@ -3,15 +3,9 @@
# ------------------------------------------------------------ # ------------------------------------------------------------
parameters: parameters:
# ------------------------------------------------------------
# Root
# ------------------------------------------------------------
mto.root: '%kernel.project_dir%' mto.root: '%kernel.project_dir%'
mto.kernel.dir: '%mto.root%' mto.kernel.dir: '%mto.root%'
# ------------------------------------------------------------
# Knowledge Root (ZENTRAL)
# ------------------------------------------------------------
mto.knowledge.root: '%mto.root%/var/knowledge' mto.knowledge.root: '%mto.root%/var/knowledge'
mto.knowledge.ndjson: '%mto.knowledge.root%/index.ndjson' mto.knowledge.ndjson: '%mto.knowledge.root%/index.ndjson'
@@ -21,42 +15,25 @@ parameters:
mto.runtime.meta: '%mto.knowledge.root%/index_runtime.json' mto.runtime.meta: '%mto.knowledge.root%/index_runtime.json'
mto.knowledge.upload: '%mto.knowledge.root%/uploads' mto.knowledge.upload: '%mto.knowledge.root%/uploads'
# ------------------------------------------------------------
# Tags (Document Routing)
# ------------------------------------------------------------
mto.knowledge.tags_ndjson: '%mto.knowledge.root%/tags.ndjson' mto.knowledge.tags_ndjson: '%mto.knowledge.root%/tags.ndjson'
# Tag vector index outputs
mto.knowledge.vector_tags_index: '%mto.knowledge.root%/vector_tags.index' mto.knowledge.vector_tags_index: '%mto.knowledge.root%/vector_tags.index'
mto.knowledge.vector_tags_index_meta: '%mto.knowledge.root%/vector_tags.index.meta.json' mto.knowledge.vector_tags_index_meta: '%mto.knowledge.root%/vector_tags.index.meta.json'
# ------------------------------------------------------------
# Vector Script Directory (A2)
# ------------------------------------------------------------
mto.vector.script_dir: '%mto.root%/python/vector' mto.vector.script_dir: '%mto.root%/python/vector'
# Tag vector scripts
mto.vector.ingest_tags_script: '%mto.vector.script_dir%/vector_ingest_tags.py' mto.vector.ingest_tags_script: '%mto.vector.script_dir%/vector_ingest_tags.py'
mto.vector.search_tags_script: '%mto.vector.script_dir%/vector_search_tags.py' mto.vector.search_tags_script: '%mto.vector.script_dir%/vector_search_tags.py'
# Lock for tag rebuild jobs
mto.tags.rebuild_lock: '%mto.knowledge.root%/locks/tag_rebuild.lock' mto.tags.rebuild_lock: '%mto.knowledge.root%/locks/tag_rebuild.lock'
# Backward compatibility alias
mto.vector.data.upload.path: '%mto.knowledge.upload%' mto.vector.data.upload.path: '%mto.knowledge.upload%'
# ------------------------------------------------------------
# Index Configuration (Fallback Guardrails)
# ------------------------------------------------------------
mto.index.chunk_size: 800 mto.index.chunk_size: 800
mto.index.chunk_overlap: 100 mto.index.chunk_overlap: 100
mto.index.embedding_model: 'all-MiniLM-L6-v2' mto.index.embedding_model: 'all-MiniLM-L6-v2'
mto.index.embedding_dimension: 768 mto.index.embedding_dimension: 768
mto.index.scoring_version: 1 mto.index.scoring_version: 1
# ------------------------------------------------------------
# Python / Vector Runtime
# ------------------------------------------------------------
mto.vector.python_bin: '/var/www/html/.venv/bin/python3' mto.vector.python_bin: '/var/www/html/.venv/bin/python3'
mto.vector.ingest_script: '%mto.vector.script_dir%/vector_ingest.py' mto.vector.ingest_script: '%mto.vector.script_dir%/vector_ingest.py'
mto.vector.search_script: '%mto.vector.script_dir%/vector_search.py' mto.vector.search_script: '%mto.vector.script_dir%/vector_search.py'
@@ -131,7 +108,7 @@ services:
alias: App\Knowledge\Retrieval\CachedRetriever alias: App\Knowledge\Retrieval\CachedRetriever
# ------------------------------------------------------------ # ------------------------------------------------------------
# Index Configuration Provider (DB + Fallback) # Index Configuration Provider
# ------------------------------------------------------------ # ------------------------------------------------------------
App\Index\IndexConfigurationProvider: App\Index\IndexConfigurationProvider:
@@ -144,7 +121,7 @@ services:
$fallbackScoringVersion: '%mto.index.scoring_version%' $fallbackScoringVersion: '%mto.index.scoring_version%'
# ------------------------------------------------------------ # ------------------------------------------------------------
# Index Meta Manager (uses Provider) # Index Meta Manager
# ------------------------------------------------------------ # ------------------------------------------------------------
App\Index\IndexMetaManager: App\Index\IndexMetaManager:
@@ -167,13 +144,24 @@ services:
$pythonBin: '%mto.vector.python_bin%' $pythonBin: '%mto.vector.python_bin%'
$scriptPath: '%mto.vector.ingest_script%' $scriptPath: '%mto.vector.ingest_script%'
$indexNdjsonPath: '%mto.knowledge.ndjson%' $indexNdjsonPath: '%mto.knowledge.ndjson%'
$indexMetaPath: '%mto.knowledge.index_meta%'
$vectorIndexPath: '%mto.knowledge.vector_index%' $vectorIndexPath: '%mto.knowledge.vector_index%'
$timeoutSeconds: '%mto.vector.timeout%' $timeoutSeconds: '%mto.vector.timeout%'
$configurationProvider: '@App\Index\IndexConfigurationProvider' $configurationProvider: '@App\Index\IndexConfigurationProvider'
# ------------------------------------------------------------ # ------------------------------------------------------------
# Tags Export (Document Routing) # Ingest Layer (Phase B Refactor)
# ------------------------------------------------------------
App\Ingest\GuardrailValidator: ~
App\Ingest\ChunkWriteService: ~
App\Ingest\VectorRebuildService: ~
App\Ingest\IngestFlow: ~
# ------------------------------------------------------------
# Tags Export
# ------------------------------------------------------------ # ------------------------------------------------------------
App\Tag\TagNdjsonExporter: App\Tag\TagNdjsonExporter:
@@ -202,7 +190,7 @@ services:
App\Tag\TagRoutingService: ~ App\Tag\TagRoutingService: ~
# ------------------------------------------------------------ # ------------------------------------------------------------
# Tag Rebuild Jobs (8A) # Tag Rebuild Jobs
# ------------------------------------------------------------ # ------------------------------------------------------------
App\Service\TagRebuildJobService: App\Service\TagRebuildJobService:

View File

@@ -11,7 +11,8 @@ from pathlib import Path
parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON") parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON")
parser.add_argument("--index", required=True, help="Path to index.ndjson") parser.add_argument("--index", required=True, help="Path to index.ndjson")
parser.add_argument("--out", required=True, help="Path to output vector.index") parser.add_argument("--out", required=True, help="Path to output vector.index (tmp)")
parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model") parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model")
args = parser.parse_args() args = parser.parse_args()
@@ -82,13 +83,7 @@ with open(index_path, "r", encoding="utf-8") as f:
if not texts: if not texts:
print("No chunks found. Removing vector index.") print("No chunks found. Removing vector index.")
if out_path.exists(): # Entferne final erst später in PHP atomar
out_path.unlink()
meta_path = out_path.with_suffix(".meta.json")
if meta_path.exists():
meta_path.unlink()
sys.exit(0) sys.exit(0)
print(f"Loaded {len(texts)} chunks.") print(f"Loaded {len(texts)} chunks.")
@@ -119,15 +114,18 @@ index.add(embeddings)
# Ensure output directory exists # Ensure output directory exists
out_path.parent.mkdir(parents=True, exist_ok=True) out_path.parent.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------
# Write FAISS index (tmp)
# ---------------------------------------------------------
print(f"Writing FAISS index to {out_path}") print(f"Writing FAISS index to {out_path}")
faiss.write_index(index, str(out_path)) faiss.write_index(index, str(out_path))
# --------------------------------------------------------- # ---------------------------------------------------------
# Write ID mapping meta # Write ID mapping meta (tmp)
# --------------------------------------------------------- # ---------------------------------------------------------
meta_path = out_path.with_suffix(".meta.json") meta_tmp_path = Path(str(out_path) + ".meta.json")
with open(meta_path, "w", encoding="utf-8") as f: with open(meta_tmp_path, "w", encoding="utf-8") as f:
json.dump(ids, f) json.dump(ids, f)
print(f"Indexed {len(ids)} chunks successfully.") print(f"Indexed {len(ids)} chunks successfully.")

View File

@@ -21,12 +21,10 @@ use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\HttpKernel\Exception\NotFoundHttpException; use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
use Symfony\Component\Routing\Attribute\Route; use Symfony\Component\Routing\Attribute\Route;
use Symfony\Component\Uid\Uuid; use Symfony\Component\Uid\Uuid;
use function function_exists;
#[Route('/admin/documents')] #[Route('/admin/documents')]
class DocumentController extends AbstractController class DocumentController extends AbstractController
{ {
#[Route('', name: 'admin_documents')] #[Route('', name: 'admin_documents')]
public function index(EntityManagerInterface $em): Response public function index(EntityManagerInterface $em): Response
{ {
@@ -41,7 +39,7 @@ class DocumentController extends AbstractController
->getResult(); ->getResult();
return $this->render('admin/document/index.html.twig', [ return $this->render('admin/document/index.html.twig', [
'documents' => $documents 'documents' => $documents,
]); ]);
} }
@@ -54,7 +52,7 @@ class DocumentController extends AbstractController
{ {
try { try {
$uuid = Uuid::fromString($id); $uuid = Uuid::fromString($id);
} catch (\Exception $e) { } catch (\Exception) {
throw new NotFoundHttpException(); throw new NotFoundHttpException();
} }
@@ -65,7 +63,7 @@ class DocumentController extends AbstractController
} }
return $this->render('admin/document/show.html.twig', [ return $this->render('admin/document/show.html.twig', [
'document' => $document 'document' => $document,
]); ]);
} }
@@ -76,57 +74,51 @@ class DocumentController extends AbstractController
FormatText $formatText, FormatText $formatText,
IngestJobService $jobService, IngestJobService $jobService,
ParameterBagInterface $params ParameterBagInterface $params
): Response ): Response {
{ if (!$request->isMethod('POST')) {
if ($request->isMethod('POST')) { return $this->render('admin/document/new.html.twig');
}
/** @var UploadedFile|null $file */ /** @var UploadedFile|null $file */
$file = $request->files->get('file'); $file = $request->files->get('file');
if (!$file instanceof UploadedFile) { if (!$file instanceof UploadedFile) {
throw new \InvalidArgumentException('No valid file uploaded.'); throw new \InvalidArgumentException('No valid file uploaded.');
} }
$rawTitle = $request->request->get('title'); $rawTitle = $request->request->get('title');
$title = is_string($rawTitle) && $rawTitle !== '' $title = is_string($rawTitle) && $rawTitle !== ''
? $rawTitle ? $rawTitle
: $formatText->slugify($file->getClientOriginalName()); : $formatText->slugify($file->getClientOriginalName());
if (!$title) { if (!$title) {
$this->addFlash('error', 'Titel ist erforderlich.'); $this->addFlash('error', 'Titel ist erforderlich.');
return $this->redirectToRoute('admin_document_new'); return $this->redirectToRoute('admin_document_new');
} }
$uploadDir = $params->get('mto.vector.data.upload.path'); $uploadDir = (string)$params->get('mto.vector.data.upload.path');
$this->ensureDir($uploadDir);
if (!is_dir($uploadDir)) { $newFilename = uniqid('', true) . '_' . $file->getClientOriginalName();
mkdir($uploadDir, 0777, true);
}
$newFilename = uniqid() . '_' . $file->getClientOriginalName();
try { try {
$file->move($uploadDir, $newFilename); $file->move($uploadDir, $newFilename);
} catch (FileException $e) { } catch (FileException) {
throw new \RuntimeException('File upload failed.'); throw new \RuntimeException('File upload failed.');
} }
$filePath = $uploadDir . '/' . $newFilename; $filePath = $uploadDir . '/' . $newFilename;
// Dokument erstellen
$document = $documentService->createDocument( $document = $documentService->createDocument(
$title, $title,
$filePath, $filePath,
$this->getUser() $this->getUser()
); );
// ---------------------------------------------------------
// AUTO-INTEGRATION: gleicher Flow wie "Version aktivieren"
// ---------------------------------------------------------
$version = $document->getCurrentVersion(); $version = $document->getCurrentVersion();
if (!$version instanceof DocumentVersion) {
$this->addFlash('danger', 'Dokument erstellt, aber es wurde keine aktuelle Version erzeugt.');
return $this->redirectToRoute('admin_documents');
}
$job = $jobService->startJob( $job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE, IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE,
@@ -137,33 +129,19 @@ class DocumentController extends AbstractController
IngestJob::STATUS_QUEUED IngestJob::STATUS_QUEUED
); );
$projectDir = (string)$this->getParameter('kernel.project_dir'); if (!$this->canExec()) {
$console = $projectDir . '/bin/console';
$cmd = sprintf(
'%s %s %s %s > /dev/null 2>&1 &',
escapeshellarg($console),
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg((string)$job->getId()),
escapeshellarg('--no-interaction'),
);
if (!function_exists('exec')) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).'); $jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('danger', 'Dokument erstellt, aber Ingest konnte nicht asynchron gestartet werden.'); $this->addFlash('danger', 'Dokument erstellt, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_documents'); return $this->redirectToRoute('admin_documents');
} }
exec($cmd); $this->startIngestJob((string)$job->getId());
return $this->redirectToRoute('admin_job_show', [ return $this->redirectToRoute('admin_job_show', [
'id' => (string)$job->getId(), 'id' => (string)$job->getId(),
]); ]);
} }
return $this->render('admin/document/new.html.twig');
}
#[Route('/{id}/version/new', name: 'admin_document_version_new', requirements: ['id' => '[0-9a-fA-F\-]{36}'])] #[Route('/{id}/version/new', name: 'admin_document_version_new', requirements: ['id' => '[0-9a-fA-F\-]{36}'])]
public function newVersion( public function newVersion(
string $id, string $id,
@@ -171,35 +149,34 @@ class DocumentController extends AbstractController
EntityManagerInterface $em, EntityManagerInterface $em,
DocumentService $documentService, DocumentService $documentService,
ParameterBagInterface $params ParameterBagInterface $params
): Response ): Response {
{
$document = $em->getRepository(Document::class)->find($id); $document = $em->getRepository(Document::class)->find($id);
if (!$document) { if (!$document) {
throw $this->createNotFoundException(); throw $this->createNotFoundException();
} }
if ($request->isMethod('POST')) { if (!$request->isMethod('POST')) {
return $this->render('admin/document/new_version.html.twig', [
'document' => $document,
]);
}
/** @var UploadedFile|null $file */
$file = $request->files->get('file'); $file = $request->files->get('file');
if (!$file instanceof UploadedFile) {
if (!$file) {
$this->addFlash('error', 'Datei ist erforderlich.'); $this->addFlash('error', 'Datei ist erforderlich.');
return $this->redirectToRoute('admin_document_version_new', ['id' => $id]); return $this->redirectToRoute('admin_document_version_new', ['id' => $id]);
} }
$uploadDir = $params->get('mto.vector.data.upload.path'); $uploadDir = (string)$params->get('mto.vector.data.upload.path');
$this->ensureDir($uploadDir);
if (!is_dir($uploadDir)) { $newFilename = uniqid('', true) . '_' . $file->getClientOriginalName();
mkdir($uploadDir, 0777, true);
}
$newFilename = uniqid() . '_' . $file->getClientOriginalName();
try { try {
$file->move($uploadDir, $newFilename); $file->move($uploadDir, $newFilename);
} catch (FileException $e) { } catch (FileException) {
throw new \RuntimeException('File upload failed.'); throw new \RuntimeException('File upload failed.');
} }
@@ -214,11 +191,6 @@ class DocumentController extends AbstractController
return $this->redirectToRoute('admin_document_show', ['id' => $id]); return $this->redirectToRoute('admin_document_show', ['id' => $id]);
} }
return $this->render('admin/document/new_version.html.twig', [
'document' => $document
]);
}
#[Route( #[Route(
'/version/{versionId}/activate', '/version/{versionId}/activate',
name: 'admin_document_version_activate', name: 'admin_document_version_activate',
@@ -231,27 +203,18 @@ class DocumentController extends AbstractController
EntityManagerInterface $em, EntityManagerInterface $em,
DocumentService $documentService, DocumentService $documentService,
IngestJobService $jobService, IngestJobService $jobService,
): RedirectResponse ): RedirectResponse {
{ if (!$this->isCsrfTokenValid('activate_version_' . $versionId, (string)$request->request->get('_token'))) {
if (!$this->isCsrfTokenValid('activate_version_' . $versionId, $request->request->get('_token'))) {
throw $this->createAccessDeniedException(); throw $this->createAccessDeniedException();
} }
$version = $em->getRepository(DocumentVersion::class)->find($versionId); $version = $em->getRepository(DocumentVersion::class)->find($versionId);
if (!$version) { if (!$version) {
throw $this->createNotFoundException(); throw $this->createNotFoundException();
} }
try { try {
$documentService->activateVersion($version); $documentService->activateVersion($version);
// ---------------------------------------------------------
// Saubere IngestJob-Integration:
// 1) Job als QUEUED anlegen (spezieller Typ für Aktivierung)
// 2) Symfony-Command im Hintergrund starten
// 3) Direkt auf Job-Detailseite redirecten (Loader + Polling)
// ---------------------------------------------------------
$job = $jobService->startJob( $job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE, IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE,
@@ -262,28 +225,15 @@ class DocumentController extends AbstractController
IngestJob::STATUS_QUEUED IngestJob::STATUS_QUEUED
); );
// Hintergrundprozess starten (Provider-kompatibel, kein Worker/Daemon) if (!$this->canExec()) {
$projectDir = (string)$this->getParameter('kernel.project_dir');
$console = $projectDir . '/bin/console';
$cmd = sprintf(
'%s %s %s %s > /dev/null 2>&1 &',
escapeshellarg($console),
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg((string)$job->getId()),
escapeshellarg('--no-interaction'),
);
// Best effort: wenn exec deaktiviert ist, sauber abbrechen.
if (!function_exists('exec')) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).'); $jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('danger', 'Aktivierung ok, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).'); $this->addFlash('danger', 'Aktivierung ok, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_document_show', [ return $this->redirectToRoute('admin_document_show', [
'id' => $version->getDocument()->getId() 'id' => $version->getDocument()->getId(),
]); ]);
} }
exec($cmd); $this->startIngestJob((string)$job->getId());
$this->addFlash('success', 'Version aktiviert. Ingest-Job wurde erstellt und gestartet.'); $this->addFlash('success', 'Version aktiviert. Ingest-Job wurde erstellt und gestartet.');
@@ -295,7 +245,7 @@ class DocumentController extends AbstractController
} }
return $this->redirectToRoute('admin_document_show', [ return $this->redirectToRoute('admin_document_show', [
'id' => $version->getDocument()->getId() 'id' => $version->getDocument()->getId(),
]); ]);
} }
@@ -310,19 +260,17 @@ class DocumentController extends AbstractController
Request $request, Request $request,
EntityManagerInterface $em, EntityManagerInterface $em,
IngestJobService $jobService, IngestJobService $jobService,
): ?RedirectResponse ): ?RedirectResponse {
{ if (!$this->isCsrfTokenValid('ingest_version_' . $versionId, (string)$request->request->get('_token'))) {
$dryRun = false;
if (!$this->isCsrfTokenValid('ingest_version_' . $versionId, $request->request->get('_token'))) {
throw $this->createAccessDeniedException(); throw $this->createAccessDeniedException();
} }
$version = $em->getRepository(DocumentVersion::class)->find($versionId); $version = $em->getRepository(DocumentVersion::class)->find($versionId);
if (!$version) { if (!$version) {
throw $this->createNotFoundException(); throw $this->createNotFoundException();
} }
/** @var IngestJob|null $existing */
$existing = $em->getRepository(IngestJob::class) $existing = $em->getRepository(IngestJob::class)
->findOneBy( ->findOneBy(
['documentVersionId' => $version->getId()], ['documentVersionId' => $version->getId()],
@@ -333,13 +281,6 @@ class DocumentController extends AbstractController
return null; return null;
} }
// ---------------------------------------------------------
// Asynchroner Ingest (ohne Messenger):
// 1) Job als QUEUED anlegen
// 2) Symfony-Command im Hintergrund starten
// 3) Direkt auf Job-Detailseite redirecten (Loader + Polling)
// ---------------------------------------------------------
$job = $jobService->startJob( $job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT, IngestJob::TYPE_DOCUMENT,
$this->getUser(), $this->getUser(),
@@ -349,28 +290,15 @@ class DocumentController extends AbstractController
IngestJob::STATUS_QUEUED IngestJob::STATUS_QUEUED
); );
// Hintergrundprozess starten (Provider-kompatibel, kein Worker/Daemon) if (!$this->canExec()) {
$projectDir = (string)$this->getParameter('kernel.project_dir');
$console = $projectDir . '/bin/console';
$cmd = sprintf(
'%s %s %s %s > /dev/null 2>&1 &',
escapeshellarg($console),
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg((string)$job->getId()),
escapeshellarg('--no-interaction'),
);
// Best effort: wenn exec deaktiviert ist, sauber abbrechen.
if (!function_exists('exec')) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).'); $jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('error', 'Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).'); $this->addFlash('error', 'Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_document_show', [ return $this->redirectToRoute('admin_document_show', [
'id' => $version->getDocument()->getId() 'id' => $version->getDocument()->getId(),
]); ]);
} }
exec($cmd); $this->startIngestJob((string)$job->getId());
return $this->redirectToRoute('admin_job_show', [ return $this->redirectToRoute('admin_job_show', [
'id' => (string)$job->getId(), 'id' => (string)$job->getId(),
@@ -384,17 +312,21 @@ class DocumentController extends AbstractController
)] )]
public function resetCompleteSystem(ParameterBagInterface $params, Connection $connection): ?RedirectResponse public function resetCompleteSystem(ParameterBagInterface $params, Connection $connection): ?RedirectResponse
{ {
if (!function_exists('exec')) { if (!$this->canExec()) {
$this->addFlash('danger', 'Der Reset konnte nicht gestartet werden (exec deaktiviert).'); $this->addFlash('danger', 'Der Reset konnte nicht gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_dashboard'); return $this->redirectToRoute('admin_dashboard');
} }
@unlink($params->get('mto.knowledge.ndjson')); @unlink((string)$params->get('mto.knowledge.ndjson'));
@unlink($params->get('mto.knowledge.vector_index')); @unlink((string)$params->get('mto.knowledge.vector_index'));
@unlink($params->get('mto.knowledge.vector_index_meta')); @unlink((string)$params->get('mto.knowledge.vector_index_meta'));
@unlink($params->get('mto.knowledge.index_meta')); @unlink((string)$params->get('mto.knowledge.index_meta'));
@unlink($params->get('mto.runtime.meta')); @unlink((string)$params->get('mto.runtime.meta'));
exec('rm -rf ' . $params->get('mto.knowledge.upload'));
$uploadDir = (string)$params->get('mto.knowledge.upload');
if ($uploadDir !== '' && is_dir($uploadDir)) {
exec('rm -rf ' . escapeshellarg($uploadDir));
}
$sql = ' $sql = '
SET FOREIGN_KEY_CHECKS = 0; SET FOREIGN_KEY_CHECKS = 0;
@@ -425,39 +357,29 @@ class DocumentController extends AbstractController
EntityManagerInterface $em, EntityManagerInterface $em,
IngestJobService $jobService, IngestJobService $jobService,
LockService $lockService, LockService $lockService,
DocumentService $documentService ): RedirectResponse {
): RedirectResponse if (!$this->isCsrfTokenValid('delete_document_' . $id, (string)$request->request->get('_token'))) {
{
if (!$this->isCsrfTokenValid('delete_document_' . $id, $request->request->get('_token'))) {
throw $this->createAccessDeniedException(); throw $this->createAccessDeniedException();
} }
try { try {
$uuid = Uuid::fromString($id); $uuid = Uuid::fromString($id);
} catch (\Exception $e) { } catch (\Exception) {
throw $this->createNotFoundException(); throw $this->createNotFoundException();
} }
/** @var Document|null $document */
$document = $em->getRepository(Document::class)->find($uuid); $document = $em->getRepository(Document::class)->find($uuid);
if (!$document) { if (!$document) {
throw $this->createNotFoundException(); throw $this->createNotFoundException();
} }
// ---------------------------------------------------------
// 🔒 Delete nur erlauben wenn kein anderer Job läuft
// ---------------------------------------------------------
if (!$lockService->acquire()) { if (!$lockService->acquire()) {
$this->addFlash('danger', 'Ein Ingest-Job läuft bereits. Löschen derzeit nicht möglich.'); $this->addFlash('danger', 'Ein Ingest-Job läuft bereits. Löschen derzeit nicht möglich.');
return $this->redirectToRoute('admin_documents'); return $this->redirectToRoute('admin_documents');
} }
// Nur Test-Lock echter Lock im Orchestrator
$lockService->release(); $lockService->release();
// ---------------------------------------------------------
// 1) Delete-Job anlegen (QUEUED)
// ---------------------------------------------------------
$job = $jobService->startJob( $job = $jobService->startJob(
IngestJob::TYPE_DOCUMENT_DELETE, IngestJob::TYPE_DOCUMENT_DELETE,
$this->getUser(), $this->getUser(),
@@ -467,27 +389,13 @@ class DocumentController extends AbstractController
IngestJob::STATUS_QUEUED IngestJob::STATUS_QUEUED
); );
// --------------------------------------------------------- if (!$this->canExec()) {
// 2) Hintergrundprozess starten
// ---------------------------------------------------------
$projectDir = (string)$this->getParameter('kernel.project_dir');
$console = $projectDir . '/bin/console';
$cmd = sprintf(
'%s %s %s %s > /dev/null 2>&1 &',
escapeshellarg($console),
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg((string)$job->getId()),
escapeshellarg('--no-interaction'),
);
if (!function_exists('exec')) {
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).'); $jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
$this->addFlash('danger', 'Löschen konnte nicht gestartet werden (exec deaktiviert).'); $this->addFlash('danger', 'Löschen konnte nicht gestartet werden (exec deaktiviert).');
return $this->redirectToRoute('admin_documents'); return $this->redirectToRoute('admin_documents');
} }
exec($cmd); $this->startIngestJob((string)$job->getId());
$this->addFlash('success', 'Löschvorgang gestartet. Dokument wird nach Index-Rebuild entfernt.'); $this->addFlash('success', 'Löschvorgang gestartet. Dokument wird nach Index-Rebuild entfernt.');
@@ -495,4 +403,42 @@ class DocumentController extends AbstractController
'id' => (string)$job->getId(), 'id' => (string)$job->getId(),
]); ]);
} }
// =========================================================
// Helpers
// =========================================================
private function canExec(): bool
{
return function_exists('exec');
}
private function ensureDir(string $dir): void
{
if ($dir === '') {
throw new \RuntimeException('Upload directory not configured.');
}
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create upload directory.');
}
}
private function startIngestJob(string $jobId): void
{
$projectDir = (string)$this->getParameter('kernel.project_dir');
$console = $projectDir . '/bin/console';
// WICHTIG: --no-interaction ist ein GLOBAL-Flag und muss VOR dem Command stehen!
$cmd = sprintf(
'%s %s %s %s %s > /dev/null 2>&1 &',
escapeshellarg(PHP_BINARY),
escapeshellarg($console),
'--no-interaction',
escapeshellarg('mto:agent:ingest:run'),
escapeshellarg($jobId),
);
exec($cmd);
}
} }

View File

@@ -16,14 +16,12 @@ final class IndexMetaManager
IndexConfigurationProvider $provider IndexConfigurationProvider $provider
) { ) {
$this->metaPath = $metaPath; $this->metaPath = $metaPath;
$this->provider = $provider;
// runtime liegt im selben Verzeichnis
$this->runtimePath = $runTimePath; $this->runtimePath = $runTimePath;
$this->provider = $provider;
} }
// ===================================================== // =====================================================
// META (Governance unverändert lassen!) // META (Governance unverändert inhaltlich)
// ===================================================== // =====================================================
public function ensureExists(): void public function ensureExists(): void
@@ -39,10 +37,12 @@ final class IndexMetaManager
return null; return null;
} }
return json_decode( $data = json_decode(
(string) file_get_contents($this->metaPath), (string) file_get_contents($this->metaPath),
true true
); );
return is_array($data) ? $data : null;
} }
public function validateAgainstCurrent(): void public function validateAgainstCurrent(): void
@@ -85,18 +85,7 @@ final class IndexMetaManager
$config->toStructureArray() $config->toStructureArray()
); );
$dir = dirname($this->metaPath); $this->atomicWriteJson($this->metaPath, $payload);
if (!is_dir($dir)) {
mkdir($dir, 0777, true);
}
file_put_contents(
$this->metaPath,
json_encode(
$payload,
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
)
);
} }
// ===================================================== // =====================================================
@@ -109,20 +98,12 @@ final class IndexMetaManager
return; return;
} }
$dir = dirname($this->runtimePath);
if (!is_dir($dir)) {
mkdir($dir, 0777, true);
}
$payload = [ $payload = [
'chunk_count' => 0, 'chunk_count' => 0,
'last_rebuild_at' => null, 'last_rebuild_at' => null,
]; ];
file_put_contents( $this->atomicWriteJson($this->runtimePath, $payload);
$this->runtimePath,
json_encode($payload, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
);
} }
public function updateRuntimeStats(int $chunkCount): void public function updateRuntimeStats(int $chunkCount): void
@@ -134,10 +115,7 @@ final class IndexMetaManager
'last_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM), 'last_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
]; ];
file_put_contents( $this->atomicWriteJson($this->runtimePath, $payload);
$this->runtimePath,
json_encode($payload, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
);
} }
public function getRuntimeChunkCount(): int public function getRuntimeChunkCount(): int
@@ -151,5 +129,37 @@ final class IndexMetaManager
return (int)($data['chunk_count'] ?? 0); return (int)($data['chunk_count'] ?? 0);
} }
// =====================================================
// INTERNAL ATOMIC JSON WRITE
// =====================================================
private function atomicWriteJson(string $path, array $payload): void
{
$dir = dirname($path);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create directory for meta/runtime');
} }
$tmpPath = $path . '.tmp';
$json = json_encode(
$payload,
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
);
if ($json === false) {
throw new \RuntimeException('Unable to encode JSON payload');
}
if (file_put_contents($tmpPath, $json) === false) {
throw new \RuntimeException('Unable to write temporary JSON file');
}
if (!rename($tmpPath, $path)) {
@unlink($tmpPath);
throw new \RuntimeException('Atomic switch failed for JSON file');
}
}
}

View File

@@ -0,0 +1,72 @@
<?php
declare(strict_types=1);
namespace App\Ingest;
use App\Entity\DocumentVersion;
use App\Knowledge\ChunkManager;
use Symfony\Component\Uid\Uuid;
final readonly class ChunkWriteService
{
public function __construct(
private ChunkManager $chunkManager,
) {}
public function getIndexPath(): string
{
return $this->chunkManager->getIndexPath();
}
public function countAllChunks(): int
{
return $this->chunkManager->countAllChunks();
}
public function compactByDocumentId(Uuid $documentId): void
{
$this->chunkManager->compactByDocument($documentId);
}
/**
* @param iterable<array<string,mixed>> $chunks
*/
public function appendChunks(iterable $chunks): void
{
$this->chunkManager->appendChunks($chunks);
}
/**
* Lokaler Ingest für eine einzelne DocumentVersion.
*
* Ablauf:
* 1. Entfernt bestehende Chunks dieses Dokuments
* 2. Appendet neue Chunks
*
* @param iterable<array<string,mixed>> $chunks
*/
public function writeForDocumentVersion(
DocumentVersion $version,
iterable $chunks
): void {
$documentId = $version->getDocument()->getId();
if (!$documentId instanceof Uuid) {
throw new \RuntimeException('Document ID must be a Uuid instance');
}
$this->chunkManager->compactByDocument($documentId);
$this->chunkManager->appendChunks($chunks);
}
/**
* Vollständiger Rewrite des NDJSON-Index (Global Reindex).
*
* @param iterable<array<string,mixed>> $allChunks
*/
public function rewriteAll(iterable $allChunks): void
{
$this->chunkManager->rewriteAll($allChunks);
}
}

View File

@@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
namespace App\Ingest;
use App\Index\IndexMetaManager;
final readonly class GuardrailValidator
{
public function __construct(
private IndexMetaManager $metaManager,
) {}
/**
* Wirft eine Exception, wenn ein lokaler Ingest nicht kompatibel ist
* und ein Global Reindex erforderlich ist.
*/
public function validateOrThrow(): void
{
$this->metaManager->validateAgainstCurrent();
}
}

View File

@@ -7,9 +7,7 @@ namespace App\Ingest;
use App\Entity\Document; use App\Entity\Document;
use App\Entity\DocumentVersion; use App\Entity\DocumentVersion;
use App\Index\IndexMetaManager; use App\Index\IndexMetaManager;
use App\Knowledge\ChunkManager;
use App\Knowledge\Ingest\KnowledgeIngestService; use App\Knowledge\Ingest\KnowledgeIngestService;
use App\Vector\VectorIndexBuilder;
use Doctrine\ORM\EntityManagerInterface; use Doctrine\ORM\EntityManagerInterface;
use Psr\Log\LoggerInterface; use Psr\Log\LoggerInterface;
use Symfony\Component\Uid\Uuid; use Symfony\Component\Uid\Uuid;
@@ -21,45 +19,52 @@ final readonly class IngestFlow
public function __construct( public function __construct(
private KnowledgeIngestService $knowledgeIngestService, private KnowledgeIngestService $knowledgeIngestService,
private ChunkManager $chunkManager, private GuardrailValidator $guardrailValidator,
private VectorIndexBuilder $vectorBuilder, private ChunkWriteService $chunkWriteService,
private VectorRebuildService $vectorRebuildService,
private IndexMetaManager $metaManager, private IndexMetaManager $metaManager,
private IngestLockService $lockService,
private LoggerInterface $logger, private LoggerInterface $logger,
private EntityManagerInterface $em, private EntityManagerInterface $em,
) {} ) {}
// ========================================================= // =========================================================
// DOCUMENT INGEST (STREAMING SAFE) // DOCUMENT INGEST
// ========================================================= // =========================================================
public function ingestDocumentVersion(DocumentVersion $version): void public function ingestDocumentVersion(DocumentVersion $version): void
{ {
$this->metaManager->validateAgainstCurrent(); $this->withLock(function () use ($version): void {
$this->guardrailValidator->validateOrThrow();
$version->setIngestStatus(DocumentVersion::INGEST_RUNNING); $version->setIngestStatus(DocumentVersion::INGEST_RUNNING);
$this->em->flush(); $this->em->flush();
try { try {
$documentId = $version->getDocument()->getId();
if (!$documentId instanceof Uuid) {
throw new \RuntimeException('Document ID must be a Uuid instance');
}
$this->chunkManager->compactByDocument($version->getDocument()->getId()); // 1) Alte Chunks entfernen
$this->chunkWriteService->compactByDocumentId($documentId);
$existing = $this->chunkManager->countAllChunks(); // 2) Existing Chunks nach Compaction zählen
$existing = $this->chunkWriteService->countAllChunks();
$incoming = 0; $incoming = 0;
$warned = false;
$generator = $this->knowledgeIngestService->buildChunkRecords($version); $generator = $this->knowledgeIngestService->buildChunkRecords($version);
$wrappedGenerator = (function () use ($generator, $existing, &$incoming) { $wrappedGenerator = (function () use ($generator, $existing, &$incoming, &$warned) {
foreach ($generator as $record) { foreach ($generator as $record) {
$incoming++; $incoming++;
$total = $existing + $incoming; $total = $existing + $incoming;
if ($total >= self::CHUNK_LIMIT_WARN) { if (!$warned && $total >= self::CHUNK_LIMIT_WARN) {
// Nur einmal warnen $warned = true;
if ($incoming === 1 || $total === self::CHUNK_LIMIT_WARN) {
// Logging erfolgt außerhalb des Streams final
}
} }
if ($total > self::CHUNK_LIMIT_HARD) { if ($total > self::CHUNK_LIMIT_HARD) {
@@ -68,40 +73,47 @@ final readonly class IngestFlow
yield $record; yield $record;
} }
})(); })();
$this->chunkManager->appendChunks($wrappedGenerator); // 3) Streaming Append
$this->chunkWriteService->appendChunks($wrappedGenerator);
$total = $existing + $incoming; $total = $existing + $incoming;
if ($total >= self::CHUNK_LIMIT_WARN) { if ($warned) {
$this->logger->warning('Chunk count approaching limit.', [ $this->logger->warning('Chunk count approaching limit.', [
'existing' => $existing, 'existing' => $existing,
'incoming' => $incoming, 'incoming' => $incoming,
'total' => $total, 'total' => $total,
'document' => (string)$documentId,
'version' => (string)$version->getId(),
]); ]);
} }
$this->rebuildIndex(false); // 4) Vector Rebuild + Runtime Update
$this->vectorRebuildService->rebuild();
$version->setIngestStatus(DocumentVersion::INGEST_INDEXED); $version->setIngestStatus(DocumentVersion::INGEST_INDEXED);
$this->em->flush(); $this->em->flush();
} catch (\Throwable $e) { } catch (\Throwable $e) {
$version->setIngestStatus(DocumentVersion::INGEST_FAILED); $version->setIngestStatus(DocumentVersion::INGEST_FAILED);
$this->em->flush(); $this->em->flush();
throw $e; throw $e;
} }
});
} }
// ========================================================= // =========================================================
// GLOBAL REINDEX (STREAMING SAFE) // GLOBAL REINDEX
// ========================================================= // =========================================================
public function globalReindex(): void public function globalReindex(): void
{ {
$this->withLock(function (): void {
// Global Reindex ist der Drift-Fix → keine Guardrail-Blockade hier.
$activeDocuments = $this->em $activeDocuments = $this->em
->getRepository(Document::class) ->getRepository(Document::class)
->createQueryBuilder('d') ->createQueryBuilder('d')
@@ -111,46 +123,70 @@ final readonly class IngestFlow
->getResult(); ->getResult();
if (empty($activeDocuments)) { if (empty($activeDocuments)) {
throw new \RuntimeException( throw new \RuntimeException('Global Reindex aborted: no active documents found.');
'Global Reindex abgebrochen: Es sind keine aktiven Dokumente vorhanden.'
);
} }
$existing = 0; // rewriteAll ersetzt alles
$incoming = 0; $incoming = 0;
$warned = false;
$generator = $this->knowledgeIngestService->buildAllActiveChunkRecords(); $generator = $this->knowledgeIngestService->buildAllActiveChunkRecords();
$wrappedGenerator = (function () use ($generator, &$incoming) { // 1) "Peek" ohne RAM: erstes Element ziehen
$first = null;
foreach ($generator as $record) {
$first = $record;
$incoming++;
break;
}
if ($first === null) {
throw new \RuntimeException('Global Reindex aborted: no chunks generated.');
}
// 2) Stream bauen, Limits prüfen
$stream = (function () use ($first, $generator, $existing, &$incoming, &$warned) {
// first
$total = $existing + $incoming;
if (!$warned && $total >= self::CHUNK_LIMIT_WARN) {
$warned = true;
}
if ($total > self::CHUNK_LIMIT_HARD) {
throw new \RuntimeException('Chunk limit exceeded.');
}
yield $first;
foreach ($generator as $record) { foreach ($generator as $record) {
$incoming++; $incoming++;
$total = $existing + $incoming;
if (!$warned && $total >= self::CHUNK_LIMIT_WARN) {
$warned = true;
}
if ($total > self::CHUNK_LIMIT_HARD) {
throw new \RuntimeException('Chunk limit exceeded.');
}
yield $record; yield $record;
} }
})(); })();
// Prüfen ob überhaupt etwas kommt (ohne alles in RAM zu ziehen) // 3) Rewrite + Rebuild
$peekIterator = $wrappedGenerator instanceof \Iterator $this->chunkWriteService->rewriteAll($stream);
? $wrappedGenerator
: (function () use ($wrappedGenerator) {
foreach ($wrappedGenerator as $item) {
yield $item;
}
})();
if (!$peekIterator->valid()) { if ($warned) {
$peekIterator->rewind(); $this->logger->warning('Chunk count approaching limit after global reindex.', [
'incoming' => $incoming,
'total' => $incoming,
]);
} }
if (!$peekIterator->valid()) { $this->vectorRebuildService->rebuild();
throw new \RuntimeException(
'Global Reindex abgebrochen: Es wurden keine Chunks erzeugt.'
);
}
$this->chunkManager->rewriteAll($peekIterator); // Governance: Version erhöhen
$this->metaManager->writeMetaForGlobalReindex();
$this->rebuildIndex(true); });
} }
// ========================================================= // =========================================================
@@ -159,8 +195,11 @@ final readonly class IngestFlow
public function deleteDocument(Uuid $documentId): void public function deleteDocument(Uuid $documentId): void
{ {
$this->metaManager->validateAgainstCurrent(); $this->withLock(function () use ($documentId): void {
$this->guardrailValidator->validateOrThrow();
/** @var Document|null $document */
$document = $this->em $document = $this->em
->getRepository(Document::class) ->getRepository(Document::class)
->find($documentId); ->find($documentId);
@@ -169,32 +208,41 @@ final readonly class IngestFlow
throw new \RuntimeException('Document not found.'); throw new \RuntimeException('Document not found.');
} }
$this->chunkManager->compactByDocument($documentId); // 1) Chunks entfernen
$this->chunkWriteService->compactByDocumentId($documentId);
// 2) FK-sicher löschen: currentVersion lösen (verhindert „Version zeigt noch auf DocumentVersion“)
if (method_exists($document, 'getCurrentVersion') && method_exists($document, 'setCurrentVersion')) {
if ($document->getCurrentVersion() !== null) {
$document->setCurrentVersion(null);
$this->em->flush();
}
}
// 3) Dokument entfernen
$this->em->remove($document); $this->em->remove($document);
$this->em->flush(); $this->em->flush();
$this->rebuildIndex(false); // 4) Vector rebuild + runtime update
$this->vectorRebuildService->rebuild();
});
} }
// ========================================================= // =========================================================
// CENTRAL REBUILD // INTERNALS
// ========================================================= // =========================================================
private function rebuildIndex(bool $isGlobal): void /**
* @param callable():void $fn
*/
private function withLock(callable $fn): void
{ {
$this->vectorBuilder->rebuildFromNdjson(); $this->lockService->acquire();
if ($isGlobal) { try {
$this->metaManager->writeMetaForGlobalReindex(); $fn();
} } finally {
$this->lockService->release();
$this->updateChunkCount(); }
}
private function updateChunkCount(): void
{
$chunkCount = $this->chunkManager->countAllChunks();
$this->metaManager->updateRuntimeStats($chunkCount);
} }
} }

View File

@@ -0,0 +1,51 @@
<?php
declare(strict_types=1);
namespace App\Ingest;
final class IngestLockService
{
private string $lockFilePath;
/** @var resource|null */
private $handle = null;
public function __construct(string $projectDir)
{
$this->lockFilePath = rtrim($projectDir, '/') . '/var/knowledge/locks/ingest.lock';
}
public function acquire(): void
{
$dir = dirname($this->lockFilePath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create lock directory.');
}
$this->handle = fopen($this->lockFilePath, 'c');
if ($this->handle === false) {
throw new \RuntimeException('Unable to open ingest lock file.');
}
if (!flock($this->handle, LOCK_EX | LOCK_NB)) {
throw new \RuntimeException('Another ingest process is already running.');
}
}
public function release(): void
{
if ($this->handle !== null) {
flock($this->handle, LOCK_UN);
fclose($this->handle);
$this->handle = null;
}
}
public function __destruct()
{
$this->release();
}
}

View File

@@ -0,0 +1,38 @@
<?php
declare(strict_types=1);
namespace App\Ingest;
use App\Index\IndexMetaManager;
use App\Knowledge\ChunkManager;
use App\Vector\VectorIndexBuilder;
final readonly class VectorRebuildService
{
public function __construct(
private VectorIndexBuilder $vectorBuilder,
private IndexMetaManager $metaManager,
private ChunkManager $chunkManager,
) {}
/**
* Führt einen vollständigen, deterministischen FAISS-Rebuild aus.
*
* Ablauf:
* 1. Rebuild des Vector Index aus index.ndjson
* 2. Chunk-Zählung via ChunkManager
* 3. Runtime-Stats atomar aktualisieren
*/
public function rebuild(?string $logPath = null): void
{
// 1⃣ Vector Index neu bauen
$this->vectorBuilder->rebuildFromNdjson($logPath);
// 2⃣ Chunk Count streaming-safe zählen
$chunkCount = $this->chunkManager->countAllChunks();
// 3⃣ Runtime-Stats aktualisieren (atomar)
$this->metaManager->updateRuntimeStats($chunkCount);
}
}

View File

@@ -13,7 +13,6 @@ final class VectorIndexBuilder
private string $pythonBin; private string $pythonBin;
private string $scriptPath; private string $scriptPath;
private string $indexNdjsonPath; private string $indexNdjsonPath;
private string $indexMetaPath;
private string $vectorIndexPath; private string $vectorIndexPath;
private string $vectorMetaPath; private string $vectorMetaPath;
private int $timeoutSeconds; private int $timeoutSeconds;
@@ -24,7 +23,6 @@ final class VectorIndexBuilder
string $pythonBin, string $pythonBin,
string $scriptPath, string $scriptPath,
string $indexNdjsonPath, string $indexNdjsonPath,
string $indexMetaPath,
string $vectorIndexPath, string $vectorIndexPath,
int $timeoutSeconds, int $timeoutSeconds,
IndexConfigurationProvider $configurationProvider IndexConfigurationProvider $configurationProvider
@@ -32,54 +30,30 @@ final class VectorIndexBuilder
$this->pythonBin = $pythonBin; $this->pythonBin = $pythonBin;
$this->scriptPath = $scriptPath; $this->scriptPath = $scriptPath;
$this->indexNdjsonPath = $indexNdjsonPath; $this->indexNdjsonPath = $indexNdjsonPath;
$this->indexMetaPath = $indexMetaPath;
$this->vectorIndexPath = $vectorIndexPath; $this->vectorIndexPath = $vectorIndexPath;
$this->vectorMetaPath = $vectorIndexPath . '.meta.json'; $this->vectorMetaPath = $vectorIndexPath . '.meta.json';
$this->timeoutSeconds = $timeoutSeconds; $this->timeoutSeconds = $timeoutSeconds;
$this->configurationProvider = $configurationProvider; $this->configurationProvider = $configurationProvider;
} }
/**
* Rebuild FAISS Index deterministisch aus index.ndjson.
*/
public function rebuildFromNdjson(?string $logPath = null): void public function rebuildFromNdjson(?string $logPath = null): void
{ {
$this->assertPreconditions(); $this->assertPreconditions();
// --------------------------------------------
// 🔵 FALL: NDJSON ist leer → kein Vector Index
// --------------------------------------------
if (!is_file($this->indexNdjsonPath) || filesize($this->indexNdjsonPath) === 0) { if (!is_file($this->indexNdjsonPath) || filesize($this->indexNdjsonPath) === 0) {
@unlink($this->vectorIndexPath); @unlink($this->vectorIndexPath);
@unlink($this->vectorMetaPath); @unlink($this->vectorMetaPath);
if ($logPath !== null) {
@file_put_contents(
$logPath,
"NDJSON empty → Vector index removed\n",
FILE_APPEND
);
}
return; return;
} }
// -------------------------------------------- $config = $this->configurationProvider->getConfiguration();
// 🟢 FALL: NDJSON enthält Chunks $embeddingModel = $config->getEmbeddingModel();
// --------------------------------------------
if (!is_file($this->indexMetaPath)) {
$this->initializeIndexMeta();
}
$indexMeta = $this->readIndexMeta();
$embeddingModel = $indexMeta['embedding_model'];
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp'; $tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
$tmpVectorMetaPath = $tmpVectorIndexPath . '.meta.json';
@unlink($tmpVectorIndexPath); @unlink($tmpVectorIndexPath);
@unlink($this->vectorMetaPath); @unlink($tmpVectorMetaPath);
$cmd = [ $cmd = [
$this->pythonBin, $this->pythonBin,
@@ -94,107 +68,51 @@ final class VectorIndexBuilder
$this->runProcess($process, $logPath); $this->runProcess($process, $logPath);
$this->validatePythonOutputs($tmpVectorIndexPath); $this->validateOutputs($tmpVectorIndexPath, $tmpVectorMetaPath);
$this->atomicSwitch($tmpVectorIndexPath); $this->atomicSwitchPair(
$tmpVectorIndexPath,
$tmpVectorMetaPath
);
} }
// -----------------------------------------------------
// Internals
// -----------------------------------------------------
private function assertPreconditions(): void private function assertPreconditions(): void
{ {
if (!is_file($this->scriptPath)) { if (!is_file($this->scriptPath)) {
throw new \RuntimeException( throw new \RuntimeException('Vector build script not found.');
'Vector build script not found at: ' . $this->scriptPath
);
} }
if (!is_file($this->indexNdjsonPath)) { if (!is_file($this->indexNdjsonPath)) {
throw new \RuntimeException( throw new \RuntimeException('index.ndjson not found.');
'index.ndjson not found at: ' . $this->indexNdjsonPath
);
} }
} }
private function readIndexMeta(): array private function validateOutputs(string $tmpIndex, string $tmpMeta): void
{ {
$meta = json_decode( if (!is_file($tmpIndex) || filesize($tmpIndex) === 0) {
(string) file_get_contents($this->indexMetaPath),
true
);
if (!is_array($meta) || empty($meta['embedding_model'])) {
throw new \RuntimeException('Invalid index_meta.json');
}
return $meta;
}
private function initializeIndexMeta(): void
{
$dir = dirname($this->indexMetaPath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Cannot create knowledge directory');
}
$config = $this->configurationProvider->getConfiguration();
$data = [
'index_version' => 1,
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
'embedding_model' => $config->getEmbeddingModel(),
'embedding_dimension' => $config->getEmbeddingDimension(),
'chunk_size' => $config->getChunkSize(),
'chunk_overlap' => $config->getChunkOverlap(),
'scoring_version' => $config->getScoringVersion(),
'index_format' => 'ndjson',
'vector_backend' => 'faiss',
];
file_put_contents(
$this->indexMetaPath,
json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
);
}
private function validatePythonOutputs(string $tmpVectorIndexPath): void
{
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
throw new \RuntimeException('Vector index tmp missing or empty'); throw new \RuntimeException('Vector index tmp missing or empty');
} }
if (!is_file($tmpMeta) || filesize($tmpMeta) === 0) {
if (!is_file($this->vectorMetaPath) || filesize($this->vectorMetaPath) === 0) { throw new \RuntimeException('Vector meta tmp missing or empty');
throw new \RuntimeException('Vector meta missing or empty');
} }
} }
private function atomicSwitch(string $tmpVectorIndexPath): void private function atomicSwitchPair(string $tmpIndex, string $tmpMeta): void
{ {
if (!rename($tmpVectorIndexPath, $this->vectorIndexPath)) { if (!rename($tmpIndex, $this->vectorIndexPath)) {
throw new \RuntimeException('Atomic switch failed for vector index'); throw new \RuntimeException('Atomic switch failed for vector index');
} }
if (!rename($tmpMeta, $this->vectorMetaPath)) {
throw new \RuntimeException('Atomic switch failed for vector meta');
}
} }
private function runProcess(Process $process, ?string $logPath): void private function runProcess(Process $process, ?string $logPath): void
{ {
if ($logPath !== null) {
@file_put_contents($logPath, "=== VectorIndexBuilder START ===\n", FILE_APPEND);
}
$process->run(); $process->run();
if (!$process->isSuccessful()) { if (!$process->isSuccessful()) {
if ($logPath !== null) {
@file_put_contents($logPath, $process->getErrorOutput(), FILE_APPEND);
}
throw new ProcessFailedException($process); throw new ProcessFailedException($process);
} }
if ($logPath !== null) {
@file_put_contents($logPath, "=== VectorIndexBuilder OK ===\n", FILE_APPEND);
}
} }
} }