phase a audit
This commit is contained in:
@@ -3,15 +3,9 @@
|
|||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
parameters:
|
parameters:
|
||||||
# ------------------------------------------------------------
|
|
||||||
# Root
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
mto.root: '%kernel.project_dir%'
|
mto.root: '%kernel.project_dir%'
|
||||||
mto.kernel.dir: '%mto.root%'
|
mto.kernel.dir: '%mto.root%'
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
# Knowledge Root (ZENTRAL)
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
mto.knowledge.root: '%mto.root%/var/knowledge'
|
mto.knowledge.root: '%mto.root%/var/knowledge'
|
||||||
|
|
||||||
mto.knowledge.ndjson: '%mto.knowledge.root%/index.ndjson'
|
mto.knowledge.ndjson: '%mto.knowledge.root%/index.ndjson'
|
||||||
@@ -21,42 +15,25 @@ parameters:
|
|||||||
mto.runtime.meta: '%mto.knowledge.root%/index_runtime.json'
|
mto.runtime.meta: '%mto.knowledge.root%/index_runtime.json'
|
||||||
mto.knowledge.upload: '%mto.knowledge.root%/uploads'
|
mto.knowledge.upload: '%mto.knowledge.root%/uploads'
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
# Tags (Document Routing)
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
mto.knowledge.tags_ndjson: '%mto.knowledge.root%/tags.ndjson'
|
mto.knowledge.tags_ndjson: '%mto.knowledge.root%/tags.ndjson'
|
||||||
|
|
||||||
# Tag vector index outputs
|
|
||||||
mto.knowledge.vector_tags_index: '%mto.knowledge.root%/vector_tags.index'
|
mto.knowledge.vector_tags_index: '%mto.knowledge.root%/vector_tags.index'
|
||||||
mto.knowledge.vector_tags_index_meta: '%mto.knowledge.root%/vector_tags.index.meta.json'
|
mto.knowledge.vector_tags_index_meta: '%mto.knowledge.root%/vector_tags.index.meta.json'
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
# Vector Script Directory (A2)
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
mto.vector.script_dir: '%mto.root%/python/vector'
|
mto.vector.script_dir: '%mto.root%/python/vector'
|
||||||
|
|
||||||
# Tag vector scripts
|
|
||||||
mto.vector.ingest_tags_script: '%mto.vector.script_dir%/vector_ingest_tags.py'
|
mto.vector.ingest_tags_script: '%mto.vector.script_dir%/vector_ingest_tags.py'
|
||||||
mto.vector.search_tags_script: '%mto.vector.script_dir%/vector_search_tags.py'
|
mto.vector.search_tags_script: '%mto.vector.script_dir%/vector_search_tags.py'
|
||||||
|
|
||||||
# Lock for tag rebuild jobs
|
|
||||||
mto.tags.rebuild_lock: '%mto.knowledge.root%/locks/tag_rebuild.lock'
|
mto.tags.rebuild_lock: '%mto.knowledge.root%/locks/tag_rebuild.lock'
|
||||||
|
|
||||||
# Backward compatibility alias
|
|
||||||
mto.vector.data.upload.path: '%mto.knowledge.upload%'
|
mto.vector.data.upload.path: '%mto.knowledge.upload%'
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
# Index Configuration (Fallback Guardrails)
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
mto.index.chunk_size: 800
|
mto.index.chunk_size: 800
|
||||||
mto.index.chunk_overlap: 100
|
mto.index.chunk_overlap: 100
|
||||||
mto.index.embedding_model: 'all-MiniLM-L6-v2'
|
mto.index.embedding_model: 'all-MiniLM-L6-v2'
|
||||||
mto.index.embedding_dimension: 768
|
mto.index.embedding_dimension: 768
|
||||||
mto.index.scoring_version: 1
|
mto.index.scoring_version: 1
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
# Python / Vector Runtime
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
mto.vector.python_bin: '/var/www/html/.venv/bin/python3'
|
mto.vector.python_bin: '/var/www/html/.venv/bin/python3'
|
||||||
mto.vector.ingest_script: '%mto.vector.script_dir%/vector_ingest.py'
|
mto.vector.ingest_script: '%mto.vector.script_dir%/vector_ingest.py'
|
||||||
mto.vector.search_script: '%mto.vector.script_dir%/vector_search.py'
|
mto.vector.search_script: '%mto.vector.script_dir%/vector_search.py'
|
||||||
@@ -131,7 +108,7 @@ services:
|
|||||||
alias: App\Knowledge\Retrieval\CachedRetriever
|
alias: App\Knowledge\Retrieval\CachedRetriever
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# Index Configuration Provider (DB + Fallback)
|
# Index Configuration Provider
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
App\Index\IndexConfigurationProvider:
|
App\Index\IndexConfigurationProvider:
|
||||||
@@ -144,7 +121,7 @@ services:
|
|||||||
$fallbackScoringVersion: '%mto.index.scoring_version%'
|
$fallbackScoringVersion: '%mto.index.scoring_version%'
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# Index Meta Manager (uses Provider)
|
# Index Meta Manager
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
App\Index\IndexMetaManager:
|
App\Index\IndexMetaManager:
|
||||||
@@ -167,13 +144,24 @@ services:
|
|||||||
$pythonBin: '%mto.vector.python_bin%'
|
$pythonBin: '%mto.vector.python_bin%'
|
||||||
$scriptPath: '%mto.vector.ingest_script%'
|
$scriptPath: '%mto.vector.ingest_script%'
|
||||||
$indexNdjsonPath: '%mto.knowledge.ndjson%'
|
$indexNdjsonPath: '%mto.knowledge.ndjson%'
|
||||||
$indexMetaPath: '%mto.knowledge.index_meta%'
|
|
||||||
$vectorIndexPath: '%mto.knowledge.vector_index%'
|
$vectorIndexPath: '%mto.knowledge.vector_index%'
|
||||||
$timeoutSeconds: '%mto.vector.timeout%'
|
$timeoutSeconds: '%mto.vector.timeout%'
|
||||||
$configurationProvider: '@App\Index\IndexConfigurationProvider'
|
$configurationProvider: '@App\Index\IndexConfigurationProvider'
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# Tags Export (Document Routing)
|
# Ingest Layer (Phase B Refactor)
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
App\Ingest\GuardrailValidator: ~
|
||||||
|
|
||||||
|
App\Ingest\ChunkWriteService: ~
|
||||||
|
|
||||||
|
App\Ingest\VectorRebuildService: ~
|
||||||
|
|
||||||
|
App\Ingest\IngestFlow: ~
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# Tags Export
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
App\Tag\TagNdjsonExporter:
|
App\Tag\TagNdjsonExporter:
|
||||||
@@ -202,7 +190,7 @@ services:
|
|||||||
App\Tag\TagRoutingService: ~
|
App\Tag\TagRoutingService: ~
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# Tag Rebuild Jobs (8A)
|
# Tag Rebuild Jobs
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
App\Service\TagRebuildJobService:
|
App\Service\TagRebuildJobService:
|
||||||
|
|||||||
@@ -11,7 +11,8 @@ from pathlib import Path
|
|||||||
parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON")
|
parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON")
|
||||||
|
|
||||||
parser.add_argument("--index", required=True, help="Path to index.ndjson")
|
parser.add_argument("--index", required=True, help="Path to index.ndjson")
|
||||||
parser.add_argument("--out", required=True, help="Path to output vector.index")
|
parser.add_argument("--out", required=True, help="Path to output vector.index (tmp)")
|
||||||
|
|
||||||
parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model")
|
parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@@ -82,13 +83,7 @@ with open(index_path, "r", encoding="utf-8") as f:
|
|||||||
if not texts:
|
if not texts:
|
||||||
print("No chunks found. Removing vector index.")
|
print("No chunks found. Removing vector index.")
|
||||||
|
|
||||||
if out_path.exists():
|
# Entferne final erst später in PHP atomar
|
||||||
out_path.unlink()
|
|
||||||
|
|
||||||
meta_path = out_path.with_suffix(".meta.json")
|
|
||||||
if meta_path.exists():
|
|
||||||
meta_path.unlink()
|
|
||||||
|
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
print(f"Loaded {len(texts)} chunks.")
|
print(f"Loaded {len(texts)} chunks.")
|
||||||
@@ -119,15 +114,18 @@ index.add(embeddings)
|
|||||||
# Ensure output directory exists
|
# Ensure output directory exists
|
||||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------
|
||||||
|
# Write FAISS index (tmp)
|
||||||
|
# ---------------------------------------------------------
|
||||||
print(f"Writing FAISS index to {out_path}")
|
print(f"Writing FAISS index to {out_path}")
|
||||||
faiss.write_index(index, str(out_path))
|
faiss.write_index(index, str(out_path))
|
||||||
|
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
# Write ID mapping meta
|
# Write ID mapping meta (tmp)
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
meta_path = out_path.with_suffix(".meta.json")
|
meta_tmp_path = Path(str(out_path) + ".meta.json")
|
||||||
|
|
||||||
with open(meta_path, "w", encoding="utf-8") as f:
|
with open(meta_tmp_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(ids, f)
|
json.dump(ids, f)
|
||||||
|
|
||||||
print(f"Indexed {len(ids)} chunks successfully.")
|
print(f"Indexed {len(ids)} chunks successfully.")
|
||||||
|
|||||||
@@ -21,12 +21,10 @@ use Symfony\Component\HttpFoundation\Response;
|
|||||||
use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
|
use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
|
||||||
use Symfony\Component\Routing\Attribute\Route;
|
use Symfony\Component\Routing\Attribute\Route;
|
||||||
use Symfony\Component\Uid\Uuid;
|
use Symfony\Component\Uid\Uuid;
|
||||||
use function function_exists;
|
|
||||||
|
|
||||||
#[Route('/admin/documents')]
|
#[Route('/admin/documents')]
|
||||||
class DocumentController extends AbstractController
|
class DocumentController extends AbstractController
|
||||||
{
|
{
|
||||||
|
|
||||||
#[Route('', name: 'admin_documents')]
|
#[Route('', name: 'admin_documents')]
|
||||||
public function index(EntityManagerInterface $em): Response
|
public function index(EntityManagerInterface $em): Response
|
||||||
{
|
{
|
||||||
@@ -41,7 +39,7 @@ class DocumentController extends AbstractController
|
|||||||
->getResult();
|
->getResult();
|
||||||
|
|
||||||
return $this->render('admin/document/index.html.twig', [
|
return $this->render('admin/document/index.html.twig', [
|
||||||
'documents' => $documents
|
'documents' => $documents,
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -54,7 +52,7 @@ class DocumentController extends AbstractController
|
|||||||
{
|
{
|
||||||
try {
|
try {
|
||||||
$uuid = Uuid::fromString($id);
|
$uuid = Uuid::fromString($id);
|
||||||
} catch (\Exception $e) {
|
} catch (\Exception) {
|
||||||
throw new NotFoundHttpException();
|
throw new NotFoundHttpException();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -65,7 +63,7 @@ class DocumentController extends AbstractController
|
|||||||
}
|
}
|
||||||
|
|
||||||
return $this->render('admin/document/show.html.twig', [
|
return $this->render('admin/document/show.html.twig', [
|
||||||
'document' => $document
|
'document' => $document,
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -76,57 +74,51 @@ class DocumentController extends AbstractController
|
|||||||
FormatText $formatText,
|
FormatText $formatText,
|
||||||
IngestJobService $jobService,
|
IngestJobService $jobService,
|
||||||
ParameterBagInterface $params
|
ParameterBagInterface $params
|
||||||
): Response
|
): Response {
|
||||||
{
|
if (!$request->isMethod('POST')) {
|
||||||
if ($request->isMethod('POST')) {
|
return $this->render('admin/document/new.html.twig');
|
||||||
|
}
|
||||||
|
|
||||||
/** @var UploadedFile|null $file */
|
/** @var UploadedFile|null $file */
|
||||||
$file = $request->files->get('file');
|
$file = $request->files->get('file');
|
||||||
|
|
||||||
if (!$file instanceof UploadedFile) {
|
if (!$file instanceof UploadedFile) {
|
||||||
throw new \InvalidArgumentException('No valid file uploaded.');
|
throw new \InvalidArgumentException('No valid file uploaded.');
|
||||||
}
|
}
|
||||||
|
|
||||||
$rawTitle = $request->request->get('title');
|
$rawTitle = $request->request->get('title');
|
||||||
|
|
||||||
$title = is_string($rawTitle) && $rawTitle !== ''
|
$title = is_string($rawTitle) && $rawTitle !== ''
|
||||||
? $rawTitle
|
? $rawTitle
|
||||||
: $formatText->slugify($file->getClientOriginalName());
|
: $formatText->slugify($file->getClientOriginalName());
|
||||||
|
|
||||||
|
|
||||||
if (!$title) {
|
if (!$title) {
|
||||||
$this->addFlash('error', 'Titel ist erforderlich.');
|
$this->addFlash('error', 'Titel ist erforderlich.');
|
||||||
return $this->redirectToRoute('admin_document_new');
|
return $this->redirectToRoute('admin_document_new');
|
||||||
}
|
}
|
||||||
|
|
||||||
$uploadDir = $params->get('mto.vector.data.upload.path');
|
$uploadDir = (string)$params->get('mto.vector.data.upload.path');
|
||||||
|
$this->ensureDir($uploadDir);
|
||||||
|
|
||||||
if (!is_dir($uploadDir)) {
|
$newFilename = uniqid('', true) . '_' . $file->getClientOriginalName();
|
||||||
mkdir($uploadDir, 0777, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
$newFilename = uniqid() . '_' . $file->getClientOriginalName();
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
$file->move($uploadDir, $newFilename);
|
$file->move($uploadDir, $newFilename);
|
||||||
} catch (FileException $e) {
|
} catch (FileException) {
|
||||||
throw new \RuntimeException('File upload failed.');
|
throw new \RuntimeException('File upload failed.');
|
||||||
}
|
}
|
||||||
|
|
||||||
$filePath = $uploadDir . '/' . $newFilename;
|
$filePath = $uploadDir . '/' . $newFilename;
|
||||||
|
|
||||||
// Dokument erstellen
|
|
||||||
$document = $documentService->createDocument(
|
$document = $documentService->createDocument(
|
||||||
$title,
|
$title,
|
||||||
$filePath,
|
$filePath,
|
||||||
$this->getUser()
|
$this->getUser()
|
||||||
);
|
);
|
||||||
|
|
||||||
// ---------------------------------------------------------
|
|
||||||
// AUTO-INTEGRATION: gleicher Flow wie "Version aktivieren"
|
|
||||||
// ---------------------------------------------------------
|
|
||||||
|
|
||||||
$version = $document->getCurrentVersion();
|
$version = $document->getCurrentVersion();
|
||||||
|
if (!$version instanceof DocumentVersion) {
|
||||||
|
$this->addFlash('danger', 'Dokument erstellt, aber es wurde keine aktuelle Version erzeugt.');
|
||||||
|
return $this->redirectToRoute('admin_documents');
|
||||||
|
}
|
||||||
|
|
||||||
$job = $jobService->startJob(
|
$job = $jobService->startJob(
|
||||||
IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE,
|
IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE,
|
||||||
@@ -137,33 +129,19 @@ class DocumentController extends AbstractController
|
|||||||
IngestJob::STATUS_QUEUED
|
IngestJob::STATUS_QUEUED
|
||||||
);
|
);
|
||||||
|
|
||||||
$projectDir = (string)$this->getParameter('kernel.project_dir');
|
if (!$this->canExec()) {
|
||||||
$console = $projectDir . '/bin/console';
|
|
||||||
|
|
||||||
$cmd = sprintf(
|
|
||||||
'%s %s %s %s > /dev/null 2>&1 &',
|
|
||||||
escapeshellarg($console),
|
|
||||||
escapeshellarg('mto:agent:ingest:run'),
|
|
||||||
escapeshellarg((string)$job->getId()),
|
|
||||||
escapeshellarg('--no-interaction'),
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!function_exists('exec')) {
|
|
||||||
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
|
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
|
||||||
$this->addFlash('danger', 'Dokument erstellt, aber Ingest konnte nicht asynchron gestartet werden.');
|
$this->addFlash('danger', 'Dokument erstellt, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
|
||||||
return $this->redirectToRoute('admin_documents');
|
return $this->redirectToRoute('admin_documents');
|
||||||
}
|
}
|
||||||
|
|
||||||
exec($cmd);
|
$this->startIngestJob((string)$job->getId());
|
||||||
|
|
||||||
return $this->redirectToRoute('admin_job_show', [
|
return $this->redirectToRoute('admin_job_show', [
|
||||||
'id' => (string)$job->getId(),
|
'id' => (string)$job->getId(),
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return $this->render('admin/document/new.html.twig');
|
|
||||||
}
|
|
||||||
|
|
||||||
#[Route('/{id}/version/new', name: 'admin_document_version_new', requirements: ['id' => '[0-9a-fA-F\-]{36}'])]
|
#[Route('/{id}/version/new', name: 'admin_document_version_new', requirements: ['id' => '[0-9a-fA-F\-]{36}'])]
|
||||||
public function newVersion(
|
public function newVersion(
|
||||||
string $id,
|
string $id,
|
||||||
@@ -171,35 +149,34 @@ class DocumentController extends AbstractController
|
|||||||
EntityManagerInterface $em,
|
EntityManagerInterface $em,
|
||||||
DocumentService $documentService,
|
DocumentService $documentService,
|
||||||
ParameterBagInterface $params
|
ParameterBagInterface $params
|
||||||
): Response
|
): Response {
|
||||||
{
|
|
||||||
|
|
||||||
$document = $em->getRepository(Document::class)->find($id);
|
$document = $em->getRepository(Document::class)->find($id);
|
||||||
|
|
||||||
if (!$document) {
|
if (!$document) {
|
||||||
throw $this->createNotFoundException();
|
throw $this->createNotFoundException();
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($request->isMethod('POST')) {
|
if (!$request->isMethod('POST')) {
|
||||||
|
return $this->render('admin/document/new_version.html.twig', [
|
||||||
|
'document' => $document,
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @var UploadedFile|null $file */
|
||||||
$file = $request->files->get('file');
|
$file = $request->files->get('file');
|
||||||
|
if (!$file instanceof UploadedFile) {
|
||||||
if (!$file) {
|
|
||||||
$this->addFlash('error', 'Datei ist erforderlich.');
|
$this->addFlash('error', 'Datei ist erforderlich.');
|
||||||
return $this->redirectToRoute('admin_document_version_new', ['id' => $id]);
|
return $this->redirectToRoute('admin_document_version_new', ['id' => $id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
$uploadDir = $params->get('mto.vector.data.upload.path');
|
$uploadDir = (string)$params->get('mto.vector.data.upload.path');
|
||||||
|
$this->ensureDir($uploadDir);
|
||||||
|
|
||||||
if (!is_dir($uploadDir)) {
|
$newFilename = uniqid('', true) . '_' . $file->getClientOriginalName();
|
||||||
mkdir($uploadDir, 0777, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
$newFilename = uniqid() . '_' . $file->getClientOriginalName();
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
$file->move($uploadDir, $newFilename);
|
$file->move($uploadDir, $newFilename);
|
||||||
} catch (FileException $e) {
|
} catch (FileException) {
|
||||||
throw new \RuntimeException('File upload failed.');
|
throw new \RuntimeException('File upload failed.');
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -214,11 +191,6 @@ class DocumentController extends AbstractController
|
|||||||
return $this->redirectToRoute('admin_document_show', ['id' => $id]);
|
return $this->redirectToRoute('admin_document_show', ['id' => $id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return $this->render('admin/document/new_version.html.twig', [
|
|
||||||
'document' => $document
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[Route(
|
#[Route(
|
||||||
'/version/{versionId}/activate',
|
'/version/{versionId}/activate',
|
||||||
name: 'admin_document_version_activate',
|
name: 'admin_document_version_activate',
|
||||||
@@ -231,27 +203,18 @@ class DocumentController extends AbstractController
|
|||||||
EntityManagerInterface $em,
|
EntityManagerInterface $em,
|
||||||
DocumentService $documentService,
|
DocumentService $documentService,
|
||||||
IngestJobService $jobService,
|
IngestJobService $jobService,
|
||||||
): RedirectResponse
|
): RedirectResponse {
|
||||||
{
|
if (!$this->isCsrfTokenValid('activate_version_' . $versionId, (string)$request->request->get('_token'))) {
|
||||||
|
|
||||||
if (!$this->isCsrfTokenValid('activate_version_' . $versionId, $request->request->get('_token'))) {
|
|
||||||
throw $this->createAccessDeniedException();
|
throw $this->createAccessDeniedException();
|
||||||
}
|
}
|
||||||
|
|
||||||
$version = $em->getRepository(DocumentVersion::class)->find($versionId);
|
$version = $em->getRepository(DocumentVersion::class)->find($versionId);
|
||||||
|
|
||||||
if (!$version) {
|
if (!$version) {
|
||||||
throw $this->createNotFoundException();
|
throw $this->createNotFoundException();
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
$documentService->activateVersion($version);
|
$documentService->activateVersion($version);
|
||||||
// ---------------------------------------------------------
|
|
||||||
// Saubere IngestJob-Integration:
|
|
||||||
// 1) Job als QUEUED anlegen (spezieller Typ für Aktivierung)
|
|
||||||
// 2) Symfony-Command im Hintergrund starten
|
|
||||||
// 3) Direkt auf Job-Detailseite redirecten (Loader + Polling)
|
|
||||||
// ---------------------------------------------------------
|
|
||||||
|
|
||||||
$job = $jobService->startJob(
|
$job = $jobService->startJob(
|
||||||
IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE,
|
IngestJob::TYPE_DOCUMENT_VERSION_ACTIVATE,
|
||||||
@@ -262,28 +225,15 @@ class DocumentController extends AbstractController
|
|||||||
IngestJob::STATUS_QUEUED
|
IngestJob::STATUS_QUEUED
|
||||||
);
|
);
|
||||||
|
|
||||||
// Hintergrundprozess starten (Provider-kompatibel, kein Worker/Daemon)
|
if (!$this->canExec()) {
|
||||||
$projectDir = (string)$this->getParameter('kernel.project_dir');
|
|
||||||
$console = $projectDir . '/bin/console';
|
|
||||||
|
|
||||||
$cmd = sprintf(
|
|
||||||
'%s %s %s %s > /dev/null 2>&1 &',
|
|
||||||
escapeshellarg($console),
|
|
||||||
escapeshellarg('mto:agent:ingest:run'),
|
|
||||||
escapeshellarg((string)$job->getId()),
|
|
||||||
escapeshellarg('--no-interaction'),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Best effort: wenn exec deaktiviert ist, sauber abbrechen.
|
|
||||||
if (!function_exists('exec')) {
|
|
||||||
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
|
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
|
||||||
$this->addFlash('danger', 'Aktivierung ok, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
|
$this->addFlash('danger', 'Aktivierung ok, aber Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
|
||||||
return $this->redirectToRoute('admin_document_show', [
|
return $this->redirectToRoute('admin_document_show', [
|
||||||
'id' => $version->getDocument()->getId()
|
'id' => $version->getDocument()->getId(),
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
exec($cmd);
|
$this->startIngestJob((string)$job->getId());
|
||||||
|
|
||||||
$this->addFlash('success', 'Version aktiviert. Ingest-Job wurde erstellt und gestartet.');
|
$this->addFlash('success', 'Version aktiviert. Ingest-Job wurde erstellt und gestartet.');
|
||||||
|
|
||||||
@@ -295,7 +245,7 @@ class DocumentController extends AbstractController
|
|||||||
}
|
}
|
||||||
|
|
||||||
return $this->redirectToRoute('admin_document_show', [
|
return $this->redirectToRoute('admin_document_show', [
|
||||||
'id' => $version->getDocument()->getId()
|
'id' => $version->getDocument()->getId(),
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -310,19 +260,17 @@ class DocumentController extends AbstractController
|
|||||||
Request $request,
|
Request $request,
|
||||||
EntityManagerInterface $em,
|
EntityManagerInterface $em,
|
||||||
IngestJobService $jobService,
|
IngestJobService $jobService,
|
||||||
): ?RedirectResponse
|
): ?RedirectResponse {
|
||||||
{
|
if (!$this->isCsrfTokenValid('ingest_version_' . $versionId, (string)$request->request->get('_token'))) {
|
||||||
$dryRun = false;
|
|
||||||
if (!$this->isCsrfTokenValid('ingest_version_' . $versionId, $request->request->get('_token'))) {
|
|
||||||
throw $this->createAccessDeniedException();
|
throw $this->createAccessDeniedException();
|
||||||
}
|
}
|
||||||
|
|
||||||
$version = $em->getRepository(DocumentVersion::class)->find($versionId);
|
$version = $em->getRepository(DocumentVersion::class)->find($versionId);
|
||||||
|
|
||||||
if (!$version) {
|
if (!$version) {
|
||||||
throw $this->createNotFoundException();
|
throw $this->createNotFoundException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @var IngestJob|null $existing */
|
||||||
$existing = $em->getRepository(IngestJob::class)
|
$existing = $em->getRepository(IngestJob::class)
|
||||||
->findOneBy(
|
->findOneBy(
|
||||||
['documentVersionId' => $version->getId()],
|
['documentVersionId' => $version->getId()],
|
||||||
@@ -333,13 +281,6 @@ class DocumentController extends AbstractController
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------
|
|
||||||
// Asynchroner Ingest (ohne Messenger):
|
|
||||||
// 1) Job als QUEUED anlegen
|
|
||||||
// 2) Symfony-Command im Hintergrund starten
|
|
||||||
// 3) Direkt auf Job-Detailseite redirecten (Loader + Polling)
|
|
||||||
// ---------------------------------------------------------
|
|
||||||
|
|
||||||
$job = $jobService->startJob(
|
$job = $jobService->startJob(
|
||||||
IngestJob::TYPE_DOCUMENT,
|
IngestJob::TYPE_DOCUMENT,
|
||||||
$this->getUser(),
|
$this->getUser(),
|
||||||
@@ -349,28 +290,15 @@ class DocumentController extends AbstractController
|
|||||||
IngestJob::STATUS_QUEUED
|
IngestJob::STATUS_QUEUED
|
||||||
);
|
);
|
||||||
|
|
||||||
// Hintergrundprozess starten (Provider-kompatibel, kein Worker/Daemon)
|
if (!$this->canExec()) {
|
||||||
$projectDir = (string)$this->getParameter('kernel.project_dir');
|
|
||||||
$console = $projectDir . '/bin/console';
|
|
||||||
|
|
||||||
$cmd = sprintf(
|
|
||||||
'%s %s %s %s > /dev/null 2>&1 &',
|
|
||||||
escapeshellarg($console),
|
|
||||||
escapeshellarg('mto:agent:ingest:run'),
|
|
||||||
escapeshellarg((string)$job->getId()),
|
|
||||||
escapeshellarg('--no-interaction'),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Best effort: wenn exec deaktiviert ist, sauber abbrechen.
|
|
||||||
if (!function_exists('exec')) {
|
|
||||||
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
|
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
|
||||||
$this->addFlash('error', 'Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
|
$this->addFlash('error', 'Ingest konnte nicht asynchron gestartet werden (exec deaktiviert).');
|
||||||
return $this->redirectToRoute('admin_document_show', [
|
return $this->redirectToRoute('admin_document_show', [
|
||||||
'id' => $version->getDocument()->getId()
|
'id' => $version->getDocument()->getId(),
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
exec($cmd);
|
$this->startIngestJob((string)$job->getId());
|
||||||
|
|
||||||
return $this->redirectToRoute('admin_job_show', [
|
return $this->redirectToRoute('admin_job_show', [
|
||||||
'id' => (string)$job->getId(),
|
'id' => (string)$job->getId(),
|
||||||
@@ -384,17 +312,21 @@ class DocumentController extends AbstractController
|
|||||||
)]
|
)]
|
||||||
public function resetCompleteSystem(ParameterBagInterface $params, Connection $connection): ?RedirectResponse
|
public function resetCompleteSystem(ParameterBagInterface $params, Connection $connection): ?RedirectResponse
|
||||||
{
|
{
|
||||||
if (!function_exists('exec')) {
|
if (!$this->canExec()) {
|
||||||
$this->addFlash('danger', 'Der Reset konnte nicht gestartet werden (exec deaktiviert).');
|
$this->addFlash('danger', 'Der Reset konnte nicht gestartet werden (exec deaktiviert).');
|
||||||
return $this->redirectToRoute('admin_dashboard');
|
return $this->redirectToRoute('admin_dashboard');
|
||||||
}
|
}
|
||||||
|
|
||||||
@unlink($params->get('mto.knowledge.ndjson'));
|
@unlink((string)$params->get('mto.knowledge.ndjson'));
|
||||||
@unlink($params->get('mto.knowledge.vector_index'));
|
@unlink((string)$params->get('mto.knowledge.vector_index'));
|
||||||
@unlink($params->get('mto.knowledge.vector_index_meta'));
|
@unlink((string)$params->get('mto.knowledge.vector_index_meta'));
|
||||||
@unlink($params->get('mto.knowledge.index_meta'));
|
@unlink((string)$params->get('mto.knowledge.index_meta'));
|
||||||
@unlink($params->get('mto.runtime.meta'));
|
@unlink((string)$params->get('mto.runtime.meta'));
|
||||||
exec('rm -rf ' . $params->get('mto.knowledge.upload'));
|
|
||||||
|
$uploadDir = (string)$params->get('mto.knowledge.upload');
|
||||||
|
if ($uploadDir !== '' && is_dir($uploadDir)) {
|
||||||
|
exec('rm -rf ' . escapeshellarg($uploadDir));
|
||||||
|
}
|
||||||
|
|
||||||
$sql = '
|
$sql = '
|
||||||
SET FOREIGN_KEY_CHECKS = 0;
|
SET FOREIGN_KEY_CHECKS = 0;
|
||||||
@@ -425,39 +357,29 @@ class DocumentController extends AbstractController
|
|||||||
EntityManagerInterface $em,
|
EntityManagerInterface $em,
|
||||||
IngestJobService $jobService,
|
IngestJobService $jobService,
|
||||||
LockService $lockService,
|
LockService $lockService,
|
||||||
DocumentService $documentService
|
): RedirectResponse {
|
||||||
): RedirectResponse
|
if (!$this->isCsrfTokenValid('delete_document_' . $id, (string)$request->request->get('_token'))) {
|
||||||
{
|
|
||||||
if (!$this->isCsrfTokenValid('delete_document_' . $id, $request->request->get('_token'))) {
|
|
||||||
throw $this->createAccessDeniedException();
|
throw $this->createAccessDeniedException();
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
$uuid = Uuid::fromString($id);
|
$uuid = Uuid::fromString($id);
|
||||||
} catch (\Exception $e) {
|
} catch (\Exception) {
|
||||||
throw $this->createNotFoundException();
|
throw $this->createNotFoundException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @var Document|null $document */
|
||||||
$document = $em->getRepository(Document::class)->find($uuid);
|
$document = $em->getRepository(Document::class)->find($uuid);
|
||||||
|
|
||||||
if (!$document) {
|
if (!$document) {
|
||||||
throw $this->createNotFoundException();
|
throw $this->createNotFoundException();
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------
|
|
||||||
// 🔒 Delete nur erlauben wenn kein anderer Job läuft
|
|
||||||
// ---------------------------------------------------------
|
|
||||||
if (!$lockService->acquire()) {
|
if (!$lockService->acquire()) {
|
||||||
$this->addFlash('danger', 'Ein Ingest-Job läuft bereits. Löschen derzeit nicht möglich.');
|
$this->addFlash('danger', 'Ein Ingest-Job läuft bereits. Löschen derzeit nicht möglich.');
|
||||||
return $this->redirectToRoute('admin_documents');
|
return $this->redirectToRoute('admin_documents');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Nur Test-Lock – echter Lock im Orchestrator
|
|
||||||
$lockService->release();
|
$lockService->release();
|
||||||
|
|
||||||
// ---------------------------------------------------------
|
|
||||||
// 1) Delete-Job anlegen (QUEUED)
|
|
||||||
// ---------------------------------------------------------
|
|
||||||
$job = $jobService->startJob(
|
$job = $jobService->startJob(
|
||||||
IngestJob::TYPE_DOCUMENT_DELETE,
|
IngestJob::TYPE_DOCUMENT_DELETE,
|
||||||
$this->getUser(),
|
$this->getUser(),
|
||||||
@@ -467,27 +389,13 @@ class DocumentController extends AbstractController
|
|||||||
IngestJob::STATUS_QUEUED
|
IngestJob::STATUS_QUEUED
|
||||||
);
|
);
|
||||||
|
|
||||||
// ---------------------------------------------------------
|
if (!$this->canExec()) {
|
||||||
// 2) Hintergrundprozess starten
|
|
||||||
// ---------------------------------------------------------
|
|
||||||
$projectDir = (string)$this->getParameter('kernel.project_dir');
|
|
||||||
$console = $projectDir . '/bin/console';
|
|
||||||
|
|
||||||
$cmd = sprintf(
|
|
||||||
'%s %s %s %s > /dev/null 2>&1 &',
|
|
||||||
escapeshellarg($console),
|
|
||||||
escapeshellarg('mto:agent:ingest:run'),
|
|
||||||
escapeshellarg((string)$job->getId()),
|
|
||||||
escapeshellarg('--no-interaction'),
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!function_exists('exec')) {
|
|
||||||
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
|
$jobService->markFailed($job, 'Server configuration does not allow background execution (exec disabled).');
|
||||||
$this->addFlash('danger', 'Löschen konnte nicht gestartet werden (exec deaktiviert).');
|
$this->addFlash('danger', 'Löschen konnte nicht gestartet werden (exec deaktiviert).');
|
||||||
return $this->redirectToRoute('admin_documents');
|
return $this->redirectToRoute('admin_documents');
|
||||||
}
|
}
|
||||||
|
|
||||||
exec($cmd);
|
$this->startIngestJob((string)$job->getId());
|
||||||
|
|
||||||
$this->addFlash('success', 'Löschvorgang gestartet. Dokument wird nach Index-Rebuild entfernt.');
|
$this->addFlash('success', 'Löschvorgang gestartet. Dokument wird nach Index-Rebuild entfernt.');
|
||||||
|
|
||||||
@@ -495,4 +403,42 @@ class DocumentController extends AbstractController
|
|||||||
'id' => (string)$job->getId(),
|
'id' => (string)$job->getId(),
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =========================================================
|
||||||
|
// Helpers
|
||||||
|
// =========================================================
|
||||||
|
|
||||||
|
private function canExec(): bool
|
||||||
|
{
|
||||||
|
return function_exists('exec');
|
||||||
|
}
|
||||||
|
|
||||||
|
private function ensureDir(string $dir): void
|
||||||
|
{
|
||||||
|
if ($dir === '') {
|
||||||
|
throw new \RuntimeException('Upload directory not configured.');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||||
|
throw new \RuntimeException('Unable to create upload directory.');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private function startIngestJob(string $jobId): void
|
||||||
|
{
|
||||||
|
$projectDir = (string)$this->getParameter('kernel.project_dir');
|
||||||
|
$console = $projectDir . '/bin/console';
|
||||||
|
|
||||||
|
// WICHTIG: --no-interaction ist ein GLOBAL-Flag und muss VOR dem Command stehen!
|
||||||
|
$cmd = sprintf(
|
||||||
|
'%s %s %s %s %s > /dev/null 2>&1 &',
|
||||||
|
escapeshellarg(PHP_BINARY),
|
||||||
|
escapeshellarg($console),
|
||||||
|
'--no-interaction',
|
||||||
|
escapeshellarg('mto:agent:ingest:run'),
|
||||||
|
escapeshellarg($jobId),
|
||||||
|
);
|
||||||
|
|
||||||
|
exec($cmd);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -16,14 +16,12 @@ final class IndexMetaManager
|
|||||||
IndexConfigurationProvider $provider
|
IndexConfigurationProvider $provider
|
||||||
) {
|
) {
|
||||||
$this->metaPath = $metaPath;
|
$this->metaPath = $metaPath;
|
||||||
$this->provider = $provider;
|
|
||||||
|
|
||||||
// runtime liegt im selben Verzeichnis
|
|
||||||
$this->runtimePath = $runTimePath;
|
$this->runtimePath = $runTimePath;
|
||||||
|
$this->provider = $provider;
|
||||||
}
|
}
|
||||||
|
|
||||||
// =====================================================
|
// =====================================================
|
||||||
// META (Governance – unverändert lassen!)
|
// META (Governance – unverändert inhaltlich)
|
||||||
// =====================================================
|
// =====================================================
|
||||||
|
|
||||||
public function ensureExists(): void
|
public function ensureExists(): void
|
||||||
@@ -39,10 +37,12 @@ final class IndexMetaManager
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return json_decode(
|
$data = json_decode(
|
||||||
(string) file_get_contents($this->metaPath),
|
(string) file_get_contents($this->metaPath),
|
||||||
true
|
true
|
||||||
);
|
);
|
||||||
|
|
||||||
|
return is_array($data) ? $data : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function validateAgainstCurrent(): void
|
public function validateAgainstCurrent(): void
|
||||||
@@ -85,18 +85,7 @@ final class IndexMetaManager
|
|||||||
$config->toStructureArray()
|
$config->toStructureArray()
|
||||||
);
|
);
|
||||||
|
|
||||||
$dir = dirname($this->metaPath);
|
$this->atomicWriteJson($this->metaPath, $payload);
|
||||||
if (!is_dir($dir)) {
|
|
||||||
mkdir($dir, 0777, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
file_put_contents(
|
|
||||||
$this->metaPath,
|
|
||||||
json_encode(
|
|
||||||
$payload,
|
|
||||||
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
|
|
||||||
)
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// =====================================================
|
// =====================================================
|
||||||
@@ -109,20 +98,12 @@ final class IndexMetaManager
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
$dir = dirname($this->runtimePath);
|
|
||||||
if (!is_dir($dir)) {
|
|
||||||
mkdir($dir, 0777, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
$payload = [
|
$payload = [
|
||||||
'chunk_count' => 0,
|
'chunk_count' => 0,
|
||||||
'last_rebuild_at' => null,
|
'last_rebuild_at' => null,
|
||||||
];
|
];
|
||||||
|
|
||||||
file_put_contents(
|
$this->atomicWriteJson($this->runtimePath, $payload);
|
||||||
$this->runtimePath,
|
|
||||||
json_encode($payload, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public function updateRuntimeStats(int $chunkCount): void
|
public function updateRuntimeStats(int $chunkCount): void
|
||||||
@@ -134,10 +115,7 @@ final class IndexMetaManager
|
|||||||
'last_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
'last_rebuild_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
||||||
];
|
];
|
||||||
|
|
||||||
file_put_contents(
|
$this->atomicWriteJson($this->runtimePath, $payload);
|
||||||
$this->runtimePath,
|
|
||||||
json_encode($payload, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getRuntimeChunkCount(): int
|
public function getRuntimeChunkCount(): int
|
||||||
@@ -151,5 +129,37 @@ final class IndexMetaManager
|
|||||||
|
|
||||||
return (int)($data['chunk_count'] ?? 0);
|
return (int)($data['chunk_count'] ?? 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =====================================================
|
||||||
|
// INTERNAL ATOMIC JSON WRITE
|
||||||
|
// =====================================================
|
||||||
|
|
||||||
|
private function atomicWriteJson(string $path, array $payload): void
|
||||||
|
{
|
||||||
|
$dir = dirname($path);
|
||||||
|
|
||||||
|
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||||
|
throw new \RuntimeException('Unable to create directory for meta/runtime');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$tmpPath = $path . '.tmp';
|
||||||
|
|
||||||
|
$json = json_encode(
|
||||||
|
$payload,
|
||||||
|
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
|
||||||
|
);
|
||||||
|
|
||||||
|
if ($json === false) {
|
||||||
|
throw new \RuntimeException('Unable to encode JSON payload');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (file_put_contents($tmpPath, $json) === false) {
|
||||||
|
throw new \RuntimeException('Unable to write temporary JSON file');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!rename($tmpPath, $path)) {
|
||||||
|
@unlink($tmpPath);
|
||||||
|
throw new \RuntimeException('Atomic switch failed for JSON file');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
72
src/Ingest/ChunkWriteService.php
Normal file
72
src/Ingest/ChunkWriteService.php
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Ingest;
|
||||||
|
|
||||||
|
use App\Entity\DocumentVersion;
|
||||||
|
use App\Knowledge\ChunkManager;
|
||||||
|
use Symfony\Component\Uid\Uuid;
|
||||||
|
|
||||||
|
final readonly class ChunkWriteService
|
||||||
|
{
|
||||||
|
public function __construct(
|
||||||
|
private ChunkManager $chunkManager,
|
||||||
|
) {}
|
||||||
|
|
||||||
|
public function getIndexPath(): string
|
||||||
|
{
|
||||||
|
return $this->chunkManager->getIndexPath();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function countAllChunks(): int
|
||||||
|
{
|
||||||
|
return $this->chunkManager->countAllChunks();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function compactByDocumentId(Uuid $documentId): void
|
||||||
|
{
|
||||||
|
$this->chunkManager->compactByDocument($documentId);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param iterable<array<string,mixed>> $chunks
|
||||||
|
*/
|
||||||
|
public function appendChunks(iterable $chunks): void
|
||||||
|
{
|
||||||
|
$this->chunkManager->appendChunks($chunks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lokaler Ingest für eine einzelne DocumentVersion.
|
||||||
|
*
|
||||||
|
* Ablauf:
|
||||||
|
* 1. Entfernt bestehende Chunks dieses Dokuments
|
||||||
|
* 2. Appendet neue Chunks
|
||||||
|
*
|
||||||
|
* @param iterable<array<string,mixed>> $chunks
|
||||||
|
*/
|
||||||
|
public function writeForDocumentVersion(
|
||||||
|
DocumentVersion $version,
|
||||||
|
iterable $chunks
|
||||||
|
): void {
|
||||||
|
$documentId = $version->getDocument()->getId();
|
||||||
|
|
||||||
|
if (!$documentId instanceof Uuid) {
|
||||||
|
throw new \RuntimeException('Document ID must be a Uuid instance');
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->chunkManager->compactByDocument($documentId);
|
||||||
|
$this->chunkManager->appendChunks($chunks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Vollständiger Rewrite des NDJSON-Index (Global Reindex).
|
||||||
|
*
|
||||||
|
* @param iterable<array<string,mixed>> $allChunks
|
||||||
|
*/
|
||||||
|
public function rewriteAll(iterable $allChunks): void
|
||||||
|
{
|
||||||
|
$this->chunkManager->rewriteAll($allChunks);
|
||||||
|
}
|
||||||
|
}
|
||||||
23
src/Ingest/GuardrailValidator.php
Normal file
23
src/Ingest/GuardrailValidator.php
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Ingest;
|
||||||
|
|
||||||
|
use App\Index\IndexMetaManager;
|
||||||
|
|
||||||
|
final readonly class GuardrailValidator
|
||||||
|
{
|
||||||
|
public function __construct(
|
||||||
|
private IndexMetaManager $metaManager,
|
||||||
|
) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wirft eine Exception, wenn ein lokaler Ingest nicht kompatibel ist
|
||||||
|
* und ein Global Reindex erforderlich ist.
|
||||||
|
*/
|
||||||
|
public function validateOrThrow(): void
|
||||||
|
{
|
||||||
|
$this->metaManager->validateAgainstCurrent();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -7,9 +7,7 @@ namespace App\Ingest;
|
|||||||
use App\Entity\Document;
|
use App\Entity\Document;
|
||||||
use App\Entity\DocumentVersion;
|
use App\Entity\DocumentVersion;
|
||||||
use App\Index\IndexMetaManager;
|
use App\Index\IndexMetaManager;
|
||||||
use App\Knowledge\ChunkManager;
|
|
||||||
use App\Knowledge\Ingest\KnowledgeIngestService;
|
use App\Knowledge\Ingest\KnowledgeIngestService;
|
||||||
use App\Vector\VectorIndexBuilder;
|
|
||||||
use Doctrine\ORM\EntityManagerInterface;
|
use Doctrine\ORM\EntityManagerInterface;
|
||||||
use Psr\Log\LoggerInterface;
|
use Psr\Log\LoggerInterface;
|
||||||
use Symfony\Component\Uid\Uuid;
|
use Symfony\Component\Uid\Uuid;
|
||||||
@@ -21,45 +19,52 @@ final readonly class IngestFlow
|
|||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private KnowledgeIngestService $knowledgeIngestService,
|
private KnowledgeIngestService $knowledgeIngestService,
|
||||||
private ChunkManager $chunkManager,
|
private GuardrailValidator $guardrailValidator,
|
||||||
private VectorIndexBuilder $vectorBuilder,
|
private ChunkWriteService $chunkWriteService,
|
||||||
|
private VectorRebuildService $vectorRebuildService,
|
||||||
private IndexMetaManager $metaManager,
|
private IndexMetaManager $metaManager,
|
||||||
|
private IngestLockService $lockService,
|
||||||
private LoggerInterface $logger,
|
private LoggerInterface $logger,
|
||||||
private EntityManagerInterface $em,
|
private EntityManagerInterface $em,
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
// =========================================================
|
// =========================================================
|
||||||
// DOCUMENT INGEST (STREAMING SAFE)
|
// DOCUMENT INGEST
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
public function ingestDocumentVersion(DocumentVersion $version): void
|
public function ingestDocumentVersion(DocumentVersion $version): void
|
||||||
{
|
{
|
||||||
$this->metaManager->validateAgainstCurrent();
|
$this->withLock(function () use ($version): void {
|
||||||
|
|
||||||
|
$this->guardrailValidator->validateOrThrow();
|
||||||
|
|
||||||
$version->setIngestStatus(DocumentVersion::INGEST_RUNNING);
|
$version->setIngestStatus(DocumentVersion::INGEST_RUNNING);
|
||||||
$this->em->flush();
|
$this->em->flush();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
$documentId = $version->getDocument()->getId();
|
||||||
|
if (!$documentId instanceof Uuid) {
|
||||||
|
throw new \RuntimeException('Document ID must be a Uuid instance');
|
||||||
|
}
|
||||||
|
|
||||||
$this->chunkManager->compactByDocument($version->getDocument()->getId());
|
// 1) Alte Chunks entfernen
|
||||||
|
$this->chunkWriteService->compactByDocumentId($documentId);
|
||||||
|
|
||||||
$existing = $this->chunkManager->countAllChunks();
|
// 2) Existing Chunks nach Compaction zählen
|
||||||
|
$existing = $this->chunkWriteService->countAllChunks();
|
||||||
|
|
||||||
$incoming = 0;
|
$incoming = 0;
|
||||||
|
$warned = false;
|
||||||
|
|
||||||
$generator = $this->knowledgeIngestService->buildChunkRecords($version);
|
$generator = $this->knowledgeIngestService->buildChunkRecords($version);
|
||||||
|
|
||||||
$wrappedGenerator = (function () use ($generator, $existing, &$incoming) {
|
$wrappedGenerator = (function () use ($generator, $existing, &$incoming, &$warned) {
|
||||||
|
|
||||||
foreach ($generator as $record) {
|
foreach ($generator as $record) {
|
||||||
|
|
||||||
$incoming++;
|
$incoming++;
|
||||||
$total = $existing + $incoming;
|
$total = $existing + $incoming;
|
||||||
|
|
||||||
if ($total >= self::CHUNK_LIMIT_WARN) {
|
if (!$warned && $total >= self::CHUNK_LIMIT_WARN) {
|
||||||
// Nur einmal warnen
|
$warned = true;
|
||||||
if ($incoming === 1 || $total === self::CHUNK_LIMIT_WARN) {
|
|
||||||
// Logging erfolgt außerhalb des Streams final
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($total > self::CHUNK_LIMIT_HARD) {
|
if ($total > self::CHUNK_LIMIT_HARD) {
|
||||||
@@ -68,40 +73,47 @@ final readonly class IngestFlow
|
|||||||
|
|
||||||
yield $record;
|
yield $record;
|
||||||
}
|
}
|
||||||
|
|
||||||
})();
|
})();
|
||||||
|
|
||||||
$this->chunkManager->appendChunks($wrappedGenerator);
|
// 3) Streaming Append
|
||||||
|
$this->chunkWriteService->appendChunks($wrappedGenerator);
|
||||||
|
|
||||||
$total = $existing + $incoming;
|
$total = $existing + $incoming;
|
||||||
|
|
||||||
if ($total >= self::CHUNK_LIMIT_WARN) {
|
if ($warned) {
|
||||||
$this->logger->warning('Chunk count approaching limit.', [
|
$this->logger->warning('Chunk count approaching limit.', [
|
||||||
'existing' => $existing,
|
'existing' => $existing,
|
||||||
'incoming' => $incoming,
|
'incoming' => $incoming,
|
||||||
'total' => $total,
|
'total' => $total,
|
||||||
|
'document' => (string)$documentId,
|
||||||
|
'version' => (string)$version->getId(),
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->rebuildIndex(false);
|
// 4) Vector Rebuild + Runtime Update
|
||||||
|
$this->vectorRebuildService->rebuild();
|
||||||
|
|
||||||
$version->setIngestStatus(DocumentVersion::INGEST_INDEXED);
|
$version->setIngestStatus(DocumentVersion::INGEST_INDEXED);
|
||||||
$this->em->flush();
|
$this->em->flush();
|
||||||
|
|
||||||
} catch (\Throwable $e) {
|
} catch (\Throwable $e) {
|
||||||
|
|
||||||
$version->setIngestStatus(DocumentVersion::INGEST_FAILED);
|
$version->setIngestStatus(DocumentVersion::INGEST_FAILED);
|
||||||
$this->em->flush();
|
$this->em->flush();
|
||||||
throw $e;
|
throw $e;
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// =========================================================
|
// =========================================================
|
||||||
// GLOBAL REINDEX (STREAMING SAFE)
|
// GLOBAL REINDEX
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
public function globalReindex(): void
|
public function globalReindex(): void
|
||||||
{
|
{
|
||||||
|
$this->withLock(function (): void {
|
||||||
|
|
||||||
|
// Global Reindex ist der Drift-Fix → keine Guardrail-Blockade hier.
|
||||||
|
|
||||||
$activeDocuments = $this->em
|
$activeDocuments = $this->em
|
||||||
->getRepository(Document::class)
|
->getRepository(Document::class)
|
||||||
->createQueryBuilder('d')
|
->createQueryBuilder('d')
|
||||||
@@ -111,46 +123,70 @@ final readonly class IngestFlow
|
|||||||
->getResult();
|
->getResult();
|
||||||
|
|
||||||
if (empty($activeDocuments)) {
|
if (empty($activeDocuments)) {
|
||||||
throw new \RuntimeException(
|
throw new \RuntimeException('Global Reindex aborted: no active documents found.');
|
||||||
'Global Reindex abgebrochen: Es sind keine aktiven Dokumente vorhanden.'
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$existing = 0; // rewriteAll ersetzt alles
|
||||||
$incoming = 0;
|
$incoming = 0;
|
||||||
|
$warned = false;
|
||||||
|
|
||||||
$generator = $this->knowledgeIngestService->buildAllActiveChunkRecords();
|
$generator = $this->knowledgeIngestService->buildAllActiveChunkRecords();
|
||||||
|
|
||||||
$wrappedGenerator = (function () use ($generator, &$incoming) {
|
// 1) "Peek" ohne RAM: erstes Element ziehen
|
||||||
|
$first = null;
|
||||||
|
foreach ($generator as $record) {
|
||||||
|
$first = $record;
|
||||||
|
$incoming++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($first === null) {
|
||||||
|
throw new \RuntimeException('Global Reindex aborted: no chunks generated.');
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) Stream bauen, Limits prüfen
|
||||||
|
$stream = (function () use ($first, $generator, $existing, &$incoming, &$warned) {
|
||||||
|
// first
|
||||||
|
$total = $existing + $incoming;
|
||||||
|
if (!$warned && $total >= self::CHUNK_LIMIT_WARN) {
|
||||||
|
$warned = true;
|
||||||
|
}
|
||||||
|
if ($total > self::CHUNK_LIMIT_HARD) {
|
||||||
|
throw new \RuntimeException('Chunk limit exceeded.');
|
||||||
|
}
|
||||||
|
yield $first;
|
||||||
|
|
||||||
foreach ($generator as $record) {
|
foreach ($generator as $record) {
|
||||||
$incoming++;
|
$incoming++;
|
||||||
|
$total = $existing + $incoming;
|
||||||
|
|
||||||
|
if (!$warned && $total >= self::CHUNK_LIMIT_WARN) {
|
||||||
|
$warned = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($total > self::CHUNK_LIMIT_HARD) {
|
||||||
|
throw new \RuntimeException('Chunk limit exceeded.');
|
||||||
|
}
|
||||||
|
|
||||||
yield $record;
|
yield $record;
|
||||||
}
|
}
|
||||||
|
|
||||||
})();
|
})();
|
||||||
|
|
||||||
// Prüfen ob überhaupt etwas kommt (ohne alles in RAM zu ziehen)
|
// 3) Rewrite + Rebuild
|
||||||
$peekIterator = $wrappedGenerator instanceof \Iterator
|
$this->chunkWriteService->rewriteAll($stream);
|
||||||
? $wrappedGenerator
|
|
||||||
: (function () use ($wrappedGenerator) {
|
|
||||||
foreach ($wrappedGenerator as $item) {
|
|
||||||
yield $item;
|
|
||||||
}
|
|
||||||
})();
|
|
||||||
|
|
||||||
if (!$peekIterator->valid()) {
|
if ($warned) {
|
||||||
$peekIterator->rewind();
|
$this->logger->warning('Chunk count approaching limit after global reindex.', [
|
||||||
|
'incoming' => $incoming,
|
||||||
|
'total' => $incoming,
|
||||||
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!$peekIterator->valid()) {
|
$this->vectorRebuildService->rebuild();
|
||||||
throw new \RuntimeException(
|
|
||||||
'Global Reindex abgebrochen: Es wurden keine Chunks erzeugt.'
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->chunkManager->rewriteAll($peekIterator);
|
// Governance: Version erhöhen
|
||||||
|
$this->metaManager->writeMetaForGlobalReindex();
|
||||||
$this->rebuildIndex(true);
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// =========================================================
|
// =========================================================
|
||||||
@@ -159,8 +195,11 @@ final readonly class IngestFlow
|
|||||||
|
|
||||||
public function deleteDocument(Uuid $documentId): void
|
public function deleteDocument(Uuid $documentId): void
|
||||||
{
|
{
|
||||||
$this->metaManager->validateAgainstCurrent();
|
$this->withLock(function () use ($documentId): void {
|
||||||
|
|
||||||
|
$this->guardrailValidator->validateOrThrow();
|
||||||
|
|
||||||
|
/** @var Document|null $document */
|
||||||
$document = $this->em
|
$document = $this->em
|
||||||
->getRepository(Document::class)
|
->getRepository(Document::class)
|
||||||
->find($documentId);
|
->find($documentId);
|
||||||
@@ -169,32 +208,41 @@ final readonly class IngestFlow
|
|||||||
throw new \RuntimeException('Document not found.');
|
throw new \RuntimeException('Document not found.');
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->chunkManager->compactByDocument($documentId);
|
// 1) Chunks entfernen
|
||||||
|
$this->chunkWriteService->compactByDocumentId($documentId);
|
||||||
|
|
||||||
|
// 2) FK-sicher löschen: currentVersion lösen (verhindert „Version zeigt noch auf DocumentVersion“)
|
||||||
|
if (method_exists($document, 'getCurrentVersion') && method_exists($document, 'setCurrentVersion')) {
|
||||||
|
if ($document->getCurrentVersion() !== null) {
|
||||||
|
$document->setCurrentVersion(null);
|
||||||
|
$this->em->flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3) Dokument entfernen
|
||||||
$this->em->remove($document);
|
$this->em->remove($document);
|
||||||
$this->em->flush();
|
$this->em->flush();
|
||||||
|
|
||||||
$this->rebuildIndex(false);
|
// 4) Vector rebuild + runtime update
|
||||||
|
$this->vectorRebuildService->rebuild();
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// =========================================================
|
// =========================================================
|
||||||
// CENTRAL REBUILD
|
// INTERNALS
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
private function rebuildIndex(bool $isGlobal): void
|
/**
|
||||||
|
* @param callable():void $fn
|
||||||
|
*/
|
||||||
|
private function withLock(callable $fn): void
|
||||||
{
|
{
|
||||||
$this->vectorBuilder->rebuildFromNdjson();
|
$this->lockService->acquire();
|
||||||
|
|
||||||
if ($isGlobal) {
|
try {
|
||||||
$this->metaManager->writeMetaForGlobalReindex();
|
$fn();
|
||||||
}
|
} finally {
|
||||||
|
$this->lockService->release();
|
||||||
$this->updateChunkCount();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
private function updateChunkCount(): void
|
|
||||||
{
|
|
||||||
$chunkCount = $this->chunkManager->countAllChunks();
|
|
||||||
$this->metaManager->updateRuntimeStats($chunkCount);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
51
src/Ingest/IngestLockService.php
Normal file
51
src/Ingest/IngestLockService.php
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Ingest;
|
||||||
|
|
||||||
|
final class IngestLockService
|
||||||
|
{
|
||||||
|
private string $lockFilePath;
|
||||||
|
|
||||||
|
/** @var resource|null */
|
||||||
|
private $handle = null;
|
||||||
|
|
||||||
|
public function __construct(string $projectDir)
|
||||||
|
{
|
||||||
|
$this->lockFilePath = rtrim($projectDir, '/') . '/var/knowledge/locks/ingest.lock';
|
||||||
|
}
|
||||||
|
|
||||||
|
public function acquire(): void
|
||||||
|
{
|
||||||
|
$dir = dirname($this->lockFilePath);
|
||||||
|
|
||||||
|
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||||
|
throw new \RuntimeException('Unable to create lock directory.');
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->handle = fopen($this->lockFilePath, 'c');
|
||||||
|
|
||||||
|
if ($this->handle === false) {
|
||||||
|
throw new \RuntimeException('Unable to open ingest lock file.');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!flock($this->handle, LOCK_EX | LOCK_NB)) {
|
||||||
|
throw new \RuntimeException('Another ingest process is already running.');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function release(): void
|
||||||
|
{
|
||||||
|
if ($this->handle !== null) {
|
||||||
|
flock($this->handle, LOCK_UN);
|
||||||
|
fclose($this->handle);
|
||||||
|
$this->handle = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function __destruct()
|
||||||
|
{
|
||||||
|
$this->release();
|
||||||
|
}
|
||||||
|
}
|
||||||
38
src/Ingest/VectorRebuildService.php
Normal file
38
src/Ingest/VectorRebuildService.php
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Ingest;
|
||||||
|
|
||||||
|
use App\Index\IndexMetaManager;
|
||||||
|
use App\Knowledge\ChunkManager;
|
||||||
|
use App\Vector\VectorIndexBuilder;
|
||||||
|
|
||||||
|
final readonly class VectorRebuildService
|
||||||
|
{
|
||||||
|
public function __construct(
|
||||||
|
private VectorIndexBuilder $vectorBuilder,
|
||||||
|
private IndexMetaManager $metaManager,
|
||||||
|
private ChunkManager $chunkManager,
|
||||||
|
) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Führt einen vollständigen, deterministischen FAISS-Rebuild aus.
|
||||||
|
*
|
||||||
|
* Ablauf:
|
||||||
|
* 1. Rebuild des Vector Index aus index.ndjson
|
||||||
|
* 2. Chunk-Zählung via ChunkManager
|
||||||
|
* 3. Runtime-Stats atomar aktualisieren
|
||||||
|
*/
|
||||||
|
public function rebuild(?string $logPath = null): void
|
||||||
|
{
|
||||||
|
// 1️⃣ Vector Index neu bauen
|
||||||
|
$this->vectorBuilder->rebuildFromNdjson($logPath);
|
||||||
|
|
||||||
|
// 2️⃣ Chunk Count streaming-safe zählen
|
||||||
|
$chunkCount = $this->chunkManager->countAllChunks();
|
||||||
|
|
||||||
|
// 3️⃣ Runtime-Stats aktualisieren (atomar)
|
||||||
|
$this->metaManager->updateRuntimeStats($chunkCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -13,7 +13,6 @@ final class VectorIndexBuilder
|
|||||||
private string $pythonBin;
|
private string $pythonBin;
|
||||||
private string $scriptPath;
|
private string $scriptPath;
|
||||||
private string $indexNdjsonPath;
|
private string $indexNdjsonPath;
|
||||||
private string $indexMetaPath;
|
|
||||||
private string $vectorIndexPath;
|
private string $vectorIndexPath;
|
||||||
private string $vectorMetaPath;
|
private string $vectorMetaPath;
|
||||||
private int $timeoutSeconds;
|
private int $timeoutSeconds;
|
||||||
@@ -24,7 +23,6 @@ final class VectorIndexBuilder
|
|||||||
string $pythonBin,
|
string $pythonBin,
|
||||||
string $scriptPath,
|
string $scriptPath,
|
||||||
string $indexNdjsonPath,
|
string $indexNdjsonPath,
|
||||||
string $indexMetaPath,
|
|
||||||
string $vectorIndexPath,
|
string $vectorIndexPath,
|
||||||
int $timeoutSeconds,
|
int $timeoutSeconds,
|
||||||
IndexConfigurationProvider $configurationProvider
|
IndexConfigurationProvider $configurationProvider
|
||||||
@@ -32,54 +30,30 @@ final class VectorIndexBuilder
|
|||||||
$this->pythonBin = $pythonBin;
|
$this->pythonBin = $pythonBin;
|
||||||
$this->scriptPath = $scriptPath;
|
$this->scriptPath = $scriptPath;
|
||||||
$this->indexNdjsonPath = $indexNdjsonPath;
|
$this->indexNdjsonPath = $indexNdjsonPath;
|
||||||
$this->indexMetaPath = $indexMetaPath;
|
|
||||||
$this->vectorIndexPath = $vectorIndexPath;
|
$this->vectorIndexPath = $vectorIndexPath;
|
||||||
$this->vectorMetaPath = $vectorIndexPath . '.meta.json';
|
$this->vectorMetaPath = $vectorIndexPath . '.meta.json';
|
||||||
$this->timeoutSeconds = $timeoutSeconds;
|
$this->timeoutSeconds = $timeoutSeconds;
|
||||||
$this->configurationProvider = $configurationProvider;
|
$this->configurationProvider = $configurationProvider;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Rebuild FAISS Index deterministisch aus index.ndjson.
|
|
||||||
*/
|
|
||||||
public function rebuildFromNdjson(?string $logPath = null): void
|
public function rebuildFromNdjson(?string $logPath = null): void
|
||||||
{
|
{
|
||||||
$this->assertPreconditions();
|
$this->assertPreconditions();
|
||||||
|
|
||||||
// --------------------------------------------
|
|
||||||
// 🔵 FALL: NDJSON ist leer → kein Vector Index
|
|
||||||
// --------------------------------------------
|
|
||||||
if (!is_file($this->indexNdjsonPath) || filesize($this->indexNdjsonPath) === 0) {
|
if (!is_file($this->indexNdjsonPath) || filesize($this->indexNdjsonPath) === 0) {
|
||||||
|
|
||||||
@unlink($this->vectorIndexPath);
|
@unlink($this->vectorIndexPath);
|
||||||
@unlink($this->vectorMetaPath);
|
@unlink($this->vectorMetaPath);
|
||||||
|
|
||||||
if ($logPath !== null) {
|
|
||||||
@file_put_contents(
|
|
||||||
$logPath,
|
|
||||||
"NDJSON empty → Vector index removed\n",
|
|
||||||
FILE_APPEND
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------
|
$config = $this->configurationProvider->getConfiguration();
|
||||||
// 🟢 FALL: NDJSON enthält Chunks
|
$embeddingModel = $config->getEmbeddingModel();
|
||||||
// --------------------------------------------
|
|
||||||
|
|
||||||
if (!is_file($this->indexMetaPath)) {
|
|
||||||
$this->initializeIndexMeta();
|
|
||||||
}
|
|
||||||
|
|
||||||
$indexMeta = $this->readIndexMeta();
|
|
||||||
$embeddingModel = $indexMeta['embedding_model'];
|
|
||||||
|
|
||||||
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
|
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
|
||||||
|
$tmpVectorMetaPath = $tmpVectorIndexPath . '.meta.json';
|
||||||
|
|
||||||
@unlink($tmpVectorIndexPath);
|
@unlink($tmpVectorIndexPath);
|
||||||
@unlink($this->vectorMetaPath);
|
@unlink($tmpVectorMetaPath);
|
||||||
|
|
||||||
$cmd = [
|
$cmd = [
|
||||||
$this->pythonBin,
|
$this->pythonBin,
|
||||||
@@ -94,107 +68,51 @@ final class VectorIndexBuilder
|
|||||||
|
|
||||||
$this->runProcess($process, $logPath);
|
$this->runProcess($process, $logPath);
|
||||||
|
|
||||||
$this->validatePythonOutputs($tmpVectorIndexPath);
|
$this->validateOutputs($tmpVectorIndexPath, $tmpVectorMetaPath);
|
||||||
|
|
||||||
$this->atomicSwitch($tmpVectorIndexPath);
|
$this->atomicSwitchPair(
|
||||||
|
$tmpVectorIndexPath,
|
||||||
|
$tmpVectorMetaPath
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// -----------------------------------------------------
|
|
||||||
// Internals
|
|
||||||
// -----------------------------------------------------
|
|
||||||
|
|
||||||
private function assertPreconditions(): void
|
private function assertPreconditions(): void
|
||||||
{
|
{
|
||||||
if (!is_file($this->scriptPath)) {
|
if (!is_file($this->scriptPath)) {
|
||||||
throw new \RuntimeException(
|
throw new \RuntimeException('Vector build script not found.');
|
||||||
'Vector build script not found at: ' . $this->scriptPath
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!is_file($this->indexNdjsonPath)) {
|
if (!is_file($this->indexNdjsonPath)) {
|
||||||
throw new \RuntimeException(
|
throw new \RuntimeException('index.ndjson not found.');
|
||||||
'index.ndjson not found at: ' . $this->indexNdjsonPath
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private function readIndexMeta(): array
|
private function validateOutputs(string $tmpIndex, string $tmpMeta): void
|
||||||
{
|
{
|
||||||
$meta = json_decode(
|
if (!is_file($tmpIndex) || filesize($tmpIndex) === 0) {
|
||||||
(string) file_get_contents($this->indexMetaPath),
|
|
||||||
true
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!is_array($meta) || empty($meta['embedding_model'])) {
|
|
||||||
throw new \RuntimeException('Invalid index_meta.json');
|
|
||||||
}
|
|
||||||
|
|
||||||
return $meta;
|
|
||||||
}
|
|
||||||
|
|
||||||
private function initializeIndexMeta(): void
|
|
||||||
{
|
|
||||||
$dir = dirname($this->indexMetaPath);
|
|
||||||
|
|
||||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
|
||||||
throw new \RuntimeException('Cannot create knowledge directory');
|
|
||||||
}
|
|
||||||
|
|
||||||
$config = $this->configurationProvider->getConfiguration();
|
|
||||||
|
|
||||||
$data = [
|
|
||||||
'index_version' => 1,
|
|
||||||
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
|
||||||
'embedding_model' => $config->getEmbeddingModel(),
|
|
||||||
'embedding_dimension' => $config->getEmbeddingDimension(),
|
|
||||||
'chunk_size' => $config->getChunkSize(),
|
|
||||||
'chunk_overlap' => $config->getChunkOverlap(),
|
|
||||||
'scoring_version' => $config->getScoringVersion(),
|
|
||||||
'index_format' => 'ndjson',
|
|
||||||
'vector_backend' => 'faiss',
|
|
||||||
];
|
|
||||||
|
|
||||||
file_put_contents(
|
|
||||||
$this->indexMetaPath,
|
|
||||||
json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
private function validatePythonOutputs(string $tmpVectorIndexPath): void
|
|
||||||
{
|
|
||||||
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
|
|
||||||
throw new \RuntimeException('Vector index tmp missing or empty');
|
throw new \RuntimeException('Vector index tmp missing or empty');
|
||||||
}
|
}
|
||||||
|
if (!is_file($tmpMeta) || filesize($tmpMeta) === 0) {
|
||||||
if (!is_file($this->vectorMetaPath) || filesize($this->vectorMetaPath) === 0) {
|
throw new \RuntimeException('Vector meta tmp missing or empty');
|
||||||
throw new \RuntimeException('Vector meta missing or empty');
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private function atomicSwitch(string $tmpVectorIndexPath): void
|
private function atomicSwitchPair(string $tmpIndex, string $tmpMeta): void
|
||||||
{
|
{
|
||||||
if (!rename($tmpVectorIndexPath, $this->vectorIndexPath)) {
|
if (!rename($tmpIndex, $this->vectorIndexPath)) {
|
||||||
throw new \RuntimeException('Atomic switch failed for vector index');
|
throw new \RuntimeException('Atomic switch failed for vector index');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!rename($tmpMeta, $this->vectorMetaPath)) {
|
||||||
|
throw new \RuntimeException('Atomic switch failed for vector meta');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private function runProcess(Process $process, ?string $logPath): void
|
private function runProcess(Process $process, ?string $logPath): void
|
||||||
{
|
{
|
||||||
if ($logPath !== null) {
|
|
||||||
@file_put_contents($logPath, "=== VectorIndexBuilder START ===\n", FILE_APPEND);
|
|
||||||
}
|
|
||||||
|
|
||||||
$process->run();
|
$process->run();
|
||||||
|
|
||||||
if (!$process->isSuccessful()) {
|
if (!$process->isSuccessful()) {
|
||||||
if ($logPath !== null) {
|
|
||||||
@file_put_contents($logPath, $process->getErrorOutput(), FILE_APPEND);
|
|
||||||
}
|
|
||||||
throw new ProcessFailedException($process);
|
throw new ProcessFailedException($process);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($logPath !== null) {
|
|
||||||
@file_put_contents($logPath, "=== VectorIndexBuilder OK ===\n", FILE_APPEND);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user