stash light
This commit is contained in:
108
src/Command/CreateUserCommand.php
Normal file
108
src/Command/CreateUserCommand.php
Normal file
@@ -0,0 +1,108 @@
|
||||
<?php
|
||||
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use App\Entity\User;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
use Symfony\Component\Console\Attribute\AsCommand;
|
||||
use Symfony\Component\Console\Command\Command;
|
||||
use Symfony\Component\Console\Helper\QuestionHelper;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use Symfony\Component\Console\Question\ChoiceQuestion;
|
||||
use Symfony\Component\Console\Question\Question;
|
||||
use Symfony\Component\PasswordHasher\Hasher\UserPasswordHasherInterface;
|
||||
|
||||
#[AsCommand(
|
||||
name: 'mto:agent:user:create',
|
||||
description: 'Creates a new admin user'
|
||||
)]
|
||||
class CreateUserCommand extends Command
|
||||
{
|
||||
public function __construct(
|
||||
private EntityManagerInterface $em,
|
||||
private UserPasswordHasherInterface $passwordHasher
|
||||
)
|
||||
{
|
||||
parent::__construct();
|
||||
}
|
||||
|
||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
/** @var QuestionHelper $helper */
|
||||
$helper = $this->getHelper('question');
|
||||
|
||||
// =============================
|
||||
// Email
|
||||
// =============================
|
||||
$emailQuestion = new Question('E-Mail: ');
|
||||
$emailQuestion->setValidator(function ($value) {
|
||||
if (!filter_var($value, FILTER_VALIDATE_EMAIL)) {
|
||||
throw new \RuntimeException('Invalid email address.');
|
||||
}
|
||||
return strtolower(trim($value));
|
||||
});
|
||||
|
||||
$email = $helper->ask($input, $output, $emailQuestion);
|
||||
|
||||
// Prüfen ob User existiert
|
||||
$existingUser = $this->em
|
||||
->getRepository(User::class)
|
||||
->findOneBy(['email' => $email]);
|
||||
|
||||
if ($existingUser) {
|
||||
$output->writeln('<error>User already exists.</error>');
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
// =============================
|
||||
// Passwort
|
||||
// =============================
|
||||
$passwordQuestion = new Question('Password: ');
|
||||
$passwordQuestion->setHidden(true);
|
||||
$passwordQuestion->setHiddenFallback(false);
|
||||
|
||||
$plainPassword = $helper->ask($input, $output, $passwordQuestion);
|
||||
|
||||
if (strlen($plainPassword) < 8) {
|
||||
$output->writeln('<error>Password must be at least 8 characters.</error>');
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
// =============================
|
||||
// Rolle auswählen
|
||||
// =============================
|
||||
$roleQuestion = new ChoiceQuestion(
|
||||
'Select role:',
|
||||
[
|
||||
'ROLE_SUPER_ADMIN',
|
||||
'ROLE_KNOWLEDGE_ADMIN',
|
||||
'ROLE_EDITOR',
|
||||
'ROLE_USER',
|
||||
],
|
||||
0
|
||||
);
|
||||
|
||||
$role = $helper->ask($input, $output, $roleQuestion);
|
||||
|
||||
// =============================
|
||||
// User erzeugen
|
||||
// =============================
|
||||
$user = new User();
|
||||
$user->setEmail($email);
|
||||
$user->setRoles([$role]);
|
||||
|
||||
$hashedPassword = $this->passwordHasher->hashPassword($user, $plainPassword);
|
||||
$user->setPassword($hashedPassword);
|
||||
|
||||
$this->em->persist($user);
|
||||
$this->em->flush();
|
||||
|
||||
$output->writeln('<info>User created successfully.</info>');
|
||||
$output->writeln('Email: ' . $email);
|
||||
$output->writeln('Role: ' . $role);
|
||||
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
}
|
||||
17
src/Controller/Admin/DashboardController.php
Normal file
17
src/Controller/Admin/DashboardController.php
Normal file
@@ -0,0 +1,17 @@
|
||||
<?php
|
||||
|
||||
|
||||
namespace App\Controller\Admin;
|
||||
|
||||
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
|
||||
use Symfony\Component\HttpFoundation\Response;
|
||||
use Symfony\Component\Routing\Attribute\Route;
|
||||
|
||||
final class DashboardController extends AbstractController
|
||||
{
|
||||
#[Route('/admin', name: 'admin_dashboard')]
|
||||
public function index(): Response
|
||||
{
|
||||
return $this->render('admin/dashboard/index.html.twig');
|
||||
}
|
||||
}
|
||||
217
src/Controller/Admin/DocumentController.php
Normal file
217
src/Controller/Admin/DocumentController.php
Normal file
@@ -0,0 +1,217 @@
|
||||
<?php
|
||||
|
||||
|
||||
namespace App\Controller\Admin;
|
||||
|
||||
use App\Entity\Document;
|
||||
use App\Entity\DocumentVersion;
|
||||
use App\Service\DocumentService;
|
||||
use App\Service\IngestOrchestrator;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
|
||||
use Symfony\Component\HttpFoundation\RedirectResponse;
|
||||
use Symfony\Component\HttpFoundation\Response;
|
||||
use Symfony\Component\Routing\Attribute\Route;
|
||||
use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
use Symfony\Component\HttpFoundation\Request;
|
||||
use Symfony\Component\HttpFoundation\File\Exception\FileException;
|
||||
|
||||
|
||||
#[Route('/admin/documents')]
|
||||
class DocumentController extends AbstractController
|
||||
{
|
||||
#[Route('', name: 'admin_documents')]
|
||||
public function index(EntityManagerInterface $em): Response
|
||||
{
|
||||
$documents = $em->getRepository(Document::class)
|
||||
->findBy([], ['createdAt' => 'DESC']);
|
||||
|
||||
return $this->render('admin/document/index.html.twig', [
|
||||
'documents' => $documents
|
||||
]);
|
||||
}
|
||||
|
||||
#[Route(
|
||||
'/{id}',
|
||||
name: 'admin_document_show',
|
||||
requirements: ['id' => '[0-9a-fA-F\-]{36}']
|
||||
)]
|
||||
public function show(string $id, EntityManagerInterface $em): Response
|
||||
{
|
||||
try {
|
||||
$uuid = Uuid::fromString($id);
|
||||
} catch (\Exception $e) {
|
||||
throw new NotFoundHttpException();
|
||||
}
|
||||
|
||||
$document = $em->getRepository(Document::class)->find($uuid);
|
||||
|
||||
if (!$document) {
|
||||
throw new NotFoundHttpException();
|
||||
}
|
||||
|
||||
return $this->render('admin/document/show.html.twig', [
|
||||
'document' => $document
|
||||
]);
|
||||
}
|
||||
|
||||
#[Route('/new', name: 'admin_document_new')]
|
||||
public function new(Request $request, DocumentService $documentService): Response
|
||||
{
|
||||
if ($request->isMethod('POST')) {
|
||||
|
||||
$title = $request->request->get('title');
|
||||
$file = $request->files->get('file');
|
||||
|
||||
if (!$file || !$title) {
|
||||
$this->addFlash('error', 'Titel und Datei sind erforderlich.');
|
||||
return $this->redirectToRoute('admin_document_new');
|
||||
}
|
||||
|
||||
$uploadDir = $this->getParameter('kernel.project_dir') . '/var/knowledge/uploads';
|
||||
|
||||
if (!is_dir($uploadDir)) {
|
||||
mkdir($uploadDir, 0777, true);
|
||||
}
|
||||
|
||||
$newFilename = uniqid() . '_' . $file->getClientOriginalName();
|
||||
|
||||
try {
|
||||
$file->move($uploadDir, $newFilename);
|
||||
} catch (FileException $e) {
|
||||
throw new \RuntimeException('File upload failed.');
|
||||
}
|
||||
|
||||
$filePath = $uploadDir . '/' . $newFilename;
|
||||
|
||||
$documentService->createDocument(
|
||||
$title,
|
||||
$filePath,
|
||||
$this->getUser()
|
||||
);
|
||||
|
||||
return $this->redirectToRoute('admin_documents');
|
||||
}
|
||||
|
||||
return $this->render('admin/document/new.html.twig');
|
||||
}
|
||||
|
||||
#[Route('/{id}/version/new', name: 'admin_document_version_new', requirements: ['id' => '[0-9a-fA-F\-]{36}'])]
|
||||
public function newVersion(
|
||||
string $id,
|
||||
Request $request,
|
||||
EntityManagerInterface $em,
|
||||
DocumentService $documentService
|
||||
): Response {
|
||||
|
||||
$document = $em->getRepository(Document::class)->find($id);
|
||||
|
||||
if (!$document) {
|
||||
throw $this->createNotFoundException();
|
||||
}
|
||||
|
||||
if ($request->isMethod('POST')) {
|
||||
|
||||
$file = $request->files->get('file');
|
||||
|
||||
if (!$file) {
|
||||
$this->addFlash('error', 'Datei ist erforderlich.');
|
||||
return $this->redirectToRoute('admin_document_version_new', ['id' => $id]);
|
||||
}
|
||||
|
||||
$uploadDir = $this->getParameter('kernel.project_dir') . '/var/knowledge/uploads';
|
||||
|
||||
if (!is_dir($uploadDir)) {
|
||||
mkdir($uploadDir, 0777, true);
|
||||
}
|
||||
|
||||
$newFilename = uniqid() . '_' . $file->getClientOriginalName();
|
||||
|
||||
try {
|
||||
$file->move($uploadDir, $newFilename);
|
||||
} catch (FileException $e) {
|
||||
throw new \RuntimeException('File upload failed.');
|
||||
}
|
||||
|
||||
$filePath = $uploadDir . '/' . $newFilename;
|
||||
|
||||
$documentService->addVersion(
|
||||
$document,
|
||||
$filePath,
|
||||
$this->getUser()
|
||||
);
|
||||
|
||||
return $this->redirectToRoute('admin_document_show', ['id' => $id]);
|
||||
}
|
||||
|
||||
return $this->render('admin/document/new_version.html.twig', [
|
||||
'document' => $document
|
||||
]);
|
||||
}
|
||||
|
||||
#[Route(
|
||||
'/version/{versionId}/activate',
|
||||
name: 'admin_document_version_activate',
|
||||
requirements: ['versionId' => '[0-9a-fA-F\-]{36}'],
|
||||
methods: ['POST']
|
||||
)]
|
||||
public function activateVersion(
|
||||
string $versionId,
|
||||
Request $request,
|
||||
EntityManagerInterface $em,
|
||||
DocumentService $documentService
|
||||
): RedirectResponse {
|
||||
|
||||
if (!$this->isCsrfTokenValid('activate_version', $request->request->get('_token'))) {
|
||||
throw $this->createAccessDeniedException();
|
||||
}
|
||||
|
||||
$version = $em->getRepository(DocumentVersion::class)->find($versionId);
|
||||
|
||||
if (!$version) {
|
||||
throw $this->createNotFoundException();
|
||||
}
|
||||
|
||||
$documentService->activateVersion($version);
|
||||
|
||||
return $this->redirectToRoute('admin_document_show', [
|
||||
'id' => $version->getDocument()->getId()
|
||||
]);
|
||||
}
|
||||
|
||||
#[Route(
|
||||
'/version/{versionId}/ingest',
|
||||
name: 'admin_document_version_ingest',
|
||||
methods: ['POST'],
|
||||
requirements: ['versionId' => '[0-9a-fA-F\-]{36}']
|
||||
)]
|
||||
public function ingestVersion(
|
||||
string $versionId,
|
||||
Request $request,
|
||||
EntityManagerInterface $em,
|
||||
IngestOrchestrator $orchestrator
|
||||
): RedirectResponse {
|
||||
|
||||
if (!$this->isCsrfTokenValid('ingest_version', $request->request->get('_token'))) {
|
||||
throw $this->createAccessDeniedException();
|
||||
}
|
||||
|
||||
$version = $em->getRepository(DocumentVersion::class)->find($versionId);
|
||||
|
||||
if (!$version) {
|
||||
throw $this->createNotFoundException();
|
||||
}
|
||||
|
||||
$orchestrator->runForVersion(
|
||||
$version,
|
||||
$this->getUser(),
|
||||
true // erstmal DryRun
|
||||
);
|
||||
|
||||
return $this->redirectToRoute('admin_document_show', [
|
||||
'id' => $version->getDocument()->getId()
|
||||
]);
|
||||
}
|
||||
|
||||
}
|
||||
57
src/Controller/Admin/IngestJobController.php
Normal file
57
src/Controller/Admin/IngestJobController.php
Normal file
@@ -0,0 +1,57 @@
|
||||
<?php
|
||||
|
||||
|
||||
namespace App\Controller\Admin;
|
||||
|
||||
use App\Entity\IngestJob;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
|
||||
use Symfony\Component\HttpFoundation\Response;
|
||||
use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
|
||||
use Symfony\Component\Routing\Attribute\Route;
|
||||
use App\Ingest\IngestFlow;
|
||||
use Symfony\Component\HttpFoundation\RedirectResponse;
|
||||
|
||||
#[Route('/admin/jobs')]
|
||||
class IngestJobController extends AbstractController
|
||||
{
|
||||
#[Route('', name: 'admin_jobs')]
|
||||
public function index(EntityManagerInterface $em): Response
|
||||
{
|
||||
$jobs = $em->getRepository(IngestJob::class)
|
||||
->findBy([], ['startedAt' => 'DESC']);
|
||||
|
||||
return $this->render('admin/job/index.html.twig', [
|
||||
'jobs' => $jobs
|
||||
]);
|
||||
}
|
||||
|
||||
#[Route(
|
||||
'/{id}',
|
||||
name: 'admin_job_show',
|
||||
requirements: ['id' => '[0-9a-fA-F\-]{36}']
|
||||
)]
|
||||
public function show(string $id, EntityManagerInterface $em): Response
|
||||
{
|
||||
$job = $em->getRepository(IngestJob::class)->find($id);
|
||||
|
||||
if (!$job) {
|
||||
throw new NotFoundHttpException();
|
||||
}
|
||||
|
||||
return $this->render('admin/job/show.html.twig', [
|
||||
'job' => $job
|
||||
]);
|
||||
}
|
||||
|
||||
#[Route('/global-reindex', name: 'admin_global_reindex', methods: ['POST'])]
|
||||
public function globalReindex(
|
||||
IngestFlow $flow
|
||||
): RedirectResponse {
|
||||
$this->denyAccessUnlessGranted('ROLE_SUPER_ADMIN');
|
||||
|
||||
$flow->globalReindex($this->getUser());
|
||||
|
||||
return $this->redirectToRoute('admin_jobs');
|
||||
}
|
||||
}
|
||||
33
src/Controller/Admin/SecurityController.php
Normal file
33
src/Controller/Admin/SecurityController.php
Normal file
@@ -0,0 +1,33 @@
|
||||
<?php
|
||||
|
||||
|
||||
namespace App\Controller\Admin;
|
||||
|
||||
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
|
||||
use Symfony\Component\HttpFoundation\Response;
|
||||
use Symfony\Component\Routing\Attribute\Route;
|
||||
use Symfony\Component\Security\Http\Authentication\AuthenticationUtils;
|
||||
|
||||
final class SecurityController extends AbstractController
|
||||
{
|
||||
#[Route('/admin/login', name: 'admin_login')]
|
||||
public function login(AuthenticationUtils $authUtils): Response
|
||||
{
|
||||
// Wenn bereits eingeloggt → direkt ins Dashboard
|
||||
if ($this->getUser()) {
|
||||
return $this->redirectToRoute('admin_dashboard');
|
||||
}
|
||||
|
||||
return $this->render('admin/security/login.html.twig', [
|
||||
'last_username' => $authUtils->getLastUsername(),
|
||||
'error' => $authUtils->getLastAuthenticationError(),
|
||||
]);
|
||||
}
|
||||
|
||||
#[Route('/admin/logout', name: 'admin_logout')]
|
||||
public function logout(): void
|
||||
{
|
||||
// Symfony interceptet diese Route, daher bleibt sie leer.
|
||||
throw new \LogicException('This method can be blank - it will be intercepted by the logout key on your firewall.');
|
||||
}
|
||||
}
|
||||
110
src/Entity/Document.php
Normal file
110
src/Entity/Document.php
Normal file
@@ -0,0 +1,110 @@
|
||||
<?php
|
||||
|
||||
|
||||
namespace App\Entity;
|
||||
|
||||
use Doctrine\ORM\Mapping as ORM;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
use Doctrine\Common\Collections\ArrayCollection;
|
||||
use Doctrine\Common\Collections\Collection;
|
||||
|
||||
#[ORM\Entity]
|
||||
class Document
|
||||
{
|
||||
public const STATUS_ACTIVE = 'ACTIVE';
|
||||
public const STATUS_ARCHIVED = 'ARCHIVED';
|
||||
|
||||
#[ORM\Id]
|
||||
#[ORM\Column(type: 'uuid', unique: true)]
|
||||
private Uuid $id;
|
||||
|
||||
#[ORM\Column(length: 255)]
|
||||
private string $title;
|
||||
|
||||
#[ORM\Column(length: 20)]
|
||||
private string $status = self::STATUS_ACTIVE;
|
||||
|
||||
#[ORM\ManyToOne]
|
||||
#[ORM\JoinColumn(nullable: false)]
|
||||
private User $createdBy;
|
||||
|
||||
#[ORM\Column]
|
||||
private \DateTimeImmutable $createdAt;
|
||||
|
||||
#[ORM\OneToMany(mappedBy: 'document', targetEntity: DocumentVersion::class, cascade: ['persist'], orphanRemoval: true)]
|
||||
private Collection $versions;
|
||||
|
||||
#[ORM\ManyToOne]
|
||||
private ?DocumentVersion $currentVersion = null;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->id = Uuid::v4();
|
||||
$this->createdAt = new \DateTimeImmutable();
|
||||
$this->versions = new ArrayCollection();
|
||||
}
|
||||
|
||||
public function getId(): Uuid
|
||||
{
|
||||
return $this->id;
|
||||
}
|
||||
|
||||
public function getCreatedAt(): \DateTimeImmutable
|
||||
{
|
||||
return $this->createdAt;
|
||||
}
|
||||
|
||||
public function getTitle(): string
|
||||
{
|
||||
return $this->title;
|
||||
}
|
||||
|
||||
public function setTitle(string $title): static
|
||||
{
|
||||
$this->title = $title;
|
||||
return $this;
|
||||
}
|
||||
|
||||
|
||||
public function getStatus(): string
|
||||
{
|
||||
return $this->status;
|
||||
}
|
||||
|
||||
public function archive(): void
|
||||
{
|
||||
$this->status = self::STATUS_ARCHIVED;
|
||||
}
|
||||
|
||||
public function getCreatedBy(): User
|
||||
{
|
||||
return $this->createdBy;
|
||||
}
|
||||
|
||||
public function setCreatedBy(User $user): static
|
||||
{
|
||||
$this->createdBy = $user;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function getVersions(): Collection
|
||||
{
|
||||
return $this->versions;
|
||||
}
|
||||
|
||||
public function addVersion(DocumentVersion $version): void
|
||||
{
|
||||
$this->versions->add($version);
|
||||
$version->setDocument($this);
|
||||
}
|
||||
|
||||
public function setCurrentVersion(?DocumentVersion $version): void
|
||||
{
|
||||
$this->currentVersion = $version;
|
||||
}
|
||||
|
||||
public function getCurrentVersion(): ?DocumentVersion
|
||||
{
|
||||
return $this->currentVersion;
|
||||
}
|
||||
}
|
||||
184
src/Entity/DocumentVersion.php
Normal file
184
src/Entity/DocumentVersion.php
Normal file
@@ -0,0 +1,184 @@
|
||||
<?php
|
||||
|
||||
namespace App\Entity;
|
||||
|
||||
use Doctrine\ORM\Mapping as ORM;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
use App\Repository\DocumentVersionRepository;
|
||||
|
||||
#[ORM\Entity(repositoryClass: DocumentVersionRepository::class)]
|
||||
class DocumentVersion
|
||||
{
|
||||
public const INGEST_PENDING = 'PENDING';
|
||||
public const INGEST_RUNNING = 'RUNNING';
|
||||
public const INGEST_INDEXED = 'INDEXED';
|
||||
public const INGEST_FAILED = 'FAILED';
|
||||
|
||||
public const INGEST_STATUSES = [
|
||||
self::INGEST_PENDING,
|
||||
self::INGEST_RUNNING,
|
||||
self::INGEST_INDEXED,
|
||||
self::INGEST_FAILED,
|
||||
];
|
||||
|
||||
#[ORM\Id]
|
||||
#[ORM\Column(type: 'uuid', unique: true)]
|
||||
private Uuid $id;
|
||||
|
||||
#[ORM\ManyToOne(inversedBy: 'versions')]
|
||||
#[ORM\JoinColumn(nullable: false)]
|
||||
private Document $document;
|
||||
|
||||
#[ORM\Column]
|
||||
private int $versionNumber;
|
||||
|
||||
#[ORM\Column(length: 255)]
|
||||
private string $filePath;
|
||||
|
||||
#[ORM\Column(length: 64)]
|
||||
private string $checksum;
|
||||
|
||||
#[ORM\Column(length: 20)]
|
||||
private string $ingestStatus = self::INGEST_PENDING;
|
||||
|
||||
#[ORM\ManyToOne]
|
||||
#[ORM\JoinColumn(nullable: false)]
|
||||
private User $createdBy;
|
||||
|
||||
#[ORM\Column]
|
||||
private \DateTimeImmutable $createdAt;
|
||||
|
||||
#[ORM\Column]
|
||||
private bool $isActive = false;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->id = Uuid::v4();
|
||||
$this->createdAt = new \DateTimeImmutable();
|
||||
}
|
||||
|
||||
// =========================
|
||||
// ID
|
||||
// =========================
|
||||
|
||||
public function getId(): Uuid
|
||||
{
|
||||
return $this->id;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Document Relation
|
||||
// =========================
|
||||
|
||||
public function setDocument(Document $document): void
|
||||
{
|
||||
$this->document = $document;
|
||||
}
|
||||
|
||||
public function getDocument(): Document
|
||||
{
|
||||
return $this->document;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Version Number
|
||||
// =========================
|
||||
|
||||
public function getVersionNumber(): int
|
||||
{
|
||||
return $this->versionNumber;
|
||||
}
|
||||
|
||||
public function setVersionNumber(int $number): void
|
||||
{
|
||||
$this->versionNumber = $number;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// File Path
|
||||
// =========================
|
||||
|
||||
public function setFilePath(string $path): void
|
||||
{
|
||||
$this->filePath = $path;
|
||||
}
|
||||
|
||||
public function getFilePath(): string
|
||||
{
|
||||
return $this->filePath;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Checksum
|
||||
// =========================
|
||||
|
||||
public function setChecksum(string $checksum): void
|
||||
{
|
||||
$this->checksum = $checksum;
|
||||
}
|
||||
|
||||
public function getChecksum(): string
|
||||
{
|
||||
return $this->checksum;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Ingest Status
|
||||
// =========================
|
||||
|
||||
public function setIngestStatus(string $status): void
|
||||
{
|
||||
if (!in_array($status, self::INGEST_STATUSES, true)) {
|
||||
throw new \InvalidArgumentException('Invalid ingest status.');
|
||||
}
|
||||
|
||||
$this->ingestStatus = $status;
|
||||
}
|
||||
|
||||
public function getIngestStatus(): string
|
||||
{
|
||||
return $this->ingestStatus;
|
||||
}
|
||||
|
||||
public function isIndexed(): bool
|
||||
{
|
||||
return $this->ingestStatus === self::INGEST_INDEXED;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Created By
|
||||
// =========================
|
||||
|
||||
public function setCreatedBy(User $user): void
|
||||
{
|
||||
$this->createdBy = $user;
|
||||
}
|
||||
|
||||
public function getCreatedBy(): User
|
||||
{
|
||||
return $this->createdBy;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Created At
|
||||
// =========================
|
||||
|
||||
public function getCreatedAt(): \DateTimeImmutable
|
||||
{
|
||||
return $this->createdAt;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Active Flag
|
||||
// =========================
|
||||
|
||||
public function setActive(bool $active): void
|
||||
{
|
||||
$this->isActive = $active;
|
||||
}
|
||||
|
||||
public function isActive(): bool
|
||||
{
|
||||
return $this->isActive;
|
||||
}
|
||||
}
|
||||
101
src/Entity/IngestJob.php
Normal file
101
src/Entity/IngestJob.php
Normal file
@@ -0,0 +1,101 @@
|
||||
<?php
|
||||
|
||||
namespace App\Entity;
|
||||
|
||||
use Doctrine\ORM\Mapping as ORM;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
#[ORM\Entity]
|
||||
class IngestJob
|
||||
{
|
||||
public const TYPE_DOCUMENT = 'DOCUMENT';
|
||||
public const TYPE_GLOBAL_REINDEX = 'GLOBAL_REINDEX';
|
||||
|
||||
public const STATUS_RUNNING = 'RUNNING';
|
||||
public const STATUS_COMPLETED = 'COMPLETED';
|
||||
public const STATUS_FAILED = 'FAILED';
|
||||
public const STATUS_ABORTED = 'ABORTED';
|
||||
|
||||
#[ORM\Id]
|
||||
#[ORM\Column(type: 'uuid', unique: true)]
|
||||
private Uuid $id;
|
||||
|
||||
#[ORM\Column(length: 30)]
|
||||
private string $type;
|
||||
|
||||
#[ORM\Column(length: 20)]
|
||||
private string $status = self::STATUS_RUNNING;
|
||||
|
||||
#[ORM\Column(type: 'uuid', nullable: true)]
|
||||
private ?Uuid $documentId = null;
|
||||
|
||||
#[ORM\Column(type: 'uuid', nullable: true)]
|
||||
private ?Uuid $documentVersionId = null;
|
||||
|
||||
#[ORM\Column]
|
||||
private \DateTimeImmutable $startedAt;
|
||||
|
||||
#[ORM\Column(nullable: true)]
|
||||
private ?\DateTimeImmutable $finishedAt = null;
|
||||
|
||||
#[ORM\ManyToOne]
|
||||
#[ORM\JoinColumn(nullable: true)]
|
||||
private ?User $startedBy = null;
|
||||
|
||||
#[ORM\Column(nullable: true)]
|
||||
private ?string $logPath = null;
|
||||
|
||||
#[ORM\Column(type: 'text', nullable: true)]
|
||||
private ?string $errorMessage = null;
|
||||
|
||||
public function __construct(string $type)
|
||||
{
|
||||
$this->id = Uuid::v4();
|
||||
$this->type = $type;
|
||||
$this->startedAt = new \DateTimeImmutable();
|
||||
$this->status = self::STATUS_RUNNING;
|
||||
}
|
||||
|
||||
public function getId(): Uuid { return $this->id; }
|
||||
public function getType(): string { return $this->type; }
|
||||
public function getStatus(): string { return $this->status; }
|
||||
|
||||
public function setDocumentId(?Uuid $id): void { $this->documentId = $id; }
|
||||
public function getDocumentId(): ?Uuid { return $this->documentId; }
|
||||
|
||||
public function setDocumentVersionId(?Uuid $id): void { $this->documentVersionId = $id; }
|
||||
public function getDocumentVersionId(): ?Uuid { return $this->documentVersionId; }
|
||||
|
||||
public function setStartedBy(?User $user): void { $this->startedBy = $user; }
|
||||
public function getStartedBy(): ?User { return $this->startedBy; }
|
||||
|
||||
public function setLogPath(?string $path): void { $this->logPath = $path; }
|
||||
public function getLogPath(): ?string { return $this->logPath; }
|
||||
|
||||
public function getStartedAt(): \DateTimeImmutable { return $this->startedAt; }
|
||||
public function getFinishedAt(): ?\DateTimeImmutable { return $this->finishedAt; }
|
||||
|
||||
public function markCompleted(): void
|
||||
{
|
||||
$this->status = self::STATUS_COMPLETED;
|
||||
$this->finishedAt = new \DateTimeImmutable();
|
||||
}
|
||||
|
||||
public function markFailed(string $message): void
|
||||
{
|
||||
$this->status = self::STATUS_FAILED;
|
||||
$this->errorMessage = $message;
|
||||
$this->finishedAt = new \DateTimeImmutable();
|
||||
}
|
||||
|
||||
public function markAborted(): void
|
||||
{
|
||||
$this->status = self::STATUS_ABORTED;
|
||||
$this->finishedAt = new \DateTimeImmutable();
|
||||
}
|
||||
|
||||
public function getErrorMessage(): ?string
|
||||
{
|
||||
return $this->errorMessage;
|
||||
}
|
||||
}
|
||||
161
src/Entity/User.php
Normal file
161
src/Entity/User.php
Normal file
@@ -0,0 +1,161 @@
|
||||
<?php
|
||||
|
||||
namespace App\Entity;
|
||||
|
||||
use App\Repository\UserRepository;
|
||||
use Doctrine\ORM\Mapping as ORM;
|
||||
use Symfony\Component\Security\Core\User\UserInterface;
|
||||
use Symfony\Component\Security\Core\User\PasswordAuthenticatedUserInterface;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
#[ORM\Entity(repositoryClass: UserRepository::class)]
|
||||
#[ORM\HasLifecycleCallbacks]
|
||||
class User implements UserInterface, PasswordAuthenticatedUserInterface
|
||||
{
|
||||
#[ORM\Id]
|
||||
#[ORM\Column(type: 'uuid', unique: true)]
|
||||
private Uuid $id;
|
||||
|
||||
#[ORM\Column(length: 180, unique: true)]
|
||||
private string $email;
|
||||
|
||||
#[ORM\Column]
|
||||
private string $password;
|
||||
|
||||
#[ORM\Column(type: 'json')]
|
||||
private array $roles = [];
|
||||
|
||||
#[ORM\Column]
|
||||
private bool $isActive = true;
|
||||
|
||||
#[ORM\Column]
|
||||
private \DateTimeImmutable $createdAt;
|
||||
|
||||
#[ORM\Column(nullable: true)]
|
||||
private ?\DateTimeImmutable $updatedAt = null;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->id = Uuid::v4();
|
||||
$this->createdAt = new \DateTimeImmutable();
|
||||
$this->roles = ['ROLE_USER'];
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Security Identifier
|
||||
// =========================
|
||||
|
||||
public function getUserIdentifier(): string
|
||||
{
|
||||
return $this->email;
|
||||
}
|
||||
|
||||
// Symfony < 6 compatibility (optional)
|
||||
public function getUsername(): string
|
||||
{
|
||||
return $this->email;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// ID
|
||||
// =========================
|
||||
|
||||
public function getId(): Uuid
|
||||
{
|
||||
return $this->id;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Email
|
||||
// =========================
|
||||
|
||||
public function getEmail(): string
|
||||
{
|
||||
return $this->email;
|
||||
}
|
||||
|
||||
public function setEmail(string $email): static
|
||||
{
|
||||
$this->email = strtolower($email);
|
||||
return $this;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Password
|
||||
// =========================
|
||||
|
||||
public function getPassword(): string
|
||||
{
|
||||
return $this->password;
|
||||
}
|
||||
|
||||
public function setPassword(string $password): static
|
||||
{
|
||||
$this->password = $password;
|
||||
return $this;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Roles
|
||||
// =========================
|
||||
|
||||
public function getRoles(): array
|
||||
{
|
||||
$roles = $this->roles;
|
||||
|
||||
// Jeder User hat mindestens ROLE_USER
|
||||
$roles[] = 'ROLE_USER';
|
||||
|
||||
return array_unique($roles);
|
||||
}
|
||||
|
||||
public function setRoles(array $roles): static
|
||||
{
|
||||
$this->roles = $roles;
|
||||
return $this;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Active Status
|
||||
// =========================
|
||||
|
||||
public function isActive(): bool
|
||||
{
|
||||
return $this->isActive;
|
||||
}
|
||||
|
||||
public function setIsActive(bool $isActive): static
|
||||
{
|
||||
$this->isActive = $isActive;
|
||||
return $this;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Timestamps
|
||||
// =========================
|
||||
|
||||
public function getCreatedAt(): \DateTimeImmutable
|
||||
{
|
||||
return $this->createdAt;
|
||||
}
|
||||
|
||||
public function getUpdatedAt(): ?\DateTimeImmutable
|
||||
{
|
||||
return $this->updatedAt;
|
||||
}
|
||||
|
||||
#[ORM\PreUpdate]
|
||||
public function updateTimestamp(): void
|
||||
{
|
||||
$this->updatedAt = new \DateTimeImmutable();
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Erase Credentials (Pflicht)
|
||||
// =========================
|
||||
|
||||
public function eraseCredentials(): void
|
||||
{
|
||||
// Falls später sensible Daten gespeichert werden
|
||||
}
|
||||
}
|
||||
95
src/Index/IndexConfiguration.php
Normal file
95
src/Index/IndexConfiguration.php
Normal file
@@ -0,0 +1,95 @@
|
||||
<?php
|
||||
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Index;
|
||||
|
||||
/**
|
||||
* Beschreibt die "Struktur" des Index (nicht den Inhalt).
|
||||
* Diese Werte müssen bei lokalem Ingest mit index_meta.json kompatibel sein,
|
||||
* sonst muss ein Global Reindex erzwungen werden.
|
||||
*/
|
||||
final class IndexConfiguration
|
||||
{
|
||||
public function __construct(
|
||||
private readonly int $chunkSize,
|
||||
private readonly int $chunkOverlap,
|
||||
private readonly string $embeddingModel,
|
||||
private readonly int $embeddingDimension,
|
||||
private readonly int $scoringVersion,
|
||||
private readonly string $indexFormat = 'ndjson', // bindend: 'ndjson'
|
||||
private readonly string $vectorBackend = 'faiss', // informativ
|
||||
)
|
||||
{
|
||||
if ($this->chunkSize <= 0) {
|
||||
throw new \InvalidArgumentException('chunkSize must be > 0');
|
||||
}
|
||||
if ($this->chunkOverlap < 0) {
|
||||
throw new \InvalidArgumentException('chunkOverlap must be >= 0');
|
||||
}
|
||||
if ($this->chunkOverlap >= $this->chunkSize) {
|
||||
throw new \InvalidArgumentException('chunkOverlap must be < chunkSize');
|
||||
}
|
||||
if ($this->embeddingDimension <= 0) {
|
||||
throw new \InvalidArgumentException('embeddingDimension must be > 0');
|
||||
}
|
||||
if ($this->scoringVersion <= 0) {
|
||||
throw new \InvalidArgumentException('scoringVersion must be > 0');
|
||||
}
|
||||
if ($this->indexFormat !== 'ndjson') {
|
||||
throw new \InvalidArgumentException('indexFormat must be "ndjson"');
|
||||
}
|
||||
}
|
||||
|
||||
public function getChunkSize(): int
|
||||
{
|
||||
return $this->chunkSize;
|
||||
}
|
||||
|
||||
public function getChunkOverlap(): int
|
||||
{
|
||||
return $this->chunkOverlap;
|
||||
}
|
||||
|
||||
public function getEmbeddingModel(): string
|
||||
{
|
||||
return $this->embeddingModel;
|
||||
}
|
||||
|
||||
public function getEmbeddingDimension(): int
|
||||
{
|
||||
return $this->embeddingDimension;
|
||||
}
|
||||
|
||||
public function getScoringVersion(): int
|
||||
{
|
||||
return $this->scoringVersion;
|
||||
}
|
||||
|
||||
public function getIndexFormat(): string
|
||||
{
|
||||
return $this->indexFormat;
|
||||
}
|
||||
|
||||
public function getVectorBackend(): string
|
||||
{
|
||||
return $this->vectorBackend;
|
||||
}
|
||||
|
||||
/**
|
||||
* Canonical representation: nur strukturelle Felder (ohne created_at, index_version).
|
||||
*/
|
||||
public function toStructureArray(): array
|
||||
{
|
||||
return [
|
||||
'embedding_model' => $this->embeddingModel,
|
||||
'embedding_dimension' => $this->embeddingDimension,
|
||||
'chunk_size' => $this->chunkSize,
|
||||
'chunk_overlap' => $this->chunkOverlap,
|
||||
'scoring_version' => $this->scoringVersion,
|
||||
'index_format' => $this->indexFormat,
|
||||
'vector_backend' => $this->vectorBackend,
|
||||
];
|
||||
}
|
||||
}
|
||||
206
src/Index/IndexMetaManager.php
Normal file
206
src/Index/IndexMetaManager.php
Normal file
@@ -0,0 +1,206 @@
|
||||
<?php
|
||||
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Index;
|
||||
|
||||
final class IndexMetaManager
|
||||
{
|
||||
private string $metaPath;
|
||||
|
||||
public function __construct(
|
||||
string $projectDir,
|
||||
private readonly IndexConfiguration $config,
|
||||
string $relativeMetaPath = '/var/knowledge/index_meta.json'
|
||||
)
|
||||
{
|
||||
$this->metaPath = rtrim($projectDir, '/') . $relativeMetaPath;
|
||||
}
|
||||
|
||||
public function getMetaPath(): string
|
||||
{
|
||||
return $this->metaPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gibt null zurück, wenn noch kein Meta existiert (frisches System).
|
||||
*
|
||||
* @return array<string,mixed>|null
|
||||
*/
|
||||
public function readMeta(): ?array
|
||||
{
|
||||
if (!is_file($this->metaPath)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$raw = file_get_contents($this->metaPath);
|
||||
if ($raw === false) {
|
||||
throw new \RuntimeException('Unable to read index_meta.json');
|
||||
}
|
||||
|
||||
$data = json_decode($raw, true);
|
||||
if (!is_array($data)) {
|
||||
throw new \RuntimeException('index_meta.json is invalid JSON');
|
||||
}
|
||||
|
||||
return $data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Erstellt Meta, falls nicht vorhanden (z. B. nach erstem Global Reindex).
|
||||
* Überschreibt NICHT automatisch, wenn vorhanden.
|
||||
*
|
||||
* @return array<string,mixed>
|
||||
*/
|
||||
public function createInitialMetaIfMissing(): array
|
||||
{
|
||||
$existing = $this->readMeta();
|
||||
if ($existing !== null) {
|
||||
return $existing;
|
||||
}
|
||||
|
||||
$meta = $this->buildMetaPayload(indexVersion: 1);
|
||||
$this->atomicWriteJson($meta);
|
||||
|
||||
return $meta;
|
||||
}
|
||||
|
||||
/**
|
||||
* Guardrail: Prüft, ob die aktuelle Config kompatibel zur gespeicherten Meta ist.
|
||||
* Wenn nicht: IndexStructureChangedException -> Global Reindex erzwingen.
|
||||
*/
|
||||
public function validateAgainstCurrent(): void
|
||||
{
|
||||
$meta = $this->readMeta();
|
||||
|
||||
// Wenn noch kein Meta existiert, lassen wir lokale Ingests NICHT einfach laufen.
|
||||
// Governance: Erst Global Reindex erzeugt Meta sauber.
|
||||
if ($meta === null) {
|
||||
throw new IndexStructureChangedException(
|
||||
'index_meta.json missing. Please run a Global Reindex to initialize index structure metadata.',
|
||||
['reason' => 'missing_meta']
|
||||
);
|
||||
}
|
||||
|
||||
$expected = $this->config->toStructureArray();
|
||||
|
||||
$diff = $this->diffStructure($meta, $expected);
|
||||
|
||||
if ($diff !== []) {
|
||||
throw new IndexStructureChangedException(
|
||||
'Index structure changed. Global Reindex required.',
|
||||
$diff
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wird beim Global Reindex verwendet:
|
||||
* - index_version++ (oder initialisieren)
|
||||
* - Meta atomar schreiben
|
||||
*
|
||||
* @return array<string,mixed> new meta
|
||||
*/
|
||||
public function writeMetaForGlobalReindex(): array
|
||||
{
|
||||
$current = $this->readMeta();
|
||||
|
||||
$nextVersion = 1;
|
||||
if (is_array($current) && isset($current['index_version']) && is_int($current['index_version'])) {
|
||||
$nextVersion = $current['index_version'] + 1;
|
||||
}
|
||||
|
||||
$meta = $this->buildMetaPayload($nextVersion);
|
||||
$this->atomicWriteJson($meta);
|
||||
|
||||
return $meta;
|
||||
}
|
||||
|
||||
public function getConfig(): IndexConfiguration
|
||||
{
|
||||
return $this->config;
|
||||
}
|
||||
|
||||
// -------------------------
|
||||
// Internals
|
||||
// -------------------------
|
||||
|
||||
/**
|
||||
* @return array<string,mixed>
|
||||
*/
|
||||
private function buildMetaPayload(int $indexVersion): array
|
||||
{
|
||||
$structure = $this->config->toStructureArray();
|
||||
|
||||
return [
|
||||
'index_version' => $indexVersion,
|
||||
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
|
||||
'embedding_model' => $structure['embedding_model'],
|
||||
'embedding_dimension' => $structure['embedding_dimension'],
|
||||
'chunk_size' => $structure['chunk_size'],
|
||||
'chunk_overlap' => $structure['chunk_overlap'],
|
||||
'scoring_version' => $structure['scoring_version'],
|
||||
'index_format' => $structure['index_format'],
|
||||
'vector_backend' => $structure['vector_backend'],
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string,mixed> $meta
|
||||
* @param array<string,mixed> $expected
|
||||
* @return array<string,mixed> diff
|
||||
*/
|
||||
private function diffStructure(array $meta, array $expected): array
|
||||
{
|
||||
$diff = [];
|
||||
|
||||
foreach ($expected as $key => $value) {
|
||||
$actual = $meta[$key] ?? null;
|
||||
if ($actual !== $value) {
|
||||
$diff[$key] = [
|
||||
'expected' => $value,
|
||||
'actual' => $actual,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
// index_format ist zwingend
|
||||
if (($meta['index_format'] ?? null) !== 'ndjson') {
|
||||
$diff['index_format'] = [
|
||||
'expected' => 'ndjson',
|
||||
'actual' => $meta['index_format'] ?? null,
|
||||
];
|
||||
}
|
||||
|
||||
return $diff;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string,mixed> $payload
|
||||
*/
|
||||
private function atomicWriteJson(array $payload): void
|
||||
{
|
||||
$dir = \dirname($this->metaPath);
|
||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||
throw new \RuntimeException('Unable to create directory: ' . $dir);
|
||||
}
|
||||
|
||||
$tmp = $this->metaPath . '.tmp';
|
||||
|
||||
$json = json_encode($payload, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES);
|
||||
if ($json === false) {
|
||||
throw new \RuntimeException('Unable to encode index_meta.json');
|
||||
}
|
||||
|
||||
if (file_put_contents($tmp, $json . PHP_EOL) === false) {
|
||||
throw new \RuntimeException('Unable to write temp meta file');
|
||||
}
|
||||
|
||||
// atomarer Switch
|
||||
if (!rename($tmp, $this->metaPath)) {
|
||||
@unlink($tmp);
|
||||
throw new \RuntimeException('Unable to switch meta file atomically');
|
||||
}
|
||||
}
|
||||
}
|
||||
34
src/Index/IndexStructureChangedException.php
Normal file
34
src/Index/IndexStructureChangedException.php
Normal file
@@ -0,0 +1,34 @@
|
||||
<?php
|
||||
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Index;
|
||||
|
||||
/**
|
||||
* Wird geworfen, wenn lokale Ingests nicht mehr kompatibel sind
|
||||
* und ein Global Reindex erzwungen werden muss.
|
||||
*/
|
||||
final class IndexStructureChangedException extends \RuntimeException
|
||||
{
|
||||
/**
|
||||
* @param array<string,mixed> $diff
|
||||
*/
|
||||
public function __construct(
|
||||
string $message,
|
||||
private readonly array $diff = [],
|
||||
int $code = 0,
|
||||
?\Throwable $previous = null
|
||||
)
|
||||
{
|
||||
parent::__construct($message, $code, $previous);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string,mixed>
|
||||
*/
|
||||
public function getDiff(): array
|
||||
{
|
||||
return $this->diff;
|
||||
}
|
||||
}
|
||||
182
src/Ingest/IngestFlow.php
Normal file
182
src/Ingest/IngestFlow.php
Normal file
@@ -0,0 +1,182 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Ingest;
|
||||
|
||||
use App\Entity\DocumentVersion;
|
||||
use App\Entity\IngestJob;
|
||||
use App\Entity\User;
|
||||
use App\Index\IndexMetaManager;
|
||||
use App\Index\IndexStructureChangedException;
|
||||
use App\Knowledge\ChunkManager;
|
||||
use App\Service\IngestJobService;
|
||||
use App\Service\LockService;
|
||||
use App\Knowledge\Ingest\KnowledgeIngestService;
|
||||
use App\Vector\VectorIndexBuilder;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
|
||||
final class IngestFlow
|
||||
{
|
||||
public function __construct(
|
||||
private readonly LockService $lockService,
|
||||
private readonly IngestJobService $jobService,
|
||||
private readonly KnowledgeIngestService $knowledgeIngestService,
|
||||
private readonly ChunkManager $chunkManager,
|
||||
private readonly VectorIndexBuilder $vectorBuilder,
|
||||
private readonly IndexMetaManager $metaManager,
|
||||
private readonly EntityManagerInterface $em,
|
||||
) {
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// LOCAL DOCUMENT INGEST
|
||||
// ============================================================
|
||||
|
||||
public function ingestDocumentVersion(
|
||||
DocumentVersion $version,
|
||||
User $user
|
||||
): IngestJob {
|
||||
|
||||
if (!$this->lockService->acquire()) {
|
||||
throw new \RuntimeException('Another ingest job is already running.');
|
||||
}
|
||||
|
||||
$job = null;
|
||||
|
||||
try {
|
||||
|
||||
$job = $this->jobService->startJob(
|
||||
IngestJob::TYPE_DOCUMENT,
|
||||
$user,
|
||||
$version->getDocument()->getId(),
|
||||
$version->getId(),
|
||||
);
|
||||
|
||||
$version->setIngestStatus(DocumentVersion::INGEST_RUNNING);
|
||||
$this->em->flush();
|
||||
|
||||
// --------------------------------------------------
|
||||
// Guardrail: Struktur prüfen
|
||||
// --------------------------------------------------
|
||||
$this->metaManager->validateAgainstCurrent();
|
||||
|
||||
// --------------------------------------------------
|
||||
// Alte Chunks dieses Dokuments entfernen (Streaming)
|
||||
// --------------------------------------------------
|
||||
$this->chunkManager->compactByDocument(
|
||||
$version->getDocument()->getId()
|
||||
);
|
||||
|
||||
// --------------------------------------------------
|
||||
// Neue Chunks erzeugen
|
||||
// --------------------------------------------------
|
||||
$records = $this->knowledgeIngestService
|
||||
->buildChunkRecords($version);
|
||||
|
||||
// --------------------------------------------------
|
||||
// Append in NDJSON
|
||||
// --------------------------------------------------
|
||||
$this->chunkManager->appendChunks($records);
|
||||
|
||||
// --------------------------------------------------
|
||||
// FAISS komplett neu bauen (deterministisch)
|
||||
// --------------------------------------------------
|
||||
$logPath = $job->getLogPath();
|
||||
$this->vectorBuilder->rebuildFromNdjson($logPath);
|
||||
|
||||
// --------------------------------------------------
|
||||
// Erfolg
|
||||
// --------------------------------------------------
|
||||
$version->setIngestStatus(DocumentVersion::INGEST_INDEXED);
|
||||
$this->jobService->markCompleted($job);
|
||||
|
||||
$this->em->flush();
|
||||
|
||||
} catch (IndexStructureChangedException $e) {
|
||||
|
||||
if ($job) {
|
||||
$this->jobService->markFailed($job, $e->getMessage());
|
||||
}
|
||||
|
||||
$version->setIngestStatus(DocumentVersion::INGEST_FAILED);
|
||||
$this->em->flush();
|
||||
|
||||
throw $e;
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
|
||||
if ($job) {
|
||||
$this->jobService->markFailed($job, $e->getMessage());
|
||||
}
|
||||
|
||||
$version->setIngestStatus(DocumentVersion::INGEST_FAILED);
|
||||
$this->em->flush();
|
||||
|
||||
throw $e;
|
||||
|
||||
} finally {
|
||||
$this->lockService->release();
|
||||
}
|
||||
|
||||
return $job;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// GLOBAL REINDEX
|
||||
// ============================================================
|
||||
|
||||
public function globalReindex(User $user): IngestJob
|
||||
{
|
||||
if (!$this->lockService->acquire()) {
|
||||
throw new \RuntimeException('Another ingest job is already running.');
|
||||
}
|
||||
|
||||
$job = null;
|
||||
|
||||
try {
|
||||
|
||||
$job = $this->jobService->startJob(
|
||||
IngestJob::TYPE_GLOBAL_REINDEX,
|
||||
$user
|
||||
);
|
||||
|
||||
// --------------------------------------------------
|
||||
// Alle aktiven Dokumente neu ingestieren
|
||||
// --------------------------------------------------
|
||||
$allRecords = $this->knowledgeIngestService
|
||||
->buildAllActiveChunkRecords();
|
||||
|
||||
// --------------------------------------------------
|
||||
// Komplettes NDJSON neu schreiben
|
||||
// --------------------------------------------------
|
||||
$this->chunkManager->rewriteAll($allRecords);
|
||||
|
||||
// --------------------------------------------------
|
||||
// FAISS komplett neu bauen
|
||||
// --------------------------------------------------
|
||||
$logPath = $job->getLogPath();
|
||||
$this->vectorBuilder->rebuildFromNdjson($logPath);
|
||||
|
||||
// --------------------------------------------------
|
||||
// Meta aktualisieren + index_version++
|
||||
// --------------------------------------------------
|
||||
$this->metaManager->writeMetaForGlobalReindex();
|
||||
|
||||
$this->jobService->markCompleted($job);
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
|
||||
if ($job) {
|
||||
$this->jobService->markFailed($job, $e->getMessage());
|
||||
}
|
||||
|
||||
throw $e;
|
||||
|
||||
} finally {
|
||||
$this->lockService->release();
|
||||
}
|
||||
|
||||
return $job;
|
||||
}
|
||||
}
|
||||
186
src/Knowledge/ChunkManager.php
Normal file
186
src/Knowledge/ChunkManager.php
Normal file
@@ -0,0 +1,186 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge;
|
||||
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
final class ChunkManager
|
||||
{
|
||||
private string $indexPath;
|
||||
|
||||
public function __construct(
|
||||
string $projectDir,
|
||||
string $relativeIndexPath = '/var/knowledge/index.ndjson'
|
||||
) {
|
||||
$this->indexPath = rtrim($projectDir, '/') . $relativeIndexPath;
|
||||
}
|
||||
|
||||
public function getIndexPath(): string
|
||||
{
|
||||
return $this->indexPath;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// APPEND
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* @param iterable<array<string,mixed>> $records
|
||||
*/
|
||||
public function appendChunks(iterable $records): void
|
||||
{
|
||||
$dir = \dirname($this->indexPath);
|
||||
|
||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||
throw new \RuntimeException('Unable to create index directory');
|
||||
}
|
||||
|
||||
$handle = fopen($this->indexPath, 'ab');
|
||||
if (!$handle) {
|
||||
throw new \RuntimeException('Unable to open index.ndjson for append');
|
||||
}
|
||||
|
||||
foreach ($records as $record) {
|
||||
$json = json_encode($record, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||
if ($json === false) {
|
||||
fclose($handle);
|
||||
throw new \RuntimeException('Unable to encode chunk record');
|
||||
}
|
||||
|
||||
fwrite($handle, $json . PHP_EOL);
|
||||
}
|
||||
|
||||
fclose($handle);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// COMPACTION – Entfernt alle Chunks eines Dokuments
|
||||
// ============================================================
|
||||
|
||||
public function compactByDocument(Uuid $documentId): void
|
||||
{
|
||||
if (!is_file($this->indexPath)) {
|
||||
return; // nichts zu kompaktieren
|
||||
}
|
||||
|
||||
$tmpPath = $this->indexPath . '.tmp';
|
||||
|
||||
$in = fopen($this->indexPath, 'rb');
|
||||
$out = fopen($tmpPath, 'wb');
|
||||
|
||||
if (!$in || !$out) {
|
||||
throw new \RuntimeException('Unable to open index for compaction');
|
||||
}
|
||||
|
||||
$docIdString = $documentId->toRfc4122();
|
||||
|
||||
while (($line = fgets($in)) !== false) {
|
||||
$line = trim($line);
|
||||
if ($line === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$data = json_decode($line, true);
|
||||
if (!is_array($data)) {
|
||||
continue; // skip corrupted line
|
||||
}
|
||||
|
||||
if (($data['document_id'] ?? null) === $docIdString) {
|
||||
continue; // skip this document's chunks
|
||||
}
|
||||
|
||||
fwrite($out, $line . PHP_EOL);
|
||||
}
|
||||
|
||||
fclose($in);
|
||||
fclose($out);
|
||||
|
||||
$this->atomicSwitch($tmpPath, $this->indexPath);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FULL REWRITE (Global Reindex)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* @param iterable<array<string,mixed>> $records
|
||||
*/
|
||||
public function rewriteAll(iterable $records): void
|
||||
{
|
||||
$dir = \dirname($this->indexPath);
|
||||
|
||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||
throw new \RuntimeException('Unable to create index directory');
|
||||
}
|
||||
|
||||
$tmpPath = $this->indexPath . '.tmp';
|
||||
|
||||
$handle = fopen($tmpPath, 'wb');
|
||||
if (!$handle) {
|
||||
throw new \RuntimeException('Unable to open temp index file');
|
||||
}
|
||||
|
||||
foreach ($records as $record) {
|
||||
$json = json_encode($record, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||
if ($json === false) {
|
||||
fclose($handle);
|
||||
throw new \RuntimeException('Unable to encode chunk record');
|
||||
}
|
||||
|
||||
fwrite($handle, $json . PHP_EOL);
|
||||
}
|
||||
|
||||
fclose($handle);
|
||||
|
||||
$this->atomicSwitch($tmpPath, $this->indexPath);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STREAM READ (für FAISS rebuild)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* @return \Generator<array<string,mixed>>
|
||||
*/
|
||||
public function streamAll(): \Generator
|
||||
{
|
||||
if (!is_file($this->indexPath)) {
|
||||
return;
|
||||
}
|
||||
|
||||
$handle = fopen($this->indexPath, 'rb');
|
||||
if (!$handle) {
|
||||
throw new \RuntimeException('Unable to open index.ndjson for read');
|
||||
}
|
||||
|
||||
try {
|
||||
while (($line = fgets($handle)) !== false) {
|
||||
$line = trim($line);
|
||||
if ($line === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$data = json_decode($line, true);
|
||||
if (is_array($data)) {
|
||||
yield $data;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
fclose($handle);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// INTERNAL ATOMIC SWITCH
|
||||
// ============================================================
|
||||
|
||||
private function atomicSwitch(string $tmp, string $final): void
|
||||
{
|
||||
if (!rename($tmp, $final)) {
|
||||
@unlink($tmp);
|
||||
throw new \RuntimeException('Atomic switch failed for index.ndjson');
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,39 +1,93 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/KnowledgeIngestService.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
use App\Entity\DocumentVersion;
|
||||
use App\Repository\DocumentVersionRepository;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
final class KnowledgeIngestService
|
||||
{
|
||||
public function __construct(
|
||||
private DocumentLoader $loader,
|
||||
private SimpleChunker $chunker,
|
||||
private ChunkWriter $writer,
|
||||
private ChunkIndexWriter $indexWriter,
|
||||
)
|
||||
{
|
||||
private DocumentLoader $loader,
|
||||
private SimpleChunker $chunker,
|
||||
private DocumentVersionRepository $versionRepo,
|
||||
) {
|
||||
}
|
||||
|
||||
/** @return string[] written chunk filenames */
|
||||
public function ingestFile(string $path, bool $optimize = false): array
|
||||
/**
|
||||
* Lokaler Ingest: erzeugt NDJSON-Records für genau diese Version.
|
||||
*
|
||||
* @return iterable<array<string,mixed>>
|
||||
*/
|
||||
public function buildChunkRecords(DocumentVersion $version): iterable
|
||||
{
|
||||
$text = $this->loader->load($path);
|
||||
|
||||
if ($optimize) {
|
||||
$text = preg_replace("/\n{3,}/", "\n\n", $text);
|
||||
$text = preg_replace("/[ \t]+$/m", "", $text);
|
||||
}
|
||||
|
||||
$sourceHash = sha1($text);
|
||||
$sourceName = basename($path);
|
||||
|
||||
if ($this->indexWriter->hasSourceHash($sourceName, $sourceHash)) {
|
||||
return [];
|
||||
}
|
||||
$text = $this->loader->load($version->getFilePath());
|
||||
$text = $this->optimizeText($text);
|
||||
|
||||
$chunks = $this->chunker->chunk($text);
|
||||
return $this->writer->write($sourceName, $chunks, $sourceHash);
|
||||
|
||||
$documentId = $version->getDocument()->getId()->toRfc4122();
|
||||
$versionId = $version->getId()->toRfc4122();
|
||||
|
||||
$index = 0;
|
||||
|
||||
foreach ($chunks as $chunkText) {
|
||||
yield [
|
||||
'chunk_id' => Uuid::v4()->toRfc4122(),
|
||||
'document_id' => $documentId,
|
||||
'version_id' => $versionId,
|
||||
'chunk_index' => $index++,
|
||||
'text' => $chunkText,
|
||||
'checksum' => sha1($chunkText),
|
||||
'metadata' => $this->buildMetadata($version),
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Global Reindex: iteriert streamingfähig über alle aktiven Versionen.
|
||||
* Keine RAM-Explosion, da alles generatorbasiert bleibt.
|
||||
*
|
||||
* @return iterable<array<string,mixed>>
|
||||
*/
|
||||
public function buildAllActiveChunkRecords(): iterable
|
||||
{
|
||||
foreach ($this->versionRepo->iterateActiveVersions() as $version) {
|
||||
// yield from hält das Ganze streamingfähig (Generator-Kaskade)
|
||||
yield from $this->buildChunkRecords($version);
|
||||
}
|
||||
}
|
||||
|
||||
private function optimizeText(string $text): string
|
||||
{
|
||||
$text = preg_replace("/\n{3,}/", "\n\n", $text);
|
||||
$text = preg_replace("/[ \t]+$/m", "", $text);
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string,mixed>
|
||||
*/
|
||||
private function buildMetadata(DocumentVersion $version): array
|
||||
{
|
||||
$doc = $version->getDocument();
|
||||
|
||||
// Optional: Titel/Name, falls vorhanden
|
||||
$title = null;
|
||||
if (method_exists($doc, 'getTitle')) {
|
||||
$title = $doc->getTitle();
|
||||
} elseif (method_exists($doc, 'getName')) {
|
||||
$title = $doc->getName();
|
||||
}
|
||||
|
||||
return array_filter([
|
||||
'document_title' => $title,
|
||||
'version_number' => method_exists($version, 'getVersionNumber') ? $version->getVersionNumber() : null,
|
||||
'file_path' => $version->getFilePath(),
|
||||
], static fn($v) => $v !== null && $v !== '');
|
||||
}
|
||||
}
|
||||
|
||||
32
src/Repository/DocumentVersionRepository.php
Normal file
32
src/Repository/DocumentVersionRepository.php
Normal file
@@ -0,0 +1,32 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Repository;
|
||||
|
||||
use App\Entity\DocumentVersion;
|
||||
use Doctrine\Bundle\DoctrineBundle\Repository\ServiceEntityRepository;
|
||||
use Doctrine\Persistence\ManagerRegistry;
|
||||
|
||||
final class DocumentVersionRepository extends ServiceEntityRepository
|
||||
{
|
||||
public function __construct(ManagerRegistry $registry)
|
||||
{
|
||||
parent::__construct($registry, DocumentVersion::class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Streamingfähige Iteration über alle aktiven Versionen (für Global Reindex).
|
||||
*
|
||||
* @return iterable<DocumentVersion>
|
||||
*/
|
||||
public function iterateActiveVersions(): iterable
|
||||
{
|
||||
$qb = $this->createQueryBuilder('v')
|
||||
->andWhere('v.isActive = :active')
|
||||
->setParameter('active', true)
|
||||
->orderBy('v.createdAt', 'ASC');
|
||||
|
||||
return $qb->getQuery()->toIterable();
|
||||
}
|
||||
}
|
||||
43
src/Repository/UserRepository.php
Normal file
43
src/Repository/UserRepository.php
Normal file
@@ -0,0 +1,43 @@
|
||||
<?php
|
||||
|
||||
namespace App\Repository;
|
||||
|
||||
use App\Entity\User;
|
||||
use Doctrine\Bundle\DoctrineBundle\Repository\ServiceEntityRepository;
|
||||
use Doctrine\Persistence\ManagerRegistry;
|
||||
|
||||
/**
|
||||
* @extends ServiceEntityRepository<User>
|
||||
*/
|
||||
class UserRepository extends ServiceEntityRepository
|
||||
{
|
||||
public function __construct(ManagerRegistry $registry)
|
||||
{
|
||||
parent::__construct($registry, User::class);
|
||||
}
|
||||
|
||||
// /**
|
||||
// * @return User[] Returns an array of User objects
|
||||
// */
|
||||
// public function findByExampleField($value): array
|
||||
// {
|
||||
// return $this->createQueryBuilder('u')
|
||||
// ->andWhere('u.exampleField = :val')
|
||||
// ->setParameter('val', $value)
|
||||
// ->orderBy('u.id', 'ASC')
|
||||
// ->setMaxResults(10)
|
||||
// ->getQuery()
|
||||
// ->getResult()
|
||||
// ;
|
||||
// }
|
||||
|
||||
// public function findOneBySomeField($value): ?User
|
||||
// {
|
||||
// return $this->createQueryBuilder('u')
|
||||
// ->andWhere('u.exampleField = :val')
|
||||
// ->setParameter('val', $value)
|
||||
// ->getQuery()
|
||||
// ->getOneOrNullResult()
|
||||
// ;
|
||||
// }
|
||||
}
|
||||
123
src/Service/DocumentService.php
Normal file
123
src/Service/DocumentService.php
Normal file
@@ -0,0 +1,123 @@
|
||||
<?php
|
||||
|
||||
namespace App\Service;
|
||||
|
||||
use App\Entity\Document;
|
||||
use App\Entity\DocumentVersion;
|
||||
use App\Entity\User;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
|
||||
class DocumentService
|
||||
{
|
||||
public function __construct(
|
||||
private EntityManagerInterface $em
|
||||
) {}
|
||||
|
||||
/**
|
||||
* Erstellt ein neues Dokument inkl. Version 1
|
||||
*/
|
||||
public function createDocument(
|
||||
string $title,
|
||||
string $filePath,
|
||||
User $user
|
||||
): Document {
|
||||
|
||||
$document = new Document();
|
||||
$document->setTitle($title);
|
||||
$document->setCreatedBy($user);
|
||||
|
||||
$version = new DocumentVersion();
|
||||
$version->setVersionNumber(1);
|
||||
$version->setFilePath($filePath);
|
||||
$version->setChecksum($this->calculateChecksum($filePath));
|
||||
$version->setCreatedBy($user);
|
||||
$version->setActive(true);
|
||||
|
||||
$document->addVersion($version);
|
||||
$document->setCurrentVersion($version);
|
||||
|
||||
$this->em->persist($document);
|
||||
$this->em->persist($version);
|
||||
$this->em->flush();
|
||||
|
||||
return $document;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fügt neue Version hinzu (immutable)
|
||||
*/
|
||||
public function addVersion(
|
||||
Document $document,
|
||||
string $filePath,
|
||||
User $user
|
||||
): DocumentVersion {
|
||||
|
||||
$nextVersionNumber = $this->getNextVersionNumber($document);
|
||||
|
||||
$version = new DocumentVersion();
|
||||
$version->setVersionNumber($nextVersionNumber);
|
||||
$version->setFilePath($filePath);
|
||||
$version->setChecksum($this->calculateChecksum($filePath));
|
||||
$version->setCreatedBy($user);
|
||||
$version->setActive(false);
|
||||
|
||||
$document->addVersion($version);
|
||||
|
||||
$this->em->persist($version);
|
||||
$this->em->flush();
|
||||
|
||||
return $version;
|
||||
}
|
||||
|
||||
/**
|
||||
* Aktiviert eine Version (setzt andere inaktiv)
|
||||
*/
|
||||
public function activateVersion(DocumentVersion $version): void
|
||||
{
|
||||
$document = $version->getDocument();
|
||||
|
||||
foreach ($document->getVersions() as $existingVersion) {
|
||||
$existingVersion->setActive(false);
|
||||
}
|
||||
|
||||
$version->setActive(true);
|
||||
$document->setCurrentVersion($version);
|
||||
|
||||
$this->em->flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Archiviert Dokument
|
||||
*/
|
||||
public function archive(Document $document): void
|
||||
{
|
||||
$document->archive();
|
||||
$this->em->flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Berechnet SHA256 Checksum
|
||||
*/
|
||||
private function calculateChecksum(string $filePath): string
|
||||
{
|
||||
if (!file_exists($filePath)) {
|
||||
throw new \RuntimeException('File not found for checksum.');
|
||||
}
|
||||
|
||||
return hash_file('sha256', $filePath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ermittelt nächste Versionsnummer
|
||||
*/
|
||||
private function getNextVersionNumber(Document $document): int
|
||||
{
|
||||
$max = 0;
|
||||
|
||||
foreach ($document->getVersions() as $version) {
|
||||
$max = max($max, $version->getVersionNumber());
|
||||
}
|
||||
|
||||
return $max + 1;
|
||||
}
|
||||
}
|
||||
54
src/Service/IngestJobService.php
Normal file
54
src/Service/IngestJobService.php
Normal file
@@ -0,0 +1,54 @@
|
||||
<?php
|
||||
|
||||
|
||||
namespace App\Service;
|
||||
|
||||
use App\Entity\IngestJob;
|
||||
use App\Entity\User;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
final class IngestJobService
|
||||
{
|
||||
public function __construct(private EntityManagerInterface $em)
|
||||
{
|
||||
}
|
||||
|
||||
public function startJob(
|
||||
string $type,
|
||||
?User $user = null,
|
||||
?Uuid $documentId = null,
|
||||
?Uuid $documentVersionId = null,
|
||||
?string $logPath = null
|
||||
): IngestJob
|
||||
{
|
||||
$job = new IngestJob($type);
|
||||
$job->setStartedBy($user);
|
||||
$job->setDocumentId($documentId);
|
||||
$job->setDocumentVersionId($documentVersionId);
|
||||
$job->setLogPath($logPath);
|
||||
|
||||
$this->em->persist($job);
|
||||
$this->em->flush();
|
||||
|
||||
return $job;
|
||||
}
|
||||
|
||||
public function markCompleted(IngestJob $job): void
|
||||
{
|
||||
$job->markCompleted();
|
||||
$this->em->flush();
|
||||
}
|
||||
|
||||
public function markFailed(IngestJob $job, string $message): void
|
||||
{
|
||||
$job->markFailed($message);
|
||||
$this->em->flush();
|
||||
}
|
||||
|
||||
public function markAborted(IngestJob $job): void
|
||||
{
|
||||
$job->markAborted();
|
||||
$this->em->flush();
|
||||
}
|
||||
}
|
||||
130
src/Service/IngestOrchestrator.php
Normal file
130
src/Service/IngestOrchestrator.php
Normal file
@@ -0,0 +1,130 @@
|
||||
<?php
|
||||
|
||||
namespace App\Service;
|
||||
|
||||
use App\Entity\DocumentVersion;
|
||||
use App\Entity\IngestJob;
|
||||
use App\Entity\User;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
|
||||
class IngestOrchestrator
|
||||
{
|
||||
public function __construct(
|
||||
private LockService $lockService,
|
||||
private IngestJobService $jobService,
|
||||
private EntityManagerInterface $em,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* Startet Ingest für eine bestimmte DocumentVersion
|
||||
*/
|
||||
public function runForVersion(
|
||||
DocumentVersion $version,
|
||||
User $user,
|
||||
bool $dryRun = false
|
||||
): IngestJob {
|
||||
|
||||
if (!$this->lockService->acquire()) {
|
||||
throw new \RuntimeException('Another ingest job is already running.');
|
||||
}
|
||||
|
||||
$job = null;
|
||||
|
||||
try {
|
||||
|
||||
// --------------------------------------
|
||||
// Job anlegen
|
||||
// --------------------------------------
|
||||
$job = $this->jobService->startJob(
|
||||
IngestJob::TYPE_DOCUMENT,
|
||||
$user,
|
||||
$version->getDocument()->getId(),
|
||||
$version->getId(),
|
||||
);
|
||||
|
||||
// --------------------------------------
|
||||
// Version Status RUNNING
|
||||
// --------------------------------------
|
||||
$version->setIngestStatus(DocumentVersion::INGEST_RUNNING);
|
||||
$this->em->flush();
|
||||
|
||||
// --------------------------------------
|
||||
// Simulierter Ablauf (noch kein echter Ingest)
|
||||
// --------------------------------------
|
||||
if ($dryRun) {
|
||||
usleep(200000);
|
||||
} else {
|
||||
// Später:
|
||||
// - KnowledgeIngestService
|
||||
// - ChunkWriter
|
||||
// - VectorIngestCommand
|
||||
}
|
||||
|
||||
// --------------------------------------
|
||||
// Erfolg
|
||||
// --------------------------------------
|
||||
$version->setIngestStatus(DocumentVersion::INGEST_INDEXED);
|
||||
$this->jobService->markCompleted($job);
|
||||
$this->em->flush();
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
|
||||
if ($job) {
|
||||
$this->jobService->markFailed($job, $e->getMessage());
|
||||
}
|
||||
|
||||
$version->setIngestStatus(DocumentVersion::INGEST_FAILED);
|
||||
$this->em->flush();
|
||||
|
||||
throw $e;
|
||||
|
||||
} finally {
|
||||
$this->lockService->release();
|
||||
}
|
||||
|
||||
return $job;
|
||||
}
|
||||
|
||||
/**
|
||||
* Globaler Reindex
|
||||
*/
|
||||
public function runGlobal(User $user, bool $dryRun = false): IngestJob
|
||||
{
|
||||
if (!$this->lockService->acquire()) {
|
||||
throw new \RuntimeException('Another ingest job is already running.');
|
||||
}
|
||||
|
||||
$job = null;
|
||||
|
||||
try {
|
||||
|
||||
$job = $this->jobService->startJob(
|
||||
IngestJob::TYPE_GLOBAL_REINDEX,
|
||||
$user
|
||||
);
|
||||
|
||||
if ($dryRun) {
|
||||
usleep(200000);
|
||||
} else {
|
||||
// Später:
|
||||
// - Alle aktiven Dokumente neu ingestieren
|
||||
// - Global vector rebuild
|
||||
}
|
||||
|
||||
$this->jobService->markCompleted($job);
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
|
||||
if ($job) {
|
||||
$this->jobService->markFailed($job, $e->getMessage());
|
||||
}
|
||||
|
||||
throw $e;
|
||||
|
||||
} finally {
|
||||
$this->lockService->release();
|
||||
}
|
||||
|
||||
return $job;
|
||||
}
|
||||
}
|
||||
71
src/Service/LockService.php
Normal file
71
src/Service/LockService.php
Normal file
@@ -0,0 +1,71 @@
|
||||
<?php
|
||||
|
||||
namespace App\Service;
|
||||
|
||||
final class LockService
|
||||
{
|
||||
private $handle = null;
|
||||
private string $lockFile;
|
||||
|
||||
public function __construct(string $projectDir)
|
||||
{
|
||||
$dir = $projectDir . '/var/locks';
|
||||
if (!is_dir($dir)) {
|
||||
mkdir($dir, 0777, true);
|
||||
}
|
||||
|
||||
$this->lockFile = $dir . '/ingest.lock';
|
||||
}
|
||||
|
||||
/**
|
||||
* Gibt true zurück, wenn Lock erfolgreich gesetzt wurde.
|
||||
* Wenn false: ein anderer Prozess hält den Lock.
|
||||
*/
|
||||
public function acquire(): bool
|
||||
{
|
||||
$handle = fopen($this->lockFile, 'c+');
|
||||
if (!$handle) {
|
||||
throw new \RuntimeException('Could not open lock file.');
|
||||
}
|
||||
|
||||
// Nicht-blockierend: sofort true/false
|
||||
if (!flock($handle, LOCK_EX | LOCK_NB)) {
|
||||
fclose($handle);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Lock halten: Handle offen lassen
|
||||
$this->handle = $handle;
|
||||
|
||||
// Optional: Metainfo reinschreiben
|
||||
ftruncate($this->handle, 0);
|
||||
fwrite($this->handle, (string) time());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public function release(): void
|
||||
{
|
||||
if ($this->handle) {
|
||||
flock($this->handle, LOCK_UN);
|
||||
fclose($this->handle);
|
||||
$this->handle = null;
|
||||
}
|
||||
}
|
||||
|
||||
public function isLocked(): bool
|
||||
{
|
||||
$handle = fopen($this->lockFile, 'c+');
|
||||
if (!$handle) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$locked = !flock($handle, LOCK_EX | LOCK_NB);
|
||||
if (!$locked) {
|
||||
flock($handle, LOCK_UN);
|
||||
}
|
||||
fclose($handle);
|
||||
|
||||
return $locked;
|
||||
}
|
||||
}
|
||||
164
src/Vector/VectorIndexBuilder.php
Normal file
164
src/Vector/VectorIndexBuilder.php
Normal file
@@ -0,0 +1,164 @@
|
||||
<?php
|
||||
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Vector;
|
||||
|
||||
use Symfony\Component\Process\Exception\ProcessFailedException;
|
||||
use Symfony\Component\Process\Process;
|
||||
|
||||
final class VectorIndexBuilder
|
||||
{
|
||||
private string $pythonBin;
|
||||
private string $scriptPath;
|
||||
private string $indexNdjsonPath;
|
||||
private string $vectorIndexPath;
|
||||
private int $timeoutSeconds;
|
||||
|
||||
public function __construct(
|
||||
string $projectDir,
|
||||
string $pythonBin = 'python3',
|
||||
string $relativeScriptPath = '/vector/vector_ingest.py',
|
||||
string $relativeIndexNdjsonPath = '/var/knowledge/index.ndjson',
|
||||
string $relativeVectorIndexPath = '/var/knowledge/vector.index',
|
||||
int $timeoutSeconds = 600
|
||||
)
|
||||
{
|
||||
$base = rtrim($projectDir, '/');
|
||||
|
||||
$this->pythonBin = $pythonBin;
|
||||
$this->scriptPath = $base . $relativeScriptPath;
|
||||
$this->indexNdjsonPath = $base . $relativeIndexNdjsonPath;
|
||||
$this->vectorIndexPath = $base . $relativeVectorIndexPath;
|
||||
$this->timeoutSeconds = $timeoutSeconds;
|
||||
}
|
||||
|
||||
public function getIndexNdjsonPath(): string
|
||||
{
|
||||
return $this->indexNdjsonPath;
|
||||
}
|
||||
|
||||
public function getVectorIndexPath(): string
|
||||
{
|
||||
return $this->vectorIndexPath;
|
||||
}
|
||||
|
||||
public function getScriptPath(): string
|
||||
{
|
||||
return $this->scriptPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rebuild FAISS Index deterministisch aus index.ndjson.
|
||||
*
|
||||
* Erwartung: Python schreibt in $tmpVectorIndexPath, wir schalten atomar um.
|
||||
*
|
||||
* @param string|null $logPath Optional: stdout/stderr dorthin appenden
|
||||
*/
|
||||
public function rebuildFromNdjson(?string $logPath = null): void
|
||||
{
|
||||
if (!is_file($this->scriptPath)) {
|
||||
throw new \RuntimeException('vector_ingest.py not found at: ' . $this->scriptPath);
|
||||
}
|
||||
|
||||
if (!is_file($this->indexNdjsonPath)) {
|
||||
throw new \RuntimeException('index.ndjson not found at: ' . $this->indexNdjsonPath);
|
||||
}
|
||||
|
||||
$dir = \dirname($this->vectorIndexPath);
|
||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||
throw new \RuntimeException('Unable to create vector index directory: ' . $dir);
|
||||
}
|
||||
|
||||
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
|
||||
|
||||
// Vorheriges tmp entfernen (Sicherheit)
|
||||
if (is_file($tmpVectorIndexPath)) {
|
||||
@unlink($tmpVectorIndexPath);
|
||||
}
|
||||
|
||||
// ----------------------------
|
||||
// Python-Aufruf (konservativ)
|
||||
// ----------------------------
|
||||
// Wir erwarten/standardisieren (ab jetzt) CLI-Args:
|
||||
// --index <path-to-index.ndjson>
|
||||
// --out <path-to-vector.index.tmp>
|
||||
//
|
||||
// Falls dein Python-Script aktuell andere Args hat,
|
||||
// passen wir es im nächsten Schritt konsistent an.
|
||||
$cmd = [
|
||||
$this->pythonBin,
|
||||
$this->scriptPath,
|
||||
'--index', $this->indexNdjsonPath,
|
||||
'--out', $tmpVectorIndexPath,
|
||||
];
|
||||
|
||||
$process = new Process($cmd);
|
||||
$process->setTimeout($this->timeoutSeconds);
|
||||
|
||||
$this->runProcess($process, $logPath);
|
||||
|
||||
// Python muss tmp erzeugt haben
|
||||
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
|
||||
throw new \RuntimeException('Vector index rebuild failed: tmp output missing or empty: ' . $tmpVectorIndexPath);
|
||||
}
|
||||
|
||||
// Atomarer Switch
|
||||
$this->atomicSwitch($tmpVectorIndexPath, $this->vectorIndexPath);
|
||||
}
|
||||
|
||||
// -------------------------
|
||||
// Internals
|
||||
// -------------------------
|
||||
|
||||
private function runProcess(Process $process, ?string $logPath): void
|
||||
{
|
||||
if ($logPath !== null) {
|
||||
$this->appendLog($logPath, "\n=== VectorIndexBuilder START " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n");
|
||||
$this->appendLog($logPath, "CMD: " . $process->getCommandLine() . "\n");
|
||||
}
|
||||
|
||||
$process->run(function (string $type, string $buffer) use ($logPath) {
|
||||
if ($logPath === null) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TYPE: Process::OUT / Process::ERR
|
||||
$this->appendLog($logPath, $buffer);
|
||||
});
|
||||
|
||||
if (!$process->isSuccessful()) {
|
||||
if ($logPath !== null) {
|
||||
$this->appendLog($logPath, "\n=== VectorIndexBuilder FAILED ===\n");
|
||||
$this->appendLog($logPath, "ExitCode: " . $process->getExitCode() . "\n");
|
||||
$this->appendLog($logPath, "STDERR:\n" . $process->getErrorOutput() . "\n");
|
||||
}
|
||||
|
||||
throw new ProcessFailedException($process);
|
||||
}
|
||||
|
||||
if ($logPath !== null) {
|
||||
$this->appendLog($logPath, "\n=== VectorIndexBuilder OK " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n");
|
||||
}
|
||||
}
|
||||
|
||||
private function appendLog(string $logPath, string $content): void
|
||||
{
|
||||
$dir = \dirname($logPath);
|
||||
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
|
||||
// Wenn Log nicht möglich ist: nicht hart scheitern (Build ist wichtiger)
|
||||
return;
|
||||
}
|
||||
|
||||
@file_put_contents($logPath, $content, FILE_APPEND);
|
||||
}
|
||||
|
||||
private function atomicSwitch(string $tmp, string $final): void
|
||||
{
|
||||
if (!rename($tmp, $final)) {
|
||||
@unlink($tmp);
|
||||
throw new \RuntimeException('Atomic switch failed for vector.index');
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,88 +2,125 @@
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Argument handling
|
||||
# Argument parsing
|
||||
# ---------------------------------------------------------
|
||||
if len(sys.argv) < 3:
|
||||
print("ERROR: Missing arguments (vectorDir, knowledgeDir)")
|
||||
sys.exit(2)
|
||||
parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON")
|
||||
|
||||
vector_dir = Path(sys.argv[1]).resolve()
|
||||
knowledge_dir = Path(sys.argv[2]).resolve()
|
||||
parser.add_argument("--index", required=True, help="Path to index.ndjson")
|
||||
parser.add_argument("--out", required=True, help="Path to output vector.index")
|
||||
parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model")
|
||||
|
||||
index_json = knowledge_dir / "index.json"
|
||||
index_out = vector_dir / "vector.index"
|
||||
meta_out = vector_dir / "vector_meta.json"
|
||||
args = parser.parse_args()
|
||||
|
||||
index_path = Path(args.index).resolve()
|
||||
out_path = Path(args.out).resolve()
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Dependency checks
|
||||
# ---------------------------------------------------------
|
||||
try:
|
||||
import faiss # noqa
|
||||
import faiss
|
||||
except Exception:
|
||||
print("ERROR: Python module 'faiss' not found.")
|
||||
sys.exit(10)
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer # noqa
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except Exception:
|
||||
print("ERROR: Python module 'sentence-transformers' not found.")
|
||||
sys.exit(11)
|
||||
|
||||
import numpy as np
|
||||
import faiss
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# File checks
|
||||
# ---------------------------------------------------------
|
||||
if not index_json.is_file():
|
||||
print(f"ERROR: index.json not found at {index_json}")
|
||||
if not index_path.is_file():
|
||||
print(f"ERROR: index.ndjson not found at {index_path}")
|
||||
sys.exit(20)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Load chunks from index.json
|
||||
# Load model
|
||||
# ---------------------------------------------------------
|
||||
with open(index_json, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
print(f"Loading embedding model: {args.model}")
|
||||
model = SentenceTransformer(args.model)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Streaming read NDJSON
|
||||
# ---------------------------------------------------------
|
||||
texts = []
|
||||
ids = []
|
||||
|
||||
for entry in data:
|
||||
if "file" not in entry:
|
||||
continue
|
||||
print("Reading NDJSON...")
|
||||
|
||||
chunk_path = knowledge_dir / "chunks" / entry["file"]
|
||||
if not chunk_path.is_file():
|
||||
continue
|
||||
with open(index_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
text = chunk_path.read_text(encoding="utf-8").strip()
|
||||
if not text:
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
texts.append(text)
|
||||
ids.append(entry["file"])
|
||||
text = entry.get("text")
|
||||
chunk_id = entry.get("chunk_id")
|
||||
|
||||
if not text or not chunk_id:
|
||||
continue
|
||||
|
||||
texts.append(text)
|
||||
ids.append(chunk_id)
|
||||
|
||||
if not texts:
|
||||
print("ERROR: No chunks loaded from index.json")
|
||||
print("ERROR: No valid chunks found in index.ndjson")
|
||||
sys.exit(21)
|
||||
|
||||
print(f"Loaded {len(texts)} chunks.")
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Build vector index
|
||||
# Build embeddings
|
||||
# ---------------------------------------------------------
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
embeddings = model.encode(texts, normalize_embeddings=True)
|
||||
print("Encoding embeddings...")
|
||||
embeddings = model.encode(
|
||||
texts,
|
||||
normalize_embeddings=True,
|
||||
show_progress_bar=True,
|
||||
batch_size=64
|
||||
)
|
||||
|
||||
embeddings = np.array(embeddings).astype("float32")
|
||||
|
||||
dim = embeddings.shape[1]
|
||||
print(f"Embedding dimension: {dim}")
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Build FAISS index
|
||||
# ---------------------------------------------------------
|
||||
print("Building FAISS index...")
|
||||
index = faiss.IndexFlatIP(dim)
|
||||
index.add(embeddings)
|
||||
|
||||
faiss.write_index(index, str(index_out))
|
||||
# Ensure output directory exists
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(meta_out, "w", encoding="utf-8") as f:
|
||||
print(f"Writing FAISS index to {out_path}")
|
||||
faiss.write_index(index, str(out_path))
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Write ID mapping meta
|
||||
# ---------------------------------------------------------
|
||||
meta_path = out_path.with_suffix(".meta.json")
|
||||
|
||||
with open(meta_path, "w", encoding="utf-8") as f:
|
||||
json.dump(ids, f)
|
||||
|
||||
print(f"Indexed {len(ids)} chunks.")
|
||||
print(f"Indexed {len(ids)} chunks successfully.")
|
||||
sys.exit(0)
|
||||
|
||||
Reference in New Issue
Block a user