stash light

This commit is contained in:
team 1
2026-02-12 10:03:52 +01:00
parent 5b650a8f28
commit 0bb0c0b42f
51 changed files with 6864 additions and 72 deletions

View File

@@ -0,0 +1,108 @@
<?php
namespace App\Command;
use App\Entity\User;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Helper\QuestionHelper;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Question\ChoiceQuestion;
use Symfony\Component\Console\Question\Question;
use Symfony\Component\PasswordHasher\Hasher\UserPasswordHasherInterface;
#[AsCommand(
name: 'mto:agent:user:create',
description: 'Creates a new admin user'
)]
class CreateUserCommand extends Command
{
public function __construct(
private EntityManagerInterface $em,
private UserPasswordHasherInterface $passwordHasher
)
{
parent::__construct();
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
/** @var QuestionHelper $helper */
$helper = $this->getHelper('question');
// =============================
// Email
// =============================
$emailQuestion = new Question('E-Mail: ');
$emailQuestion->setValidator(function ($value) {
if (!filter_var($value, FILTER_VALIDATE_EMAIL)) {
throw new \RuntimeException('Invalid email address.');
}
return strtolower(trim($value));
});
$email = $helper->ask($input, $output, $emailQuestion);
// Prüfen ob User existiert
$existingUser = $this->em
->getRepository(User::class)
->findOneBy(['email' => $email]);
if ($existingUser) {
$output->writeln('<error>User already exists.</error>');
return Command::FAILURE;
}
// =============================
// Passwort
// =============================
$passwordQuestion = new Question('Password: ');
$passwordQuestion->setHidden(true);
$passwordQuestion->setHiddenFallback(false);
$plainPassword = $helper->ask($input, $output, $passwordQuestion);
if (strlen($plainPassword) < 8) {
$output->writeln('<error>Password must be at least 8 characters.</error>');
return Command::FAILURE;
}
// =============================
// Rolle auswählen
// =============================
$roleQuestion = new ChoiceQuestion(
'Select role:',
[
'ROLE_SUPER_ADMIN',
'ROLE_KNOWLEDGE_ADMIN',
'ROLE_EDITOR',
'ROLE_USER',
],
0
);
$role = $helper->ask($input, $output, $roleQuestion);
// =============================
// User erzeugen
// =============================
$user = new User();
$user->setEmail($email);
$user->setRoles([$role]);
$hashedPassword = $this->passwordHasher->hashPassword($user, $plainPassword);
$user->setPassword($hashedPassword);
$this->em->persist($user);
$this->em->flush();
$output->writeln('<info>User created successfully.</info>');
$output->writeln('Email: ' . $email);
$output->writeln('Role: ' . $role);
return Command::SUCCESS;
}
}

View File

@@ -0,0 +1,17 @@
<?php
namespace App\Controller\Admin;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\Routing\Attribute\Route;
final class DashboardController extends AbstractController
{
#[Route('/admin', name: 'admin_dashboard')]
public function index(): Response
{
return $this->render('admin/dashboard/index.html.twig');
}
}

View File

@@ -0,0 +1,217 @@
<?php
namespace App\Controller\Admin;
use App\Entity\Document;
use App\Entity\DocumentVersion;
use App\Service\DocumentService;
use App\Service\IngestOrchestrator;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
use Symfony\Component\HttpFoundation\RedirectResponse;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\Routing\Attribute\Route;
use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
use Symfony\Component\Uid\Uuid;
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\File\Exception\FileException;
#[Route('/admin/documents')]
class DocumentController extends AbstractController
{
#[Route('', name: 'admin_documents')]
public function index(EntityManagerInterface $em): Response
{
$documents = $em->getRepository(Document::class)
->findBy([], ['createdAt' => 'DESC']);
return $this->render('admin/document/index.html.twig', [
'documents' => $documents
]);
}
#[Route(
'/{id}',
name: 'admin_document_show',
requirements: ['id' => '[0-9a-fA-F\-]{36}']
)]
public function show(string $id, EntityManagerInterface $em): Response
{
try {
$uuid = Uuid::fromString($id);
} catch (\Exception $e) {
throw new NotFoundHttpException();
}
$document = $em->getRepository(Document::class)->find($uuid);
if (!$document) {
throw new NotFoundHttpException();
}
return $this->render('admin/document/show.html.twig', [
'document' => $document
]);
}
#[Route('/new', name: 'admin_document_new')]
public function new(Request $request, DocumentService $documentService): Response
{
if ($request->isMethod('POST')) {
$title = $request->request->get('title');
$file = $request->files->get('file');
if (!$file || !$title) {
$this->addFlash('error', 'Titel und Datei sind erforderlich.');
return $this->redirectToRoute('admin_document_new');
}
$uploadDir = $this->getParameter('kernel.project_dir') . '/var/knowledge/uploads';
if (!is_dir($uploadDir)) {
mkdir($uploadDir, 0777, true);
}
$newFilename = uniqid() . '_' . $file->getClientOriginalName();
try {
$file->move($uploadDir, $newFilename);
} catch (FileException $e) {
throw new \RuntimeException('File upload failed.');
}
$filePath = $uploadDir . '/' . $newFilename;
$documentService->createDocument(
$title,
$filePath,
$this->getUser()
);
return $this->redirectToRoute('admin_documents');
}
return $this->render('admin/document/new.html.twig');
}
#[Route('/{id}/version/new', name: 'admin_document_version_new', requirements: ['id' => '[0-9a-fA-F\-]{36}'])]
public function newVersion(
string $id,
Request $request,
EntityManagerInterface $em,
DocumentService $documentService
): Response {
$document = $em->getRepository(Document::class)->find($id);
if (!$document) {
throw $this->createNotFoundException();
}
if ($request->isMethod('POST')) {
$file = $request->files->get('file');
if (!$file) {
$this->addFlash('error', 'Datei ist erforderlich.');
return $this->redirectToRoute('admin_document_version_new', ['id' => $id]);
}
$uploadDir = $this->getParameter('kernel.project_dir') . '/var/knowledge/uploads';
if (!is_dir($uploadDir)) {
mkdir($uploadDir, 0777, true);
}
$newFilename = uniqid() . '_' . $file->getClientOriginalName();
try {
$file->move($uploadDir, $newFilename);
} catch (FileException $e) {
throw new \RuntimeException('File upload failed.');
}
$filePath = $uploadDir . '/' . $newFilename;
$documentService->addVersion(
$document,
$filePath,
$this->getUser()
);
return $this->redirectToRoute('admin_document_show', ['id' => $id]);
}
return $this->render('admin/document/new_version.html.twig', [
'document' => $document
]);
}
#[Route(
'/version/{versionId}/activate',
name: 'admin_document_version_activate',
requirements: ['versionId' => '[0-9a-fA-F\-]{36}'],
methods: ['POST']
)]
public function activateVersion(
string $versionId,
Request $request,
EntityManagerInterface $em,
DocumentService $documentService
): RedirectResponse {
if (!$this->isCsrfTokenValid('activate_version', $request->request->get('_token'))) {
throw $this->createAccessDeniedException();
}
$version = $em->getRepository(DocumentVersion::class)->find($versionId);
if (!$version) {
throw $this->createNotFoundException();
}
$documentService->activateVersion($version);
return $this->redirectToRoute('admin_document_show', [
'id' => $version->getDocument()->getId()
]);
}
#[Route(
'/version/{versionId}/ingest',
name: 'admin_document_version_ingest',
methods: ['POST'],
requirements: ['versionId' => '[0-9a-fA-F\-]{36}']
)]
public function ingestVersion(
string $versionId,
Request $request,
EntityManagerInterface $em,
IngestOrchestrator $orchestrator
): RedirectResponse {
if (!$this->isCsrfTokenValid('ingest_version', $request->request->get('_token'))) {
throw $this->createAccessDeniedException();
}
$version = $em->getRepository(DocumentVersion::class)->find($versionId);
if (!$version) {
throw $this->createNotFoundException();
}
$orchestrator->runForVersion(
$version,
$this->getUser(),
true // erstmal DryRun
);
return $this->redirectToRoute('admin_document_show', [
'id' => $version->getDocument()->getId()
]);
}
}

View File

@@ -0,0 +1,57 @@
<?php
namespace App\Controller\Admin;
use App\Entity\IngestJob;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
use Symfony\Component\Routing\Attribute\Route;
use App\Ingest\IngestFlow;
use Symfony\Component\HttpFoundation\RedirectResponse;
#[Route('/admin/jobs')]
class IngestJobController extends AbstractController
{
#[Route('', name: 'admin_jobs')]
public function index(EntityManagerInterface $em): Response
{
$jobs = $em->getRepository(IngestJob::class)
->findBy([], ['startedAt' => 'DESC']);
return $this->render('admin/job/index.html.twig', [
'jobs' => $jobs
]);
}
#[Route(
'/{id}',
name: 'admin_job_show',
requirements: ['id' => '[0-9a-fA-F\-]{36}']
)]
public function show(string $id, EntityManagerInterface $em): Response
{
$job = $em->getRepository(IngestJob::class)->find($id);
if (!$job) {
throw new NotFoundHttpException();
}
return $this->render('admin/job/show.html.twig', [
'job' => $job
]);
}
#[Route('/global-reindex', name: 'admin_global_reindex', methods: ['POST'])]
public function globalReindex(
IngestFlow $flow
): RedirectResponse {
$this->denyAccessUnlessGranted('ROLE_SUPER_ADMIN');
$flow->globalReindex($this->getUser());
return $this->redirectToRoute('admin_jobs');
}
}

View File

@@ -0,0 +1,33 @@
<?php
namespace App\Controller\Admin;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\Routing\Attribute\Route;
use Symfony\Component\Security\Http\Authentication\AuthenticationUtils;
final class SecurityController extends AbstractController
{
#[Route('/admin/login', name: 'admin_login')]
public function login(AuthenticationUtils $authUtils): Response
{
// Wenn bereits eingeloggt → direkt ins Dashboard
if ($this->getUser()) {
return $this->redirectToRoute('admin_dashboard');
}
return $this->render('admin/security/login.html.twig', [
'last_username' => $authUtils->getLastUsername(),
'error' => $authUtils->getLastAuthenticationError(),
]);
}
#[Route('/admin/logout', name: 'admin_logout')]
public function logout(): void
{
// Symfony interceptet diese Route, daher bleibt sie leer.
throw new \LogicException('This method can be blank - it will be intercepted by the logout key on your firewall.');
}
}

110
src/Entity/Document.php Normal file
View File

@@ -0,0 +1,110 @@
<?php
namespace App\Entity;
use Doctrine\ORM\Mapping as ORM;
use Symfony\Component\Uid\Uuid;
use Doctrine\Common\Collections\ArrayCollection;
use Doctrine\Common\Collections\Collection;
#[ORM\Entity]
class Document
{
public const STATUS_ACTIVE = 'ACTIVE';
public const STATUS_ARCHIVED = 'ARCHIVED';
#[ORM\Id]
#[ORM\Column(type: 'uuid', unique: true)]
private Uuid $id;
#[ORM\Column(length: 255)]
private string $title;
#[ORM\Column(length: 20)]
private string $status = self::STATUS_ACTIVE;
#[ORM\ManyToOne]
#[ORM\JoinColumn(nullable: false)]
private User $createdBy;
#[ORM\Column]
private \DateTimeImmutable $createdAt;
#[ORM\OneToMany(mappedBy: 'document', targetEntity: DocumentVersion::class, cascade: ['persist'], orphanRemoval: true)]
private Collection $versions;
#[ORM\ManyToOne]
private ?DocumentVersion $currentVersion = null;
public function __construct()
{
$this->id = Uuid::v4();
$this->createdAt = new \DateTimeImmutable();
$this->versions = new ArrayCollection();
}
public function getId(): Uuid
{
return $this->id;
}
public function getCreatedAt(): \DateTimeImmutable
{
return $this->createdAt;
}
public function getTitle(): string
{
return $this->title;
}
public function setTitle(string $title): static
{
$this->title = $title;
return $this;
}
public function getStatus(): string
{
return $this->status;
}
public function archive(): void
{
$this->status = self::STATUS_ARCHIVED;
}
public function getCreatedBy(): User
{
return $this->createdBy;
}
public function setCreatedBy(User $user): static
{
$this->createdBy = $user;
return $this;
}
public function getVersions(): Collection
{
return $this->versions;
}
public function addVersion(DocumentVersion $version): void
{
$this->versions->add($version);
$version->setDocument($this);
}
public function setCurrentVersion(?DocumentVersion $version): void
{
$this->currentVersion = $version;
}
public function getCurrentVersion(): ?DocumentVersion
{
return $this->currentVersion;
}
}

View File

@@ -0,0 +1,184 @@
<?php
namespace App\Entity;
use Doctrine\ORM\Mapping as ORM;
use Symfony\Component\Uid\Uuid;
use App\Repository\DocumentVersionRepository;
#[ORM\Entity(repositoryClass: DocumentVersionRepository::class)]
class DocumentVersion
{
public const INGEST_PENDING = 'PENDING';
public const INGEST_RUNNING = 'RUNNING';
public const INGEST_INDEXED = 'INDEXED';
public const INGEST_FAILED = 'FAILED';
public const INGEST_STATUSES = [
self::INGEST_PENDING,
self::INGEST_RUNNING,
self::INGEST_INDEXED,
self::INGEST_FAILED,
];
#[ORM\Id]
#[ORM\Column(type: 'uuid', unique: true)]
private Uuid $id;
#[ORM\ManyToOne(inversedBy: 'versions')]
#[ORM\JoinColumn(nullable: false)]
private Document $document;
#[ORM\Column]
private int $versionNumber;
#[ORM\Column(length: 255)]
private string $filePath;
#[ORM\Column(length: 64)]
private string $checksum;
#[ORM\Column(length: 20)]
private string $ingestStatus = self::INGEST_PENDING;
#[ORM\ManyToOne]
#[ORM\JoinColumn(nullable: false)]
private User $createdBy;
#[ORM\Column]
private \DateTimeImmutable $createdAt;
#[ORM\Column]
private bool $isActive = false;
public function __construct()
{
$this->id = Uuid::v4();
$this->createdAt = new \DateTimeImmutable();
}
// =========================
// ID
// =========================
public function getId(): Uuid
{
return $this->id;
}
// =========================
// Document Relation
// =========================
public function setDocument(Document $document): void
{
$this->document = $document;
}
public function getDocument(): Document
{
return $this->document;
}
// =========================
// Version Number
// =========================
public function getVersionNumber(): int
{
return $this->versionNumber;
}
public function setVersionNumber(int $number): void
{
$this->versionNumber = $number;
}
// =========================
// File Path
// =========================
public function setFilePath(string $path): void
{
$this->filePath = $path;
}
public function getFilePath(): string
{
return $this->filePath;
}
// =========================
// Checksum
// =========================
public function setChecksum(string $checksum): void
{
$this->checksum = $checksum;
}
public function getChecksum(): string
{
return $this->checksum;
}
// =========================
// Ingest Status
// =========================
public function setIngestStatus(string $status): void
{
if (!in_array($status, self::INGEST_STATUSES, true)) {
throw new \InvalidArgumentException('Invalid ingest status.');
}
$this->ingestStatus = $status;
}
public function getIngestStatus(): string
{
return $this->ingestStatus;
}
public function isIndexed(): bool
{
return $this->ingestStatus === self::INGEST_INDEXED;
}
// =========================
// Created By
// =========================
public function setCreatedBy(User $user): void
{
$this->createdBy = $user;
}
public function getCreatedBy(): User
{
return $this->createdBy;
}
// =========================
// Created At
// =========================
public function getCreatedAt(): \DateTimeImmutable
{
return $this->createdAt;
}
// =========================
// Active Flag
// =========================
public function setActive(bool $active): void
{
$this->isActive = $active;
}
public function isActive(): bool
{
return $this->isActive;
}
}

101
src/Entity/IngestJob.php Normal file
View File

@@ -0,0 +1,101 @@
<?php
namespace App\Entity;
use Doctrine\ORM\Mapping as ORM;
use Symfony\Component\Uid\Uuid;
#[ORM\Entity]
class IngestJob
{
public const TYPE_DOCUMENT = 'DOCUMENT';
public const TYPE_GLOBAL_REINDEX = 'GLOBAL_REINDEX';
public const STATUS_RUNNING = 'RUNNING';
public const STATUS_COMPLETED = 'COMPLETED';
public const STATUS_FAILED = 'FAILED';
public const STATUS_ABORTED = 'ABORTED';
#[ORM\Id]
#[ORM\Column(type: 'uuid', unique: true)]
private Uuid $id;
#[ORM\Column(length: 30)]
private string $type;
#[ORM\Column(length: 20)]
private string $status = self::STATUS_RUNNING;
#[ORM\Column(type: 'uuid', nullable: true)]
private ?Uuid $documentId = null;
#[ORM\Column(type: 'uuid', nullable: true)]
private ?Uuid $documentVersionId = null;
#[ORM\Column]
private \DateTimeImmutable $startedAt;
#[ORM\Column(nullable: true)]
private ?\DateTimeImmutable $finishedAt = null;
#[ORM\ManyToOne]
#[ORM\JoinColumn(nullable: true)]
private ?User $startedBy = null;
#[ORM\Column(nullable: true)]
private ?string $logPath = null;
#[ORM\Column(type: 'text', nullable: true)]
private ?string $errorMessage = null;
public function __construct(string $type)
{
$this->id = Uuid::v4();
$this->type = $type;
$this->startedAt = new \DateTimeImmutable();
$this->status = self::STATUS_RUNNING;
}
public function getId(): Uuid { return $this->id; }
public function getType(): string { return $this->type; }
public function getStatus(): string { return $this->status; }
public function setDocumentId(?Uuid $id): void { $this->documentId = $id; }
public function getDocumentId(): ?Uuid { return $this->documentId; }
public function setDocumentVersionId(?Uuid $id): void { $this->documentVersionId = $id; }
public function getDocumentVersionId(): ?Uuid { return $this->documentVersionId; }
public function setStartedBy(?User $user): void { $this->startedBy = $user; }
public function getStartedBy(): ?User { return $this->startedBy; }
public function setLogPath(?string $path): void { $this->logPath = $path; }
public function getLogPath(): ?string { return $this->logPath; }
public function getStartedAt(): \DateTimeImmutable { return $this->startedAt; }
public function getFinishedAt(): ?\DateTimeImmutable { return $this->finishedAt; }
public function markCompleted(): void
{
$this->status = self::STATUS_COMPLETED;
$this->finishedAt = new \DateTimeImmutable();
}
public function markFailed(string $message): void
{
$this->status = self::STATUS_FAILED;
$this->errorMessage = $message;
$this->finishedAt = new \DateTimeImmutable();
}
public function markAborted(): void
{
$this->status = self::STATUS_ABORTED;
$this->finishedAt = new \DateTimeImmutable();
}
public function getErrorMessage(): ?string
{
return $this->errorMessage;
}
}

161
src/Entity/User.php Normal file
View File

@@ -0,0 +1,161 @@
<?php
namespace App\Entity;
use App\Repository\UserRepository;
use Doctrine\ORM\Mapping as ORM;
use Symfony\Component\Security\Core\User\UserInterface;
use Symfony\Component\Security\Core\User\PasswordAuthenticatedUserInterface;
use Symfony\Component\Uid\Uuid;
#[ORM\Entity(repositoryClass: UserRepository::class)]
#[ORM\HasLifecycleCallbacks]
class User implements UserInterface, PasswordAuthenticatedUserInterface
{
#[ORM\Id]
#[ORM\Column(type: 'uuid', unique: true)]
private Uuid $id;
#[ORM\Column(length: 180, unique: true)]
private string $email;
#[ORM\Column]
private string $password;
#[ORM\Column(type: 'json')]
private array $roles = [];
#[ORM\Column]
private bool $isActive = true;
#[ORM\Column]
private \DateTimeImmutable $createdAt;
#[ORM\Column(nullable: true)]
private ?\DateTimeImmutable $updatedAt = null;
public function __construct()
{
$this->id = Uuid::v4();
$this->createdAt = new \DateTimeImmutable();
$this->roles = ['ROLE_USER'];
}
// =========================
// Security Identifier
// =========================
public function getUserIdentifier(): string
{
return $this->email;
}
// Symfony < 6 compatibility (optional)
public function getUsername(): string
{
return $this->email;
}
// =========================
// ID
// =========================
public function getId(): Uuid
{
return $this->id;
}
// =========================
// Email
// =========================
public function getEmail(): string
{
return $this->email;
}
public function setEmail(string $email): static
{
$this->email = strtolower($email);
return $this;
}
// =========================
// Password
// =========================
public function getPassword(): string
{
return $this->password;
}
public function setPassword(string $password): static
{
$this->password = $password;
return $this;
}
// =========================
// Roles
// =========================
public function getRoles(): array
{
$roles = $this->roles;
// Jeder User hat mindestens ROLE_USER
$roles[] = 'ROLE_USER';
return array_unique($roles);
}
public function setRoles(array $roles): static
{
$this->roles = $roles;
return $this;
}
// =========================
// Active Status
// =========================
public function isActive(): bool
{
return $this->isActive;
}
public function setIsActive(bool $isActive): static
{
$this->isActive = $isActive;
return $this;
}
// =========================
// Timestamps
// =========================
public function getCreatedAt(): \DateTimeImmutable
{
return $this->createdAt;
}
public function getUpdatedAt(): ?\DateTimeImmutable
{
return $this->updatedAt;
}
#[ORM\PreUpdate]
public function updateTimestamp(): void
{
$this->updatedAt = new \DateTimeImmutable();
}
// =========================
// Erase Credentials (Pflicht)
// =========================
public function eraseCredentials(): void
{
// Falls später sensible Daten gespeichert werden
}
}

View File

@@ -0,0 +1,95 @@
<?php
declare(strict_types=1);
namespace App\Index;
/**
* Beschreibt die "Struktur" des Index (nicht den Inhalt).
* Diese Werte müssen bei lokalem Ingest mit index_meta.json kompatibel sein,
* sonst muss ein Global Reindex erzwungen werden.
*/
final class IndexConfiguration
{
public function __construct(
private readonly int $chunkSize,
private readonly int $chunkOverlap,
private readonly string $embeddingModel,
private readonly int $embeddingDimension,
private readonly int $scoringVersion,
private readonly string $indexFormat = 'ndjson', // bindend: 'ndjson'
private readonly string $vectorBackend = 'faiss', // informativ
)
{
if ($this->chunkSize <= 0) {
throw new \InvalidArgumentException('chunkSize must be > 0');
}
if ($this->chunkOverlap < 0) {
throw new \InvalidArgumentException('chunkOverlap must be >= 0');
}
if ($this->chunkOverlap >= $this->chunkSize) {
throw new \InvalidArgumentException('chunkOverlap must be < chunkSize');
}
if ($this->embeddingDimension <= 0) {
throw new \InvalidArgumentException('embeddingDimension must be > 0');
}
if ($this->scoringVersion <= 0) {
throw new \InvalidArgumentException('scoringVersion must be > 0');
}
if ($this->indexFormat !== 'ndjson') {
throw new \InvalidArgumentException('indexFormat must be "ndjson"');
}
}
public function getChunkSize(): int
{
return $this->chunkSize;
}
public function getChunkOverlap(): int
{
return $this->chunkOverlap;
}
public function getEmbeddingModel(): string
{
return $this->embeddingModel;
}
public function getEmbeddingDimension(): int
{
return $this->embeddingDimension;
}
public function getScoringVersion(): int
{
return $this->scoringVersion;
}
public function getIndexFormat(): string
{
return $this->indexFormat;
}
public function getVectorBackend(): string
{
return $this->vectorBackend;
}
/**
* Canonical representation: nur strukturelle Felder (ohne created_at, index_version).
*/
public function toStructureArray(): array
{
return [
'embedding_model' => $this->embeddingModel,
'embedding_dimension' => $this->embeddingDimension,
'chunk_size' => $this->chunkSize,
'chunk_overlap' => $this->chunkOverlap,
'scoring_version' => $this->scoringVersion,
'index_format' => $this->indexFormat,
'vector_backend' => $this->vectorBackend,
];
}
}

View File

@@ -0,0 +1,206 @@
<?php
declare(strict_types=1);
namespace App\Index;
final class IndexMetaManager
{
private string $metaPath;
public function __construct(
string $projectDir,
private readonly IndexConfiguration $config,
string $relativeMetaPath = '/var/knowledge/index_meta.json'
)
{
$this->metaPath = rtrim($projectDir, '/') . $relativeMetaPath;
}
public function getMetaPath(): string
{
return $this->metaPath;
}
/**
* Gibt null zurück, wenn noch kein Meta existiert (frisches System).
*
* @return array<string,mixed>|null
*/
public function readMeta(): ?array
{
if (!is_file($this->metaPath)) {
return null;
}
$raw = file_get_contents($this->metaPath);
if ($raw === false) {
throw new \RuntimeException('Unable to read index_meta.json');
}
$data = json_decode($raw, true);
if (!is_array($data)) {
throw new \RuntimeException('index_meta.json is invalid JSON');
}
return $data;
}
/**
* Erstellt Meta, falls nicht vorhanden (z. B. nach erstem Global Reindex).
* Überschreibt NICHT automatisch, wenn vorhanden.
*
* @return array<string,mixed>
*/
public function createInitialMetaIfMissing(): array
{
$existing = $this->readMeta();
if ($existing !== null) {
return $existing;
}
$meta = $this->buildMetaPayload(indexVersion: 1);
$this->atomicWriteJson($meta);
return $meta;
}
/**
* Guardrail: Prüft, ob die aktuelle Config kompatibel zur gespeicherten Meta ist.
* Wenn nicht: IndexStructureChangedException -> Global Reindex erzwingen.
*/
public function validateAgainstCurrent(): void
{
$meta = $this->readMeta();
// Wenn noch kein Meta existiert, lassen wir lokale Ingests NICHT einfach laufen.
// Governance: Erst Global Reindex erzeugt Meta sauber.
if ($meta === null) {
throw new IndexStructureChangedException(
'index_meta.json missing. Please run a Global Reindex to initialize index structure metadata.',
['reason' => 'missing_meta']
);
}
$expected = $this->config->toStructureArray();
$diff = $this->diffStructure($meta, $expected);
if ($diff !== []) {
throw new IndexStructureChangedException(
'Index structure changed. Global Reindex required.',
$diff
);
}
}
/**
* Wird beim Global Reindex verwendet:
* - index_version++ (oder initialisieren)
* - Meta atomar schreiben
*
* @return array<string,mixed> new meta
*/
public function writeMetaForGlobalReindex(): array
{
$current = $this->readMeta();
$nextVersion = 1;
if (is_array($current) && isset($current['index_version']) && is_int($current['index_version'])) {
$nextVersion = $current['index_version'] + 1;
}
$meta = $this->buildMetaPayload($nextVersion);
$this->atomicWriteJson($meta);
return $meta;
}
public function getConfig(): IndexConfiguration
{
return $this->config;
}
// -------------------------
// Internals
// -------------------------
/**
* @return array<string,mixed>
*/
private function buildMetaPayload(int $indexVersion): array
{
$structure = $this->config->toStructureArray();
return [
'index_version' => $indexVersion,
'created_at' => (new \DateTimeImmutable())->format(DATE_ATOM),
'embedding_model' => $structure['embedding_model'],
'embedding_dimension' => $structure['embedding_dimension'],
'chunk_size' => $structure['chunk_size'],
'chunk_overlap' => $structure['chunk_overlap'],
'scoring_version' => $structure['scoring_version'],
'index_format' => $structure['index_format'],
'vector_backend' => $structure['vector_backend'],
];
}
/**
* @param array<string,mixed> $meta
* @param array<string,mixed> $expected
* @return array<string,mixed> diff
*/
private function diffStructure(array $meta, array $expected): array
{
$diff = [];
foreach ($expected as $key => $value) {
$actual = $meta[$key] ?? null;
if ($actual !== $value) {
$diff[$key] = [
'expected' => $value,
'actual' => $actual,
];
}
}
// index_format ist zwingend
if (($meta['index_format'] ?? null) !== 'ndjson') {
$diff['index_format'] = [
'expected' => 'ndjson',
'actual' => $meta['index_format'] ?? null,
];
}
return $diff;
}
/**
* @param array<string,mixed> $payload
*/
private function atomicWriteJson(array $payload): void
{
$dir = \dirname($this->metaPath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create directory: ' . $dir);
}
$tmp = $this->metaPath . '.tmp';
$json = json_encode($payload, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES);
if ($json === false) {
throw new \RuntimeException('Unable to encode index_meta.json');
}
if (file_put_contents($tmp, $json . PHP_EOL) === false) {
throw new \RuntimeException('Unable to write temp meta file');
}
// atomarer Switch
if (!rename($tmp, $this->metaPath)) {
@unlink($tmp);
throw new \RuntimeException('Unable to switch meta file atomically');
}
}
}

View File

@@ -0,0 +1,34 @@
<?php
declare(strict_types=1);
namespace App\Index;
/**
* Wird geworfen, wenn lokale Ingests nicht mehr kompatibel sind
* und ein Global Reindex erzwungen werden muss.
*/
final class IndexStructureChangedException extends \RuntimeException
{
/**
* @param array<string,mixed> $diff
*/
public function __construct(
string $message,
private readonly array $diff = [],
int $code = 0,
?\Throwable $previous = null
)
{
parent::__construct($message, $code, $previous);
}
/**
* @return array<string,mixed>
*/
public function getDiff(): array
{
return $this->diff;
}
}

182
src/Ingest/IngestFlow.php Normal file
View File

@@ -0,0 +1,182 @@
<?php
declare(strict_types=1);
namespace App\Ingest;
use App\Entity\DocumentVersion;
use App\Entity\IngestJob;
use App\Entity\User;
use App\Index\IndexMetaManager;
use App\Index\IndexStructureChangedException;
use App\Knowledge\ChunkManager;
use App\Service\IngestJobService;
use App\Service\LockService;
use App\Knowledge\Ingest\KnowledgeIngestService;
use App\Vector\VectorIndexBuilder;
use Doctrine\ORM\EntityManagerInterface;
final class IngestFlow
{
public function __construct(
private readonly LockService $lockService,
private readonly IngestJobService $jobService,
private readonly KnowledgeIngestService $knowledgeIngestService,
private readonly ChunkManager $chunkManager,
private readonly VectorIndexBuilder $vectorBuilder,
private readonly IndexMetaManager $metaManager,
private readonly EntityManagerInterface $em,
) {
}
// ============================================================
// LOCAL DOCUMENT INGEST
// ============================================================
public function ingestDocumentVersion(
DocumentVersion $version,
User $user
): IngestJob {
if (!$this->lockService->acquire()) {
throw new \RuntimeException('Another ingest job is already running.');
}
$job = null;
try {
$job = $this->jobService->startJob(
IngestJob::TYPE_DOCUMENT,
$user,
$version->getDocument()->getId(),
$version->getId(),
);
$version->setIngestStatus(DocumentVersion::INGEST_RUNNING);
$this->em->flush();
// --------------------------------------------------
// Guardrail: Struktur prüfen
// --------------------------------------------------
$this->metaManager->validateAgainstCurrent();
// --------------------------------------------------
// Alte Chunks dieses Dokuments entfernen (Streaming)
// --------------------------------------------------
$this->chunkManager->compactByDocument(
$version->getDocument()->getId()
);
// --------------------------------------------------
// Neue Chunks erzeugen
// --------------------------------------------------
$records = $this->knowledgeIngestService
->buildChunkRecords($version);
// --------------------------------------------------
// Append in NDJSON
// --------------------------------------------------
$this->chunkManager->appendChunks($records);
// --------------------------------------------------
// FAISS komplett neu bauen (deterministisch)
// --------------------------------------------------
$logPath = $job->getLogPath();
$this->vectorBuilder->rebuildFromNdjson($logPath);
// --------------------------------------------------
// Erfolg
// --------------------------------------------------
$version->setIngestStatus(DocumentVersion::INGEST_INDEXED);
$this->jobService->markCompleted($job);
$this->em->flush();
} catch (IndexStructureChangedException $e) {
if ($job) {
$this->jobService->markFailed($job, $e->getMessage());
}
$version->setIngestStatus(DocumentVersion::INGEST_FAILED);
$this->em->flush();
throw $e;
} catch (\Throwable $e) {
if ($job) {
$this->jobService->markFailed($job, $e->getMessage());
}
$version->setIngestStatus(DocumentVersion::INGEST_FAILED);
$this->em->flush();
throw $e;
} finally {
$this->lockService->release();
}
return $job;
}
// ============================================================
// GLOBAL REINDEX
// ============================================================
public function globalReindex(User $user): IngestJob
{
if (!$this->lockService->acquire()) {
throw new \RuntimeException('Another ingest job is already running.');
}
$job = null;
try {
$job = $this->jobService->startJob(
IngestJob::TYPE_GLOBAL_REINDEX,
$user
);
// --------------------------------------------------
// Alle aktiven Dokumente neu ingestieren
// --------------------------------------------------
$allRecords = $this->knowledgeIngestService
->buildAllActiveChunkRecords();
// --------------------------------------------------
// Komplettes NDJSON neu schreiben
// --------------------------------------------------
$this->chunkManager->rewriteAll($allRecords);
// --------------------------------------------------
// FAISS komplett neu bauen
// --------------------------------------------------
$logPath = $job->getLogPath();
$this->vectorBuilder->rebuildFromNdjson($logPath);
// --------------------------------------------------
// Meta aktualisieren + index_version++
// --------------------------------------------------
$this->metaManager->writeMetaForGlobalReindex();
$this->jobService->markCompleted($job);
} catch (\Throwable $e) {
if ($job) {
$this->jobService->markFailed($job, $e->getMessage());
}
throw $e;
} finally {
$this->lockService->release();
}
return $job;
}
}

View File

@@ -0,0 +1,186 @@
<?php
declare(strict_types=1);
namespace App\Knowledge;
use Symfony\Component\Uid\Uuid;
final class ChunkManager
{
private string $indexPath;
public function __construct(
string $projectDir,
string $relativeIndexPath = '/var/knowledge/index.ndjson'
) {
$this->indexPath = rtrim($projectDir, '/') . $relativeIndexPath;
}
public function getIndexPath(): string
{
return $this->indexPath;
}
// ============================================================
// APPEND
// ============================================================
/**
* @param iterable<array<string,mixed>> $records
*/
public function appendChunks(iterable $records): void
{
$dir = \dirname($this->indexPath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create index directory');
}
$handle = fopen($this->indexPath, 'ab');
if (!$handle) {
throw new \RuntimeException('Unable to open index.ndjson for append');
}
foreach ($records as $record) {
$json = json_encode($record, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
if ($json === false) {
fclose($handle);
throw new \RuntimeException('Unable to encode chunk record');
}
fwrite($handle, $json . PHP_EOL);
}
fclose($handle);
}
// ============================================================
// COMPACTION Entfernt alle Chunks eines Dokuments
// ============================================================
public function compactByDocument(Uuid $documentId): void
{
if (!is_file($this->indexPath)) {
return; // nichts zu kompaktieren
}
$tmpPath = $this->indexPath . '.tmp';
$in = fopen($this->indexPath, 'rb');
$out = fopen($tmpPath, 'wb');
if (!$in || !$out) {
throw new \RuntimeException('Unable to open index for compaction');
}
$docIdString = $documentId->toRfc4122();
while (($line = fgets($in)) !== false) {
$line = trim($line);
if ($line === '') {
continue;
}
$data = json_decode($line, true);
if (!is_array($data)) {
continue; // skip corrupted line
}
if (($data['document_id'] ?? null) === $docIdString) {
continue; // skip this document's chunks
}
fwrite($out, $line . PHP_EOL);
}
fclose($in);
fclose($out);
$this->atomicSwitch($tmpPath, $this->indexPath);
}
// ============================================================
// FULL REWRITE (Global Reindex)
// ============================================================
/**
* @param iterable<array<string,mixed>> $records
*/
public function rewriteAll(iterable $records): void
{
$dir = \dirname($this->indexPath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create index directory');
}
$tmpPath = $this->indexPath . '.tmp';
$handle = fopen($tmpPath, 'wb');
if (!$handle) {
throw new \RuntimeException('Unable to open temp index file');
}
foreach ($records as $record) {
$json = json_encode($record, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
if ($json === false) {
fclose($handle);
throw new \RuntimeException('Unable to encode chunk record');
}
fwrite($handle, $json . PHP_EOL);
}
fclose($handle);
$this->atomicSwitch($tmpPath, $this->indexPath);
}
// ============================================================
// STREAM READ (für FAISS rebuild)
// ============================================================
/**
* @return \Generator<array<string,mixed>>
*/
public function streamAll(): \Generator
{
if (!is_file($this->indexPath)) {
return;
}
$handle = fopen($this->indexPath, 'rb');
if (!$handle) {
throw new \RuntimeException('Unable to open index.ndjson for read');
}
try {
while (($line = fgets($handle)) !== false) {
$line = trim($line);
if ($line === '') {
continue;
}
$data = json_decode($line, true);
if (is_array($data)) {
yield $data;
}
}
} finally {
fclose($handle);
}
}
// ============================================================
// INTERNAL ATOMIC SWITCH
// ============================================================
private function atomicSwitch(string $tmp, string $final): void
{
if (!rename($tmp, $final)) {
@unlink($tmp);
throw new \RuntimeException('Atomic switch failed for index.ndjson');
}
}
}

View File

@@ -1,39 +1,93 @@
<?php
// src/Knowledge/Ingest/KnowledgeIngestService.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
use App\Entity\DocumentVersion;
use App\Repository\DocumentVersionRepository;
use Symfony\Component\Uid\Uuid;
final class KnowledgeIngestService
{
public function __construct(
private DocumentLoader $loader,
private SimpleChunker $chunker,
private ChunkWriter $writer,
private ChunkIndexWriter $indexWriter,
)
{
private DocumentLoader $loader,
private SimpleChunker $chunker,
private DocumentVersionRepository $versionRepo,
) {
}
/** @return string[] written chunk filenames */
public function ingestFile(string $path, bool $optimize = false): array
/**
* Lokaler Ingest: erzeugt NDJSON-Records für genau diese Version.
*
* @return iterable<array<string,mixed>>
*/
public function buildChunkRecords(DocumentVersion $version): iterable
{
$text = $this->loader->load($path);
if ($optimize) {
$text = preg_replace("/\n{3,}/", "\n\n", $text);
$text = preg_replace("/[ \t]+$/m", "", $text);
}
$sourceHash = sha1($text);
$sourceName = basename($path);
if ($this->indexWriter->hasSourceHash($sourceName, $sourceHash)) {
return [];
}
$text = $this->loader->load($version->getFilePath());
$text = $this->optimizeText($text);
$chunks = $this->chunker->chunk($text);
return $this->writer->write($sourceName, $chunks, $sourceHash);
$documentId = $version->getDocument()->getId()->toRfc4122();
$versionId = $version->getId()->toRfc4122();
$index = 0;
foreach ($chunks as $chunkText) {
yield [
'chunk_id' => Uuid::v4()->toRfc4122(),
'document_id' => $documentId,
'version_id' => $versionId,
'chunk_index' => $index++,
'text' => $chunkText,
'checksum' => sha1($chunkText),
'metadata' => $this->buildMetadata($version),
];
}
}
/**
* Global Reindex: iteriert streamingfähig über alle aktiven Versionen.
* Keine RAM-Explosion, da alles generatorbasiert bleibt.
*
* @return iterable<array<string,mixed>>
*/
public function buildAllActiveChunkRecords(): iterable
{
foreach ($this->versionRepo->iterateActiveVersions() as $version) {
// yield from hält das Ganze streamingfähig (Generator-Kaskade)
yield from $this->buildChunkRecords($version);
}
}
private function optimizeText(string $text): string
{
$text = preg_replace("/\n{3,}/", "\n\n", $text);
$text = preg_replace("/[ \t]+$/m", "", $text);
return $text;
}
/**
* @return array<string,mixed>
*/
private function buildMetadata(DocumentVersion $version): array
{
$doc = $version->getDocument();
// Optional: Titel/Name, falls vorhanden
$title = null;
if (method_exists($doc, 'getTitle')) {
$title = $doc->getTitle();
} elseif (method_exists($doc, 'getName')) {
$title = $doc->getName();
}
return array_filter([
'document_title' => $title,
'version_number' => method_exists($version, 'getVersionNumber') ? $version->getVersionNumber() : null,
'file_path' => $version->getFilePath(),
], static fn($v) => $v !== null && $v !== '');
}
}

View File

@@ -0,0 +1,32 @@
<?php
declare(strict_types=1);
namespace App\Repository;
use App\Entity\DocumentVersion;
use Doctrine\Bundle\DoctrineBundle\Repository\ServiceEntityRepository;
use Doctrine\Persistence\ManagerRegistry;
final class DocumentVersionRepository extends ServiceEntityRepository
{
public function __construct(ManagerRegistry $registry)
{
parent::__construct($registry, DocumentVersion::class);
}
/**
* Streamingfähige Iteration über alle aktiven Versionen (für Global Reindex).
*
* @return iterable<DocumentVersion>
*/
public function iterateActiveVersions(): iterable
{
$qb = $this->createQueryBuilder('v')
->andWhere('v.isActive = :active')
->setParameter('active', true)
->orderBy('v.createdAt', 'ASC');
return $qb->getQuery()->toIterable();
}
}

View File

@@ -0,0 +1,43 @@
<?php
namespace App\Repository;
use App\Entity\User;
use Doctrine\Bundle\DoctrineBundle\Repository\ServiceEntityRepository;
use Doctrine\Persistence\ManagerRegistry;
/**
* @extends ServiceEntityRepository<User>
*/
class UserRepository extends ServiceEntityRepository
{
public function __construct(ManagerRegistry $registry)
{
parent::__construct($registry, User::class);
}
// /**
// * @return User[] Returns an array of User objects
// */
// public function findByExampleField($value): array
// {
// return $this->createQueryBuilder('u')
// ->andWhere('u.exampleField = :val')
// ->setParameter('val', $value)
// ->orderBy('u.id', 'ASC')
// ->setMaxResults(10)
// ->getQuery()
// ->getResult()
// ;
// }
// public function findOneBySomeField($value): ?User
// {
// return $this->createQueryBuilder('u')
// ->andWhere('u.exampleField = :val')
// ->setParameter('val', $value)
// ->getQuery()
// ->getOneOrNullResult()
// ;
// }
}

View File

@@ -0,0 +1,123 @@
<?php
namespace App\Service;
use App\Entity\Document;
use App\Entity\DocumentVersion;
use App\Entity\User;
use Doctrine\ORM\EntityManagerInterface;
class DocumentService
{
public function __construct(
private EntityManagerInterface $em
) {}
/**
* Erstellt ein neues Dokument inkl. Version 1
*/
public function createDocument(
string $title,
string $filePath,
User $user
): Document {
$document = new Document();
$document->setTitle($title);
$document->setCreatedBy($user);
$version = new DocumentVersion();
$version->setVersionNumber(1);
$version->setFilePath($filePath);
$version->setChecksum($this->calculateChecksum($filePath));
$version->setCreatedBy($user);
$version->setActive(true);
$document->addVersion($version);
$document->setCurrentVersion($version);
$this->em->persist($document);
$this->em->persist($version);
$this->em->flush();
return $document;
}
/**
* Fügt neue Version hinzu (immutable)
*/
public function addVersion(
Document $document,
string $filePath,
User $user
): DocumentVersion {
$nextVersionNumber = $this->getNextVersionNumber($document);
$version = new DocumentVersion();
$version->setVersionNumber($nextVersionNumber);
$version->setFilePath($filePath);
$version->setChecksum($this->calculateChecksum($filePath));
$version->setCreatedBy($user);
$version->setActive(false);
$document->addVersion($version);
$this->em->persist($version);
$this->em->flush();
return $version;
}
/**
* Aktiviert eine Version (setzt andere inaktiv)
*/
public function activateVersion(DocumentVersion $version): void
{
$document = $version->getDocument();
foreach ($document->getVersions() as $existingVersion) {
$existingVersion->setActive(false);
}
$version->setActive(true);
$document->setCurrentVersion($version);
$this->em->flush();
}
/**
* Archiviert Dokument
*/
public function archive(Document $document): void
{
$document->archive();
$this->em->flush();
}
/**
* Berechnet SHA256 Checksum
*/
private function calculateChecksum(string $filePath): string
{
if (!file_exists($filePath)) {
throw new \RuntimeException('File not found for checksum.');
}
return hash_file('sha256', $filePath);
}
/**
* Ermittelt nächste Versionsnummer
*/
private function getNextVersionNumber(Document $document): int
{
$max = 0;
foreach ($document->getVersions() as $version) {
$max = max($max, $version->getVersionNumber());
}
return $max + 1;
}
}

View File

@@ -0,0 +1,54 @@
<?php
namespace App\Service;
use App\Entity\IngestJob;
use App\Entity\User;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Component\Uid\Uuid;
final class IngestJobService
{
public function __construct(private EntityManagerInterface $em)
{
}
public function startJob(
string $type,
?User $user = null,
?Uuid $documentId = null,
?Uuid $documentVersionId = null,
?string $logPath = null
): IngestJob
{
$job = new IngestJob($type);
$job->setStartedBy($user);
$job->setDocumentId($documentId);
$job->setDocumentVersionId($documentVersionId);
$job->setLogPath($logPath);
$this->em->persist($job);
$this->em->flush();
return $job;
}
public function markCompleted(IngestJob $job): void
{
$job->markCompleted();
$this->em->flush();
}
public function markFailed(IngestJob $job, string $message): void
{
$job->markFailed($message);
$this->em->flush();
}
public function markAborted(IngestJob $job): void
{
$job->markAborted();
$this->em->flush();
}
}

View File

@@ -0,0 +1,130 @@
<?php
namespace App\Service;
use App\Entity\DocumentVersion;
use App\Entity\IngestJob;
use App\Entity\User;
use Doctrine\ORM\EntityManagerInterface;
class IngestOrchestrator
{
public function __construct(
private LockService $lockService,
private IngestJobService $jobService,
private EntityManagerInterface $em,
) {}
/**
* Startet Ingest für eine bestimmte DocumentVersion
*/
public function runForVersion(
DocumentVersion $version,
User $user,
bool $dryRun = false
): IngestJob {
if (!$this->lockService->acquire()) {
throw new \RuntimeException('Another ingest job is already running.');
}
$job = null;
try {
// --------------------------------------
// Job anlegen
// --------------------------------------
$job = $this->jobService->startJob(
IngestJob::TYPE_DOCUMENT,
$user,
$version->getDocument()->getId(),
$version->getId(),
);
// --------------------------------------
// Version Status RUNNING
// --------------------------------------
$version->setIngestStatus(DocumentVersion::INGEST_RUNNING);
$this->em->flush();
// --------------------------------------
// Simulierter Ablauf (noch kein echter Ingest)
// --------------------------------------
if ($dryRun) {
usleep(200000);
} else {
// Später:
// - KnowledgeIngestService
// - ChunkWriter
// - VectorIngestCommand
}
// --------------------------------------
// Erfolg
// --------------------------------------
$version->setIngestStatus(DocumentVersion::INGEST_INDEXED);
$this->jobService->markCompleted($job);
$this->em->flush();
} catch (\Throwable $e) {
if ($job) {
$this->jobService->markFailed($job, $e->getMessage());
}
$version->setIngestStatus(DocumentVersion::INGEST_FAILED);
$this->em->flush();
throw $e;
} finally {
$this->lockService->release();
}
return $job;
}
/**
* Globaler Reindex
*/
public function runGlobal(User $user, bool $dryRun = false): IngestJob
{
if (!$this->lockService->acquire()) {
throw new \RuntimeException('Another ingest job is already running.');
}
$job = null;
try {
$job = $this->jobService->startJob(
IngestJob::TYPE_GLOBAL_REINDEX,
$user
);
if ($dryRun) {
usleep(200000);
} else {
// Später:
// - Alle aktiven Dokumente neu ingestieren
// - Global vector rebuild
}
$this->jobService->markCompleted($job);
} catch (\Throwable $e) {
if ($job) {
$this->jobService->markFailed($job, $e->getMessage());
}
throw $e;
} finally {
$this->lockService->release();
}
return $job;
}
}

View File

@@ -0,0 +1,71 @@
<?php
namespace App\Service;
final class LockService
{
private $handle = null;
private string $lockFile;
public function __construct(string $projectDir)
{
$dir = $projectDir . '/var/locks';
if (!is_dir($dir)) {
mkdir($dir, 0777, true);
}
$this->lockFile = $dir . '/ingest.lock';
}
/**
* Gibt true zurück, wenn Lock erfolgreich gesetzt wurde.
* Wenn false: ein anderer Prozess hält den Lock.
*/
public function acquire(): bool
{
$handle = fopen($this->lockFile, 'c+');
if (!$handle) {
throw new \RuntimeException('Could not open lock file.');
}
// Nicht-blockierend: sofort true/false
if (!flock($handle, LOCK_EX | LOCK_NB)) {
fclose($handle);
return false;
}
// Lock halten: Handle offen lassen
$this->handle = $handle;
// Optional: Metainfo reinschreiben
ftruncate($this->handle, 0);
fwrite($this->handle, (string) time());
return true;
}
public function release(): void
{
if ($this->handle) {
flock($this->handle, LOCK_UN);
fclose($this->handle);
$this->handle = null;
}
}
public function isLocked(): bool
{
$handle = fopen($this->lockFile, 'c+');
if (!$handle) {
return false;
}
$locked = !flock($handle, LOCK_EX | LOCK_NB);
if (!$locked) {
flock($handle, LOCK_UN);
}
fclose($handle);
return $locked;
}
}

View File

@@ -0,0 +1,164 @@
<?php
declare(strict_types=1);
namespace App\Vector;
use Symfony\Component\Process\Exception\ProcessFailedException;
use Symfony\Component\Process\Process;
final class VectorIndexBuilder
{
private string $pythonBin;
private string $scriptPath;
private string $indexNdjsonPath;
private string $vectorIndexPath;
private int $timeoutSeconds;
public function __construct(
string $projectDir,
string $pythonBin = 'python3',
string $relativeScriptPath = '/vector/vector_ingest.py',
string $relativeIndexNdjsonPath = '/var/knowledge/index.ndjson',
string $relativeVectorIndexPath = '/var/knowledge/vector.index',
int $timeoutSeconds = 600
)
{
$base = rtrim($projectDir, '/');
$this->pythonBin = $pythonBin;
$this->scriptPath = $base . $relativeScriptPath;
$this->indexNdjsonPath = $base . $relativeIndexNdjsonPath;
$this->vectorIndexPath = $base . $relativeVectorIndexPath;
$this->timeoutSeconds = $timeoutSeconds;
}
public function getIndexNdjsonPath(): string
{
return $this->indexNdjsonPath;
}
public function getVectorIndexPath(): string
{
return $this->vectorIndexPath;
}
public function getScriptPath(): string
{
return $this->scriptPath;
}
/**
* Rebuild FAISS Index deterministisch aus index.ndjson.
*
* Erwartung: Python schreibt in $tmpVectorIndexPath, wir schalten atomar um.
*
* @param string|null $logPath Optional: stdout/stderr dorthin appenden
*/
public function rebuildFromNdjson(?string $logPath = null): void
{
if (!is_file($this->scriptPath)) {
throw new \RuntimeException('vector_ingest.py not found at: ' . $this->scriptPath);
}
if (!is_file($this->indexNdjsonPath)) {
throw new \RuntimeException('index.ndjson not found at: ' . $this->indexNdjsonPath);
}
$dir = \dirname($this->vectorIndexPath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new \RuntimeException('Unable to create vector index directory: ' . $dir);
}
$tmpVectorIndexPath = $this->vectorIndexPath . '.tmp';
// Vorheriges tmp entfernen (Sicherheit)
if (is_file($tmpVectorIndexPath)) {
@unlink($tmpVectorIndexPath);
}
// ----------------------------
// Python-Aufruf (konservativ)
// ----------------------------
// Wir erwarten/standardisieren (ab jetzt) CLI-Args:
// --index <path-to-index.ndjson>
// --out <path-to-vector.index.tmp>
//
// Falls dein Python-Script aktuell andere Args hat,
// passen wir es im nächsten Schritt konsistent an.
$cmd = [
$this->pythonBin,
$this->scriptPath,
'--index', $this->indexNdjsonPath,
'--out', $tmpVectorIndexPath,
];
$process = new Process($cmd);
$process->setTimeout($this->timeoutSeconds);
$this->runProcess($process, $logPath);
// Python muss tmp erzeugt haben
if (!is_file($tmpVectorIndexPath) || filesize($tmpVectorIndexPath) === 0) {
throw new \RuntimeException('Vector index rebuild failed: tmp output missing or empty: ' . $tmpVectorIndexPath);
}
// Atomarer Switch
$this->atomicSwitch($tmpVectorIndexPath, $this->vectorIndexPath);
}
// -------------------------
// Internals
// -------------------------
private function runProcess(Process $process, ?string $logPath): void
{
if ($logPath !== null) {
$this->appendLog($logPath, "\n=== VectorIndexBuilder START " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n");
$this->appendLog($logPath, "CMD: " . $process->getCommandLine() . "\n");
}
$process->run(function (string $type, string $buffer) use ($logPath) {
if ($logPath === null) {
return;
}
// TYPE: Process::OUT / Process::ERR
$this->appendLog($logPath, $buffer);
});
if (!$process->isSuccessful()) {
if ($logPath !== null) {
$this->appendLog($logPath, "\n=== VectorIndexBuilder FAILED ===\n");
$this->appendLog($logPath, "ExitCode: " . $process->getExitCode() . "\n");
$this->appendLog($logPath, "STDERR:\n" . $process->getErrorOutput() . "\n");
}
throw new ProcessFailedException($process);
}
if ($logPath !== null) {
$this->appendLog($logPath, "\n=== VectorIndexBuilder OK " . (new \DateTimeImmutable())->format(DATE_ATOM) . " ===\n");
}
}
private function appendLog(string $logPath, string $content): void
{
$dir = \dirname($logPath);
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
// Wenn Log nicht möglich ist: nicht hart scheitern (Build ist wichtiger)
return;
}
@file_put_contents($logPath, $content, FILE_APPEND);
}
private function atomicSwitch(string $tmp, string $final): void
{
if (!rename($tmp, $final)) {
@unlink($tmp);
throw new \RuntimeException('Atomic switch failed for vector.index');
}
}
}

View File

@@ -2,88 +2,125 @@
import sys
import json
import argparse
from pathlib import Path
# ---------------------------------------------------------
# Argument handling
# Argument parsing
# ---------------------------------------------------------
if len(sys.argv) < 3:
print("ERROR: Missing arguments (vectorDir, knowledgeDir)")
sys.exit(2)
parser = argparse.ArgumentParser(description="Build FAISS index from NDJSON")
vector_dir = Path(sys.argv[1]).resolve()
knowledge_dir = Path(sys.argv[2]).resolve()
parser.add_argument("--index", required=True, help="Path to index.ndjson")
parser.add_argument("--out", required=True, help="Path to output vector.index")
parser.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model")
index_json = knowledge_dir / "index.json"
index_out = vector_dir / "vector.index"
meta_out = vector_dir / "vector_meta.json"
args = parser.parse_args()
index_path = Path(args.index).resolve()
out_path = Path(args.out).resolve()
# ---------------------------------------------------------
# Dependency checks
# ---------------------------------------------------------
try:
import faiss # noqa
import faiss
except Exception:
print("ERROR: Python module 'faiss' not found.")
sys.exit(10)
try:
from sentence_transformers import SentenceTransformer # noqa
from sentence_transformers import SentenceTransformer
except Exception:
print("ERROR: Python module 'sentence-transformers' not found.")
sys.exit(11)
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if not index_json.is_file():
print(f"ERROR: index.json not found at {index_json}")
if not index_path.is_file():
print(f"ERROR: index.ndjson not found at {index_path}")
sys.exit(20)
# ---------------------------------------------------------
# Load chunks from index.json
# Load model
# ---------------------------------------------------------
with open(index_json, "r", encoding="utf-8") as f:
data = json.load(f)
print(f"Loading embedding model: {args.model}")
model = SentenceTransformer(args.model)
# ---------------------------------------------------------
# Streaming read NDJSON
# ---------------------------------------------------------
texts = []
ids = []
for entry in data:
if "file" not in entry:
continue
print("Reading NDJSON...")
chunk_path = knowledge_dir / "chunks" / entry["file"]
if not chunk_path.is_file():
continue
with open(index_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
text = chunk_path.read_text(encoding="utf-8").strip()
if not text:
continue
try:
entry = json.loads(line)
except Exception:
continue
texts.append(text)
ids.append(entry["file"])
text = entry.get("text")
chunk_id = entry.get("chunk_id")
if not text or not chunk_id:
continue
texts.append(text)
ids.append(chunk_id)
if not texts:
print("ERROR: No chunks loaded from index.json")
print("ERROR: No valid chunks found in index.ndjson")
sys.exit(21)
print(f"Loaded {len(texts)} chunks.")
# ---------------------------------------------------------
# Build vector index
# Build embeddings
# ---------------------------------------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, normalize_embeddings=True)
print("Encoding embeddings...")
embeddings = model.encode(
texts,
normalize_embeddings=True,
show_progress_bar=True,
batch_size=64
)
embeddings = np.array(embeddings).astype("float32")
dim = embeddings.shape[1]
print(f"Embedding dimension: {dim}")
# ---------------------------------------------------------
# Build FAISS index
# ---------------------------------------------------------
print("Building FAISS index...")
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
faiss.write_index(index, str(index_out))
# Ensure output directory exists
out_path.parent.mkdir(parents=True, exist_ok=True)
with open(meta_out, "w", encoding="utf-8") as f:
print(f"Writing FAISS index to {out_path}")
faiss.write_index(index, str(out_path))
# ---------------------------------------------------------
# Write ID mapping meta
# ---------------------------------------------------------
meta_path = out_path.with_suffix(".meta.json")
with open(meta_path, "w", encoding="utf-8") as f:
json.dump(ids, f)
print(f"Indexed {len(ids)} chunks.")
print(f"Indexed {len(ids)} chunks successfully.")
sys.exit(0)