From 812f2bf265770a79f9bf16cbc7a42aa4b28e9ab5 Mon Sep 17 00:00:00 2001 From: team 1 Date: Tue, 17 Feb 2026 14:12:24 +0100 Subject: [PATCH] optimize code and ingest docs --- src/Controller/Admin/DocumentController.php | 28 +++- .../Admin/IngestProfileController.php | 9 ++ src/Entity/DocumentVersion.php | 61 ++----- src/Knowledge/Ingest/ChunkIndexWriter.php | 58 ------- src/Knowledge/Ingest/ChunkWriter.php | 149 ------------------ .../Ingest/KnowledgeIngestService.php | 15 +- src/Repository/IngestProfileRepository.php | 14 ++ templates/admin/document/index.html.twig | 8 + templates/admin/document/new.html.twig | 7 +- .../admin/ingest_profile/create.html.twig | 82 ++++++++-- templates/admin/ingest_profile/list.html.twig | 7 +- 11 files changed, 156 insertions(+), 282 deletions(-) delete mode 100644 src/Knowledge/Ingest/ChunkIndexWriter.php delete mode 100644 src/Knowledge/Ingest/ChunkWriter.php diff --git a/src/Controller/Admin/DocumentController.php b/src/Controller/Admin/DocumentController.php index 23831b9..04fea2d 100644 --- a/src/Controller/Admin/DocumentController.php +++ b/src/Controller/Admin/DocumentController.php @@ -13,6 +13,7 @@ use Doctrine\ORM\EntityManagerInterface; use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface; use Symfony\Component\HttpFoundation\File\Exception\FileException; +use Symfony\Component\HttpFoundation\File\UploadedFile; use Symfony\Component\HttpFoundation\RedirectResponse; use Symfony\Component\HttpFoundation\Request; use Symfony\Component\HttpFoundation\Response; @@ -29,7 +30,14 @@ class DocumentController extends AbstractController public function index(EntityManagerInterface $em): Response { $documents = $em->getRepository(Document::class) - ->findBy([], ['createdAt' => 'DESC']); + ->createQueryBuilder('d') + ->leftJoin('d.versions', 'v') + ->addSelect('v') + ->leftJoin('d.currentVersion', 'cv') + ->addSelect('cv') + ->orderBy('d.createdAt', 'DESC') + ->getQuery() + ->getResult(); return $this->render('admin/document/index.html.twig', [ 'documents' => $documents @@ -71,12 +79,22 @@ class DocumentController extends AbstractController { if ($request->isMethod('POST')) { + /** @var UploadedFile|null $file */ $file = $request->files->get('file'); - $title = $request->request->get('title') ?: $file->getClientOriginalName(); - $title = $formatText->slugify($title); - if (!$file || !$title) { - $this->addFlash('error', 'Titel und Datei sind erforderlich.'); + if (!$file instanceof UploadedFile) { + throw new \InvalidArgumentException('No valid file uploaded.'); + } + + $rawTitle = $request->request->get('title'); + + $title = is_string($rawTitle) && $rawTitle !== '' + ? $rawTitle + : $formatText->slugify($file->getClientOriginalName()); + + + if (!$title) { + $this->addFlash('error', 'Titel ist erforderlich.'); return $this->redirectToRoute('admin_document_new'); } diff --git a/src/Controller/Admin/IngestProfileController.php b/src/Controller/Admin/IngestProfileController.php index e2c4f78..0c19c00 100644 --- a/src/Controller/Admin/IngestProfileController.php +++ b/src/Controller/Admin/IngestProfileController.php @@ -99,4 +99,13 @@ class IngestProfileController extends AbstractController return $this->redirectToRoute('admin_ingest_profile_list'); } + + #[Route('/remove/{id}', name: 'admin_ingest_profile_remove')] + public function remove( + IngestProfileRepository $repo, + string $id + ): Response { + $repo->remove($id); + return $this->redirectToRoute('admin_ingest_profile_list'); + } } diff --git a/src/Entity/DocumentVersion.php b/src/Entity/DocumentVersion.php index 880e039..e7f2140 100644 --- a/src/Entity/DocumentVersion.php +++ b/src/Entity/DocumentVersion.php @@ -1,5 +1,7 @@ createdAt = new \DateTimeImmutable(); } - // ========================= - // ID - // ========================= - public function getId(): Uuid { return $this->id; } - // ========================= - // Document Relation - // ========================= - public function setDocument(Document $document): void { $this->document = $document; + + if (!$document->getVersions()->contains($this)) { + $document->addVersion($this); + } } public function getDocument(): Document @@ -80,10 +78,6 @@ class DocumentVersion return $this->document; } - // ========================= - // Version Number - // ========================= - public function getVersionNumber(): int { return $this->versionNumber; @@ -94,10 +88,6 @@ class DocumentVersion $this->versionNumber = $number; } - // ========================= - // File Path - // ========================= - public function setFilePath(string $path): void { $this->filePath = $path; @@ -108,10 +98,6 @@ class DocumentVersion return $this->filePath; } - // ========================= - // Checksum - // ========================= - public function setChecksum(string $checksum): void { $this->checksum = $checksum; @@ -122,10 +108,6 @@ class DocumentVersion return $this->checksum; } - // ========================= - // Ingest Status - // ========================= - public function setIngestStatus(string $status): void { if (!in_array($status, self::INGEST_STATUSES, true)) { @@ -145,10 +127,6 @@ class DocumentVersion return $this->ingestStatus === self::INGEST_INDEXED; } - // ========================= - // Created By - // ========================= - public function setCreatedBy(User $user): void { $this->createdBy = $user; @@ -159,19 +137,11 @@ class DocumentVersion return $this->createdBy; } - // ========================= - // Created At - // ========================= - public function getCreatedAt(): \DateTimeImmutable { return $this->createdAt; } - // ========================= - // Active Flag - // ========================= - public function setActive(bool $active): void { $this->isActive = $active; @@ -182,15 +152,8 @@ class DocumentVersion return $this->isActive; } - //######################################################### - // Helper - //######################################################### public function getFileExtension(): string { - if (!$this->filePath) { - return ''; - } - return mb_strtolower(pathinfo($this->filePath, PATHINFO_EXTENSION)); } diff --git a/src/Knowledge/Ingest/ChunkIndexWriter.php b/src/Knowledge/Ingest/ChunkIndexWriter.php deleted file mode 100644 index 337f6d2..0000000 --- a/src/Knowledge/Ingest/ChunkIndexWriter.php +++ /dev/null @@ -1,58 +0,0 @@ -load(); - $index[] = $entry; - $this->save($index); - } - - private function load(): array - { - if (!is_file($this->indexPath)) { - return []; - } - - $json = file_get_contents($this->indexPath); - $data = $json ? json_decode($json, true) : null; - - return is_array($data) ? $data : []; - } - - private function save(array $index): void - { - $dir = dirname($this->indexPath); - if (!is_dir($dir)) { - mkdir($dir, 0775, true); - } - - file_put_contents( - $this->indexPath, - json_encode($index, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE) - ); - } - - public function hasSourceHash(string $source, string $hash): bool - { - foreach ($this->load() as $entry) { - if ( - ($entry['source'] ?? null) === $source && - ($entry['sourceHash'] ?? null) === $hash - ) { - return true; - } - } - return false; - } -} diff --git a/src/Knowledge/Ingest/ChunkWriter.php b/src/Knowledge/Ingest/ChunkWriter.php deleted file mode 100644 index 9ea9f5d..0000000 --- a/src/Knowledge/Ingest/ChunkWriter.php +++ /dev/null @@ -1,149 +0,0 @@ -chunksDir)) { - mkdir($this->chunksDir, 0775, true); - } - - $manifest = $this->loadManifest(); - $written = []; - - $base = $this->safeBase($sourceName); - $ts = date('Ymd_His'); - - foreach ($chunks as $i => $chunk) { - $filename = "{$base}__{$ts}__" . str_pad((string)$i, 4, '0', STR_PAD_LEFT) . ".txt"; - $path = rtrim($this->chunksDir, '/') . '/' . $filename; - - $header = $this->buildHeader( - source: $sourceName, - index: $i - ); - - file_put_contents($path, $header . "\n\n" . $chunk); - - $written[] = $filename; - - $manifest[] = [ - 'file' => $filename, - 'source' => $sourceName, - 'index' => $i, - 'chars' => mb_strlen($chunk), - 'createdAt' => date('c'), - ]; - - $this->indexWriter->add([ - 'file' => $filename, - 'source' => $sourceName, - 'sourceHash' => $sourceHash, - 'keywords' => $this->extractKeywords($chunk), - 'chars' => mb_strlen($chunk), - ]); - } - - - $this->saveManifest($manifest); - return $written; - } - - private function safeBase(string $name): string - { - $name = pathinfo($name, PATHINFO_FILENAME); - $name = mb_strtolower($name); - $name = preg_replace('/[^a-z0-9\-_]+/u', '-', $name); - return trim((string)$name, '-'); - } - - private function loadManifest(): array - { - if (!is_file($this->manifestPath)) { - return []; - } - $json = file_get_contents($this->manifestPath); - $data = $json ? json_decode($json, true) : null; - return is_array($data) ? $data : []; - } - - private function saveManifest(array $manifest): void - { - $dir = dirname($this->manifestPath); - if (!is_dir($dir)) { - mkdir($dir, 0775, true); - } - file_put_contents($this->manifestPath, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)); - } - - private function buildHeader(string $source, int $index): string - { - return sprintf( - '[Quelle: %s | Abschnitt: Chunk %d]', - $source, - $index + 1 - ); - } - - private function extractKeywords(string $text): array - { - // 1) Lowercase - $text = mb_strtolower($text); - - // 2) URLs entfernen (sehr wichtig) - $text = preg_replace('#https?://\S+#u', ' ', $text); - - // 3) Newlines & Tabs → Space - $text = str_replace(["\r", "\n", "\t"], ' ', $text); - - // 4) Trennzeichen → Space (NICHT löschen!) - $text = preg_replace('/[\/\.\,\:\;\-\_\(\)\[\]\{\}]/u', ' ', $text); - - // 5) Alles andere raus - $text = preg_replace('/[^\p{L}\p{N}\s]/u', '', $text); - - // 6) Whitespace normalisieren - $text = preg_replace('/\s+/u', ' ', $text); - $text = trim($text); - - // 7) Wörter extrahieren - $words = explode(' ', $text); - - // 8) Filtern + deduplizieren - $keywords = []; - - foreach ($words as $word) { - if (mb_strlen($word) < 4) { - continue; - } - if (in_array($word, $this->stopWords->getStopWords() ?? [], true)) { - continue; - } - $keywords[] = $word; - } - - return array_values(array_unique(array_slice($keywords, 0, 25))); - } -} diff --git a/src/Knowledge/Ingest/KnowledgeIngestService.php b/src/Knowledge/Ingest/KnowledgeIngestService.php index 647acad..78a93b3 100644 --- a/src/Knowledge/Ingest/KnowledgeIngestService.php +++ b/src/Knowledge/Ingest/KnowledgeIngestService.php @@ -29,18 +29,30 @@ final class KnowledgeIngestService $chunks = $this->chunker->chunk($text); - $documentId = $version->getDocument()->getId()->toRfc4122(); + $doc = $version->getDocument(); + + $documentId = $doc->getId()->toRfc4122(); $versionId = $version->getId()->toRfc4122(); + // ✅ Regel: Wenn title gefüllt ist, kommt er in jeden Chunk + $title = trim((string) $doc->getTitle()); + $index = 0; foreach ($chunks as $chunkText) { + + // ✅ Prefix nur wenn title vorhanden; keine Flags, keine Meta-Schalter + if ($title !== '' && !str_starts_with($chunkText, $title)) { + $chunkText = $title . "\n\n" . $chunkText; + } + yield [ 'chunk_id' => Uuid::v4()->toRfc4122(), 'document_id' => $documentId, 'version_id' => $versionId, 'chunk_index' => $index++, 'text' => $chunkText, + // ✅ checksum muss den finalen Text abbilden (inkl. Titel) 'checksum' => sha1($chunkText), 'metadata' => $this->buildMetadata($version), ]; @@ -56,7 +68,6 @@ final class KnowledgeIngestService public function buildAllActiveChunkRecords(): iterable { foreach ($this->versionRepo->iterateActiveVersions() as $version) { - // yield from hält das Ganze streamingfähig (Generator-Kaskade) yield from $this->buildChunkRecords($version); } } diff --git a/src/Repository/IngestProfileRepository.php b/src/Repository/IngestProfileRepository.php index 9ba1f80..40aeadc 100644 --- a/src/Repository/IngestProfileRepository.php +++ b/src/Repository/IngestProfileRepository.php @@ -7,6 +7,7 @@ namespace App\Repository; use App\Entity\IngestProfile; use Doctrine\Bundle\DoctrineBundle\Repository\ServiceEntityRepository; use Doctrine\Persistence\ManagerRegistry; +use Symfony\Component\Uid\Uuid; class IngestProfileRepository extends ServiceEntityRepository { @@ -28,4 +29,17 @@ class IngestProfileRepository extends ServiceEntityRepository { return $this->findOneBy(['active' => true]); } + + public function remove(string $id): void + { + $entity = $this->find($id); + + if (!$entity instanceof IngestProfile) { + return; + } + + $em = $this->getEntityManager(); + $em->remove($entity); + $em->flush(); + } } diff --git a/templates/admin/document/index.html.twig b/templates/admin/document/index.html.twig index 3393a07..9564a64 100644 --- a/templates/admin/document/index.html.twig +++ b/templates/admin/document/index.html.twig @@ -24,6 +24,7 @@ ID Typ Status + Indexiert Versionen Aktive Version Erstellt am @@ -58,6 +59,13 @@ Archiviert {% endif %} + + {% if document.currentVersion.ingestStatus == 'INDEXED' %} + {{ document.currentVersion.ingestStatus }} + {% else %} + {{ document.currentVersion.ingestStatus }} + {% endif %} + {{ document.versions|length }} {% if document.currentVersion %} diff --git a/templates/admin/document/new.html.twig b/templates/admin/document/new.html.twig index a57d8bb..69389e4 100644 --- a/templates/admin/document/new.html.twig +++ b/templates/admin/document/new.html.twig @@ -8,12 +8,15 @@
- + +
Bitte geben Sie einen aussagekräftigen Titel ein.
+ Der Titel ist entscheidend, damit in jedem Chunk ein sinnvoller thematischer Bezug hergestellt und eine saubere semantische Zuordnung ermöglicht werden kann.
+ Wenn kein Titel angegeben wird, wird automatisch der Dateiname als Titel verwendet (nicht empfohlen).
- +
diff --git a/templates/admin/ingest_profile/create.html.twig b/templates/admin/ingest_profile/create.html.twig index 5802047..5f5bd42 100644 --- a/templates/admin/ingest_profile/create.html.twig +++ b/templates/admin/ingest_profile/create.html.twig @@ -6,21 +6,71 @@

Create Ingest Profile

- -
- - -
- - -
- - -
- - -
- - + + + + + + + + + + + + + + + + + + + + + + + + + + +
Chunk Size (500-2500) + +
Chunk Overlap (50-150) + +
Embedding Model (default) + +
Embedding Dimension (default) + +
Scoring Version (default) + +
+ +
+ {% endblock %} diff --git a/templates/admin/ingest_profile/list.html.twig b/templates/admin/ingest_profile/list.html.twig index 6d6d9af..1a76fa6 100644 --- a/templates/admin/ingest_profile/list.html.twig +++ b/templates/admin/ingest_profile/list.html.twig @@ -44,10 +44,15 @@ {{ p.reindexRequired ? 'Yes' : 'No' }} {% if not p.active %} - + Aktivieren {% endif %} + {% if not p.active %} + + Löschen + + {% endif %} {% endfor %}