optimize ingesting documents

2026-02-28 22:48:01 +01:00
parent 54ce057ef0
commit 509ba83ac0
6 changed files with 335 additions and 21 deletions
--- a/src/Knowledge/Ingest/KnowledgeIngestService.php
+++ b/src/Knowledge/Ingest/KnowledgeIngestService.php
@@ -7,14 +7,16 @@ namespace App\Knowledge\Ingest;
 use App\Entity\DocumentVersion;
 use App\Knowledge\Text\TextNormalizer;
 use App\Repository\DocumentVersionRepository;
+use App\Ingest\DocumentSanitizer;

-final readonly  class KnowledgeIngestService
+final readonly class KnowledgeIngestService
 {
    public function __construct(
        private DocumentLoader            $loader,
        private SimpleChunker             $chunker,
        private DocumentVersionRepository $versionRepo,
-        private TextNormalizer            $textNormalizer
+        private TextNormalizer            $textNormalizer,
+        private DocumentSanitizer         $documentSanitizer, // ✅ NEU
    )
    {
    }
@@ -26,8 +28,13 @@ final readonly  class KnowledgeIngestService
     */
    public function buildChunkRecords(DocumentVersion $version): iterable
    {
+        // 1️⃣ Rohtext laden
        $text = $this->loader->load($version->getFilePath());

+        // 2️⃣ 🔥 Deterministische Vorverarbeitung (NEU)
+        $text = $this->documentSanitizer->sanitize($text);
+
+        // 3️⃣ Chunking
        $chunks = $this->chunker->chunk($text);

        $doc = $version->getDocument();
@@ -41,7 +48,6 @@ final readonly  class KnowledgeIngestService
        foreach ($chunks as $chunkText) {

            if ($title !== '' && !str_starts_with($chunkText, $title)) {
-                //title with backticks
                $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
            }

@@ -57,13 +63,13 @@ final readonly  class KnowledgeIngestService
            );

            yield [
-                'chunk_id' => $chunkId,
+                'chunk_id'    => $chunkId,
                'document_id' => $documentId,
-                'version_id' => $versionId,
+                'version_id'  => $versionId,
                'chunk_index' => $index++,
-                'text' => $chunkText,
-                'checksum' => sha1($chunkText),
-                'metadata' => $this->buildMetadata($version),
+                'text'        => $chunkText,
+                'checksum'    => sha1($chunkText),
+                'metadata'    => $this->buildMetadata($version),
            ];
        }
    }
--- a/src/Knowledge/Ingest/SimpleChunker.php
+++ b/src/Knowledge/Ingest/SimpleChunker.php
@@ -10,10 +10,9 @@ use App\Knowledge\Text\TextNormalizer;

 final readonly class SimpleChunker
 {
-
    public function __construct(
        private IndexConfigurationProvider $configurationProvider,
-        private TextNormalizer             $textNormalizer
+        private TextNormalizer $textNormalizer
    )
    {
    }
@@ -23,7 +22,7 @@ final readonly class SimpleChunker
    {
        $config = $this->configurationProvider->getConfiguration();

-        $maxWords = $config->getChunkSize();
+        $maxWords     = $config->getChunkSize();
        $overlapWords = $config->getChunkOverlap();

        $text = $this->textNormalizer->normalize($text);
@@ -31,6 +30,74 @@ final readonly class SimpleChunker
            return [];
        }

+        // ======================================================
+        // HYBRID: Erst Absatzbasiert sammeln
+        // ======================================================
+
+        $paragraphs = preg_split('/\n{2,}/u', $text);
+        if (!$paragraphs) {
+            return [];
+        }
+
+        $chunks = [];
+        $currentChunk = '';
+        $currentWordCount = 0;
+
+        foreach ($paragraphs as $paragraph) {
+
+            $paragraph = trim($paragraph);
+            if ($paragraph === '') {
+                continue;
+            }
+
+            $paragraphWordCount = $this->countWords($paragraph);
+
+            // Falls einzelner Absatz größer als maxWords → Fallback
+            if ($paragraphWordCount > $maxWords) {
+
+                if ($currentChunk !== '') {
+                    $chunks[] = trim($currentChunk);
+                    $currentChunk = '';
+                    $currentWordCount = 0;
+                }
+
+                foreach ($this->chunkByWords($paragraph, $maxWords, $overlapWords) as $subChunk) {
+                    $chunks[] = $subChunk;
+                }
+
+                continue;
+            }
+
+            // Absatz passt noch in aktuellen Chunk
+            if ($currentWordCount + $paragraphWordCount <= $maxWords) {
+                $currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph;
+                $currentWordCount += $paragraphWordCount;
+                continue;
+            }
+
+            // Flush aktueller Chunk
+            if ($currentChunk !== '') {
+                $chunks[] = trim($currentChunk);
+            }
+
+            $currentChunk = $paragraph;
+            $currentWordCount = $paragraphWordCount;
+        }
+
+        if ($currentChunk !== '') {
+            $chunks[] = trim($currentChunk);
+        }
+
+        return $this->dedupe($chunks);
+    }
+
+    // ======================================================
+    // Wortbasierter Fallback (Original-Logik beibehalten)
+    // ======================================================
+
+    /** @return string[] */
+    private function chunkByWords(string $text, int $maxWords, int $overlapWords): array
+    {
        $tokens = preg_split(
            '/(\s+)/u',
            $text,
@@ -61,7 +128,7 @@ final readonly class SimpleChunker
            $wordEnd = min($wordPos + $maxWords, $totalWords);

            $tokenStart = $wordTokenIndexes[$wordPos];
-            $tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
+            $tokenEnd   = $wordTokenIndexes[$wordEnd - 1] + 1;

            $tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);

@@ -82,7 +149,7 @@ final readonly class SimpleChunker
            $wordPos = max(0, $wordEnd - $overlapWords);
        }

-        return $this->dedupe($chunks);
+        return $chunks;
    }

    private function adjustCutToBoundary(array $tokens, int $start, int $end): int
@@ -110,11 +177,17 @@ final readonly class SimpleChunker
        return $end;
    }

+    private function countWords(string $text): int
+    {
+        $parts = preg_split('/\s+/u', trim($text));
+        return $parts ? count($parts) : 0;
+    }
+
    /** @param string[] $chunks @return string[] */
    private function dedupe(array $chunks): array
    {
        $seen = [];
-        $out = [];
+        $out  = [];

        foreach ($chunks as $chunk) {
            $key = mb_strtolower(
@@ -131,4 +204,4 @@ final readonly class SimpleChunker

        return $out;
    }
-}
+}