optimize ingesting documents

2026-02-28 22:48:01 +01:00
parent 54ce057ef0
commit 509ba83ac0
6 changed files with 335 additions and 21 deletions
--- a/src/Ingest/ChunkWriteService.php
+++ b/src/Ingest/ChunkWriteService.php
@@ -11,8 +11,10 @@ use Symfony\Component\Uid\Uuid;
 final readonly class ChunkWriteService
 {
    public function __construct(
-        private ChunkManager $chunkManager,
-    ) {}
+        private ChunkManager $chunkManager
+    )
+    {
+    }

    public function countAllChunks(): int
    {
@@ -41,4 +43,5 @@ final readonly class ChunkWriteService
    {
        $this->chunkManager->rewriteAll($allChunks);
    }
+
 }
--- a/src/Ingest/DocumentSanitizer.php
+++ b/src/Ingest/DocumentSanitizer.php
@@ -0,0 +1,232 @@
+<?php
+
+declare(strict_types=1);
+
+namespace App\Ingest;
+
+/**
+ * DocumentSanitizer
+ *
+ * Ziel (deterministisch, minimal-invasiv):
+ * - Entfernt typische PDF-/DOC-Artefakte VOR dem Chunking:
+ *   - Inhaltsverzeichnis-Blöcke (TOC)
+ *   - Seitenzahlen / "Seite X von Y"
+ *   - wiederkehrende Header/Footer-Zeilen
+ *   - Dot-Leader-Zeilen (".... 12")
+ *
+ * Guardrails:
+ * - Keine semantische Umschreibung
+ * - Keine Zufälligkeit
+ * - Kein Entfernen echter Fließtext-Absätze
+ */
+final class DocumentSanitizer
+{
+    private const MAX_HEADER_LEN = 120;
+    private const REPEAT_HEADER_MIN_COUNT = 3;
+
+    public function sanitize(string $text): string
+    {
+        if ($text === '') {
+            return '';
+        }
+
+        $text = $this->normalizeLineEndings($text);
+
+        // Wichtig: Reihenfolge so, dass wir erst "grobe Blöcke" (TOC) entfernen,
+        // danach zeilenbasierte Artefakte (PageNumbers/Headers/DotLeaders).
+        $text = $this->removeToc($text);
+        $text = $this->removePageNumbers($text);
+        $text = $this->removeDotLeaderLines($text);
+        $text = $this->removeRepeatedHeaders($text);
+
+        $text = $this->cleanupWhitespace($text);
+
+        return trim($text);
+    }
+
+    private function normalizeLineEndings(string $text): string
+    {
+        // Vereinheitlichen auf \n (deterministisch, kein Encoding-Change)
+        return str_replace(["\r\n", "\r"], "\n", $text);
+    }
+
+    /**
+     * Entfernt TOC-Block nach "Inhaltsverzeichnis" bis zum ersten "echten" Absatz.
+     *
+     * Heuristik:
+     * - Start: Zeile enthält "Inhaltsverzeichnis" (case-insensitive)
+     * - Innerhalb TOC werden Zeilen entfernt, die wie TOC-Einträge aussehen:
+     *   - Dot-Leader + Seitenzahl
+     *   - Kapitelnummern + Text + Seitenzahl
+     * - Ende: sobald eine Zeile "absatzartig" wirkt:
+     *   - ausreichend lang UND enthält Satzpunkt (.)
+     *
+     * Guardrail:
+     * - Leere Zeilen innerhalb TOC werden verworfen (damit TOC-Block wirklich weg ist)
+     */
+    private function removeToc(string $text): string
+    {
+        $lines = explode("\n", $text);
+        $filtered = [];
+
+        $inToc = false;
+
+        foreach ($lines as $line) {
+            $trim = trim($line);
+
+            // TOC Start
+            if (!$inToc && $trim !== '' && stripos($trim, 'inhaltsverzeichnis') !== false) {
+                $inToc = true;
+                continue;
+            }
+
+            if ($inToc) {
+                // Innerhalb TOC: leere Zeilen weg (Block entfernen)
+                if ($trim === '') {
+                    continue;
+                }
+
+                // typische TOC-Zeilen (Leader / Kapitelnummern)
+                if ($this->looksLikeDotLeaderLine($trim) || $this->looksLikeNumberedTocLine($trim)) {
+                    continue;
+                }
+
+                // Ende TOC, wenn "echter Absatz" beginnt (lang + Punkt)
+                if (strlen($trim) >= 120 && str_contains($trim, '.')) {
+                    $inToc = false;
+                    $filtered[] = $line;
+                    continue;
+                }
+
+                // sonst: solange wir im TOC sind, ignorieren
+                continue;
+            }
+
+            $filtered[] = $line;
+        }
+
+        return implode("\n", $filtered);
+    }
+
+    /**
+     * Entfernt typische Seitenzahl-Zeilen.
+     *
+     * Guardrails:
+     * - Nur kurze, "isolierte" Zeilen (trim != '')
+     * - Lässt Fließtext unangetastet
+     */
+    private function removePageNumbers(string $text): string
+    {
+        $lines = explode("\n", $text);
+        $filtered = [];
+
+        foreach ($lines as $line) {
+            $trim = trim($line);
+
+            if ($trim === '') {
+                $filtered[] = $line;
+                continue;
+            }
+
+            // "Seite 3" / "Seite 3 von 20"
+            if (preg_match('/^seite\s+\d+(\s+von\s+\d+)?$/iu', $trim)) {
+                continue;
+            }
+
+            // "Page 12" / "Page 12 of 34"
+            if (preg_match('/^page\s+\d+(\s+of\s+\d+)?$/iu', $trim)) {
+                continue;
+            }
+
+            // "- 4 -" / "4" / "– 4 –"
+            if (preg_match('/^[-–]?\s?\d{1,3}\s?[-–]?$/u', $trim)) {
+                continue;
+            }
+
+            $filtered[] = $line;
+        }
+
+        return implode("\n", $filtered);
+    }
+
+    /**
+     * Entfernt Dot-Leader-Zeilen überall (nicht nur im TOC),
+     * z.B.: "Kapitel ......... 12"
+     */
+    private function removeDotLeaderLines(string $text): string
+    {
+        $lines = explode("\n", $text);
+        $filtered = [];
+
+        foreach ($lines as $line) {
+            $trim = trim($line);
+
+            if ($trim !== '' && $this->looksLikeDotLeaderLine($trim)) {
+                continue;
+            }
+
+            $filtered[] = $line;
+        }
+
+        return implode("\n", $filtered);
+    }
+
+    /**
+     * Entfernt wiederkehrende Header/Footer-Zeilen.
+     *
+     * Guardrails:
+     * - Nur relativ kurze Zeilen (unter MAX_HEADER_LEN)
+     * - Nur wenn identisch (trim) >= REPEAT_HEADER_MIN_COUNT
+     * - Leere Zeilen bleiben erhalten
+     */
+    private function removeRepeatedHeaders(string $text): string
+    {
+        $lines = explode("\n", $text);
+
+        // counts basiert auf trim (damit z.B. unterschiedliche Einrückung nicht zählt)
+        $trimmed = array_map('trim', $lines);
+        $counts = array_count_values($trimmed);
+
+        $filtered = [];
+
+        foreach ($lines as $line) {
+            $trim = trim($line);
+
+            if (
+                $trim !== '' &&
+                strlen($trim) < self::MAX_HEADER_LEN &&
+                ($counts[$trim] ?? 0) >= self::REPEAT_HEADER_MIN_COUNT
+            ) {
+                continue;
+            }
+
+            $filtered[] = $line;
+        }
+
+        return implode("\n", $filtered);
+    }
+
+    private function cleanupWhitespace(string $text): string
+    {
+        // nicht zu aggressiv: nur 3+ Leerzeilen auf 2 reduzieren
+        $text = preg_replace("/\n{3,}/", "\n\n", $text);
+        return $text ?? '';
+    }
+
+    // =========================================================
+    // Heuristics (isoliert, testbar)
+    // =========================================================
+
+    private function looksLikeDotLeaderLine(string $trimmedLine): bool
+    {
+        // "Text ..... 12" (mind. 5 Punkte, Seitenzahl am Ende)
+        return (bool)preg_match('/^.+\.{5,}\s*\d+$/u', $trimmedLine);
+    }
+
+    private function looksLikeNumberedTocLine(string $trimmedLine): bool
+    {
+        // "2.1 Kapitelname 12" / "3 Kapitelname 7"
+        // Kapitelnummern + Text + Seitenzahl am Ende
+        return (bool)preg_match('/^\d+(\.\d+)*\s+.+\s+\d+$/u', $trimmedLine);
+    }
+}
--- a/src/Knowledge/Ingest/KnowledgeIngestService.php
+++ b/src/Knowledge/Ingest/KnowledgeIngestService.php
@@ -7,14 +7,16 @@ namespace App\Knowledge\Ingest;
 use App\Entity\DocumentVersion;
 use App\Knowledge\Text\TextNormalizer;
 use App\Repository\DocumentVersionRepository;
+use App\Ingest\DocumentSanitizer;

-final readonly  class KnowledgeIngestService
+final readonly class KnowledgeIngestService
 {
    public function __construct(
        private DocumentLoader            $loader,
        private SimpleChunker             $chunker,
        private DocumentVersionRepository $versionRepo,
-        private TextNormalizer            $textNormalizer
+        private TextNormalizer            $textNormalizer,
+        private DocumentSanitizer         $documentSanitizer, // ✅ NEU
    )
    {
    }
@@ -26,8 +28,13 @@ final readonly  class KnowledgeIngestService
     */
    public function buildChunkRecords(DocumentVersion $version): iterable
    {
+        // 1️⃣ Rohtext laden
        $text = $this->loader->load($version->getFilePath());

+        // 2️⃣ 🔥 Deterministische Vorverarbeitung (NEU)
+        $text = $this->documentSanitizer->sanitize($text);
+
+        // 3️⃣ Chunking
        $chunks = $this->chunker->chunk($text);

        $doc = $version->getDocument();
@@ -41,7 +48,6 @@ final readonly  class KnowledgeIngestService
        foreach ($chunks as $chunkText) {

            if ($title !== '' && !str_starts_with($chunkText, $title)) {
-                //title with backticks
                $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
            }

@@ -57,13 +63,13 @@ final readonly  class KnowledgeIngestService
            );

            yield [
-                'chunk_id' => $chunkId,
+                'chunk_id'    => $chunkId,
                'document_id' => $documentId,
-                'version_id' => $versionId,
+                'version_id'  => $versionId,
                'chunk_index' => $index++,
-                'text' => $chunkText,
-                'checksum' => sha1($chunkText),
-                'metadata' => $this->buildMetadata($version),
+                'text'        => $chunkText,
+                'checksum'    => sha1($chunkText),
+                'metadata'    => $this->buildMetadata($version),
            ];
        }
    }
--- a/src/Knowledge/Ingest/SimpleChunker.php
+++ b/src/Knowledge/Ingest/SimpleChunker.php
@@ -10,10 +10,9 @@ use App\Knowledge\Text\TextNormalizer;

 final readonly class SimpleChunker
 {
-
    public function __construct(
        private IndexConfigurationProvider $configurationProvider,
-        private TextNormalizer             $textNormalizer
+        private TextNormalizer $textNormalizer
    )
    {
    }
@@ -23,7 +22,7 @@ final readonly class SimpleChunker
    {
        $config = $this->configurationProvider->getConfiguration();

-        $maxWords = $config->getChunkSize();
+        $maxWords     = $config->getChunkSize();
        $overlapWords = $config->getChunkOverlap();

        $text = $this->textNormalizer->normalize($text);
@@ -31,6 +30,74 @@ final readonly class SimpleChunker
            return [];
        }

+        // ======================================================
+        // HYBRID: Erst Absatzbasiert sammeln
+        // ======================================================
+
+        $paragraphs = preg_split('/\n{2,}/u', $text);
+        if (!$paragraphs) {
+            return [];
+        }
+
+        $chunks = [];
+        $currentChunk = '';
+        $currentWordCount = 0;
+
+        foreach ($paragraphs as $paragraph) {
+
+            $paragraph = trim($paragraph);
+            if ($paragraph === '') {
+                continue;
+            }
+
+            $paragraphWordCount = $this->countWords($paragraph);
+
+            // Falls einzelner Absatz größer als maxWords → Fallback
+            if ($paragraphWordCount > $maxWords) {
+
+                if ($currentChunk !== '') {
+                    $chunks[] = trim($currentChunk);
+                    $currentChunk = '';
+                    $currentWordCount = 0;
+                }
+
+                foreach ($this->chunkByWords($paragraph, $maxWords, $overlapWords) as $subChunk) {
+                    $chunks[] = $subChunk;
+                }
+
+                continue;
+            }
+
+            // Absatz passt noch in aktuellen Chunk
+            if ($currentWordCount + $paragraphWordCount <= $maxWords) {
+                $currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph;
+                $currentWordCount += $paragraphWordCount;
+                continue;
+            }
+
+            // Flush aktueller Chunk
+            if ($currentChunk !== '') {
+                $chunks[] = trim($currentChunk);
+            }
+
+            $currentChunk = $paragraph;
+            $currentWordCount = $paragraphWordCount;
+        }
+
+        if ($currentChunk !== '') {
+            $chunks[] = trim($currentChunk);
+        }
+
+        return $this->dedupe($chunks);
+    }
+
+    // ======================================================
+    // Wortbasierter Fallback (Original-Logik beibehalten)
+    // ======================================================
+
+    /** @return string[] */
+    private function chunkByWords(string $text, int $maxWords, int $overlapWords): array
+    {
        $tokens = preg_split(
            '/(\s+)/u',
            $text,
@@ -61,7 +128,7 @@ final readonly class SimpleChunker
            $wordEnd = min($wordPos + $maxWords, $totalWords);

            $tokenStart = $wordTokenIndexes[$wordPos];
-            $tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
+            $tokenEnd   = $wordTokenIndexes[$wordEnd - 1] + 1;

            $tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);

@@ -82,7 +149,7 @@ final readonly class SimpleChunker
            $wordPos = max(0, $wordEnd - $overlapWords);
        }

-        return $this->dedupe($chunks);
+        return $chunks;
    }

    private function adjustCutToBoundary(array $tokens, int $start, int $end): int
@@ -110,11 +177,17 @@ final readonly class SimpleChunker
        return $end;
    }

+    private function countWords(string $text): int
+    {
+        $parts = preg_split('/\s+/u', trim($text));
+        return $parts ? count($parts) : 0;
+    }
+
    /** @param string[] $chunks @return string[] */
    private function dedupe(array $chunks): array
    {
        $seen = [];
-        $out = [];
+        $out  = [];

        foreach ($chunks as $chunk) {
            $key = mb_strtolower(
@@ -131,4 +204,4 @@ final readonly class SimpleChunker

        return $out;
    }
-}
+}