optimize ingesting documents

2026-02-28 22:48:01 +01:00
parent 54ce057ef0
commit 509ba83ac0
6 changed files with 335 additions and 21 deletions
--- a/python/vector/vector_ingest.py
+++ b/python/vector/vector_ingest.py
@@ -96,8 +96,8 @@ print("Encoding embeddings...")
 embeddings = model.encode(
    texts,
    normalize_embeddings=True,
-    show_progress_bar=True,
+    show_progress_bar=False,
-    batch_size=64
+    batch_size=128
 )
 embeddings = np.array(embeddings).astype("float32")
--- a/python/vector/vector_ingest_tags.py
+++ b/python/vector/vector_ingest_tags.py
@@ -109,8 +109,8 @@ if not texts:
 embeddings = model.encode(
    texts,
    normalize_embeddings=True,
-    show_progress_bar=False,
+    show_progress_bar=True,
-    batch_size=64
+    batch_size=128
 )
 embeddings = np.array(embeddings).astype("float32")
--- a/src/Ingest/ChunkWriteService.php
+++ b/src/Ingest/ChunkWriteService.php
@@ -11,8 +11,10 @@ use Symfony\Component\Uid\Uuid;
 final readonly class ChunkWriteService
 {
    public function __construct(
-        private ChunkManager $chunkManager,
+        private ChunkManager $chunkManager
-    ) {}
+    )
    {
    }
    public function countAllChunks(): int
    {
@@ -41,4 +43,5 @@ final readonly class ChunkWriteService
    {
        $this->chunkManager->rewriteAll($allChunks);
    }
 }
--- a/src/Ingest/DocumentSanitizer.php
+++ b/src/Ingest/DocumentSanitizer.php
@@ -0,0 +1,232 @@
 <?php
 declare(strict_types=1);
 namespace App\Ingest;
 /**
 * DocumentSanitizer
 *
 * Ziel (deterministisch, minimal-invasiv):
 * - Entfernt typische PDF-/DOC-Artefakte VOR dem Chunking:
 *   - Inhaltsverzeichnis-Blöcke (TOC)
 *   - Seitenzahlen / "Seite X von Y"
 *   - wiederkehrende Header/Footer-Zeilen
 *   - Dot-Leader-Zeilen (".... 12")
 *
 * Guardrails:
 * - Keine semantische Umschreibung
 * - Keine Zufälligkeit
 * - Kein Entfernen echter Fließtext-Absätze
 */
 final class DocumentSanitizer
 {
    private const MAX_HEADER_LEN = 120;
    private const REPEAT_HEADER_MIN_COUNT = 3;
    public function sanitize(string $text): string
    {
        if ($text === '') {
            return '';
        }
        $text = $this->normalizeLineEndings($text);
        // Wichtig: Reihenfolge so, dass wir erst "grobe Blöcke" (TOC) entfernen,
        // danach zeilenbasierte Artefakte (PageNumbers/Headers/DotLeaders).
        $text = $this->removeToc($text);
        $text = $this->removePageNumbers($text);
        $text = $this->removeDotLeaderLines($text);
        $text = $this->removeRepeatedHeaders($text);
        $text = $this->cleanupWhitespace($text);
        return trim($text);
    }
    private function normalizeLineEndings(string $text): string
    {
        // Vereinheitlichen auf \n (deterministisch, kein Encoding-Change)
        return str_replace(["\r\n", "\r"], "\n", $text);
    }
    /**
     * Entfernt TOC-Block nach "Inhaltsverzeichnis" bis zum ersten "echten" Absatz.
     *
     * Heuristik:
     * - Start: Zeile enthält "Inhaltsverzeichnis" (case-insensitive)
     * - Innerhalb TOC werden Zeilen entfernt, die wie TOC-Einträge aussehen:
     *   - Dot-Leader + Seitenzahl
     *   - Kapitelnummern + Text + Seitenzahl
     * - Ende: sobald eine Zeile "absatzartig" wirkt:
     *   - ausreichend lang UND enthält Satzpunkt (.)
     *
     * Guardrail:
     * - Leere Zeilen innerhalb TOC werden verworfen (damit TOC-Block wirklich weg ist)
     */
    private function removeToc(string $text): string
    {
        $lines = explode("\n", $text);
        $filtered = [];
        $inToc = false;
        foreach ($lines as $line) {
            $trim = trim($line);
            // TOC Start
            if (!$inToc && $trim !== '' && stripos($trim, 'inhaltsverzeichnis') !== false) {
                $inToc = true;
                continue;
            }
            if ($inToc) {
                // Innerhalb TOC: leere Zeilen weg (Block entfernen)
                if ($trim === '') {
                    continue;
                }
                // typische TOC-Zeilen (Leader / Kapitelnummern)
                if ($this->looksLikeDotLeaderLine($trim) || $this->looksLikeNumberedTocLine($trim)) {
                    continue;
                }
                // Ende TOC, wenn "echter Absatz" beginnt (lang + Punkt)
                if (strlen($trim) >= 120 && str_contains($trim, '.')) {
                    $inToc = false;
                    $filtered[] = $line;
                    continue;
                }
                // sonst: solange wir im TOC sind, ignorieren
                continue;
            }
            $filtered[] = $line;
        }
        return implode("\n", $filtered);
    }
    /**
     * Entfernt typische Seitenzahl-Zeilen.
     *
     * Guardrails:
     * - Nur kurze, "isolierte" Zeilen (trim != '')
     * - Lässt Fließtext unangetastet
     */
    private function removePageNumbers(string $text): string
    {
        $lines = explode("\n", $text);
        $filtered = [];
        foreach ($lines as $line) {
            $trim = trim($line);
            if ($trim === '') {
                $filtered[] = $line;
                continue;
            }
            // "Seite 3" / "Seite 3 von 20"
            if (preg_match('/^seite\s+\d+(\s+von\s+\d+)?$/iu', $trim)) {
                continue;
            }
            // "Page 12" / "Page 12 of 34"
            if (preg_match('/^page\s+\d+(\s+of\s+\d+)?$/iu', $trim)) {
                continue;
            }
            // "- 4 -" / "4" / "– 4 –"
            if (preg_match('/^[-–]?\s?\d{1,3}\s?[-–]?$/u', $trim)) {
                continue;
            }
            $filtered[] = $line;
        }
        return implode("\n", $filtered);
    }
    /**
     * Entfernt Dot-Leader-Zeilen überall (nicht nur im TOC),
     * z.B.: "Kapitel ......... 12"
     */
    private function removeDotLeaderLines(string $text): string
    {
        $lines = explode("\n", $text);
        $filtered = [];
        foreach ($lines as $line) {
            $trim = trim($line);
            if ($trim !== '' && $this->looksLikeDotLeaderLine($trim)) {
                continue;
            }
            $filtered[] = $line;
        }
        return implode("\n", $filtered);
    }
    /**
     * Entfernt wiederkehrende Header/Footer-Zeilen.
     *
     * Guardrails:
     * - Nur relativ kurze Zeilen (unter MAX_HEADER_LEN)
     * - Nur wenn identisch (trim) >= REPEAT_HEADER_MIN_COUNT
     * - Leere Zeilen bleiben erhalten
     */
    private function removeRepeatedHeaders(string $text): string
    {
        $lines = explode("\n", $text);
        // counts basiert auf trim (damit z.B. unterschiedliche Einrückung nicht zählt)
        $trimmed = array_map('trim', $lines);
        $counts = array_count_values($trimmed);
        $filtered = [];
        foreach ($lines as $line) {
            $trim = trim($line);
            if (
                $trim !== '' &&
                strlen($trim) < self::MAX_HEADER_LEN &&
                ($counts[$trim] ?? 0) >= self::REPEAT_HEADER_MIN_COUNT
            ) {
                continue;
            }
            $filtered[] = $line;
        }
        return implode("\n", $filtered);
    }
    private function cleanupWhitespace(string $text): string
    {
        // nicht zu aggressiv: nur 3+ Leerzeilen auf 2 reduzieren
        $text = preg_replace("/\n{3,}/", "\n\n", $text);
        return $text ?? '';
    }
    // =========================================================
    // Heuristics (isoliert, testbar)
    // =========================================================
    private function looksLikeDotLeaderLine(string $trimmedLine): bool
    {
        // "Text ..... 12" (mind. 5 Punkte, Seitenzahl am Ende)
        return (bool)preg_match('/^.+\.{5,}\s*\d+$/u', $trimmedLine);
    }
    private function looksLikeNumberedTocLine(string $trimmedLine): bool
    {
        // "2.1 Kapitelname 12" / "3 Kapitelname 7"
        // Kapitelnummern + Text + Seitenzahl am Ende
        return (bool)preg_match('/^\d+(\.\d+)*\s+.+\s+\d+$/u', $trimmedLine);
    }
 }
--- a/src/Knowledge/Ingest/KnowledgeIngestService.php
+++ b/src/Knowledge/Ingest/KnowledgeIngestService.php
@@ -7,6 +7,7 @@ namespace App\Knowledge\Ingest;
 use App\Entity\DocumentVersion;
 use App\Knowledge\Text\TextNormalizer;
 use App\Repository\DocumentVersionRepository;
 use App\Ingest\DocumentSanitizer;
 final readonly class KnowledgeIngestService
 {
@@ -14,7 +15,8 @@ final readonly  class KnowledgeIngestService
        private DocumentLoader            $loader,
        private SimpleChunker             $chunker,
        private DocumentVersionRepository $versionRepo,
-        private TextNormalizer            $textNormalizer
+        private TextNormalizer            $textNormalizer,
        private DocumentSanitizer         $documentSanitizer, // ✅ NEU
    )
    {
    }
@@ -26,8 +28,13 @@ final readonly  class KnowledgeIngestService
     */
    public function buildChunkRecords(DocumentVersion $version): iterable
    {
        // 1️⃣ Rohtext laden
        $text = $this->loader->load($version->getFilePath());
        // 2️⃣ 🔥 Deterministische Vorverarbeitung (NEU)
        $text = $this->documentSanitizer->sanitize($text);
        // 3️⃣ Chunking
        $chunks = $this->chunker->chunk($text);
        $doc = $version->getDocument();
@@ -41,7 +48,6 @@ final readonly  class KnowledgeIngestService
        foreach ($chunks as $chunkText) {
            if ($title !== '' && !str_starts_with($chunkText, $title)) {
                //title with backticks
                $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
            }
--- a/src/Knowledge/Ingest/SimpleChunker.php
+++ b/src/Knowledge/Ingest/SimpleChunker.php
@@ -10,7 +10,6 @@ use App\Knowledge\Text\TextNormalizer;
 final readonly class SimpleChunker
 {
    public function __construct(
        private IndexConfigurationProvider $configurationProvider,
        private TextNormalizer $textNormalizer
@@ -31,6 +30,74 @@ final readonly class SimpleChunker
            return [];
        }
        // ======================================================
        // HYBRID: Erst Absatzbasiert sammeln
        // ======================================================
        $paragraphs = preg_split('/\n{2,}/u', $text);
        if (!$paragraphs) {
            return [];
        }
        $chunks = [];
        $currentChunk = '';
        $currentWordCount = 0;
        foreach ($paragraphs as $paragraph) {
            $paragraph = trim($paragraph);
            if ($paragraph === '') {
                continue;
            }
            $paragraphWordCount = $this->countWords($paragraph);
            // Falls einzelner Absatz größer als maxWords → Fallback
            if ($paragraphWordCount > $maxWords) {
                if ($currentChunk !== '') {
                    $chunks[] = trim($currentChunk);
                    $currentChunk = '';
                    $currentWordCount = 0;
                }
                foreach ($this->chunkByWords($paragraph, $maxWords, $overlapWords) as $subChunk) {
                    $chunks[] = $subChunk;
                }
                continue;
            }
            // Absatz passt noch in aktuellen Chunk
            if ($currentWordCount + $paragraphWordCount <= $maxWords) {
                $currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph;
                $currentWordCount += $paragraphWordCount;
                continue;
            }
            // Flush aktueller Chunk
            if ($currentChunk !== '') {
                $chunks[] = trim($currentChunk);
            }
            $currentChunk = $paragraph;
            $currentWordCount = $paragraphWordCount;
        }
        if ($currentChunk !== '') {
            $chunks[] = trim($currentChunk);
        }
        return $this->dedupe($chunks);
    }
    // ======================================================
    // Wortbasierter Fallback (Original-Logik beibehalten)
    // ======================================================
    /** @return string[] */
    private function chunkByWords(string $text, int $maxWords, int $overlapWords): array
    {
        $tokens = preg_split(
            '/(\s+)/u',
            $text,
@@ -82,7 +149,7 @@ final readonly class SimpleChunker
            $wordPos = max(0, $wordEnd - $overlapWords);
        }
-        return $this->dedupe($chunks);
+        return $chunks;
    }
    private function adjustCutToBoundary(array $tokens, int $start, int $end): int
@@ -110,6 +177,12 @@ final readonly class SimpleChunker
        return $end;
    }
    private function countWords(string $text): int
    {
        $parts = preg_split('/\s+/u', trim($text));
        return $parts ? count($parts) : 0;
    }
    /** @param string[] $chunks @return string[] */
    private function dedupe(array $chunks): array
    {