optimize system and cleanup

2026-03-02 21:27:20 +01:00
parent 6b8d1b1936
commit e7047cd885
10 changed files with 459 additions and 346 deletions
--- a/src/Ingest/DocumentSanitizer.php
+++ b/src/Ingest/DocumentSanitizer.php
@@ -7,68 +7,65 @@ namespace App\Ingest;
 /**
 * DocumentSanitizer
 *
- * Ziel (deterministisch, minimal-invasiv):
- * - Entfernt typische PDF-/DOC-Artefakte VOR dem Chunking:
- *   - Inhaltsverzeichnis-Blöcke (TOC)
- *   - Seitenzahlen / "Seite X von Y"
- *   - wiederkehrende Header/Footer-Zeilen
- *   - Dot-Leader-Zeilen (".... 12")
+ * Deterministic, minimal-invasive preprocessing BEFORE chunking.
 *
- * Guardrails:
- * - Keine semantische Umschreibung
- * - Keine Zufälligkeit
- * - Kein Entfernen echter Fließtext-Absätze
+ * Removes typical PDF/DOC artefacts:
+ *  - Table of contents blocks
+ *  - Page numbers
+ *  - Repeated headers/footers
+ *  - Dot-leader lines (e.g. "...... 12")
+ *
+ * Design principles:
+ *  - No semantic rewriting
+ *  - No randomness
+ *  - No removal of real paragraphs
+ *  - Type-aware sanitizing (PDF/DOC != MD/TXT)
 */
 final class DocumentSanitizer
 {
    private const MAX_HEADER_LEN = 120;
    private const REPEAT_HEADER_MIN_COUNT = 3;

-    public function sanitize(
-        string $text,
-        string $fileExtension
-    ): string
+    public function sanitize(string $text, string $fileExtension): string
    {
        if ($text === '') {
            return '';
        }

        $text = $this->normalizeLineEndings($text);
-
        $fileExtension = strtolower($fileExtension);

+        // Nur PDF-/DOC-artige Formate aggressiver behandeln
        if (in_array($fileExtension, ['pdf', 'doc', 'docx'], true)) {
-            $text = $this->removeToc($text);
-            $text = $this->removePageNumbers($text);
-            $text = $this->removeDotLeaderLines($text);
-            $text = $this->removeRepeatedHeaders($text);
+            $text = $this->sanitizePdfLike($text);
        }

-        $text = $this->cleanupWhitespace($text);
+        return trim($this->cleanupWhitespace($text));
+    }

-        return trim($text);
+    // =========================================================
+    // PIPELINE
+    // =========================================================
+
+    private function sanitizePdfLike(string $text): string
+    {
+        $text = $this->removeToc($text);
+        $text = $this->removePageNumbers($text);
+        $text = $this->removeDotLeaderLines($text);
+        $text = $this->removeRepeatedHeaders($text);
+
+        return $text;
    }

    private function normalizeLineEndings(string $text): string
    {
-        // Vereinheitlichen auf \n (deterministisch, kein Encoding-Change)
        return str_replace(["\r\n", "\r"], "\n", $text);
    }

-    /**
-     * Entfernt TOC-Block nach "Inhaltsverzeichnis" bis zum ersten "echten" Absatz.
-     *
-     * Heuristik:
-     * - Start: Zeile enthält "Inhaltsverzeichnis" (case-insensitive)
-     * - Innerhalb TOC werden Zeilen entfernt, die wie TOC-Einträge aussehen:
-     *   - Dot-Leader + Seitenzahl
-     *   - Kapitelnummern + Text + Seitenzahl
-     * - Ende: sobald eine Zeile "absatzartig" wirkt:
-     *   - ausreichend lang UND enthält Satzpunkt (.)
-     *
-     * Guardrail:
-     * - Leere Zeilen innerhalb TOC werden verworfen (damit TOC-Block wirklich weg ist)
-     */
+    // =========================================================
+    // TOC REMOVAL
+    // =========================================================
+
    private function removeToc(string $text): string
    {
        $lines = explode("\n", $text);
@@ -86,24 +83,24 @@ final class DocumentSanitizer
            }

            if ($inToc) {
-                // Innerhalb TOC: leere Zeilen weg (Block entfernen)
                if ($trim === '') {
                    continue;
                }

-                // typische TOC-Zeilen (Leader / Kapitelnummern)
-                if ($this->looksLikeDotLeaderLine($trim) || $this->looksLikeNumberedTocLine($trim)) {
+                if (
+                    $this->looksLikeDotLeaderLine($trim) ||
+                    $this->looksLikeNumberedTocLine($trim)
+                ) {
                    continue;
                }

-                // Ende TOC, wenn "echter Absatz" beginnt (lang + Punkt)
-                if (strlen($trim) >= 120 && str_contains($trim, '.')) {
+                // Ende TOC sobald normale Satzstruktur erkannt wird
+                if (preg_match('/[a-zäöüß]\.\s*$/iu', $trim)) {
                    $inToc = false;
                    $filtered[] = $line;
                    continue;
                }

-                // sonst: solange wir im TOC sind, ignorieren
                continue;
            }

@@ -113,13 +110,10 @@ final class DocumentSanitizer
        return implode("\n", $filtered);
    }

-    /**
-     * Entfernt typische Seitenzahl-Zeilen.
-     *
-     * Guardrails:
-     * - Nur kurze, "isolierte" Zeilen (trim != '')
-     * - Lässt Fließtext unangetastet
-     */
+    // =========================================================
+    // PAGE NUMBERS
+    // =========================================================
+
    private function removePageNumbers(string $text): string
    {
        $lines = explode("\n", $text);
@@ -134,17 +128,22 @@ final class DocumentSanitizer
            }

            // "Seite 3" / "Seite 3 von 20"
-            if (preg_match('/^seite\s+\d+(\s+von\s+\d+)?$/iu', $trim)) {
+            if (preg_match('/^seite\s+\d{1,4}(\s+von\s+\d{1,4})?$/iu', $trim)) {
                continue;
            }

            // "Page 12" / "Page 12 of 34"
-            if (preg_match('/^page\s+\d+(\s+of\s+\d+)?$/iu', $trim)) {
+            if (preg_match('/^page\s+\d{1,4}(\s+of\s+\d{1,4})?$/iu', $trim)) {
                continue;
            }

-            // "- 4 -" / "4" / "– 4 –"
-            if (preg_match('/^[-–]?\s?\d{1,3}\s?[-–]?$/u', $trim)) {
+            // Isolierte Seitenmarker: "- 4 -" oder "– 4 –"
+            if (preg_match('/^[-–]\s?\d{1,4}\s?[-–]$/u', $trim)) {
+                continue;
+            }
+
+            // Nur reine Zahl (max 3 Stellen, um IDs nicht zu killen)
+            if (preg_match('/^\d{1,3}$/u', $trim)) {
                continue;
            }

@@ -154,10 +153,10 @@ final class DocumentSanitizer
        return implode("\n", $filtered);
    }

-    /**
-     * Entfernt Dot-Leader-Zeilen überall (nicht nur im TOC),
-     * z.B.: "Kapitel ......... 12"
-     */
+    // =========================================================
+    // DOT LEADER
+    // =========================================================
+
    private function removeDotLeaderLines(string $text): string
    {
        $lines = explode("\n", $text);
@@ -176,19 +175,14 @@ final class DocumentSanitizer
        return implode("\n", $filtered);
    }

-    /**
-     * Entfernt wiederkehrende Header/Footer-Zeilen.
-     *
-     * Guardrails:
-     * - Nur relativ kurze Zeilen (unter MAX_HEADER_LEN)
-     * - Nur wenn identisch (trim) >= REPEAT_HEADER_MIN_COUNT
-     * - Leere Zeilen bleiben erhalten
-     */
+    // =========================================================
+    // REPEATED HEADERS
+    // =========================================================
+
    private function removeRepeatedHeaders(string $text): string
    {
        $lines = explode("\n", $text);

-        // counts basiert auf trim (damit z.B. unterschiedliche Einrückung nicht zählt)
        $trimmed = array_map('trim', $lines);
        $counts = array_count_values($trimmed);

@@ -211,27 +205,27 @@ final class DocumentSanitizer
        return implode("\n", $filtered);
    }

+    // =========================================================
+    // WHITESPACE
+    // =========================================================
+
    private function cleanupWhitespace(string $text): string
    {
-        // nicht zu aggressiv: nur 3+ Leerzeilen auf 2 reduzieren
-        $text = preg_replace("/\n{3,}/", "\n\n", $text);
-        return $text ?? '';
+        // Maximal 2 Leerzeilen
+        return preg_replace("/\n{3,}/", "\n\n", $text);
    }

    // =========================================================
-    // Heuristics (isoliert, testbar)
+    // HEURISTICS
    // =========================================================

    private function looksLikeDotLeaderLine(string $trimmedLine): bool
    {
-        // "Text ..... 12" (mind. 5 Punkte, Seitenzahl am Ende)
-        return (bool)preg_match('/^.+\.{5,}\s*\d+$/u', $trimmedLine);
+        return (bool)preg_match('/^.+\.{4,}\s*\d+$/u', $trimmedLine);
    }

    private function looksLikeNumberedTocLine(string $trimmedLine): bool
    {
-        // "2.1 Kapitelname 12" / "3 Kapitelname 7"
-        // Kapitelnummern + Text + Seitenzahl am Ende
        return (bool)preg_match('/^\d+(\.\d+)*\s+.+\s+\d+$/u', $trimmedLine);
    }
 }
--- a/src/Ingest/StructureEnhancer.php
+++ b/src/Ingest/StructureEnhancer.php
@@ -4,8 +4,27 @@ declare(strict_types=1);

 namespace App\Ingest;

+/**
+ * StructureEnhancer
+ *
+ * Minimal, deterministic structure hints BEFORE chunking.
+ *
+ * Adds:
+ *  - Heading markers ("## ") for isolated short title lines
+ *  - Bullet markers ("- ") for obvious list runs
+ *
+ * Non-goals:
+ *  - No semantic rewriting
+ *  - No sentence merging
+ *  - No aggressive list guessing
+ */
 final class StructureEnhancer
 {
+    private const MAX_HEADING_LEN = 80;
+
+    private const MAX_LIST_ITEM_LEN = 140;
+    private const MIN_LIST_RUN = 2;
+
    public function enhance(string $text): string
    {
        if ($text === '') {
@@ -13,6 +32,8 @@ final class StructureEnhancer
        }

        $text = $this->normalizeLineEndings($text);
+
+        // Reihenfolge: erst Headings, dann Listen (stabiler fürs Chunking)
        $text = $this->detectHeadings($text);
        $text = $this->detectSimpleLists($text);

@@ -24,6 +45,10 @@ final class StructureEnhancer
        return str_replace(["\r\n", "\r"], "\n", $text);
    }

+    // =========================================================
+    // HEADINGS
+    // =========================================================
+
    private function detectHeadings(string $text): string
    {
        $lines = explode("\n", $text);
@@ -52,22 +77,31 @@ final class StructureEnhancer
            return false;
        }

-        if (strlen($line) > 80) {
+        // Schon Markdown-Heading? Dann nicht anfassen.
+        if (preg_match('/^#{1,6}\s+/u', $line)) {
            return false;
        }

-        if (str_ends_with($line, '.')) {
+        if (mb_strlen($line) > self::MAX_HEADING_LEN) {
            return false;
        }

+        // Heading soll kein "Satz" sein
+        if (preg_match('/[.!?]\s*$/u', $line)) {
+            return false;
+        }
+
+        // Keine typischen Satz-Kommas (zu risky)
        if (str_contains($line, ',')) {
            return false;
        }

-        if (preg_match('/\d+\.\d+/', $line)) {
+        // Nummerierte Kapitel "1.2" / "2.3.4" nicht zwangs-heading-en
+        if (preg_match('/\b\d+\.\d+(\.\d+)*\b/u', $line)) {
            return false;
        }

+        // Muss "isoliert" stehen (leerzeile davor und danach)
        $prev = $lines[$index - 1] ?? '';
        $next = $lines[$index + 1] ?? '';

@@ -75,48 +109,81 @@ final class StructureEnhancer
            return false;
        }

+        // Guardrail: mindestens ein Buchstabe
+        if (!preg_match('/\p{L}/u', $line)) {
+            return false;
+        }
+
+        // Klassiker: UPPERCASE oder Title Case
        $uppercaseRatio = $this->uppercaseRatio($line);
-        if ($uppercaseRatio > 0.6) {
+        if ($uppercaseRatio >= 0.65) {
            return true;
        }

-        if ($this->isTitleCase($line)) {
-            return true;
-        }
-
-        return false;
+        return $this->isTitleCase($line);
    }

    private function uppercaseRatio(string $line): float
    {
-        $letters = preg_replace('/[^a-zA-ZÄÖÜäöü]/u', '', $line);
-        if ($letters === '') {
-            return 0;
+        $letters = preg_replace('/[^\p{L}]/u', '', $line);
+        if ($letters === '' || $letters === null) {
+            return 0.0;
        }

-        $upper = preg_replace('/[^A-ZÄÖÜ]/u', '', $letters);
+        $upper = preg_replace('/[^\p{Lu}]/u', '', $letters);
+        if ($upper === null) {
+            return 0.0;
+        }

-        return mb_strlen($upper) / mb_strlen($letters);
+        $lettersLen = mb_strlen($letters);
+        if ($lettersLen === 0) {
+            return 0.0;
+        }
+
+        return mb_strlen($upper) / $lettersLen;
    }

    private function isTitleCase(string $line): bool
    {
-        $words = explode(' ', $line);
-        $count = 0;
+        $words = preg_split('/\s+/u', trim($line));
+        if (!$words) {
+            return false;
+        }
+
+        $wordCount = 0;
+        $capCount = 0;

        foreach ($words as $word) {
+            $word = trim($word);
            if ($word === '') {
                continue;
            }

-            if (mb_strtoupper(mb_substr($word, 0, 1)) === mb_substr($word, 0, 1)) {
-                $count++;
+            // Wörter ohne Buchstaben ignorieren
+            if (!preg_match('/\p{L}/u', $word)) {
+                continue;
+            }
+
+            $wordCount++;
+
+            $first = mb_substr($word, 0, 1);
+            if ($first !== '' && mb_strtoupper($first) === $first) {
+                $capCount++;
            }
        }

-        return $count >= max(1, intdiv(count($words), 2));
+        if ($wordCount === 0) {
+            return false;
+        }
+
+        // mindestens die Hälfte der Wörter beginnt groß
+        return $capCount >= max(1, intdiv($wordCount + 1, 2));
    }

+    // =========================================================
+    // LISTS
+    // =========================================================
+
    private function detectSimpleLists(string $text): string
    {
        $lines = explode("\n", $text);
@@ -127,36 +194,45 @@ final class StructureEnhancer
        foreach ($lines as $line) {
            $trim = trim($line);

+            // Bereits echte Liste? → nicht anfassen
+            if (preg_match('/^-\s+/u', $trim) || preg_match('/^\d+\.\s+/u', $trim)) {
+                $this->flushListBuffer($buffer, $out);
+                $out[] = $line;
+                continue;
+            }
+
            if ($this->isListCandidate($trim)) {
                $buffer[] = $trim;
                continue;
            }

-            if (count($buffer) >= 2) {
-                foreach ($buffer as $item) {
-                    $out[] = '- ' . $item;
-                }
-            } else {
-                foreach ($buffer as $item) {
-                    $out[] = $item;
-                }
-            }
-
-            $buffer = [];
+            $this->flushListBuffer($buffer, $out);
            $out[] = $line;
        }

-        if (count($buffer) >= 2) {
+        $this->flushListBuffer($buffer, $out);
+
+        return implode("\n", $out);
+    }
+
+    private function flushListBuffer(array &$buffer, array &$out): void
+    {
+        if ($buffer === []) {
+            return;
+        }
+
+        if (count($buffer) >= self::MIN_LIST_RUN) {
            foreach ($buffer as $item) {
                $out[] = '- ' . $item;
            }
        } else {
+            // single line: unverändert lassen (kein "erraten"!)
            foreach ($buffer as $item) {
                $out[] = $item;
            }
        }

-        return implode("\n", $out);
+        $buffer = [];
    }

    private function isListCandidate(string $line): bool
@@ -165,18 +241,32 @@ final class StructureEnhancer
            return false;
        }

-        if (strlen($line) > 120) {
+        // zu lang = ziemlich sicher Absatz/Satz
+        if (mb_strlen($line) > self::MAX_LIST_ITEM_LEN) {
            return false;
        }

-        if (str_ends_with($line, '.')) {
+        // wenn es wie ein Satz endet, nicht als Liste
+        if (preg_match('/[.!?]\s*$/u', $line)) {
            return false;
        }

+        // "Key: Value" ist typischerweise keine Liste
        if (str_contains($line, ':')) {
            return false;
        }

+        // Wenn es ein kompletter Satz sein könnte (Verb/Artikel), nicht raten:
+        // -> minimaler Guardrail: beginnt mit Großbuchstabe UND enthält mindestens 5 Wörter => eher Satz/Absatz
+        $words = preg_split('/\s+/u', trim($line));
+        if ($words && count($words) >= 5) {
+            $first = mb_substr($line, 0, 1);
+            if ($first !== '' && mb_strtoupper($first) === $first) {
+                return false;
+            }
+        }
+
+        // nur "kurze, stichpunktartige" Zeilen als Kandidat akzeptieren
        return true;
    }
 }