optimize ingesting documents

2026-02-28 23:19:47 +01:00
parent 509ba83ac0
commit e7a315c147
3 changed files with 223 additions and 19 deletions
--- a/src/Ingest/StructureEnhancer.php
+++ b/src/Ingest/StructureEnhancer.php
@@ -0,0 +1,182 @@
+<?php
+
+declare(strict_types=1);
+
+namespace App\Ingest;
+
+final class StructureEnhancer
+{
+    public function enhance(string $text): string
+    {
+        if ($text === '') {
+            return '';
+        }
+
+        $text = $this->normalizeLineEndings($text);
+        $text = $this->detectHeadings($text);
+        $text = $this->detectSimpleLists($text);
+
+        return $text;
+    }
+
+    private function normalizeLineEndings(string $text): string
+    {
+        return str_replace(["\r\n", "\r"], "\n", $text);
+    }
+
+    private function detectHeadings(string $text): string
+    {
+        $lines = explode("\n", $text);
+        $out = [];
+
+        $total = count($lines);
+
+        for ($i = 0; $i < $total; $i++) {
+            $line = $lines[$i];
+            $trim = trim($line);
+
+            if ($this->isHeadingCandidate($trim, $lines, $i)) {
+                $out[] = '## ' . $trim;
+                continue;
+            }
+
+            $out[] = $line;
+        }
+
+        return implode("\n", $out);
+    }
+
+    private function isHeadingCandidate(string $line, array $lines, int $index): bool
+    {
+        if ($line === '') {
+            return false;
+        }
+
+        if (strlen($line) > 80) {
+            return false;
+        }
+
+        if (str_ends_with($line, '.')) {
+            return false;
+        }
+
+        if (str_contains($line, ',')) {
+            return false;
+        }
+
+        if (preg_match('/\d+\.\d+/', $line)) {
+            return false;
+        }
+
+        $prev = $lines[$index - 1] ?? '';
+        $next = $lines[$index + 1] ?? '';
+
+        if (trim($prev) !== '' || trim($next) !== '') {
+            return false;
+        }
+
+        $uppercaseRatio = $this->uppercaseRatio($line);
+        if ($uppercaseRatio > 0.6) {
+            return true;
+        }
+
+        if ($this->isTitleCase($line)) {
+            return true;
+        }
+
+        return false;
+    }
+
+    private function uppercaseRatio(string $line): float
+    {
+        $letters = preg_replace('/[^a-zA-ZÄÖÜäöü]/u', '', $line);
+        if ($letters === '') {
+            return 0;
+        }
+
+        $upper = preg_replace('/[^A-ZÄÖÜ]/u', '', $letters);
+
+        return mb_strlen($upper) / mb_strlen($letters);
+    }
+
+    private function isTitleCase(string $line): bool
+    {
+        $words = explode(' ', $line);
+        $count = 0;
+
+        foreach ($words as $word) {
+            if ($word === '') {
+                continue;
+            }
+
+            if (mb_strtoupper(mb_substr($word, 0, 1)) === mb_substr($word, 0, 1)) {
+                $count++;
+            }
+        }
+
+        return $count >= max(1, intdiv(count($words), 2));
+    }
+
+    private function detectSimpleLists(string $text): string
+    {
+        $lines = explode("\n", $text);
+        $out = [];
+
+        $buffer = [];
+
+        foreach ($lines as $line) {
+            $trim = trim($line);
+
+            if ($this->isListCandidate($trim)) {
+                $buffer[] = $trim;
+                continue;
+            }
+
+            if (count($buffer) >= 2) {
+                foreach ($buffer as $item) {
+                    $out[] = '- ' . $item;
+                }
+            } else {
+                foreach ($buffer as $item) {
+                    $out[] = $item;
+                }
+            }
+
+            $buffer = [];
+            $out[] = $line;
+        }
+
+        if (count($buffer) >= 2) {
+            foreach ($buffer as $item) {
+                $out[] = '- ' . $item;
+            }
+        } else {
+            foreach ($buffer as $item) {
+                $out[] = $item;
+            }
+        }
+
+        return implode("\n", $out);
+    }
+
+    private function isListCandidate(string $line): bool
+    {
+        if ($line === '') {
+            return false;
+        }
+
+        if (strlen($line) > 120) {
+            return false;
+        }
+
+        if (str_ends_with($line, '.')) {
+            return false;
+        }
+
+        if (str_contains($line, ':')) {
+            return false;
+        }
+
+        return true;
+    }
+}
--- a/src/Knowledge/Ingest/DocumentLoader.php
+++ b/src/Knowledge/Ingest/DocumentLoader.php
@@ -20,10 +20,6 @@ final class DocumentLoader
        return match ($ext) {
            'txt', 'md' => $this->loadText($path),
            'pdf'       => $this->loadPdf($path),
-
-            // vorbereitet für später:
-            // 'docx'   => $this->loadDocx($path),
-
            default     => throw new \RuntimeException("Unsupported file type: .{$ext}"),
        };
    }
@@ -46,27 +42,48 @@ final class DocumentLoader
            $pdf  = $parser->parseFile($path);
            $text = $pdf->getText();
        } catch (\Throwable $e) {
-            throw new \RuntimeException("Failed to parse PDF: {$path}. Error: " . $e->getMessage(), 0, $e);
+            throw new \RuntimeException(
+                "Failed to parse PDF: {$path}. Error: " . $e->getMessage(),
+                0,
+                $e
+            );
        }

        return $this->normalize($text);
    }

    /**
-     * Zentraler Normalizer für alle Dokumenttypen
+     * Zentraler Normalizer für alle Dokumenttypen.
+     * Rein formal – keine Domain-Logik.
     */
    private function normalize(string $text): string
    {
-        // Silbentrennung entfernen
+        if ($text === '') {
+            return '';
+        }
+
+        // 1️⃣ Silbentrennung entfernen (Wort-\nFortsetzung)
        $text = preg_replace('/-\n/', '', $text);

-        // Windows-Zeilenumbrüche
-        $text = str_replace("\r\n", "\n", $text);
+        // 2️⃣ Einheitliche Zeilenumbrüche
+        $text = str_replace(["\r\n", "\r"], "\n", $text);

-        // Mehrfache Leerzeichen
+        // 3️⃣ Harte PDF-Zeilenumbrüche reparieren:
+        // Wenn Zeile nicht mit Punkt endet und nächste mit Kleinbuchstabe beginnt → zusammenführen
+        $text = preg_replace(
+            '/([^\.\!\?\:\n])\n([a-zäöü])/u',
+            '$1 $2',
+            $text
+        );
+
+        // 4️⃣ Inline-Listen stabilisieren:
+        // " - Punkt - Punkt" → echte neue Zeile
+        $text = preg_replace('/\s-\s/', "\n- ", $text);
+
+        // 5️⃣ Mehrfache Leerzeichen reduzieren
        $text = preg_replace('/[ \t]+/', ' ', $text);

-        // Mehrfache Leerzeilen
+        // 6️⃣ Mehrfache Leerzeilen reduzieren
        $text = preg_replace('/\n{3,}/', "\n\n", $text);

        return trim($text);
--- a/src/Knowledge/Ingest/KnowledgeIngestService.php
+++ b/src/Knowledge/Ingest/KnowledgeIngestService.php
@@ -8,6 +8,7 @@ use App\Entity\DocumentVersion;
 use App\Knowledge\Text\TextNormalizer;
 use App\Repository\DocumentVersionRepository;
 use App\Ingest\DocumentSanitizer;
+use App\Ingest\StructureEnhancer;

 final readonly class KnowledgeIngestService
 {
@@ -16,7 +17,8 @@ final readonly class KnowledgeIngestService
        private SimpleChunker             $chunker,
        private DocumentVersionRepository $versionRepo,
        private TextNormalizer            $textNormalizer,
-        private DocumentSanitizer         $documentSanitizer, // ✅ NEU
+        private DocumentSanitizer         $documentSanitizer,
+        private StructureEnhancer         $structureEnhancer, // ✅ NEU
    )
    {
    }
@@ -31,10 +33,13 @@ final readonly class KnowledgeIngestService
        // 1️⃣ Rohtext laden
        $text = $this->loader->load($version->getFilePath());

-        // 2️⃣ 🔥 Deterministische Vorverarbeitung (NEU)
+        // 2️⃣ Deterministische Textbereinigung
        $text = $this->documentSanitizer->sanitize($text);

-        // 3️⃣ Chunking
+        // 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU)
+        $text = $this->structureEnhancer->enhance($text);
+
+        // 4️⃣ Chunking
        $chunks = $this->chunker->chunk($text);

        $doc = $version->getDocument();