optimize ingesting documents

2026-02-28 23:19:47 +01:00
parent 509ba83ac0
commit e7a315c147
3 changed files with 223 additions and 19 deletions
--- a/src/Ingest/StructureEnhancer.php
+++ b/src/Ingest/StructureEnhancer.php
@@ -0,0 +1,182 @@
 <?php
 declare(strict_types=1);
 namespace App\Ingest;
 final class StructureEnhancer
 {
    public function enhance(string $text): string
    {
        if ($text === '') {
            return '';
        }
        $text = $this->normalizeLineEndings($text);
        $text = $this->detectHeadings($text);
        $text = $this->detectSimpleLists($text);
        return $text;
    }
    private function normalizeLineEndings(string $text): string
    {
        return str_replace(["\r\n", "\r"], "\n", $text);
    }
    private function detectHeadings(string $text): string
    {
        $lines = explode("\n", $text);
        $out = [];
        $total = count($lines);
        for ($i = 0; $i < $total; $i++) {
            $line = $lines[$i];
            $trim = trim($line);
            if ($this->isHeadingCandidate($trim, $lines, $i)) {
                $out[] = '## ' . $trim;
                continue;
            }
            $out[] = $line;
        }
        return implode("\n", $out);
    }
    private function isHeadingCandidate(string $line, array $lines, int $index): bool
    {
        if ($line === '') {
            return false;
        }
        if (strlen($line) > 80) {
            return false;
        }
        if (str_ends_with($line, '.')) {
            return false;
        }
        if (str_contains($line, ',')) {
            return false;
        }
        if (preg_match('/\d+\.\d+/', $line)) {
            return false;
        }
        $prev = $lines[$index - 1] ?? '';
        $next = $lines[$index + 1] ?? '';
        if (trim($prev) !== '' || trim($next) !== '') {
            return false;
        }
        $uppercaseRatio = $this->uppercaseRatio($line);
        if ($uppercaseRatio > 0.6) {
            return true;
        }
        if ($this->isTitleCase($line)) {
            return true;
        }
        return false;
    }
    private function uppercaseRatio(string $line): float
    {
        $letters = preg_replace('/[^a-zA-ZÄÖÜäöü]/u', '', $line);
        if ($letters === '') {
            return 0;
        }
        $upper = preg_replace('/[^A-ZÄÖÜ]/u', '', $letters);
        return mb_strlen($upper) / mb_strlen($letters);
    }
    private function isTitleCase(string $line): bool
    {
        $words = explode(' ', $line);
        $count = 0;
        foreach ($words as $word) {
            if ($word === '') {
                continue;
            }
            if (mb_strtoupper(mb_substr($word, 0, 1)) === mb_substr($word, 0, 1)) {
                $count++;
            }
        }
        return $count >= max(1, intdiv(count($words), 2));
    }
    private function detectSimpleLists(string $text): string
    {
        $lines = explode("\n", $text);
        $out = [];
        $buffer = [];
        foreach ($lines as $line) {
            $trim = trim($line);
            if ($this->isListCandidate($trim)) {
                $buffer[] = $trim;
                continue;
            }
            if (count($buffer) >= 2) {
                foreach ($buffer as $item) {
                    $out[] = '- ' . $item;
                }
            } else {
                foreach ($buffer as $item) {
                    $out[] = $item;
                }
            }
            $buffer = [];
            $out[] = $line;
        }
        if (count($buffer) >= 2) {
            foreach ($buffer as $item) {
                $out[] = '- ' . $item;
            }
        } else {
            foreach ($buffer as $item) {
                $out[] = $item;
            }
        }
        return implode("\n", $out);
    }
    private function isListCandidate(string $line): bool
    {
        if ($line === '') {
            return false;
        }
        if (strlen($line) > 120) {
            return false;
        }
        if (str_ends_with($line, '.')) {
            return false;
        }
        if (str_contains($line, ':')) {
            return false;
        }
        return true;
    }
 }
--- a/src/Knowledge/Ingest/DocumentLoader.php
+++ b/src/Knowledge/Ingest/DocumentLoader.php
@@ -20,11 +20,7 @@ final class DocumentLoader
        return match ($ext) {
            'txt', 'md' => $this->loadText($path),
            'pdf'       => $this->loadPdf($path),
-
+            default     => throw new \RuntimeException("Unsupported file type: .{$ext}"),
            // vorbereitet für später:
            // 'docx'   => $this->loadDocx($path),
            default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
        };
    }
@@ -43,30 +39,51 @@ final class DocumentLoader
        $parser = new Parser();
        try {
-            $pdf = $parser->parseFile($path);
+            $pdf  = $parser->parseFile($path);
            $text = $pdf->getText();
        } catch (\Throwable $e) {
-            throw new \RuntimeException("Failed to parse PDF: {$path}. Error: " . $e->getMessage(), 0, $e);
+            throw new \RuntimeException(
                "Failed to parse PDF: {$path}. Error: " . $e->getMessage(),
                0,
                $e
            );
        }
        return $this->normalize($text);
    }
    /**
-     * Zentraler Normalizer für alle Dokumenttypen
+     * Zentraler Normalizer für alle Dokumenttypen.
     * Rein formal – keine Domain-Logik.
     */
    private function normalize(string $text): string
    {
-        // Silbentrennung entfernen
+        if ($text === '') {
            return '';
        }
        // 1️⃣ Silbentrennung entfernen (Wort-\nFortsetzung)
        $text = preg_replace('/-\n/', '', $text);
-        // Windows-Zeilenumbrüche
+        // 2️⃣ Einheitliche Zeilenumbrüche
-        $text = str_replace("\r\n", "\n", $text);
+        $text = str_replace(["\r\n", "\r"], "\n", $text);
-        // Mehrfache Leerzeichen
+        // 3️⃣ Harte PDF-Zeilenumbrüche reparieren:
        // Wenn Zeile nicht mit Punkt endet und nächste mit Kleinbuchstabe beginnt → zusammenführen
        $text = preg_replace(
            '/([^\.\!\?\:\n])\n([a-zäöü])/u',
            '$1 $2',
            $text
        );
        // 4️⃣ Inline-Listen stabilisieren:
        // " - Punkt - Punkt" → echte neue Zeile
        $text = preg_replace('/\s-\s/', "\n- ", $text);
        // 5️⃣ Mehrfache Leerzeichen reduzieren
        $text = preg_replace('/[ \t]+/', ' ', $text);
-        // Mehrfache Leerzeilen
+        // 6️⃣ Mehrfache Leerzeilen reduzieren
        $text = preg_replace('/\n{3,}/', "\n\n", $text);
        return trim($text);
--- a/src/Knowledge/Ingest/KnowledgeIngestService.php
+++ b/src/Knowledge/Ingest/KnowledgeIngestService.php
@@ -8,6 +8,7 @@ use App\Entity\DocumentVersion;
 use App\Knowledge\Text\TextNormalizer;
 use App\Repository\DocumentVersionRepository;
 use App\Ingest\DocumentSanitizer;
 use App\Ingest\StructureEnhancer;
 final readonly class KnowledgeIngestService
 {
@@ -16,7 +17,8 @@ final readonly class KnowledgeIngestService
        private SimpleChunker             $chunker,
        private DocumentVersionRepository $versionRepo,
        private TextNormalizer            $textNormalizer,
-        private DocumentSanitizer         $documentSanitizer, // ✅ NEU
+        private DocumentSanitizer         $documentSanitizer,
        private StructureEnhancer         $structureEnhancer, // ✅ NEU
    )
    {
    }
@@ -31,15 +33,18 @@ final readonly class KnowledgeIngestService
        // 1️⃣ Rohtext laden
        $text = $this->loader->load($version->getFilePath());
-        // 2️⃣ 🔥 Deterministische Vorverarbeitung (NEU)
+        // 2️⃣ Deterministische Textbereinigung
        $text = $this->documentSanitizer->sanitize($text);
-        // 3️⃣ Chunking
+        // 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU)
        $text = $this->structureEnhancer->enhance($text);
        // 4️⃣ Chunking
        $chunks = $this->chunker->chunk($text);
        $doc = $version->getDocument();
        $documentId = $doc->getId()->toRfc4122();
-        $versionId = $version->getId()->toRfc4122();
+        $versionId  = $version->getId()->toRfc4122();
        $title = trim((string)$doc->getTitle());
@@ -58,7 +63,7 @@ final readonly class KnowledgeIngestService
            $chunkId = sha1(
                $documentId . '|' .
-                $versionId . '|' .
+                $versionId  . '|' .
                $normalizedForId
            );