From e7a315c1478898be2d225c77ccfb79b1734531f1 Mon Sep 17 00:00:00 2001
From: team2 <team2@mitho-media.de>
Date: Sat, 28 Feb 2026 23:19:47 +0100
Subject: [PATCH] optimize ingesting documents

---
 src/Ingest/StructureEnhancer.php              | 182 ++++++++++++++++++
 src/Knowledge/Ingest/DocumentLoader.php       |  45 +++--
 .../Ingest/KnowledgeIngestService.php         |  15 +-
 3 files changed, 223 insertions(+), 19 deletions(-)
 create mode 100644 src/Ingest/StructureEnhancer.php

diff --git a/src/Ingest/StructureEnhancer.php b/src/Ingest/StructureEnhancer.php
new file mode 100644
index 0000000..6cdaf49
--- /dev/null
+++ b/src/Ingest/StructureEnhancer.php
@@ -0,0 +1,182 @@
+<?php
+
+declare(strict_types=1);
+
+namespace App\Ingest;
+
+final class StructureEnhancer
+{
+    public function enhance(string $text): string
+    {
+        if ($text === '') {
+            return '';
+        }
+
+        $text = $this->normalizeLineEndings($text);
+        $text = $this->detectHeadings($text);
+        $text = $this->detectSimpleLists($text);
+
+        return $text;
+    }
+
+    private function normalizeLineEndings(string $text): string
+    {
+        return str_replace(["\r\n", "\r"], "\n", $text);
+    }
+
+    private function detectHeadings(string $text): string
+    {
+        $lines = explode("\n", $text);
+        $out = [];
+
+        $total = count($lines);
+
+        for ($i = 0; $i < $total; $i++) {
+            $line = $lines[$i];
+            $trim = trim($line);
+
+            if ($this->isHeadingCandidate($trim, $lines, $i)) {
+                $out[] = '## ' . $trim;
+                continue;
+            }
+
+            $out[] = $line;
+        }
+
+        return implode("\n", $out);
+    }
+
+    private function isHeadingCandidate(string $line, array $lines, int $index): bool
+    {
+        if ($line === '') {
+            return false;
+        }
+
+        if (strlen($line) > 80) {
+            return false;
+        }
+
+        if (str_ends_with($line, '.')) {
+            return false;
+        }
+
+        if (str_contains($line, ',')) {
+            return false;
+        }
+
+        if (preg_match('/\d+\.\d+/', $line)) {
+            return false;
+        }
+
+        $prev = $lines[$index - 1] ?? '';
+        $next = $lines[$index + 1] ?? '';
+
+        if (trim($prev) !== '' || trim($next) !== '') {
+            return false;
+        }
+
+        $uppercaseRatio = $this->uppercaseRatio($line);
+        if ($uppercaseRatio > 0.6) {
+            return true;
+        }
+
+        if ($this->isTitleCase($line)) {
+            return true;
+        }
+
+        return false;
+    }
+
+    private function uppercaseRatio(string $line): float
+    {
+        $letters = preg_replace('/[^a-zA-ZÄÖÜäöü]/u', '', $line);
+        if ($letters === '') {
+            return 0;
+        }
+
+        $upper = preg_replace('/[^A-ZÄÖÜ]/u', '', $letters);
+
+        return mb_strlen($upper) / mb_strlen($letters);
+    }
+
+    private function isTitleCase(string $line): bool
+    {
+        $words = explode(' ', $line);
+        $count = 0;
+
+        foreach ($words as $word) {
+            if ($word === '') {
+                continue;
+            }
+
+            if (mb_strtoupper(mb_substr($word, 0, 1)) === mb_substr($word, 0, 1)) {
+                $count++;
+            }
+        }
+
+        return $count >= max(1, intdiv(count($words), 2));
+    }
+
+    private function detectSimpleLists(string $text): string
+    {
+        $lines = explode("\n", $text);
+        $out = [];
+
+        $buffer = [];
+
+        foreach ($lines as $line) {
+            $trim = trim($line);
+
+            if ($this->isListCandidate($trim)) {
+                $buffer[] = $trim;
+                continue;
+            }
+
+            if (count($buffer) >= 2) {
+                foreach ($buffer as $item) {
+                    $out[] = '- ' . $item;
+                }
+            } else {
+                foreach ($buffer as $item) {
+                    $out[] = $item;
+                }
+            }
+
+            $buffer = [];
+            $out[] = $line;
+        }
+
+        if (count($buffer) >= 2) {
+            foreach ($buffer as $item) {
+                $out[] = '- ' . $item;
+            }
+        } else {
+            foreach ($buffer as $item) {
+                $out[] = $item;
+            }
+        }
+
+        return implode("\n", $out);
+    }
+
+    private function isListCandidate(string $line): bool
+    {
+        if ($line === '') {
+            return false;
+        }
+
+        if (strlen($line) > 120) {
+            return false;
+        }
+
+        if (str_ends_with($line, '.')) {
+            return false;
+        }
+
+        if (str_contains($line, ':')) {
+            return false;
+        }
+
+        return true;
+    }
+}
\ No newline at end of file
diff --git a/src/Knowledge/Ingest/DocumentLoader.php b/src/Knowledge/Ingest/DocumentLoader.php
index f2b4704..33d2ea0 100644
--- a/src/Knowledge/Ingest/DocumentLoader.php
+++ b/src/Knowledge/Ingest/DocumentLoader.php
@@ -20,11 +20,7 @@ final class DocumentLoader
         return match ($ext) {
             'txt', 'md' => $this->loadText($path),
             'pdf'       => $this->loadPdf($path),
-
-            // vorbereitet für später:
-            // 'docx'   => $this->loadDocx($path),
-
-            default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
+            default     => throw new \RuntimeException("Unsupported file type: .{$ext}"),
         };
     }
 
@@ -43,32 +39,53 @@ final class DocumentLoader
         $parser = new Parser();
 
         try {
-            $pdf = $parser->parseFile($path);
+            $pdf  = $parser->parseFile($path);
             $text = $pdf->getText();
         } catch (\Throwable $e) {
-            throw new \RuntimeException("Failed to parse PDF: {$path}. Error: " . $e->getMessage(), 0, $e);
+            throw new \RuntimeException(
+                "Failed to parse PDF: {$path}. Error: " . $e->getMessage(),
+                0,
+                $e
+            );
         }
 
         return $this->normalize($text);
     }
 
     /**
-     * Zentraler Normalizer für alle Dokumenttypen
+     * Zentraler Normalizer für alle Dokumenttypen.
+     * Rein formal – keine Domain-Logik.
      */
     private function normalize(string $text): string
     {
-        // Silbentrennung entfernen
+        if ($text === '') {
+            return '';
+        }
+
+        // 1️⃣ Silbentrennung entfernen (Wort-\nFortsetzung)
         $text = preg_replace('/-\n/', '', $text);
 
-        // Windows-Zeilenumbrüche
-        $text = str_replace("\r\n", "\n", $text);
+        // 2️⃣ Einheitliche Zeilenumbrüche
+        $text = str_replace(["\r\n", "\r"], "\n", $text);
 
-        // Mehrfache Leerzeichen
+        // 3️⃣ Harte PDF-Zeilenumbrüche reparieren:
+        // Wenn Zeile nicht mit Punkt endet und nächste mit Kleinbuchstabe beginnt → zusammenführen
+        $text = preg_replace(
+            '/([^\.\!\?\:\n])\n([a-zäöü])/u',
+            '$1 $2',
+            $text
+        );
+
+        // 4️⃣ Inline-Listen stabilisieren:
+        // " - Punkt - Punkt" → echte neue Zeile
+        $text = preg_replace('/\s-\s/', "\n- ", $text);
+
+        // 5️⃣ Mehrfache Leerzeichen reduzieren
         $text = preg_replace('/[ \t]+/', ' ', $text);
 
-        // Mehrfache Leerzeilen
+        // 6️⃣ Mehrfache Leerzeilen reduzieren
         $text = preg_replace('/\n{3,}/', "\n\n", $text);
 
         return trim($text);
     }
-}
+}
\ No newline at end of file
diff --git a/src/Knowledge/Ingest/KnowledgeIngestService.php b/src/Knowledge/Ingest/KnowledgeIngestService.php
index 1b133c2..41da161 100644
--- a/src/Knowledge/Ingest/KnowledgeIngestService.php
+++ b/src/Knowledge/Ingest/KnowledgeIngestService.php
@@ -8,6 +8,7 @@ use App\Entity\DocumentVersion;
 use App\Knowledge\Text\TextNormalizer;
 use App\Repository\DocumentVersionRepository;
 use App\Ingest\DocumentSanitizer;
+use App\Ingest\StructureEnhancer;
 
 final readonly class KnowledgeIngestService
 {
@@ -16,7 +17,8 @@ final readonly class KnowledgeIngestService
         private SimpleChunker             $chunker,
         private DocumentVersionRepository $versionRepo,
         private TextNormalizer            $textNormalizer,
-        private DocumentSanitizer         $documentSanitizer, // ✅ NEU
+        private DocumentSanitizer         $documentSanitizer,
+        private StructureEnhancer         $structureEnhancer, // ✅ NEU
     )
     {
     }
@@ -31,15 +33,18 @@ final readonly class KnowledgeIngestService
         // 1️⃣ Rohtext laden
         $text = $this->loader->load($version->getFilePath());
 
-        // 2️⃣ 🔥 Deterministische Vorverarbeitung (NEU)
+        // 2️⃣ Deterministische Textbereinigung
         $text = $this->documentSanitizer->sanitize($text);
 
-        // 3️⃣ Chunking
+        // 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU)
+        $text = $this->structureEnhancer->enhance($text);
+
+        // 4️⃣ Chunking
         $chunks = $this->chunker->chunk($text);
 
         $doc = $version->getDocument();
         $documentId = $doc->getId()->toRfc4122();
-        $versionId = $version->getId()->toRfc4122();
+        $versionId  = $version->getId()->toRfc4122();
 
         $title = trim((string)$doc->getTitle());
 
@@ -58,7 +63,7 @@ final readonly class KnowledgeIngestService
 
             $chunkId = sha1(
                 $documentId . '|' .
-                $versionId . '|' .
+                $versionId  . '|' .
                 $normalizedForId
             );