diff --git a/src/Knowledge/Ingest/DocumentLoader.php b/src/Knowledge/Ingest/DocumentLoader.php index 33d2ea0..b8d3f9e 100644 --- a/src/Knowledge/Ingest/DocumentLoader.php +++ b/src/Knowledge/Ingest/DocumentLoader.php @@ -52,40 +52,117 @@ final class DocumentLoader return $this->normalize($text); } - /** - * Zentraler Normalizer für alle Dokumenttypen. - * Rein formal – keine Domain-Logik. - */ private function normalize(string $text): string { if ($text === '') { return ''; } - // 1️⃣ Silbentrennung entfernen (Wort-\nFortsetzung) + // 1. Silbentrennung entfernen $text = preg_replace('/-\n/', '', $text); - // 2️⃣ Einheitliche Zeilenumbrüche + // 2. Einheitliche Zeilenumbrüche $text = str_replace(["\r\n", "\r"], "\n", $text); - // 3️⃣ Harte PDF-Zeilenumbrüche reparieren: - // Wenn Zeile nicht mit Punkt endet und nächste mit Kleinbuchstabe beginnt → zusammenführen - $text = preg_replace( - '/([^\.\!\?\:\n])\n([a-zäöü])/u', - '$1 $2', - $text - ); + // 3. Symbolmüll entfernen + $text = $this->removeUnwantedSymbols($text); - // 4️⃣ Inline-Listen stabilisieren: - // " - Punkt - Punkt" → echte neue Zeile + // 4. Struktur-Reparatur + $text = $this->repairStructure($text); + + // 5. Inline-Listen stabilisieren $text = preg_replace('/\s-\s/', "\n- ", $text); - // 5️⃣ Mehrfache Leerzeichen reduzieren + // 6. Whitespace normalisieren $text = preg_replace('/[ \t]+/', ' ', $text); - - // 6️⃣ Mehrfache Leerzeilen reduzieren $text = preg_replace('/\n{3,}/', "\n\n", $text); return trim($text); } + + private function removeUnwantedSymbols(string $text): string + { + $text = str_replace(['©', '®', '™', '℠'], '', $text); + $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text); + $text = preg_replace('/[^\P{C}\n]+/u', '', $text); + return $text; + } + + /** + * Konsolidierte Struktur-Reparatur + */ + private function repairStructure(string $text): string + { + $lines = explode("\n", $text); + $out = []; + $count = count($lines); + + for ($i = 0; $i < $count; $i++) { + $current = trim($lines[$i]); + + if ($current === '') { + $out[] = ''; + continue; + } + + if ($i < $count - 1) { + $next = trim($lines[$i + 1]); + + // --- 1. Modellnummern / Zahlfortsetzung --- + if ( + !preg_match('/^- /', $current) && + !preg_match('/^- /', $next) && + !preg_match('/[\.:\?!]$/', $current) && + preg_match('/^\d+/', $next) // beginnt mit Zahl + ) { + $out[] = $current . ' ' . $next; + $i++; + continue; + } + + // --- 2. Satzfortsetzung (Zeile beginnt klein) --- + if ( + !preg_match('/^- /', $current) && + !preg_match('/^- /', $next) && + !preg_match('/[\.:\?!]$/', $current) && + preg_match('/^[a-zäöü]/u', $next) + ) { + $out[] = $current . ' ' . $next; + $i++; + continue; + } + + // --- 3. Falsche Listenfortsetzung --- + if ( + preg_match('/^- /', $current) && + preg_match('/^- [a-zäöü]/u', $next) && + !preg_match('/[\.:\?!]$/', $current) + ) { + $merged = rtrim($current) . ' ' . ltrim(substr($next, 2)); + $out[] = $merged; + $i++; + continue; + } + } + + // --- 4. Pseudo-Liste wie "- 808 festlegen" --- + if (preg_match('/^- \d+[A-Za-z ]{0,25}$/', $current)) { + $out[] = substr($current, 2); + continue; + } + + // --- 5. Pseudo-Liste wie "- im eingeschalteten Zustand ..." --- + if ( + preg_match('/^- [a-zäöü]/u', $current) && + ($i === 0 || !preg_match('/^- /', trim($lines[$i - 1]))) + ) { + $out[] = substr($current, 2); + continue; + } + + $out[] = $current; + } + + return implode("\n", $out); + } } \ No newline at end of file diff --git a/templates/admin/model_config/test_retrieval.html.twig b/templates/admin/model_config/test_retrieval.html.twig index 61c453e..2d35cba 100644 --- a/templates/admin/model_config/test_retrieval.html.twig +++ b/templates/admin/model_config/test_retrieval.html.twig @@ -80,7 +80,7 @@ {% for chunk in results %}
- {{ chunk }} + {{ chunk|nl2br }}
{% endfor %}