optimize document loader
This commit is contained in:
@@ -52,40 +52,117 @@ final class DocumentLoader
|
||||
return $this->normalize($text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Zentraler Normalizer für alle Dokumenttypen.
|
||||
* Rein formal – keine Domain-Logik.
|
||||
*/
|
||||
private function normalize(string $text): string
|
||||
{
|
||||
if ($text === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
// 1️⃣ Silbentrennung entfernen (Wort-\nFortsetzung)
|
||||
// 1. Silbentrennung entfernen
|
||||
$text = preg_replace('/-\n/', '', $text);
|
||||
|
||||
// 2️⃣ Einheitliche Zeilenumbrüche
|
||||
// 2. Einheitliche Zeilenumbrüche
|
||||
$text = str_replace(["\r\n", "\r"], "\n", $text);
|
||||
|
||||
// 3️⃣ Harte PDF-Zeilenumbrüche reparieren:
|
||||
// Wenn Zeile nicht mit Punkt endet und nächste mit Kleinbuchstabe beginnt → zusammenführen
|
||||
$text = preg_replace(
|
||||
'/([^\.\!\?\:\n])\n([a-zäöü])/u',
|
||||
'$1 $2',
|
||||
$text
|
||||
);
|
||||
// 3. Symbolmüll entfernen
|
||||
$text = $this->removeUnwantedSymbols($text);
|
||||
|
||||
// 4️⃣ Inline-Listen stabilisieren:
|
||||
// " - Punkt - Punkt" → echte neue Zeile
|
||||
// 4. Struktur-Reparatur
|
||||
$text = $this->repairStructure($text);
|
||||
|
||||
// 5. Inline-Listen stabilisieren
|
||||
$text = preg_replace('/\s-\s/', "\n- ", $text);
|
||||
|
||||
// 5️⃣ Mehrfache Leerzeichen reduzieren
|
||||
// 6. Whitespace normalisieren
|
||||
$text = preg_replace('/[ \t]+/', ' ', $text);
|
||||
|
||||
// 6️⃣ Mehrfache Leerzeilen reduzieren
|
||||
$text = preg_replace('/\n{3,}/', "\n\n", $text);
|
||||
|
||||
return trim($text);
|
||||
}
|
||||
|
||||
private function removeUnwantedSymbols(string $text): string
|
||||
{
|
||||
$text = str_replace(['©', '®', '™', '℠'], '', $text);
|
||||
$text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text);
|
||||
$text = preg_replace('/[^\P{C}\n]+/u', '', $text);
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Konsolidierte Struktur-Reparatur
|
||||
*/
|
||||
private function repairStructure(string $text): string
|
||||
{
|
||||
$lines = explode("\n", $text);
|
||||
$out = [];
|
||||
$count = count($lines);
|
||||
|
||||
for ($i = 0; $i < $count; $i++) {
|
||||
$current = trim($lines[$i]);
|
||||
|
||||
if ($current === '') {
|
||||
$out[] = '';
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($i < $count - 1) {
|
||||
$next = trim($lines[$i + 1]);
|
||||
|
||||
// --- 1. Modellnummern / Zahlfortsetzung ---
|
||||
if (
|
||||
!preg_match('/^- /', $current) &&
|
||||
!preg_match('/^- /', $next) &&
|
||||
!preg_match('/[\.:\?!]$/', $current) &&
|
||||
preg_match('/^\d+/', $next) // beginnt mit Zahl
|
||||
) {
|
||||
$out[] = $current . ' ' . $next;
|
||||
$i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- 2. Satzfortsetzung (Zeile beginnt klein) ---
|
||||
if (
|
||||
!preg_match('/^- /', $current) &&
|
||||
!preg_match('/^- /', $next) &&
|
||||
!preg_match('/[\.:\?!]$/', $current) &&
|
||||
preg_match('/^[a-zäöü]/u', $next)
|
||||
) {
|
||||
$out[] = $current . ' ' . $next;
|
||||
$i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- 3. Falsche Listenfortsetzung ---
|
||||
if (
|
||||
preg_match('/^- /', $current) &&
|
||||
preg_match('/^- [a-zäöü]/u', $next) &&
|
||||
!preg_match('/[\.:\?!]$/', $current)
|
||||
) {
|
||||
$merged = rtrim($current) . ' ' . ltrim(substr($next, 2));
|
||||
$out[] = $merged;
|
||||
$i++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// --- 4. Pseudo-Liste wie "- 808 festlegen" ---
|
||||
if (preg_match('/^- \d+[A-Za-z ]{0,25}$/', $current)) {
|
||||
$out[] = substr($current, 2);
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- 5. Pseudo-Liste wie "- im eingeschalteten Zustand ..." ---
|
||||
if (
|
||||
preg_match('/^- [a-zäöü]/u', $current) &&
|
||||
($i === 0 || !preg_match('/^- /', trim($lines[$i - 1])))
|
||||
) {
|
||||
$out[] = substr($current, 2);
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = $current;
|
||||
}
|
||||
|
||||
return implode("\n", $out);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user