optimize system and cleanup
This commit is contained in:
@@ -7,68 +7,65 @@ namespace App\Ingest;
|
||||
/**
|
||||
* DocumentSanitizer
|
||||
*
|
||||
* Ziel (deterministisch, minimal-invasiv):
|
||||
* - Entfernt typische PDF-/DOC-Artefakte VOR dem Chunking:
|
||||
* - Inhaltsverzeichnis-Blöcke (TOC)
|
||||
* - Seitenzahlen / "Seite X von Y"
|
||||
* - wiederkehrende Header/Footer-Zeilen
|
||||
* - Dot-Leader-Zeilen (".... 12")
|
||||
* Deterministic, minimal-invasive preprocessing BEFORE chunking.
|
||||
*
|
||||
* Guardrails:
|
||||
* - Keine semantische Umschreibung
|
||||
* - Keine Zufälligkeit
|
||||
* - Kein Entfernen echter Fließtext-Absätze
|
||||
* Removes typical PDF/DOC artefacts:
|
||||
* - Table of contents blocks
|
||||
* - Page numbers
|
||||
* - Repeated headers/footers
|
||||
* - Dot-leader lines (e.g. "...... 12")
|
||||
*
|
||||
* Design principles:
|
||||
* - No semantic rewriting
|
||||
* - No randomness
|
||||
* - No removal of real paragraphs
|
||||
* - Type-aware sanitizing (PDF/DOC != MD/TXT)
|
||||
*/
|
||||
final class DocumentSanitizer
|
||||
{
|
||||
private const MAX_HEADER_LEN = 120;
|
||||
private const REPEAT_HEADER_MIN_COUNT = 3;
|
||||
|
||||
public function sanitize(
|
||||
string $text,
|
||||
string $fileExtension
|
||||
): string
|
||||
public function sanitize(string $text, string $fileExtension): string
|
||||
{
|
||||
if ($text === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
$text = $this->normalizeLineEndings($text);
|
||||
|
||||
$fileExtension = strtolower($fileExtension);
|
||||
|
||||
// Nur PDF-/DOC-artige Formate aggressiver behandeln
|
||||
if (in_array($fileExtension, ['pdf', 'doc', 'docx'], true)) {
|
||||
$text = $this->removeToc($text);
|
||||
$text = $this->removePageNumbers($text);
|
||||
$text = $this->removeDotLeaderLines($text);
|
||||
$text = $this->removeRepeatedHeaders($text);
|
||||
$text = $this->sanitizePdfLike($text);
|
||||
}
|
||||
|
||||
$text = $this->cleanupWhitespace($text);
|
||||
return trim($this->cleanupWhitespace($text));
|
||||
}
|
||||
|
||||
return trim($text);
|
||||
// =========================================================
|
||||
// PIPELINE
|
||||
// =========================================================
|
||||
|
||||
private function sanitizePdfLike(string $text): string
|
||||
{
|
||||
$text = $this->removeToc($text);
|
||||
$text = $this->removePageNumbers($text);
|
||||
$text = $this->removeDotLeaderLines($text);
|
||||
$text = $this->removeRepeatedHeaders($text);
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
private function normalizeLineEndings(string $text): string
|
||||
{
|
||||
// Vereinheitlichen auf \n (deterministisch, kein Encoding-Change)
|
||||
return str_replace(["\r\n", "\r"], "\n", $text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Entfernt TOC-Block nach "Inhaltsverzeichnis" bis zum ersten "echten" Absatz.
|
||||
*
|
||||
* Heuristik:
|
||||
* - Start: Zeile enthält "Inhaltsverzeichnis" (case-insensitive)
|
||||
* - Innerhalb TOC werden Zeilen entfernt, die wie TOC-Einträge aussehen:
|
||||
* - Dot-Leader + Seitenzahl
|
||||
* - Kapitelnummern + Text + Seitenzahl
|
||||
* - Ende: sobald eine Zeile "absatzartig" wirkt:
|
||||
* - ausreichend lang UND enthält Satzpunkt (.)
|
||||
*
|
||||
* Guardrail:
|
||||
* - Leere Zeilen innerhalb TOC werden verworfen (damit TOC-Block wirklich weg ist)
|
||||
*/
|
||||
// =========================================================
|
||||
// TOC REMOVAL
|
||||
// =========================================================
|
||||
|
||||
private function removeToc(string $text): string
|
||||
{
|
||||
$lines = explode("\n", $text);
|
||||
@@ -86,24 +83,24 @@ final class DocumentSanitizer
|
||||
}
|
||||
|
||||
if ($inToc) {
|
||||
// Innerhalb TOC: leere Zeilen weg (Block entfernen)
|
||||
if ($trim === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// typische TOC-Zeilen (Leader / Kapitelnummern)
|
||||
if ($this->looksLikeDotLeaderLine($trim) || $this->looksLikeNumberedTocLine($trim)) {
|
||||
if (
|
||||
$this->looksLikeDotLeaderLine($trim) ||
|
||||
$this->looksLikeNumberedTocLine($trim)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Ende TOC, wenn "echter Absatz" beginnt (lang + Punkt)
|
||||
if (strlen($trim) >= 120 && str_contains($trim, '.')) {
|
||||
// Ende TOC sobald normale Satzstruktur erkannt wird
|
||||
if (preg_match('/[a-zäöüß]\.\s*$/iu', $trim)) {
|
||||
$inToc = false;
|
||||
$filtered[] = $line;
|
||||
continue;
|
||||
}
|
||||
|
||||
// sonst: solange wir im TOC sind, ignorieren
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -113,13 +110,10 @@ final class DocumentSanitizer
|
||||
return implode("\n", $filtered);
|
||||
}
|
||||
|
||||
/**
|
||||
* Entfernt typische Seitenzahl-Zeilen.
|
||||
*
|
||||
* Guardrails:
|
||||
* - Nur kurze, "isolierte" Zeilen (trim != '')
|
||||
* - Lässt Fließtext unangetastet
|
||||
*/
|
||||
// =========================================================
|
||||
// PAGE NUMBERS
|
||||
// =========================================================
|
||||
|
||||
private function removePageNumbers(string $text): string
|
||||
{
|
||||
$lines = explode("\n", $text);
|
||||
@@ -134,17 +128,22 @@ final class DocumentSanitizer
|
||||
}
|
||||
|
||||
// "Seite 3" / "Seite 3 von 20"
|
||||
if (preg_match('/^seite\s+\d+(\s+von\s+\d+)?$/iu', $trim)) {
|
||||
if (preg_match('/^seite\s+\d{1,4}(\s+von\s+\d{1,4})?$/iu', $trim)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// "Page 12" / "Page 12 of 34"
|
||||
if (preg_match('/^page\s+\d+(\s+of\s+\d+)?$/iu', $trim)) {
|
||||
if (preg_match('/^page\s+\d{1,4}(\s+of\s+\d{1,4})?$/iu', $trim)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// "- 4 -" / "4" / "– 4 –"
|
||||
if (preg_match('/^[-–]?\s?\d{1,3}\s?[-–]?$/u', $trim)) {
|
||||
// Isolierte Seitenmarker: "- 4 -" oder "– 4 –"
|
||||
if (preg_match('/^[-–]\s?\d{1,4}\s?[-–]$/u', $trim)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Nur reine Zahl (max 3 Stellen, um IDs nicht zu killen)
|
||||
if (preg_match('/^\d{1,3}$/u', $trim)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -154,10 +153,10 @@ final class DocumentSanitizer
|
||||
return implode("\n", $filtered);
|
||||
}
|
||||
|
||||
/**
|
||||
* Entfernt Dot-Leader-Zeilen überall (nicht nur im TOC),
|
||||
* z.B.: "Kapitel ......... 12"
|
||||
*/
|
||||
// =========================================================
|
||||
// DOT LEADER
|
||||
// =========================================================
|
||||
|
||||
private function removeDotLeaderLines(string $text): string
|
||||
{
|
||||
$lines = explode("\n", $text);
|
||||
@@ -176,19 +175,14 @@ final class DocumentSanitizer
|
||||
return implode("\n", $filtered);
|
||||
}
|
||||
|
||||
/**
|
||||
* Entfernt wiederkehrende Header/Footer-Zeilen.
|
||||
*
|
||||
* Guardrails:
|
||||
* - Nur relativ kurze Zeilen (unter MAX_HEADER_LEN)
|
||||
* - Nur wenn identisch (trim) >= REPEAT_HEADER_MIN_COUNT
|
||||
* - Leere Zeilen bleiben erhalten
|
||||
*/
|
||||
// =========================================================
|
||||
// REPEATED HEADERS
|
||||
// =========================================================
|
||||
|
||||
private function removeRepeatedHeaders(string $text): string
|
||||
{
|
||||
$lines = explode("\n", $text);
|
||||
|
||||
// counts basiert auf trim (damit z.B. unterschiedliche Einrückung nicht zählt)
|
||||
$trimmed = array_map('trim', $lines);
|
||||
$counts = array_count_values($trimmed);
|
||||
|
||||
@@ -211,27 +205,27 @@ final class DocumentSanitizer
|
||||
return implode("\n", $filtered);
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// WHITESPACE
|
||||
// =========================================================
|
||||
|
||||
private function cleanupWhitespace(string $text): string
|
||||
{
|
||||
// nicht zu aggressiv: nur 3+ Leerzeilen auf 2 reduzieren
|
||||
$text = preg_replace("/\n{3,}/", "\n\n", $text);
|
||||
return $text ?? '';
|
||||
// Maximal 2 Leerzeilen
|
||||
return preg_replace("/\n{3,}/", "\n\n", $text);
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// Heuristics (isoliert, testbar)
|
||||
// HEURISTICS
|
||||
// =========================================================
|
||||
|
||||
private function looksLikeDotLeaderLine(string $trimmedLine): bool
|
||||
{
|
||||
// "Text ..... 12" (mind. 5 Punkte, Seitenzahl am Ende)
|
||||
return (bool)preg_match('/^.+\.{5,}\s*\d+$/u', $trimmedLine);
|
||||
return (bool)preg_match('/^.+\.{4,}\s*\d+$/u', $trimmedLine);
|
||||
}
|
||||
|
||||
private function looksLikeNumberedTocLine(string $trimmedLine): bool
|
||||
{
|
||||
// "2.1 Kapitelname 12" / "3 Kapitelname 7"
|
||||
// Kapitelnummern + Text + Seitenzahl am Ende
|
||||
return (bool)preg_match('/^\d+(\.\d+)*\s+.+\s+\d+$/u', $trimmedLine);
|
||||
}
|
||||
}
|
||||
@@ -4,8 +4,27 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Ingest;
|
||||
|
||||
/**
|
||||
* StructureEnhancer
|
||||
*
|
||||
* Minimal, deterministic structure hints BEFORE chunking.
|
||||
*
|
||||
* Adds:
|
||||
* - Heading markers ("## ") for isolated short title lines
|
||||
* - Bullet markers ("- ") for obvious list runs
|
||||
*
|
||||
* Non-goals:
|
||||
* - No semantic rewriting
|
||||
* - No sentence merging
|
||||
* - No aggressive list guessing
|
||||
*/
|
||||
final class StructureEnhancer
|
||||
{
|
||||
private const MAX_HEADING_LEN = 80;
|
||||
|
||||
private const MAX_LIST_ITEM_LEN = 140;
|
||||
private const MIN_LIST_RUN = 2;
|
||||
|
||||
public function enhance(string $text): string
|
||||
{
|
||||
if ($text === '') {
|
||||
@@ -13,6 +32,8 @@ final class StructureEnhancer
|
||||
}
|
||||
|
||||
$text = $this->normalizeLineEndings($text);
|
||||
|
||||
// Reihenfolge: erst Headings, dann Listen (stabiler fürs Chunking)
|
||||
$text = $this->detectHeadings($text);
|
||||
$text = $this->detectSimpleLists($text);
|
||||
|
||||
@@ -24,6 +45,10 @@ final class StructureEnhancer
|
||||
return str_replace(["\r\n", "\r"], "\n", $text);
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// HEADINGS
|
||||
// =========================================================
|
||||
|
||||
private function detectHeadings(string $text): string
|
||||
{
|
||||
$lines = explode("\n", $text);
|
||||
@@ -52,22 +77,31 @@ final class StructureEnhancer
|
||||
return false;
|
||||
}
|
||||
|
||||
if (strlen($line) > 80) {
|
||||
// Schon Markdown-Heading? Dann nicht anfassen.
|
||||
if (preg_match('/^#{1,6}\s+/u', $line)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (str_ends_with($line, '.')) {
|
||||
if (mb_strlen($line) > self::MAX_HEADING_LEN) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Heading soll kein "Satz" sein
|
||||
if (preg_match('/[.!?]\s*$/u', $line)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Keine typischen Satz-Kommas (zu risky)
|
||||
if (str_contains($line, ',')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (preg_match('/\d+\.\d+/', $line)) {
|
||||
// Nummerierte Kapitel "1.2" / "2.3.4" nicht zwangs-heading-en
|
||||
if (preg_match('/\b\d+\.\d+(\.\d+)*\b/u', $line)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Muss "isoliert" stehen (leerzeile davor und danach)
|
||||
$prev = $lines[$index - 1] ?? '';
|
||||
$next = $lines[$index + 1] ?? '';
|
||||
|
||||
@@ -75,48 +109,81 @@ final class StructureEnhancer
|
||||
return false;
|
||||
}
|
||||
|
||||
// Guardrail: mindestens ein Buchstabe
|
||||
if (!preg_match('/\p{L}/u', $line)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Klassiker: UPPERCASE oder Title Case
|
||||
$uppercaseRatio = $this->uppercaseRatio($line);
|
||||
if ($uppercaseRatio > 0.6) {
|
||||
if ($uppercaseRatio >= 0.65) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ($this->isTitleCase($line)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
return $this->isTitleCase($line);
|
||||
}
|
||||
|
||||
private function uppercaseRatio(string $line): float
|
||||
{
|
||||
$letters = preg_replace('/[^a-zA-ZÄÖÜäöü]/u', '', $line);
|
||||
if ($letters === '') {
|
||||
return 0;
|
||||
$letters = preg_replace('/[^\p{L}]/u', '', $line);
|
||||
if ($letters === '' || $letters === null) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
$upper = preg_replace('/[^A-ZÄÖÜ]/u', '', $letters);
|
||||
$upper = preg_replace('/[^\p{Lu}]/u', '', $letters);
|
||||
if ($upper === null) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
return mb_strlen($upper) / mb_strlen($letters);
|
||||
$lettersLen = mb_strlen($letters);
|
||||
if ($lettersLen === 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
return mb_strlen($upper) / $lettersLen;
|
||||
}
|
||||
|
||||
private function isTitleCase(string $line): bool
|
||||
{
|
||||
$words = explode(' ', $line);
|
||||
$count = 0;
|
||||
$words = preg_split('/\s+/u', trim($line));
|
||||
if (!$words) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$wordCount = 0;
|
||||
$capCount = 0;
|
||||
|
||||
foreach ($words as $word) {
|
||||
$word = trim($word);
|
||||
if ($word === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mb_strtoupper(mb_substr($word, 0, 1)) === mb_substr($word, 0, 1)) {
|
||||
$count++;
|
||||
// Wörter ohne Buchstaben ignorieren
|
||||
if (!preg_match('/\p{L}/u', $word)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$wordCount++;
|
||||
|
||||
$first = mb_substr($word, 0, 1);
|
||||
if ($first !== '' && mb_strtoupper($first) === $first) {
|
||||
$capCount++;
|
||||
}
|
||||
}
|
||||
|
||||
return $count >= max(1, intdiv(count($words), 2));
|
||||
if ($wordCount === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// mindestens die Hälfte der Wörter beginnt groß
|
||||
return $capCount >= max(1, intdiv($wordCount + 1, 2));
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// LISTS
|
||||
// =========================================================
|
||||
|
||||
private function detectSimpleLists(string $text): string
|
||||
{
|
||||
$lines = explode("\n", $text);
|
||||
@@ -127,36 +194,45 @@ final class StructureEnhancer
|
||||
foreach ($lines as $line) {
|
||||
$trim = trim($line);
|
||||
|
||||
// Bereits echte Liste? → nicht anfassen
|
||||
if (preg_match('/^-\s+/u', $trim) || preg_match('/^\d+\.\s+/u', $trim)) {
|
||||
$this->flushListBuffer($buffer, $out);
|
||||
$out[] = $line;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($this->isListCandidate($trim)) {
|
||||
$buffer[] = $trim;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (count($buffer) >= 2) {
|
||||
foreach ($buffer as $item) {
|
||||
$out[] = '- ' . $item;
|
||||
}
|
||||
} else {
|
||||
foreach ($buffer as $item) {
|
||||
$out[] = $item;
|
||||
}
|
||||
}
|
||||
|
||||
$buffer = [];
|
||||
$this->flushListBuffer($buffer, $out);
|
||||
$out[] = $line;
|
||||
}
|
||||
|
||||
if (count($buffer) >= 2) {
|
||||
$this->flushListBuffer($buffer, $out);
|
||||
|
||||
return implode("\n", $out);
|
||||
}
|
||||
|
||||
private function flushListBuffer(array &$buffer, array &$out): void
|
||||
{
|
||||
if ($buffer === []) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (count($buffer) >= self::MIN_LIST_RUN) {
|
||||
foreach ($buffer as $item) {
|
||||
$out[] = '- ' . $item;
|
||||
}
|
||||
} else {
|
||||
// single line: unverändert lassen (kein "erraten"!)
|
||||
foreach ($buffer as $item) {
|
||||
$out[] = $item;
|
||||
}
|
||||
}
|
||||
|
||||
return implode("\n", $out);
|
||||
$buffer = [];
|
||||
}
|
||||
|
||||
private function isListCandidate(string $line): bool
|
||||
@@ -165,18 +241,32 @@ final class StructureEnhancer
|
||||
return false;
|
||||
}
|
||||
|
||||
if (strlen($line) > 120) {
|
||||
// zu lang = ziemlich sicher Absatz/Satz
|
||||
if (mb_strlen($line) > self::MAX_LIST_ITEM_LEN) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (str_ends_with($line, '.')) {
|
||||
// wenn es wie ein Satz endet, nicht als Liste
|
||||
if (preg_match('/[.!?]\s*$/u', $line)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// "Key: Value" ist typischerweise keine Liste
|
||||
if (str_contains($line, ':')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Wenn es ein kompletter Satz sein könnte (Verb/Artikel), nicht raten:
|
||||
// -> minimaler Guardrail: beginnt mit Großbuchstabe UND enthält mindestens 5 Wörter => eher Satz/Absatz
|
||||
$words = preg_split('/\s+/u', trim($line));
|
||||
if ($words && count($words) >= 5) {
|
||||
$first = mb_substr($line, 0, 1);
|
||||
if ($first !== '' && mb_strtoupper($first) === $first) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// nur "kurze, stichpunktartige" Zeilen als Kandidat akzeptieren
|
||||
return true;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user