optimize sanitizer

This commit is contained in:
team2
2026-03-02 20:25:54 +01:00
parent 4784ea7f02
commit 6b8d1b1936
2 changed files with 17 additions and 8 deletions

View File

@@ -24,7 +24,10 @@ final class DocumentSanitizer
private const MAX_HEADER_LEN = 120;
private const REPEAT_HEADER_MIN_COUNT = 3;
public function sanitize(string $text): string
public function sanitize(
string $text,
string $fileExtension
): string
{
if ($text === '') {
return '';
@@ -32,12 +35,14 @@ final class DocumentSanitizer
$text = $this->normalizeLineEndings($text);
// Wichtig: Reihenfolge so, dass wir erst "grobe Blöcke" (TOC) entfernen,
// danach zeilenbasierte Artefakte (PageNumbers/Headers/DotLeaders).
$text = $this->removeToc($text);
$text = $this->removePageNumbers($text);
$text = $this->removeDotLeaderLines($text);
$text = $this->removeRepeatedHeaders($text);
$fileExtension = strtolower($fileExtension);
if (in_array($fileExtension, ['pdf', 'doc', 'docx'], true)) {
$text = $this->removeToc($text);
$text = $this->removePageNumbers($text);
$text = $this->removeDotLeaderLines($text);
$text = $this->removeRepeatedHeaders($text);
}
$text = $this->cleanupWhitespace($text);