diff --git a/src/Ingest/DocumentSanitizer.php b/src/Ingest/DocumentSanitizer.php index 9a89f38..cc802c1 100644 --- a/src/Ingest/DocumentSanitizer.php +++ b/src/Ingest/DocumentSanitizer.php @@ -24,7 +24,10 @@ final class DocumentSanitizer private const MAX_HEADER_LEN = 120; private const REPEAT_HEADER_MIN_COUNT = 3; - public function sanitize(string $text): string + public function sanitize( + string $text, + string $fileExtension + ): string { if ($text === '') { return ''; @@ -32,12 +35,14 @@ final class DocumentSanitizer $text = $this->normalizeLineEndings($text); - // Wichtig: Reihenfolge so, dass wir erst "grobe Blöcke" (TOC) entfernen, - // danach zeilenbasierte Artefakte (PageNumbers/Headers/DotLeaders). - $text = $this->removeToc($text); - $text = $this->removePageNumbers($text); - $text = $this->removeDotLeaderLines($text); - $text = $this->removeRepeatedHeaders($text); + $fileExtension = strtolower($fileExtension); + + if (in_array($fileExtension, ['pdf', 'doc', 'docx'], true)) { + $text = $this->removeToc($text); + $text = $this->removePageNumbers($text); + $text = $this->removeDotLeaderLines($text); + $text = $this->removeRepeatedHeaders($text); + } $text = $this->cleanupWhitespace($text); diff --git a/src/Knowledge/Ingest/KnowledgeIngestService.php b/src/Knowledge/Ingest/KnowledgeIngestService.php index 41da161..f72ac82 100644 --- a/src/Knowledge/Ingest/KnowledgeIngestService.php +++ b/src/Knowledge/Ingest/KnowledgeIngestService.php @@ -32,9 +32,13 @@ final readonly class KnowledgeIngestService { // 1️⃣ Rohtext laden $text = $this->loader->load($version->getFilePath()); + $extension = $version->getFileExtension() ?? 'txt'; // 2️⃣ Deterministische Textbereinigung - $text = $this->documentSanitizer->sanitize($text); + $text = $this->documentSanitizer->sanitize( + $text, + $extension + ); // 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU) $text = $this->structureEnhancer->enhance($text);