optimize sanitizer

This commit is contained in:
team2
2026-03-02 20:25:54 +01:00
parent 4784ea7f02
commit 6b8d1b1936
2 changed files with 17 additions and 8 deletions

View File

@@ -24,7 +24,10 @@ final class DocumentSanitizer
private const MAX_HEADER_LEN = 120; private const MAX_HEADER_LEN = 120;
private const REPEAT_HEADER_MIN_COUNT = 3; private const REPEAT_HEADER_MIN_COUNT = 3;
public function sanitize(string $text): string public function sanitize(
string $text,
string $fileExtension
): string
{ {
if ($text === '') { if ($text === '') {
return ''; return '';
@@ -32,12 +35,14 @@ final class DocumentSanitizer
$text = $this->normalizeLineEndings($text); $text = $this->normalizeLineEndings($text);
// Wichtig: Reihenfolge so, dass wir erst "grobe Blöcke" (TOC) entfernen, $fileExtension = strtolower($fileExtension);
// danach zeilenbasierte Artefakte (PageNumbers/Headers/DotLeaders).
if (in_array($fileExtension, ['pdf', 'doc', 'docx'], true)) {
$text = $this->removeToc($text); $text = $this->removeToc($text);
$text = $this->removePageNumbers($text); $text = $this->removePageNumbers($text);
$text = $this->removeDotLeaderLines($text); $text = $this->removeDotLeaderLines($text);
$text = $this->removeRepeatedHeaders($text); $text = $this->removeRepeatedHeaders($text);
}
$text = $this->cleanupWhitespace($text); $text = $this->cleanupWhitespace($text);

View File

@@ -32,9 +32,13 @@ final readonly class KnowledgeIngestService
{ {
// 1⃣ Rohtext laden // 1⃣ Rohtext laden
$text = $this->loader->load($version->getFilePath()); $text = $this->loader->load($version->getFilePath());
$extension = $version->getFileExtension() ?? 'txt';
// 2⃣ Deterministische Textbereinigung // 2⃣ Deterministische Textbereinigung
$text = $this->documentSanitizer->sanitize($text); $text = $this->documentSanitizer->sanitize(
$text,
$extension
);
// 3⃣ 🔥 Deterministische Struktur-Anreicherung (NEU) // 3⃣ 🔥 Deterministische Struktur-Anreicherung (NEU)
$text = $this->structureEnhancer->enhance($text); $text = $this->structureEnhancer->enhance($text);