optimize sanitizer
This commit is contained in:
@@ -24,7 +24,10 @@ final class DocumentSanitizer
|
||||
private const MAX_HEADER_LEN = 120;
|
||||
private const REPEAT_HEADER_MIN_COUNT = 3;
|
||||
|
||||
public function sanitize(string $text): string
|
||||
public function sanitize(
|
||||
string $text,
|
||||
string $fileExtension
|
||||
): string
|
||||
{
|
||||
if ($text === '') {
|
||||
return '';
|
||||
@@ -32,12 +35,14 @@ final class DocumentSanitizer
|
||||
|
||||
$text = $this->normalizeLineEndings($text);
|
||||
|
||||
// Wichtig: Reihenfolge so, dass wir erst "grobe Blöcke" (TOC) entfernen,
|
||||
// danach zeilenbasierte Artefakte (PageNumbers/Headers/DotLeaders).
|
||||
$fileExtension = strtolower($fileExtension);
|
||||
|
||||
if (in_array($fileExtension, ['pdf', 'doc', 'docx'], true)) {
|
||||
$text = $this->removeToc($text);
|
||||
$text = $this->removePageNumbers($text);
|
||||
$text = $this->removeDotLeaderLines($text);
|
||||
$text = $this->removeRepeatedHeaders($text);
|
||||
}
|
||||
|
||||
$text = $this->cleanupWhitespace($text);
|
||||
|
||||
|
||||
@@ -32,9 +32,13 @@ final readonly class KnowledgeIngestService
|
||||
{
|
||||
// 1️⃣ Rohtext laden
|
||||
$text = $this->loader->load($version->getFilePath());
|
||||
$extension = $version->getFileExtension() ?? 'txt';
|
||||
|
||||
// 2️⃣ Deterministische Textbereinigung
|
||||
$text = $this->documentSanitizer->sanitize($text);
|
||||
$text = $this->documentSanitizer->sanitize(
|
||||
$text,
|
||||
$extension
|
||||
);
|
||||
|
||||
// 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU)
|
||||
$text = $this->structureEnhancer->enhance($text);
|
||||
|
||||
Reference in New Issue
Block a user