optimize sanitizer
This commit is contained in:
@@ -24,7 +24,10 @@ final class DocumentSanitizer
|
|||||||
private const MAX_HEADER_LEN = 120;
|
private const MAX_HEADER_LEN = 120;
|
||||||
private const REPEAT_HEADER_MIN_COUNT = 3;
|
private const REPEAT_HEADER_MIN_COUNT = 3;
|
||||||
|
|
||||||
public function sanitize(string $text): string
|
public function sanitize(
|
||||||
|
string $text,
|
||||||
|
string $fileExtension
|
||||||
|
): string
|
||||||
{
|
{
|
||||||
if ($text === '') {
|
if ($text === '') {
|
||||||
return '';
|
return '';
|
||||||
@@ -32,12 +35,14 @@ final class DocumentSanitizer
|
|||||||
|
|
||||||
$text = $this->normalizeLineEndings($text);
|
$text = $this->normalizeLineEndings($text);
|
||||||
|
|
||||||
// Wichtig: Reihenfolge so, dass wir erst "grobe Blöcke" (TOC) entfernen,
|
$fileExtension = strtolower($fileExtension);
|
||||||
// danach zeilenbasierte Artefakte (PageNumbers/Headers/DotLeaders).
|
|
||||||
|
if (in_array($fileExtension, ['pdf', 'doc', 'docx'], true)) {
|
||||||
$text = $this->removeToc($text);
|
$text = $this->removeToc($text);
|
||||||
$text = $this->removePageNumbers($text);
|
$text = $this->removePageNumbers($text);
|
||||||
$text = $this->removeDotLeaderLines($text);
|
$text = $this->removeDotLeaderLines($text);
|
||||||
$text = $this->removeRepeatedHeaders($text);
|
$text = $this->removeRepeatedHeaders($text);
|
||||||
|
}
|
||||||
|
|
||||||
$text = $this->cleanupWhitespace($text);
|
$text = $this->cleanupWhitespace($text);
|
||||||
|
|
||||||
|
|||||||
@@ -32,9 +32,13 @@ final readonly class KnowledgeIngestService
|
|||||||
{
|
{
|
||||||
// 1️⃣ Rohtext laden
|
// 1️⃣ Rohtext laden
|
||||||
$text = $this->loader->load($version->getFilePath());
|
$text = $this->loader->load($version->getFilePath());
|
||||||
|
$extension = $version->getFileExtension() ?? 'txt';
|
||||||
|
|
||||||
// 2️⃣ Deterministische Textbereinigung
|
// 2️⃣ Deterministische Textbereinigung
|
||||||
$text = $this->documentSanitizer->sanitize($text);
|
$text = $this->documentSanitizer->sanitize(
|
||||||
|
$text,
|
||||||
|
$extension
|
||||||
|
);
|
||||||
|
|
||||||
// 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU)
|
// 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU)
|
||||||
$text = $this->structureEnhancer->enhance($text);
|
$text = $this->structureEnhancer->enhance($text);
|
||||||
|
|||||||
Reference in New Issue
Block a user