optimize ingesting documents

This commit is contained in:
team2
2026-02-28 23:19:47 +01:00
parent 509ba83ac0
commit e7a315c147
3 changed files with 223 additions and 19 deletions

View File

@@ -20,11 +20,7 @@ final class DocumentLoader
return match ($ext) {
'txt', 'md' => $this->loadText($path),
'pdf' => $this->loadPdf($path),
// vorbereitet für später:
// 'docx' => $this->loadDocx($path),
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
};
}
@@ -43,32 +39,53 @@ final class DocumentLoader
$parser = new Parser();
try {
$pdf = $parser->parseFile($path);
$pdf = $parser->parseFile($path);
$text = $pdf->getText();
} catch (\Throwable $e) {
throw new \RuntimeException("Failed to parse PDF: {$path}. Error: " . $e->getMessage(), 0, $e);
throw new \RuntimeException(
"Failed to parse PDF: {$path}. Error: " . $e->getMessage(),
0,
$e
);
}
return $this->normalize($text);
}
/**
* Zentraler Normalizer für alle Dokumenttypen
* Zentraler Normalizer für alle Dokumenttypen.
* Rein formal keine Domain-Logik.
*/
private function normalize(string $text): string
{
// Silbentrennung entfernen
if ($text === '') {
return '';
}
// 1⃣ Silbentrennung entfernen (Wort-\nFortsetzung)
$text = preg_replace('/-\n/', '', $text);
// Windows-Zeilenumbrüche
$text = str_replace("\r\n", "\n", $text);
// 2⃣ Einheitliche Zeilenumbrüche
$text = str_replace(["\r\n", "\r"], "\n", $text);
// Mehrfache Leerzeichen
// 3⃣ Harte PDF-Zeilenumbrüche reparieren:
// Wenn Zeile nicht mit Punkt endet und nächste mit Kleinbuchstabe beginnt → zusammenführen
$text = preg_replace(
'/([^\.\!\?\:\n])\n([a-zäöü])/u',
'$1 $2',
$text
);
// 4⃣ Inline-Listen stabilisieren:
// " - Punkt - Punkt" → echte neue Zeile
$text = preg_replace('/\s-\s/', "\n- ", $text);
// 5⃣ Mehrfache Leerzeichen reduzieren
$text = preg_replace('/[ \t]+/', ' ', $text);
// Mehrfache Leerzeilen
// 6 Mehrfache Leerzeilen reduzieren
$text = preg_replace('/\n{3,}/', "\n\n", $text);
return trim($text);
}
}
}

View File

@@ -8,6 +8,7 @@ use App\Entity\DocumentVersion;
use App\Knowledge\Text\TextNormalizer;
use App\Repository\DocumentVersionRepository;
use App\Ingest\DocumentSanitizer;
use App\Ingest\StructureEnhancer;
final readonly class KnowledgeIngestService
{
@@ -16,7 +17,8 @@ final readonly class KnowledgeIngestService
private SimpleChunker $chunker,
private DocumentVersionRepository $versionRepo,
private TextNormalizer $textNormalizer,
private DocumentSanitizer $documentSanitizer, // ✅ NEU
private DocumentSanitizer $documentSanitizer,
private StructureEnhancer $structureEnhancer, // ✅ NEU
)
{
}
@@ -31,15 +33,18 @@ final readonly class KnowledgeIngestService
// 1⃣ Rohtext laden
$text = $this->loader->load($version->getFilePath());
// 2 🔥 Deterministische Vorverarbeitung (NEU)
// 2⃣ Deterministische Textbereinigung
$text = $this->documentSanitizer->sanitize($text);
// 3Chunking
// 3🔥 Deterministische Struktur-Anreicherung (NEU)
$text = $this->structureEnhancer->enhance($text);
// 4⃣ Chunking
$chunks = $this->chunker->chunk($text);
$doc = $version->getDocument();
$documentId = $doc->getId()->toRfc4122();
$versionId = $version->getId()->toRfc4122();
$versionId = $version->getId()->toRfc4122();
$title = trim((string)$doc->getTitle());
@@ -58,7 +63,7 @@ final readonly class KnowledgeIngestService
$chunkId = sha1(
$documentId . '|' .
$versionId . '|' .
$versionId . '|' .
$normalizedForId
);