optimize ingesting documents
This commit is contained in:
@@ -20,11 +20,7 @@ final class DocumentLoader
|
||||
return match ($ext) {
|
||||
'txt', 'md' => $this->loadText($path),
|
||||
'pdf' => $this->loadPdf($path),
|
||||
|
||||
// vorbereitet für später:
|
||||
// 'docx' => $this->loadDocx($path),
|
||||
|
||||
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
|
||||
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -43,32 +39,53 @@ final class DocumentLoader
|
||||
$parser = new Parser();
|
||||
|
||||
try {
|
||||
$pdf = $parser->parseFile($path);
|
||||
$pdf = $parser->parseFile($path);
|
||||
$text = $pdf->getText();
|
||||
} catch (\Throwable $e) {
|
||||
throw new \RuntimeException("Failed to parse PDF: {$path}. Error: " . $e->getMessage(), 0, $e);
|
||||
throw new \RuntimeException(
|
||||
"Failed to parse PDF: {$path}. Error: " . $e->getMessage(),
|
||||
0,
|
||||
$e
|
||||
);
|
||||
}
|
||||
|
||||
return $this->normalize($text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Zentraler Normalizer für alle Dokumenttypen
|
||||
* Zentraler Normalizer für alle Dokumenttypen.
|
||||
* Rein formal – keine Domain-Logik.
|
||||
*/
|
||||
private function normalize(string $text): string
|
||||
{
|
||||
// Silbentrennung entfernen
|
||||
if ($text === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
// 1️⃣ Silbentrennung entfernen (Wort-\nFortsetzung)
|
||||
$text = preg_replace('/-\n/', '', $text);
|
||||
|
||||
// Windows-Zeilenumbrüche
|
||||
$text = str_replace("\r\n", "\n", $text);
|
||||
// 2️⃣ Einheitliche Zeilenumbrüche
|
||||
$text = str_replace(["\r\n", "\r"], "\n", $text);
|
||||
|
||||
// Mehrfache Leerzeichen
|
||||
// 3️⃣ Harte PDF-Zeilenumbrüche reparieren:
|
||||
// Wenn Zeile nicht mit Punkt endet und nächste mit Kleinbuchstabe beginnt → zusammenführen
|
||||
$text = preg_replace(
|
||||
'/([^\.\!\?\:\n])\n([a-zäöü])/u',
|
||||
'$1 $2',
|
||||
$text
|
||||
);
|
||||
|
||||
// 4️⃣ Inline-Listen stabilisieren:
|
||||
// " - Punkt - Punkt" → echte neue Zeile
|
||||
$text = preg_replace('/\s-\s/', "\n- ", $text);
|
||||
|
||||
// 5️⃣ Mehrfache Leerzeichen reduzieren
|
||||
$text = preg_replace('/[ \t]+/', ' ', $text);
|
||||
|
||||
// Mehrfache Leerzeilen
|
||||
// 6️⃣ Mehrfache Leerzeilen reduzieren
|
||||
$text = preg_replace('/\n{3,}/', "\n\n", $text);
|
||||
|
||||
return trim($text);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -8,6 +8,7 @@ use App\Entity\DocumentVersion;
|
||||
use App\Knowledge\Text\TextNormalizer;
|
||||
use App\Repository\DocumentVersionRepository;
|
||||
use App\Ingest\DocumentSanitizer;
|
||||
use App\Ingest\StructureEnhancer;
|
||||
|
||||
final readonly class KnowledgeIngestService
|
||||
{
|
||||
@@ -16,7 +17,8 @@ final readonly class KnowledgeIngestService
|
||||
private SimpleChunker $chunker,
|
||||
private DocumentVersionRepository $versionRepo,
|
||||
private TextNormalizer $textNormalizer,
|
||||
private DocumentSanitizer $documentSanitizer, // ✅ NEU
|
||||
private DocumentSanitizer $documentSanitizer,
|
||||
private StructureEnhancer $structureEnhancer, // ✅ NEU
|
||||
)
|
||||
{
|
||||
}
|
||||
@@ -31,15 +33,18 @@ final readonly class KnowledgeIngestService
|
||||
// 1️⃣ Rohtext laden
|
||||
$text = $this->loader->load($version->getFilePath());
|
||||
|
||||
// 2️⃣ 🔥 Deterministische Vorverarbeitung (NEU)
|
||||
// 2️⃣ Deterministische Textbereinigung
|
||||
$text = $this->documentSanitizer->sanitize($text);
|
||||
|
||||
// 3️⃣ Chunking
|
||||
// 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU)
|
||||
$text = $this->structureEnhancer->enhance($text);
|
||||
|
||||
// 4️⃣ Chunking
|
||||
$chunks = $this->chunker->chunk($text);
|
||||
|
||||
$doc = $version->getDocument();
|
||||
$documentId = $doc->getId()->toRfc4122();
|
||||
$versionId = $version->getId()->toRfc4122();
|
||||
$versionId = $version->getId()->toRfc4122();
|
||||
|
||||
$title = trim((string)$doc->getTitle());
|
||||
|
||||
@@ -58,7 +63,7 @@ final readonly class KnowledgeIngestService
|
||||
|
||||
$chunkId = sha1(
|
||||
$documentId . '|' .
|
||||
$versionId . '|' .
|
||||
$versionId . '|' .
|
||||
$normalizedForId
|
||||
);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user