optimize ingesting documents

This commit is contained in:
team2
2026-02-28 23:19:47 +01:00
parent 509ba83ac0
commit e7a315c147
3 changed files with 223 additions and 19 deletions

View File

@@ -0,0 +1,182 @@
<?php
declare(strict_types=1);
namespace App\Ingest;
final class StructureEnhancer
{
public function enhance(string $text): string
{
if ($text === '') {
return '';
}
$text = $this->normalizeLineEndings($text);
$text = $this->detectHeadings($text);
$text = $this->detectSimpleLists($text);
return $text;
}
private function normalizeLineEndings(string $text): string
{
return str_replace(["\r\n", "\r"], "\n", $text);
}
private function detectHeadings(string $text): string
{
$lines = explode("\n", $text);
$out = [];
$total = count($lines);
for ($i = 0; $i < $total; $i++) {
$line = $lines[$i];
$trim = trim($line);
if ($this->isHeadingCandidate($trim, $lines, $i)) {
$out[] = '## ' . $trim;
continue;
}
$out[] = $line;
}
return implode("\n", $out);
}
private function isHeadingCandidate(string $line, array $lines, int $index): bool
{
if ($line === '') {
return false;
}
if (strlen($line) > 80) {
return false;
}
if (str_ends_with($line, '.')) {
return false;
}
if (str_contains($line, ',')) {
return false;
}
if (preg_match('/\d+\.\d+/', $line)) {
return false;
}
$prev = $lines[$index - 1] ?? '';
$next = $lines[$index + 1] ?? '';
if (trim($prev) !== '' || trim($next) !== '') {
return false;
}
$uppercaseRatio = $this->uppercaseRatio($line);
if ($uppercaseRatio > 0.6) {
return true;
}
if ($this->isTitleCase($line)) {
return true;
}
return false;
}
private function uppercaseRatio(string $line): float
{
$letters = preg_replace('/[^a-zA-ZÄÖÜäöü]/u', '', $line);
if ($letters === '') {
return 0;
}
$upper = preg_replace('/[^A-ZÄÖÜ]/u', '', $letters);
return mb_strlen($upper) / mb_strlen($letters);
}
private function isTitleCase(string $line): bool
{
$words = explode(' ', $line);
$count = 0;
foreach ($words as $word) {
if ($word === '') {
continue;
}
if (mb_strtoupper(mb_substr($word, 0, 1)) === mb_substr($word, 0, 1)) {
$count++;
}
}
return $count >= max(1, intdiv(count($words), 2));
}
private function detectSimpleLists(string $text): string
{
$lines = explode("\n", $text);
$out = [];
$buffer = [];
foreach ($lines as $line) {
$trim = trim($line);
if ($this->isListCandidate($trim)) {
$buffer[] = $trim;
continue;
}
if (count($buffer) >= 2) {
foreach ($buffer as $item) {
$out[] = '- ' . $item;
}
} else {
foreach ($buffer as $item) {
$out[] = $item;
}
}
$buffer = [];
$out[] = $line;
}
if (count($buffer) >= 2) {
foreach ($buffer as $item) {
$out[] = '- ' . $item;
}
} else {
foreach ($buffer as $item) {
$out[] = $item;
}
}
return implode("\n", $out);
}
private function isListCandidate(string $line): bool
{
if ($line === '') {
return false;
}
if (strlen($line) > 120) {
return false;
}
if (str_ends_with($line, '.')) {
return false;
}
if (str_contains($line, ':')) {
return false;
}
return true;
}
}

View File

@@ -20,10 +20,6 @@ final class DocumentLoader
return match ($ext) {
'txt', 'md' => $this->loadText($path),
'pdf' => $this->loadPdf($path),
// vorbereitet für später:
// 'docx' => $this->loadDocx($path),
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
};
}
@@ -46,27 +42,48 @@ final class DocumentLoader
$pdf = $parser->parseFile($path);
$text = $pdf->getText();
} catch (\Throwable $e) {
throw new \RuntimeException("Failed to parse PDF: {$path}. Error: " . $e->getMessage(), 0, $e);
throw new \RuntimeException(
"Failed to parse PDF: {$path}. Error: " . $e->getMessage(),
0,
$e
);
}
return $this->normalize($text);
}
/**
* Zentraler Normalizer für alle Dokumenttypen
* Zentraler Normalizer für alle Dokumenttypen.
* Rein formal keine Domain-Logik.
*/
private function normalize(string $text): string
{
// Silbentrennung entfernen
if ($text === '') {
return '';
}
// 1⃣ Silbentrennung entfernen (Wort-\nFortsetzung)
$text = preg_replace('/-\n/', '', $text);
// Windows-Zeilenumbrüche
$text = str_replace("\r\n", "\n", $text);
// 2⃣ Einheitliche Zeilenumbrüche
$text = str_replace(["\r\n", "\r"], "\n", $text);
// Mehrfache Leerzeichen
// 3⃣ Harte PDF-Zeilenumbrüche reparieren:
// Wenn Zeile nicht mit Punkt endet und nächste mit Kleinbuchstabe beginnt → zusammenführen
$text = preg_replace(
'/([^\.\!\?\:\n])\n([a-zäöü])/u',
'$1 $2',
$text
);
// 4⃣ Inline-Listen stabilisieren:
// " - Punkt - Punkt" → echte neue Zeile
$text = preg_replace('/\s-\s/', "\n- ", $text);
// 5⃣ Mehrfache Leerzeichen reduzieren
$text = preg_replace('/[ \t]+/', ' ', $text);
// Mehrfache Leerzeilen
// 6 Mehrfache Leerzeilen reduzieren
$text = preg_replace('/\n{3,}/', "\n\n", $text);
return trim($text);

View File

@@ -8,6 +8,7 @@ use App\Entity\DocumentVersion;
use App\Knowledge\Text\TextNormalizer;
use App\Repository\DocumentVersionRepository;
use App\Ingest\DocumentSanitizer;
use App\Ingest\StructureEnhancer;
final readonly class KnowledgeIngestService
{
@@ -16,7 +17,8 @@ final readonly class KnowledgeIngestService
private SimpleChunker $chunker,
private DocumentVersionRepository $versionRepo,
private TextNormalizer $textNormalizer,
private DocumentSanitizer $documentSanitizer, // ✅ NEU
private DocumentSanitizer $documentSanitizer,
private StructureEnhancer $structureEnhancer, // ✅ NEU
)
{
}
@@ -31,10 +33,13 @@ final readonly class KnowledgeIngestService
// 1⃣ Rohtext laden
$text = $this->loader->load($version->getFilePath());
// 2 🔥 Deterministische Vorverarbeitung (NEU)
// 2⃣ Deterministische Textbereinigung
$text = $this->documentSanitizer->sanitize($text);
// 3Chunking
// 3🔥 Deterministische Struktur-Anreicherung (NEU)
$text = $this->structureEnhancer->enhance($text);
// 4⃣ Chunking
$chunks = $this->chunker->chunk($text);
$doc = $version->getDocument();