From e7a315c1478898be2d225c77ccfb79b1734531f1 Mon Sep 17 00:00:00 2001 From: team2 Date: Sat, 28 Feb 2026 23:19:47 +0100 Subject: [PATCH] optimize ingesting documents --- src/Ingest/StructureEnhancer.php | 182 ++++++++++++++++++ src/Knowledge/Ingest/DocumentLoader.php | 45 +++-- .../Ingest/KnowledgeIngestService.php | 15 +- 3 files changed, 223 insertions(+), 19 deletions(-) create mode 100644 src/Ingest/StructureEnhancer.php diff --git a/src/Ingest/StructureEnhancer.php b/src/Ingest/StructureEnhancer.php new file mode 100644 index 0000000..6cdaf49 --- /dev/null +++ b/src/Ingest/StructureEnhancer.php @@ -0,0 +1,182 @@ +normalizeLineEndings($text); + $text = $this->detectHeadings($text); + $text = $this->detectSimpleLists($text); + + return $text; + } + + private function normalizeLineEndings(string $text): string + { + return str_replace(["\r\n", "\r"], "\n", $text); + } + + private function detectHeadings(string $text): string + { + $lines = explode("\n", $text); + $out = []; + + $total = count($lines); + + for ($i = 0; $i < $total; $i++) { + $line = $lines[$i]; + $trim = trim($line); + + if ($this->isHeadingCandidate($trim, $lines, $i)) { + $out[] = '## ' . $trim; + continue; + } + + $out[] = $line; + } + + return implode("\n", $out); + } + + private function isHeadingCandidate(string $line, array $lines, int $index): bool + { + if ($line === '') { + return false; + } + + if (strlen($line) > 80) { + return false; + } + + if (str_ends_with($line, '.')) { + return false; + } + + if (str_contains($line, ',')) { + return false; + } + + if (preg_match('/\d+\.\d+/', $line)) { + return false; + } + + $prev = $lines[$index - 1] ?? ''; + $next = $lines[$index + 1] ?? ''; + + if (trim($prev) !== '' || trim($next) !== '') { + return false; + } + + $uppercaseRatio = $this->uppercaseRatio($line); + if ($uppercaseRatio > 0.6) { + return true; + } + + if ($this->isTitleCase($line)) { + return true; + } + + return false; + } + + private function uppercaseRatio(string $line): float + { + $letters = preg_replace('/[^a-zA-ZÄÖÜäöü]/u', '', $line); + if ($letters === '') { + return 0; + } + + $upper = preg_replace('/[^A-ZÄÖÜ]/u', '', $letters); + + return mb_strlen($upper) / mb_strlen($letters); + } + + private function isTitleCase(string $line): bool + { + $words = explode(' ', $line); + $count = 0; + + foreach ($words as $word) { + if ($word === '') { + continue; + } + + if (mb_strtoupper(mb_substr($word, 0, 1)) === mb_substr($word, 0, 1)) { + $count++; + } + } + + return $count >= max(1, intdiv(count($words), 2)); + } + + private function detectSimpleLists(string $text): string + { + $lines = explode("\n", $text); + $out = []; + + $buffer = []; + + foreach ($lines as $line) { + $trim = trim($line); + + if ($this->isListCandidate($trim)) { + $buffer[] = $trim; + continue; + } + + if (count($buffer) >= 2) { + foreach ($buffer as $item) { + $out[] = '- ' . $item; + } + } else { + foreach ($buffer as $item) { + $out[] = $item; + } + } + + $buffer = []; + $out[] = $line; + } + + if (count($buffer) >= 2) { + foreach ($buffer as $item) { + $out[] = '- ' . $item; + } + } else { + foreach ($buffer as $item) { + $out[] = $item; + } + } + + return implode("\n", $out); + } + + private function isListCandidate(string $line): bool + { + if ($line === '') { + return false; + } + + if (strlen($line) > 120) { + return false; + } + + if (str_ends_with($line, '.')) { + return false; + } + + if (str_contains($line, ':')) { + return false; + } + + return true; + } +} \ No newline at end of file diff --git a/src/Knowledge/Ingest/DocumentLoader.php b/src/Knowledge/Ingest/DocumentLoader.php index f2b4704..33d2ea0 100644 --- a/src/Knowledge/Ingest/DocumentLoader.php +++ b/src/Knowledge/Ingest/DocumentLoader.php @@ -20,11 +20,7 @@ final class DocumentLoader return match ($ext) { 'txt', 'md' => $this->loadText($path), 'pdf' => $this->loadPdf($path), - - // vorbereitet für später: - // 'docx' => $this->loadDocx($path), - - default => throw new \RuntimeException("Unsupported file type: .{$ext}"), + default => throw new \RuntimeException("Unsupported file type: .{$ext}"), }; } @@ -43,32 +39,53 @@ final class DocumentLoader $parser = new Parser(); try { - $pdf = $parser->parseFile($path); + $pdf = $parser->parseFile($path); $text = $pdf->getText(); } catch (\Throwable $e) { - throw new \RuntimeException("Failed to parse PDF: {$path}. Error: " . $e->getMessage(), 0, $e); + throw new \RuntimeException( + "Failed to parse PDF: {$path}. Error: " . $e->getMessage(), + 0, + $e + ); } return $this->normalize($text); } /** - * Zentraler Normalizer für alle Dokumenttypen + * Zentraler Normalizer für alle Dokumenttypen. + * Rein formal – keine Domain-Logik. */ private function normalize(string $text): string { - // Silbentrennung entfernen + if ($text === '') { + return ''; + } + + // 1️⃣ Silbentrennung entfernen (Wort-\nFortsetzung) $text = preg_replace('/-\n/', '', $text); - // Windows-Zeilenumbrüche - $text = str_replace("\r\n", "\n", $text); + // 2️⃣ Einheitliche Zeilenumbrüche + $text = str_replace(["\r\n", "\r"], "\n", $text); - // Mehrfache Leerzeichen + // 3️⃣ Harte PDF-Zeilenumbrüche reparieren: + // Wenn Zeile nicht mit Punkt endet und nächste mit Kleinbuchstabe beginnt → zusammenführen + $text = preg_replace( + '/([^\.\!\?\:\n])\n([a-zäöü])/u', + '$1 $2', + $text + ); + + // 4️⃣ Inline-Listen stabilisieren: + // " - Punkt - Punkt" → echte neue Zeile + $text = preg_replace('/\s-\s/', "\n- ", $text); + + // 5️⃣ Mehrfache Leerzeichen reduzieren $text = preg_replace('/[ \t]+/', ' ', $text); - // Mehrfache Leerzeilen + // 6️⃣ Mehrfache Leerzeilen reduzieren $text = preg_replace('/\n{3,}/', "\n\n", $text); return trim($text); } -} +} \ No newline at end of file diff --git a/src/Knowledge/Ingest/KnowledgeIngestService.php b/src/Knowledge/Ingest/KnowledgeIngestService.php index 1b133c2..41da161 100644 --- a/src/Knowledge/Ingest/KnowledgeIngestService.php +++ b/src/Knowledge/Ingest/KnowledgeIngestService.php @@ -8,6 +8,7 @@ use App\Entity\DocumentVersion; use App\Knowledge\Text\TextNormalizer; use App\Repository\DocumentVersionRepository; use App\Ingest\DocumentSanitizer; +use App\Ingest\StructureEnhancer; final readonly class KnowledgeIngestService { @@ -16,7 +17,8 @@ final readonly class KnowledgeIngestService private SimpleChunker $chunker, private DocumentVersionRepository $versionRepo, private TextNormalizer $textNormalizer, - private DocumentSanitizer $documentSanitizer, // ✅ NEU + private DocumentSanitizer $documentSanitizer, + private StructureEnhancer $structureEnhancer, // ✅ NEU ) { } @@ -31,15 +33,18 @@ final readonly class KnowledgeIngestService // 1️⃣ Rohtext laden $text = $this->loader->load($version->getFilePath()); - // 2️⃣ 🔥 Deterministische Vorverarbeitung (NEU) + // 2️⃣ Deterministische Textbereinigung $text = $this->documentSanitizer->sanitize($text); - // 3️⃣ Chunking + // 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU) + $text = $this->structureEnhancer->enhance($text); + + // 4️⃣ Chunking $chunks = $this->chunker->chunk($text); $doc = $version->getDocument(); $documentId = $doc->getId()->toRfc4122(); - $versionId = $version->getId()->toRfc4122(); + $versionId = $version->getId()->toRfc4122(); $title = trim((string)$doc->getTitle()); @@ -58,7 +63,7 @@ final readonly class KnowledgeIngestService $chunkId = sha1( $documentId . '|' . - $versionId . '|' . + $versionId . '|' . $normalizedForId );