diff --git a/python/vector/vector_ingest.py b/python/vector/vector_ingest.py index 2fefdc7..05a4bfa 100644 --- a/python/vector/vector_ingest.py +++ b/python/vector/vector_ingest.py @@ -96,8 +96,8 @@ print("Encoding embeddings...") embeddings = model.encode( texts, normalize_embeddings=True, - show_progress_bar=True, - batch_size=64 + show_progress_bar=False, + batch_size=128 ) embeddings = np.array(embeddings).astype("float32") diff --git a/python/vector/vector_ingest_tags.py b/python/vector/vector_ingest_tags.py index 0e52995..2048dc9 100644 --- a/python/vector/vector_ingest_tags.py +++ b/python/vector/vector_ingest_tags.py @@ -109,8 +109,8 @@ if not texts: embeddings = model.encode( texts, normalize_embeddings=True, - show_progress_bar=False, - batch_size=64 + show_progress_bar=True, + batch_size=128 ) embeddings = np.array(embeddings).astype("float32") diff --git a/src/Ingest/ChunkWriteService.php b/src/Ingest/ChunkWriteService.php index 2967418..ed7d2b6 100644 --- a/src/Ingest/ChunkWriteService.php +++ b/src/Ingest/ChunkWriteService.php @@ -11,8 +11,10 @@ use Symfony\Component\Uid\Uuid; final readonly class ChunkWriteService { public function __construct( - private ChunkManager $chunkManager, - ) {} + private ChunkManager $chunkManager + ) + { + } public function countAllChunks(): int { @@ -41,4 +43,5 @@ final readonly class ChunkWriteService { $this->chunkManager->rewriteAll($allChunks); } + } \ No newline at end of file diff --git a/src/Ingest/DocumentSanitizer.php b/src/Ingest/DocumentSanitizer.php new file mode 100644 index 0000000..9a89f38 --- /dev/null +++ b/src/Ingest/DocumentSanitizer.php @@ -0,0 +1,232 @@ +normalizeLineEndings($text); + + // Wichtig: Reihenfolge so, dass wir erst "grobe Blöcke" (TOC) entfernen, + // danach zeilenbasierte Artefakte (PageNumbers/Headers/DotLeaders). + $text = $this->removeToc($text); + $text = $this->removePageNumbers($text); + $text = $this->removeDotLeaderLines($text); + $text = $this->removeRepeatedHeaders($text); + + $text = $this->cleanupWhitespace($text); + + return trim($text); + } + + private function normalizeLineEndings(string $text): string + { + // Vereinheitlichen auf \n (deterministisch, kein Encoding-Change) + return str_replace(["\r\n", "\r"], "\n", $text); + } + + /** + * Entfernt TOC-Block nach "Inhaltsverzeichnis" bis zum ersten "echten" Absatz. + * + * Heuristik: + * - Start: Zeile enthält "Inhaltsverzeichnis" (case-insensitive) + * - Innerhalb TOC werden Zeilen entfernt, die wie TOC-Einträge aussehen: + * - Dot-Leader + Seitenzahl + * - Kapitelnummern + Text + Seitenzahl + * - Ende: sobald eine Zeile "absatzartig" wirkt: + * - ausreichend lang UND enthält Satzpunkt (.) + * + * Guardrail: + * - Leere Zeilen innerhalb TOC werden verworfen (damit TOC-Block wirklich weg ist) + */ + private function removeToc(string $text): string + { + $lines = explode("\n", $text); + $filtered = []; + + $inToc = false; + + foreach ($lines as $line) { + $trim = trim($line); + + // TOC Start + if (!$inToc && $trim !== '' && stripos($trim, 'inhaltsverzeichnis') !== false) { + $inToc = true; + continue; + } + + if ($inToc) { + // Innerhalb TOC: leere Zeilen weg (Block entfernen) + if ($trim === '') { + continue; + } + + // typische TOC-Zeilen (Leader / Kapitelnummern) + if ($this->looksLikeDotLeaderLine($trim) || $this->looksLikeNumberedTocLine($trim)) { + continue; + } + + // Ende TOC, wenn "echter Absatz" beginnt (lang + Punkt) + if (strlen($trim) >= 120 && str_contains($trim, '.')) { + $inToc = false; + $filtered[] = $line; + continue; + } + + // sonst: solange wir im TOC sind, ignorieren + continue; + } + + $filtered[] = $line; + } + + return implode("\n", $filtered); + } + + /** + * Entfernt typische Seitenzahl-Zeilen. + * + * Guardrails: + * - Nur kurze, "isolierte" Zeilen (trim != '') + * - Lässt Fließtext unangetastet + */ + private function removePageNumbers(string $text): string + { + $lines = explode("\n", $text); + $filtered = []; + + foreach ($lines as $line) { + $trim = trim($line); + + if ($trim === '') { + $filtered[] = $line; + continue; + } + + // "Seite 3" / "Seite 3 von 20" + if (preg_match('/^seite\s+\d+(\s+von\s+\d+)?$/iu', $trim)) { + continue; + } + + // "Page 12" / "Page 12 of 34" + if (preg_match('/^page\s+\d+(\s+of\s+\d+)?$/iu', $trim)) { + continue; + } + + // "- 4 -" / "4" / "– 4 –" + if (preg_match('/^[-–]?\s?\d{1,3}\s?[-–]?$/u', $trim)) { + continue; + } + + $filtered[] = $line; + } + + return implode("\n", $filtered); + } + + /** + * Entfernt Dot-Leader-Zeilen überall (nicht nur im TOC), + * z.B.: "Kapitel ......... 12" + */ + private function removeDotLeaderLines(string $text): string + { + $lines = explode("\n", $text); + $filtered = []; + + foreach ($lines as $line) { + $trim = trim($line); + + if ($trim !== '' && $this->looksLikeDotLeaderLine($trim)) { + continue; + } + + $filtered[] = $line; + } + + return implode("\n", $filtered); + } + + /** + * Entfernt wiederkehrende Header/Footer-Zeilen. + * + * Guardrails: + * - Nur relativ kurze Zeilen (unter MAX_HEADER_LEN) + * - Nur wenn identisch (trim) >= REPEAT_HEADER_MIN_COUNT + * - Leere Zeilen bleiben erhalten + */ + private function removeRepeatedHeaders(string $text): string + { + $lines = explode("\n", $text); + + // counts basiert auf trim (damit z.B. unterschiedliche Einrückung nicht zählt) + $trimmed = array_map('trim', $lines); + $counts = array_count_values($trimmed); + + $filtered = []; + + foreach ($lines as $line) { + $trim = trim($line); + + if ( + $trim !== '' && + strlen($trim) < self::MAX_HEADER_LEN && + ($counts[$trim] ?? 0) >= self::REPEAT_HEADER_MIN_COUNT + ) { + continue; + } + + $filtered[] = $line; + } + + return implode("\n", $filtered); + } + + private function cleanupWhitespace(string $text): string + { + // nicht zu aggressiv: nur 3+ Leerzeilen auf 2 reduzieren + $text = preg_replace("/\n{3,}/", "\n\n", $text); + return $text ?? ''; + } + + // ========================================================= + // Heuristics (isoliert, testbar) + // ========================================================= + + private function looksLikeDotLeaderLine(string $trimmedLine): bool + { + // "Text ..... 12" (mind. 5 Punkte, Seitenzahl am Ende) + return (bool)preg_match('/^.+\.{5,}\s*\d+$/u', $trimmedLine); + } + + private function looksLikeNumberedTocLine(string $trimmedLine): bool + { + // "2.1 Kapitelname 12" / "3 Kapitelname 7" + // Kapitelnummern + Text + Seitenzahl am Ende + return (bool)preg_match('/^\d+(\.\d+)*\s+.+\s+\d+$/u', $trimmedLine); + } +} \ No newline at end of file diff --git a/src/Knowledge/Ingest/KnowledgeIngestService.php b/src/Knowledge/Ingest/KnowledgeIngestService.php index d2f05ad..1b133c2 100644 --- a/src/Knowledge/Ingest/KnowledgeIngestService.php +++ b/src/Knowledge/Ingest/KnowledgeIngestService.php @@ -7,14 +7,16 @@ namespace App\Knowledge\Ingest; use App\Entity\DocumentVersion; use App\Knowledge\Text\TextNormalizer; use App\Repository\DocumentVersionRepository; +use App\Ingest\DocumentSanitizer; -final readonly class KnowledgeIngestService +final readonly class KnowledgeIngestService { public function __construct( private DocumentLoader $loader, private SimpleChunker $chunker, private DocumentVersionRepository $versionRepo, - private TextNormalizer $textNormalizer + private TextNormalizer $textNormalizer, + private DocumentSanitizer $documentSanitizer, // ✅ NEU ) { } @@ -26,8 +28,13 @@ final readonly class KnowledgeIngestService */ public function buildChunkRecords(DocumentVersion $version): iterable { + // 1️⃣ Rohtext laden $text = $this->loader->load($version->getFilePath()); + // 2️⃣ 🔥 Deterministische Vorverarbeitung (NEU) + $text = $this->documentSanitizer->sanitize($text); + + // 3️⃣ Chunking $chunks = $this->chunker->chunk($text); $doc = $version->getDocument(); @@ -41,7 +48,6 @@ final readonly class KnowledgeIngestService foreach ($chunks as $chunkText) { if ($title !== '' && !str_starts_with($chunkText, $title)) { - //title with backticks $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText; } @@ -57,13 +63,13 @@ final readonly class KnowledgeIngestService ); yield [ - 'chunk_id' => $chunkId, + 'chunk_id' => $chunkId, 'document_id' => $documentId, - 'version_id' => $versionId, + 'version_id' => $versionId, 'chunk_index' => $index++, - 'text' => $chunkText, - 'checksum' => sha1($chunkText), - 'metadata' => $this->buildMetadata($version), + 'text' => $chunkText, + 'checksum' => sha1($chunkText), + 'metadata' => $this->buildMetadata($version), ]; } } diff --git a/src/Knowledge/Ingest/SimpleChunker.php b/src/Knowledge/Ingest/SimpleChunker.php index 043e6f5..e75466d 100644 --- a/src/Knowledge/Ingest/SimpleChunker.php +++ b/src/Knowledge/Ingest/SimpleChunker.php @@ -10,10 +10,9 @@ use App\Knowledge\Text\TextNormalizer; final readonly class SimpleChunker { - public function __construct( private IndexConfigurationProvider $configurationProvider, - private TextNormalizer $textNormalizer + private TextNormalizer $textNormalizer ) { } @@ -23,7 +22,7 @@ final readonly class SimpleChunker { $config = $this->configurationProvider->getConfiguration(); - $maxWords = $config->getChunkSize(); + $maxWords = $config->getChunkSize(); $overlapWords = $config->getChunkOverlap(); $text = $this->textNormalizer->normalize($text); @@ -31,6 +30,74 @@ final readonly class SimpleChunker return []; } + // ====================================================== + // HYBRID: Erst Absatzbasiert sammeln + // ====================================================== + + $paragraphs = preg_split('/\n{2,}/u', $text); + if (!$paragraphs) { + return []; + } + + $chunks = []; + $currentChunk = ''; + $currentWordCount = 0; + + foreach ($paragraphs as $paragraph) { + + $paragraph = trim($paragraph); + if ($paragraph === '') { + continue; + } + + $paragraphWordCount = $this->countWords($paragraph); + + // Falls einzelner Absatz größer als maxWords → Fallback + if ($paragraphWordCount > $maxWords) { + + if ($currentChunk !== '') { + $chunks[] = trim($currentChunk); + $currentChunk = ''; + $currentWordCount = 0; + } + + foreach ($this->chunkByWords($paragraph, $maxWords, $overlapWords) as $subChunk) { + $chunks[] = $subChunk; + } + + continue; + } + + // Absatz passt noch in aktuellen Chunk + if ($currentWordCount + $paragraphWordCount <= $maxWords) { + $currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph; + $currentWordCount += $paragraphWordCount; + continue; + } + + // Flush aktueller Chunk + if ($currentChunk !== '') { + $chunks[] = trim($currentChunk); + } + + $currentChunk = $paragraph; + $currentWordCount = $paragraphWordCount; + } + + if ($currentChunk !== '') { + $chunks[] = trim($currentChunk); + } + + return $this->dedupe($chunks); + } + + // ====================================================== + // Wortbasierter Fallback (Original-Logik beibehalten) + // ====================================================== + + /** @return string[] */ + private function chunkByWords(string $text, int $maxWords, int $overlapWords): array + { $tokens = preg_split( '/(\s+)/u', $text, @@ -61,7 +128,7 @@ final readonly class SimpleChunker $wordEnd = min($wordPos + $maxWords, $totalWords); $tokenStart = $wordTokenIndexes[$wordPos]; - $tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1; + $tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1; $tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd); @@ -82,7 +149,7 @@ final readonly class SimpleChunker $wordPos = max(0, $wordEnd - $overlapWords); } - return $this->dedupe($chunks); + return $chunks; } private function adjustCutToBoundary(array $tokens, int $start, int $end): int @@ -110,11 +177,17 @@ final readonly class SimpleChunker return $end; } + private function countWords(string $text): int + { + $parts = preg_split('/\s+/u', trim($text)); + return $parts ? count($parts) : 0; + } + /** @param string[] $chunks @return string[] */ private function dedupe(array $chunks): array { $seen = []; - $out = []; + $out = []; foreach ($chunks as $chunk) { $key = mb_strtolower( @@ -131,4 +204,4 @@ final readonly class SimpleChunker return $out; } -} +} \ No newline at end of file