From a5a6f466f32022378e578c7d17fa62d57eef2826 Mon Sep 17 00:00:00 2001 From: team2 Date: Fri, 27 Feb 2026 15:37:05 +0100 Subject: [PATCH] optimize chunk text normalizer --- .../Ingest/KnowledgeIngestService.php | 28 ++-------- .../Retrieval/NdjsonHybridRetriever.php | 2 +- src/Knowledge/Text/TextNormalizer.php | 52 ++++++++++++++++--- 3 files changed, 52 insertions(+), 30 deletions(-) diff --git a/src/Knowledge/Ingest/KnowledgeIngestService.php b/src/Knowledge/Ingest/KnowledgeIngestService.php index acf3b14..5a11732 100644 --- a/src/Knowledge/Ingest/KnowledgeIngestService.php +++ b/src/Knowledge/Ingest/KnowledgeIngestService.php @@ -5,14 +5,16 @@ declare(strict_types=1); namespace App\Knowledge\Ingest; use App\Entity\DocumentVersion; +use App\Knowledge\Text\TextNormalizer; use App\Repository\DocumentVersionRepository; -final class KnowledgeIngestService +final readonly class KnowledgeIngestService { public function __construct( private DocumentLoader $loader, private SimpleChunker $chunker, private DocumentVersionRepository $versionRepo, + private TextNormalizer $textNormalizer ) { } @@ -25,12 +27,10 @@ final class KnowledgeIngestService public function buildChunkRecords(DocumentVersion $version): iterable { $text = $this->loader->load($version->getFilePath()); - $text = $this->optimizeText($text); $chunks = $this->chunker->chunk($text); $doc = $version->getDocument(); - $documentId = $doc->getId()->toRfc4122(); $versionId = $version->getId()->toRfc4122(); @@ -41,13 +41,13 @@ final class KnowledgeIngestService foreach ($chunks as $chunkText) { if ($title !== '' && !str_starts_with($chunkText, $title)) { - $chunkText = "# Produkt Titel: " . $title . "\n\n --- " . $chunkText; + $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText; } $chunkText = trim($chunkText); // 🔥 deterministische Chunk-ID - $normalizedForId = $this->normalizeForId($chunkText); + $normalizedForId = $this->textNormalizer->normalize($chunkText); $chunkId = sha1( $documentId . '|' . @@ -77,24 +77,6 @@ final class KnowledgeIngestService } } - private function optimizeText(string $text): string - { - $text = preg_replace("/\n{3,}/", "\n\n", $text); - $text = preg_replace("/[ \t]+$/m", "", $text); - return trim($text); - } - - /** - * Normalisierung für stabile ID-Berechnung. - * Wichtig: ID darf nicht durch Whitespace minimal variieren. - */ - private function normalizeForId(string $text): string - { - $text = mb_strtolower($text); - $text = preg_replace('/\s+/u', ' ', $text); - return trim($text); - } - /** * @return array */ diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 43e616f..c9ca74d 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -23,7 +23,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface * Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter). * Enterprise Default: klein halten, sonst dominieren Tags wieder. */ - private const TAG_SCORE_BONUS = 0.25; + private const TAG_SCORE_BONUS = 0.5; public function __construct( private readonly NdjsonChunkLookup $lookup, diff --git a/src/Knowledge/Text/TextNormalizer.php b/src/Knowledge/Text/TextNormalizer.php index 66289dd..acd07b4 100644 --- a/src/Knowledge/Text/TextNormalizer.php +++ b/src/Knowledge/Text/TextNormalizer.php @@ -8,18 +8,58 @@ final class TextNormalizer { public function normalize(string $text): string { - // Silbentrennungen entfernen - $text = preg_replace('/-\n/', '', $text); + if ($text === '') { + return ''; + } + + // ------------------------------------------------- + // 1. Encoding-Artefakte & Sonderzeichen + // ------------------------------------------------- + + // Word/PDF Bullet-Artefakte (häufiges Problemzeichen) + $text = str_replace('', '-', $text); + + // Unicode Bullets vereinheitlichen → "-" + $text = preg_replace( + '/[\x{2022}\x{25CF}\x{2219}\x{2023}\x{2043}]/u', + '-', + $text + ); + + $text = preg_replace('/[\x{E000}-\x{F8FF}]/u', '', $text); + + // Non-breaking space → normales Leerzeichen + $text = str_replace("\xC2\xA0", ' ', $text); + + // Zero-width characters entfernen + $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text); + + // ------------------------------------------------- + // 2. Zeilenumbrüche vereinheitlichen + // ------------------------------------------------- - // Windows-Zeilenumbrüche vereinheitlichen $text = str_replace("\r\n", "\n", $text); + $text = str_replace("\r", "\n", $text); + + // ------------------------------------------------- + // 3. Silbentrennung über Zeilen entfernen + // Beispiel: + // Testo- + // mat → Testomat + // ------------------------------------------------- + + $text = preg_replace('/-\n(\p{L})/u', '$1', $text); + + // ------------------------------------------------- + // 4. Whitespace normalisieren + // ------------------------------------------------- // Mehrfache Leerzeichen reduzieren - $text = preg_replace('/[ \t]+/', ' ', $text); + $text = preg_replace('/[ \t]+/u', ' ', $text); // Mehrfache Leerzeilen reduzieren - $text = preg_replace('/\n{3,}/', "\n\n", $text); + $text = preg_replace('/\n{3,}/u', "\n\n", $text); return trim($text); } -} +} \ No newline at end of file