optimize chunk text normalizer

2026-02-27 15:37:05 +01:00
parent 4761648836
commit a5a6f466f3
3 changed files with 52 additions and 30 deletions
--- a/src/Knowledge/Ingest/KnowledgeIngestService.php
+++ b/src/Knowledge/Ingest/KnowledgeIngestService.php
@@ -5,14 +5,16 @@ declare(strict_types=1);
 namespace App\Knowledge\Ingest;

 use App\Entity\DocumentVersion;
+use App\Knowledge\Text\TextNormalizer;
 use App\Repository\DocumentVersionRepository;

-final class KnowledgeIngestService
+final readonly  class KnowledgeIngestService
 {
    public function __construct(
        private DocumentLoader            $loader,
        private SimpleChunker             $chunker,
        private DocumentVersionRepository $versionRepo,
+        private TextNormalizer            $textNormalizer
    )
    {
    }
@@ -25,12 +27,10 @@ final class KnowledgeIngestService
    public function buildChunkRecords(DocumentVersion $version): iterable
    {
        $text = $this->loader->load($version->getFilePath());
-        $text = $this->optimizeText($text);

        $chunks = $this->chunker->chunk($text);

        $doc = $version->getDocument();
-
        $documentId = $doc->getId()->toRfc4122();
        $versionId = $version->getId()->toRfc4122();

@@ -41,13 +41,13 @@ final class KnowledgeIngestService
        foreach ($chunks as $chunkText) {

            if ($title !== '' && !str_starts_with($chunkText, $title)) {
-                $chunkText = "# Produkt Titel: " . $title . "\n\n --- " . $chunkText;
+                $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
            }

            $chunkText = trim($chunkText);

            // 🔥 deterministische Chunk-ID
-            $normalizedForId = $this->normalizeForId($chunkText);
+            $normalizedForId = $this->textNormalizer->normalize($chunkText);

            $chunkId = sha1(
                $documentId . '|' .
@@ -77,24 +77,6 @@ final class KnowledgeIngestService
        }
    }

-    private function optimizeText(string $text): string
-    {
-        $text = preg_replace("/\n{3,}/", "\n\n", $text);
-        $text = preg_replace("/[ \t]+$/m", "", $text);
-        return trim($text);
-    }
-
-    /**
-     * Normalisierung für stabile ID-Berechnung.
-     * Wichtig: ID darf nicht durch Whitespace minimal variieren.
-     */
-    private function normalizeForId(string $text): string
-    {
-        $text = mb_strtolower($text);
-        $text = preg_replace('/\s+/u', ' ', $text);
-        return trim($text);
-    }
-
    /**
     * @return array<string,mixed>
     */
--- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
+++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
@@ -23,7 +23,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
     * Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter).
     * Enterprise Default: klein halten, sonst dominieren Tags wieder.
     */
-    private const TAG_SCORE_BONUS = 0.25;
+    private const TAG_SCORE_BONUS = 0.5;

    public function __construct(
        private readonly NdjsonChunkLookup               $lookup,
--- a/src/Knowledge/Text/TextNormalizer.php
+++ b/src/Knowledge/Text/TextNormalizer.php
@@ -8,18 +8,58 @@ final class TextNormalizer
 {
    public function normalize(string $text): string
    {
-        // Silbentrennungen entfernen
-        $text = preg_replace('/-\n/', '', $text);
+        if ($text === '') {
+            return '';
+        }
+
+        // -------------------------------------------------
+        // 1. Encoding-Artefakte & Sonderzeichen
+        // -------------------------------------------------
+
+        // Word/PDF Bullet-Artefakte (häufiges Problemzeichen)
+        $text = str_replace('', '-', $text);
+
+        // Unicode Bullets vereinheitlichen → "-"
+        $text = preg_replace(
+            '/[\x{2022}\x{25CF}\x{2219}\x{2023}\x{2043}]/u',
+            '-',
+            $text
+        );
+
+        $text = preg_replace('/[\x{E000}-\x{F8FF}]/u', '', $text);
+
+        // Non-breaking space → normales Leerzeichen
+        $text = str_replace("\xC2\xA0", ' ', $text);
+
+        // Zero-width characters entfernen
+        $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text);
+
+        // -------------------------------------------------
+        // 2. Zeilenumbrüche vereinheitlichen
+        // -------------------------------------------------

-        // Windows-Zeilenumbrüche vereinheitlichen
        $text = str_replace("\r\n", "\n", $text);
+        $text = str_replace("\r", "\n", $text);
+
+        // -------------------------------------------------
+        // 3. Silbentrennung über Zeilen entfernen
+        // Beispiel:
+        // Testo-
+        // mat → Testomat
+        // -------------------------------------------------
+
+        $text = preg_replace('/-\n(\p{L})/u', '$1', $text);
+
+        // -------------------------------------------------
+        // 4. Whitespace normalisieren
+        // -------------------------------------------------

        // Mehrfache Leerzeichen reduzieren
-        $text = preg_replace('/[ \t]+/', ' ', $text);
+        $text = preg_replace('/[ \t]+/u', ' ', $text);

        // Mehrfache Leerzeilen reduzieren
-        $text = preg_replace('/\n{3,}/', "\n\n", $text);
+        $text = preg_replace('/\n{3,}/u', "\n\n", $text);

        return trim($text);
    }
-}
+}