optimize system and cleanup

2026-03-02 21:27:20 +01:00
parent 6b8d1b1936
commit e7047cd885
10 changed files with 459 additions and 346 deletions
--- a/config/services.yaml
+++ b/config/services.yaml
@@ -99,14 +99,9 @@ services:
  App\Knowledge\Retrieval\NdjsonHybridRetriever: ~
-  App\Knowledge\Retrieval\CachedRetriever:
+  # CachedRetriever entfernt: war Interface-inkompatibel und erzeugt Drift/Chaos
    arguments:
      $inner: '@App\Knowledge\Retrieval\NdjsonHybridRetriever'
      $cache: '@cache.app'
      $ttlSeconds: 600
  App\Knowledge\Retrieval\RetrieverInterface:
-    alias: App\Knowledge\Retrieval\CachedRetriever
+    alias: App\Knowledge\Retrieval\NdjsonHybridRetriever
  # ------------------------------------------------------------
  # Index Configuration Provider
--- a/src/Ingest/DocumentSanitizer.php
+++ b/src/Ingest/DocumentSanitizer.php
@@ -7,68 +7,65 @@ namespace App\Ingest;
 /**
 * DocumentSanitizer
 *
- * Ziel (deterministisch, minimal-invasiv):
+ * Deterministic, minimal-invasive preprocessing BEFORE chunking.
 * - Entfernt typische PDF-/DOC-Artefakte VOR dem Chunking:
 *   - Inhaltsverzeichnis-Blöcke (TOC)
 *   - Seitenzahlen / "Seite X von Y"
 *   - wiederkehrende Header/Footer-Zeilen
 *   - Dot-Leader-Zeilen (".... 12")
 *
- * Guardrails:
+ * Removes typical PDF/DOC artefacts:
- * - Keine semantische Umschreibung
+ *  - Table of contents blocks
- * - Keine Zufälligkeit
+ *  - Page numbers
- * - Kein Entfernen echter Fließtext-Absätze
+ *  - Repeated headers/footers
 *  - Dot-leader lines (e.g. "...... 12")
 *
 * Design principles:
 *  - No semantic rewriting
 *  - No randomness
 *  - No removal of real paragraphs
 *  - Type-aware sanitizing (PDF/DOC != MD/TXT)
 */
 final class DocumentSanitizer
 {
    private const MAX_HEADER_LEN = 120;
    private const REPEAT_HEADER_MIN_COUNT = 3;
-    public function sanitize(
+    public function sanitize(string $text, string $fileExtension): string
        string $text,
        string $fileExtension
    ): string
    {
        if ($text === '') {
            return '';
        }
        $text = $this->normalizeLineEndings($text);
        $fileExtension = strtolower($fileExtension);
        // Nur PDF-/DOC-artige Formate aggressiver behandeln
        if (in_array($fileExtension, ['pdf', 'doc', 'docx'], true)) {
            $text = $this->sanitizePdfLike($text);
        }
        return trim($this->cleanupWhitespace($text));
    }
    // =========================================================
    // PIPELINE
    // =========================================================
    private function sanitizePdfLike(string $text): string
    {
        $text = $this->removeToc($text);
        $text = $this->removePageNumbers($text);
        $text = $this->removeDotLeaderLines($text);
        $text = $this->removeRepeatedHeaders($text);
        }
-        $text = $this->cleanupWhitespace($text);
+        return $text;
        return trim($text);
    }
    private function normalizeLineEndings(string $text): string
    {
        // Vereinheitlichen auf \n (deterministisch, kein Encoding-Change)
        return str_replace(["\r\n", "\r"], "\n", $text);
    }
-    /**
+    // =========================================================
-     * Entfernt TOC-Block nach "Inhaltsverzeichnis" bis zum ersten "echten" Absatz.
+    // TOC REMOVAL
-     *
+    // =========================================================
-     * Heuristik:
+
     * - Start: Zeile enthält "Inhaltsverzeichnis" (case-insensitive)
     * - Innerhalb TOC werden Zeilen entfernt, die wie TOC-Einträge aussehen:
     *   - Dot-Leader + Seitenzahl
     *   - Kapitelnummern + Text + Seitenzahl
     * - Ende: sobald eine Zeile "absatzartig" wirkt:
     *   - ausreichend lang UND enthält Satzpunkt (.)
     *
     * Guardrail:
     * - Leere Zeilen innerhalb TOC werden verworfen (damit TOC-Block wirklich weg ist)
     */
    private function removeToc(string $text): string
    {
        $lines = explode("\n", $text);
@@ -86,24 +83,24 @@ final class DocumentSanitizer
            }
            if ($inToc) {
                // Innerhalb TOC: leere Zeilen weg (Block entfernen)
                if ($trim === '') {
                    continue;
                }
-                // typische TOC-Zeilen (Leader / Kapitelnummern)
+                if (
-                if ($this->looksLikeDotLeaderLine($trim) || $this->looksLikeNumberedTocLine($trim)) {
+                    $this->looksLikeDotLeaderLine($trim) ||
                    $this->looksLikeNumberedTocLine($trim)
                ) {
                    continue;
                }
-                // Ende TOC, wenn "echter Absatz" beginnt (lang + Punkt)
+                // Ende TOC sobald normale Satzstruktur erkannt wird
-                if (strlen($trim) >= 120 && str_contains($trim, '.')) {
+                if (preg_match('/[a-zäöüß]\.\s*$/iu', $trim)) {
                    $inToc = false;
                    $filtered[] = $line;
                    continue;
                }
                // sonst: solange wir im TOC sind, ignorieren
                continue;
            }
@@ -113,13 +110,10 @@ final class DocumentSanitizer
        return implode("\n", $filtered);
    }
-    /**
+    // =========================================================
-     * Entfernt typische Seitenzahl-Zeilen.
+    // PAGE NUMBERS
-     *
+    // =========================================================
-     * Guardrails:
+
     * - Nur kurze, "isolierte" Zeilen (trim != '')
     * - Lässt Fließtext unangetastet
     */
    private function removePageNumbers(string $text): string
    {
        $lines = explode("\n", $text);
@@ -134,17 +128,22 @@ final class DocumentSanitizer
            }
            // "Seite 3" / "Seite 3 von 20"
-            if (preg_match('/^seite\s+\d+(\s+von\s+\d+)?$/iu', $trim)) {
+            if (preg_match('/^seite\s+\d{1,4}(\s+von\s+\d{1,4})?$/iu', $trim)) {
                continue;
            }
            // "Page 12" / "Page 12 of 34"
-            if (preg_match('/^page\s+\d+(\s+of\s+\d+)?$/iu', $trim)) {
+            if (preg_match('/^page\s+\d{1,4}(\s+of\s+\d{1,4})?$/iu', $trim)) {
                continue;
            }
-            // "- 4 -" / "4" / "– 4 –"
+            // Isolierte Seitenmarker: "- 4 -" oder "– 4 –"
-            if (preg_match('/^[-–]?\s?\d{1,3}\s?[-–]?$/u', $trim)) {
+            if (preg_match('/^[-–]\s?\d{1,4}\s?[-–]$/u', $trim)) {
                continue;
            }
            // Nur reine Zahl (max 3 Stellen, um IDs nicht zu killen)
            if (preg_match('/^\d{1,3}$/u', $trim)) {
                continue;
            }
@@ -154,10 +153,10 @@ final class DocumentSanitizer
        return implode("\n", $filtered);
    }
-    /**
+    // =========================================================
-     * Entfernt Dot-Leader-Zeilen überall (nicht nur im TOC),
+    // DOT LEADER
-     * z.B.: "Kapitel ......... 12"
+    // =========================================================
-     */
+
    private function removeDotLeaderLines(string $text): string
    {
        $lines = explode("\n", $text);
@@ -176,19 +175,14 @@ final class DocumentSanitizer
        return implode("\n", $filtered);
    }
-    /**
+    // =========================================================
-     * Entfernt wiederkehrende Header/Footer-Zeilen.
+    // REPEATED HEADERS
-     *
+    // =========================================================
-     * Guardrails:
+
     * - Nur relativ kurze Zeilen (unter MAX_HEADER_LEN)
     * - Nur wenn identisch (trim) >= REPEAT_HEADER_MIN_COUNT
     * - Leere Zeilen bleiben erhalten
     */
    private function removeRepeatedHeaders(string $text): string
    {
        $lines = explode("\n", $text);
        // counts basiert auf trim (damit z.B. unterschiedliche Einrückung nicht zählt)
        $trimmed = array_map('trim', $lines);
        $counts = array_count_values($trimmed);
@@ -211,27 +205,27 @@ final class DocumentSanitizer
        return implode("\n", $filtered);
    }
    // =========================================================
    // WHITESPACE
    // =========================================================
    private function cleanupWhitespace(string $text): string
    {
-        // nicht zu aggressiv: nur 3+ Leerzeilen auf 2 reduzieren
+        // Maximal 2 Leerzeilen
-        $text = preg_replace("/\n{3,}/", "\n\n", $text);
+        return preg_replace("/\n{3,}/", "\n\n", $text);
        return $text ?? '';
    }
    // =========================================================
-    // Heuristics (isoliert, testbar)
+    // HEURISTICS
    // =========================================================
    private function looksLikeDotLeaderLine(string $trimmedLine): bool
    {
-        // "Text ..... 12" (mind. 5 Punkte, Seitenzahl am Ende)
+        return (bool)preg_match('/^.+\.{4,}\s*\d+$/u', $trimmedLine);
        return (bool)preg_match('/^.+\.{5,}\s*\d+$/u', $trimmedLine);
    }
    private function looksLikeNumberedTocLine(string $trimmedLine): bool
    {
        // "2.1 Kapitelname 12" / "3 Kapitelname 7"
        // Kapitelnummern + Text + Seitenzahl am Ende
        return (bool)preg_match('/^\d+(\.\d+)*\s+.+\s+\d+$/u', $trimmedLine);
    }
 }
--- a/src/Ingest/StructureEnhancer.php
+++ b/src/Ingest/StructureEnhancer.php
@@ -4,8 +4,27 @@ declare(strict_types=1);
 namespace App\Ingest;
 /**
 * StructureEnhancer
 *
 * Minimal, deterministic structure hints BEFORE chunking.
 *
 * Adds:
 *  - Heading markers ("## ") for isolated short title lines
 *  - Bullet markers ("- ") for obvious list runs
 *
 * Non-goals:
 *  - No semantic rewriting
 *  - No sentence merging
 *  - No aggressive list guessing
 */
 final class StructureEnhancer
 {
    private const MAX_HEADING_LEN = 80;
    private const MAX_LIST_ITEM_LEN = 140;
    private const MIN_LIST_RUN = 2;
    public function enhance(string $text): string
    {
        if ($text === '') {
@@ -13,6 +32,8 @@ final class StructureEnhancer
        }
        $text = $this->normalizeLineEndings($text);
        // Reihenfolge: erst Headings, dann Listen (stabiler fürs Chunking)
        $text = $this->detectHeadings($text);
        $text = $this->detectSimpleLists($text);
@@ -24,6 +45,10 @@ final class StructureEnhancer
        return str_replace(["\r\n", "\r"], "\n", $text);
    }
    // =========================================================
    // HEADINGS
    // =========================================================
    private function detectHeadings(string $text): string
    {
        $lines = explode("\n", $text);
@@ -52,22 +77,31 @@ final class StructureEnhancer
            return false;
        }
-        if (strlen($line) > 80) {
+        // Schon Markdown-Heading? Dann nicht anfassen.
        if (preg_match('/^#{1,6}\s+/u', $line)) {
            return false;
        }
-        if (str_ends_with($line, '.')) {
+        if (mb_strlen($line) > self::MAX_HEADING_LEN) {
            return false;
        }
        // Heading soll kein "Satz" sein
        if (preg_match('/[.!?]\s*$/u', $line)) {
            return false;
        }
        // Keine typischen Satz-Kommas (zu risky)
        if (str_contains($line, ',')) {
            return false;
        }
-        if (preg_match('/\d+\.\d+/', $line)) {
+        // Nummerierte Kapitel "1.2" / "2.3.4" nicht zwangs-heading-en
        if (preg_match('/\b\d+\.\d+(\.\d+)*\b/u', $line)) {
            return false;
        }
        // Muss "isoliert" stehen (leerzeile davor und danach)
        $prev = $lines[$index - 1] ?? '';
        $next = $lines[$index + 1] ?? '';
@@ -75,48 +109,81 @@ final class StructureEnhancer
            return false;
        }
-        $uppercaseRatio = $this->uppercaseRatio($line);
+        // Guardrail: mindestens ein Buchstabe
-        if ($uppercaseRatio > 0.6) {
+        if (!preg_match('/\p{L}/u', $line)) {
            return true;
        }
        if ($this->isTitleCase($line)) {
            return true;
        }
            return false;
        }
        // Klassiker: UPPERCASE oder Title Case
        $uppercaseRatio = $this->uppercaseRatio($line);
        if ($uppercaseRatio >= 0.65) {
            return true;
        }
        return $this->isTitleCase($line);
    }
    private function uppercaseRatio(string $line): float
    {
-        $letters = preg_replace('/[^a-zA-ZÄÖÜäöü]/u', '', $line);
+        $letters = preg_replace('/[^\p{L}]/u', '', $line);
-        if ($letters === '') {
+        if ($letters === '' || $letters === null) {
-            return 0;
+            return 0.0;
        }
-        $upper = preg_replace('/[^A-ZÄÖÜ]/u', '', $letters);
+        $upper = preg_replace('/[^\p{Lu}]/u', '', $letters);
        if ($upper === null) {
            return 0.0;
        }
-        return mb_strlen($upper) / mb_strlen($letters);
+        $lettersLen = mb_strlen($letters);
        if ($lettersLen === 0) {
            return 0.0;
        }
        return mb_strlen($upper) / $lettersLen;
    }
    private function isTitleCase(string $line): bool
    {
-        $words = explode(' ', $line);
+        $words = preg_split('/\s+/u', trim($line));
-        $count = 0;
+        if (!$words) {
            return false;
        }
        $wordCount = 0;
        $capCount = 0;
        foreach ($words as $word) {
            $word = trim($word);
            if ($word === '') {
                continue;
            }
-            if (mb_strtoupper(mb_substr($word, 0, 1)) === mb_substr($word, 0, 1)) {
+            // Wörter ohne Buchstaben ignorieren
-                $count++;
+            if (!preg_match('/\p{L}/u', $word)) {
                continue;
            }
            $wordCount++;
            $first = mb_substr($word, 0, 1);
            if ($first !== '' && mb_strtoupper($first) === $first) {
                $capCount++;
            }
        }
-        return $count >= max(1, intdiv(count($words), 2));
+        if ($wordCount === 0) {
            return false;
        }
        // mindestens die Hälfte der Wörter beginnt groß
        return $capCount >= max(1, intdiv($wordCount + 1, 2));
    }
    // =========================================================
    // LISTS
    // =========================================================
    private function detectSimpleLists(string $text): string
    {
        $lines = explode("\n", $text);
@@ -127,36 +194,45 @@ final class StructureEnhancer
        foreach ($lines as $line) {
            $trim = trim($line);
            // Bereits echte Liste? → nicht anfassen
            if (preg_match('/^-\s+/u', $trim) || preg_match('/^\d+\.\s+/u', $trim)) {
                $this->flushListBuffer($buffer, $out);
                $out[] = $line;
                continue;
            }
            if ($this->isListCandidate($trim)) {
                $buffer[] = $trim;
                continue;
            }
-            if (count($buffer) >= 2) {
+            $this->flushListBuffer($buffer, $out);
            $out[] = $line;
        }
        $this->flushListBuffer($buffer, $out);
        return implode("\n", $out);
    }
    private function flushListBuffer(array &$buffer, array &$out): void
    {
        if ($buffer === []) {
            return;
        }
        if (count($buffer) >= self::MIN_LIST_RUN) {
            foreach ($buffer as $item) {
                $out[] = '- ' . $item;
            }
        } else {
            // single line: unverändert lassen (kein "erraten"!)
            foreach ($buffer as $item) {
                $out[] = $item;
            }
        }
        $buffer = [];
            $out[] = $line;
        }
        if (count($buffer) >= 2) {
            foreach ($buffer as $item) {
                $out[] = '- ' . $item;
            }
        } else {
            foreach ($buffer as $item) {
                $out[] = $item;
            }
        }
        return implode("\n", $out);
    }
    private function isListCandidate(string $line): bool
@@ -165,18 +241,32 @@ final class StructureEnhancer
            return false;
        }
-        if (strlen($line) > 120) {
+        // zu lang = ziemlich sicher Absatz/Satz
        if (mb_strlen($line) > self::MAX_LIST_ITEM_LEN) {
            return false;
        }
-        if (str_ends_with($line, '.')) {
+        // wenn es wie ein Satz endet, nicht als Liste
        if (preg_match('/[.!?]\s*$/u', $line)) {
            return false;
        }
        // "Key: Value" ist typischerweise keine Liste
        if (str_contains($line, ':')) {
            return false;
        }
        // Wenn es ein kompletter Satz sein könnte (Verb/Artikel), nicht raten:
        // -> minimaler Guardrail: beginnt mit Großbuchstabe UND enthält mindestens 5 Wörter => eher Satz/Absatz
        $words = preg_split('/\s+/u', trim($line));
        if ($words && count($words) >= 5) {
            $first = mb_substr($line, 0, 1);
            if ($first !== '' && mb_strtoupper($first) === $first) {
                return false;
            }
        }
        // nur "kurze, stichpunktartige" Zeilen als Kandidat akzeptieren
        return true;
    }
 }
--- a/src/Intent/CatalogIntentLite.php
+++ b/src/Intent/CatalogIntentLite.php
@@ -11,9 +11,7 @@ use App\Tag\TagTypes;
 /**
 * CatalogIntentLite
 *
- * Reiner Entity-Detector.
+ * Verantwortlich ausschließlich für:
 *
 * Verantwortlich nur für:
 * - Vector-Tag-Erkennung
 * - Score-Gate
 * - Ambiguity-Check
@@ -26,16 +24,7 @@ use App\Tag\TagTypes;
 */
 final class CatalogIntentLite
 {
    /**
     * Minimaler Similarity-Score.
     * Verhindert Rauschen.
     */
    private const MIN_SCORE = 0.72;
    /**
     * Differenz zwischen Top1 und Top2,
     * damit kein unsicherer Treffer akzeptiert wird.
     */
    private const AMBIGUITY_DELTA = 0.03;
    public function __construct(
@@ -43,10 +32,6 @@ final class CatalogIntentLite
        private readonly QueryCleaner $queryCleaner,
    ) {}
    /**
     * Gibt das canonical Label der erkannten catalog_entity zurück
     * oder null, wenn kein sauberer Treffer.
     */
    public function detect(string $prompt): ?string
    {
        $prompt = trim($prompt);
@@ -54,10 +39,82 @@ final class CatalogIntentLite
            return null;
        }
-        $promptTag = $this->queryCleaner->clean($prompt);
+        $clean = $this->queryCleaner->clean($prompt);
        if ($clean === '') {
            $clean = $prompt;
        }
-        // 1) Tag-Vector-Suche
+        // ----------------------------------------------------
-        $hits = $this->tagVectorClient->search($promptTag, 3);
+        // 1️⃣ Primär: Vollquery testen
        // ----------------------------------------------------
        $label = $this->detectFromQuery($clean);
        if ($label !== null) {
            return $label;
        }
        // ----------------------------------------------------
        // 2️⃣ Fallback: Tokenweise testen
        //    (wichtig für "geräteliste testomat")
        // ----------------------------------------------------
        $tokens = $this->tokenize($clean);
        $bestLabel = null;
        $bestScore = 0.0;
        foreach ($tokens as $token) {
            // sehr kurze Tokens ignorieren (Noise)
            if (mb_strlen($token) < 3) {
                continue;
            }
            $hits = $this->tagVectorClient->search($token, 3);
            if ($hits === []) {
                continue;
            }
            $top = $hits[0] ?? null;
            if (!is_array($top)) {
                continue;
            }
            $score = (float)($top['score'] ?? 0.0);
            if ($score < self::MIN_SCORE) {
                continue;
            }
            // Ambiguity-Check
            if (isset($hits[1])) {
                $secondScore = (float)($hits[1]['score'] ?? 0.0);
                if (abs($score - $secondScore) < self::AMBIGUITY_DELTA) {
                    continue;
                }
            }
            if (($top['tag_type'] ?? null) !== TagTypes::CATALOG_ENTITY) {
                continue;
            }
            if ($score > $bestScore) {
                $bestScore = $score;
                $bestLabel = trim((string)($top['label'] ?? ''));
            }
        }
        if ($bestLabel === null || $bestLabel === '') {
            return null;
        }
        return mb_strtolower($bestLabel);
    }
    private function detectFromQuery(string $query): ?string
    {
        $hits = $this->tagVectorClient->search($query, 3);
        if ($hits === []) {
            return null;
@@ -66,26 +123,21 @@ final class CatalogIntentLite
        $best = $hits[0];
        $bestScore = (float)($best['score'] ?? 0.0);
        // 2) Score-Tags
        if ($bestScore < self::MIN_SCORE) {
            return null;
        }
        // 3) Ambiguity-Check
        if (isset($hits[1])) {
            $secondScore = (float)($hits[1]['score'] ?? 0.0);
            if (abs($bestScore - $secondScore) < self::AMBIGUITY_DELTA) {
                return null;
            }
        }
        // 4) Nur catalog_entity zulassen
        if (($best['tag_type'] ?? null) !== TagTypes::CATALOG_ENTITY) {
            return null;
        }
        // 5) Canonical Label
        $label = trim((string)($best['label'] ?? ''));
        if ($label === '') {
@@ -94,4 +146,31 @@ final class CatalogIntentLite
        return mb_strtolower($label);
    }
    private function tokenize(string $text): array
    {
        $parts = preg_split('/\s+/u', trim($text));
        if (!$parts) {
            return [];
        }
        $seen = [];
        $out = [];
        foreach ($parts as $p) {
            $p = trim($p);
            if ($p === '') {
                continue;
            }
            if (isset($seen[$p])) {
                continue;
            }
            $seen[$p] = true;
            $out[] = $p;
        }
        return $out;
    }
 }
--- a/src/Knowledge/Ingest/DocumentLoader.php
+++ b/src/Knowledge/Ingest/DocumentLoader.php
@@ -27,11 +27,12 @@ final class DocumentLoader
    private function loadText(string $path): string
    {
        $content = file_get_contents($path);
        if ($content === false) {
            throw new \RuntimeException("Could not read file: {$path}");
        }
-        return $this->normalize($content);
+        return $this->normalizeLineEndings($content);
    }
    private function loadPdf(string $path): string
@@ -49,120 +50,31 @@ final class DocumentLoader
            );
        }
-        return $this->normalize($text);
+        return $this->normalizeLineEndings($text);
    }
-    private function normalize(string $text): string
+    /**
     * Loader ist bewusst minimal.
     *
     * KEINE:
     * - Silbentrennung
     * - Listen-Reparatur
     * - Struktur-Merges
     * - Regex-Orgie
     *
     * Nur:
     * - Zeilenumbrüche vereinheitlichen
     * - trim
     */
    private function normalizeLineEndings(string $text): string
    {
        if ($text === '') {
            return '';
        }
-        // 1. Silbentrennung entfernen
+        // Einheitliche Zeilenumbrüche
        $text = preg_replace('/-\n/', '', $text);
        // 2. Einheitliche Zeilenumbrüche
        $text = str_replace(["\r\n", "\r"], "\n", $text);
        // 3. Symbolmüll entfernen
        $text = $this->removeUnwantedSymbols($text);
        // 4. Struktur-Reparatur
        $text = $this->repairStructure($text);
        // 5. Inline-Listen stabilisieren
        $text = preg_replace('/\s-\s/', "\n- ", $text);
        // 6. Whitespace normalisieren
        $text = preg_replace('/[ \t]+/', ' ', $text);
        $text = preg_replace('/\n{3,}/', "\n\n", $text);
        return trim($text);
    }
    private function removeUnwantedSymbols(string $text): string
    {
        $text = str_replace(['©', '®', '™', '℠'], '', $text);
        $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text);
        $text = preg_replace('/[^\P{C}\n]+/u', '', $text);
        return $text;
    }
    /**
     * Konsolidierte Struktur-Reparatur
     */
    private function repairStructure(string $text): string
    {
        $lines = explode("\n", $text);
        $out   = [];
        $count = count($lines);
        for ($i = 0; $i < $count; $i++) {
            $current = trim($lines[$i]);
            if ($current === '') {
                $out[] = '';
                continue;
            }
            if ($i < $count - 1) {
                $next = trim($lines[$i + 1]);
                // --- 1. Modellnummern / Zahlfortsetzung ---
                if (
                    !preg_match('/^- /', $current) &&
                    !preg_match('/^- /', $next) &&
                    !preg_match('/[\.:\?!]$/', $current) &&
                    preg_match('/^\d+/', $next) // beginnt mit Zahl
                ) {
                    $out[] = $current . ' ' . $next;
                    $i++;
                    continue;
                }
                // --- 2. Satzfortsetzung (Zeile beginnt klein) ---
                if (
                    !preg_match('/^- /', $current) &&
                    !preg_match('/^- /', $next) &&
                    !preg_match('/[\.:\?!]$/', $current) &&
                    preg_match('/^[a-zäöü]/u', $next)
                ) {
                    $out[] = $current . ' ' . $next;
                    $i++;
                    continue;
                }
                // --- 3. Falsche Listenfortsetzung ---
                if (
                    preg_match('/^- /', $current) &&
                    preg_match('/^- [a-zäöü]/u', $next) &&
                    !preg_match('/[\.:\?!]$/', $current)
                ) {
                    $merged = rtrim($current) . ' ' . ltrim(substr($next, 2));
                    $out[] = $merged;
                    $i++;
                    continue;
                }
            }
            // --- 4. Pseudo-Liste wie "- 808 festlegen" ---
            if (preg_match('/^- \d+[A-Za-z ]{0,25}$/', $current)) {
                $out[] = substr($current, 2);
                continue;
            }
            // --- 5. Pseudo-Liste wie "- im eingeschalteten Zustand ..." ---
            if (
                preg_match('/^- [a-zäöü]/u', $current) &&
                ($i === 0 || !preg_match('/^- /', trim($lines[$i - 1])))
            ) {
                $out[] = substr($current, 2);
                continue;
            }
            $out[] = $current;
        }
        return implode("\n", $out);
    }
 }
--- a/src/Knowledge/Ingest/KnowledgeIngestService.php
+++ b/src/Knowledge/Ingest/KnowledgeIngestService.php
@@ -18,10 +18,8 @@ final readonly class KnowledgeIngestService
        private DocumentVersionRepository $versionRepo,
        private TextNormalizer            $textNormalizer,
        private DocumentSanitizer         $documentSanitizer,
-        private StructureEnhancer         $structureEnhancer, // ✅ NEU
+        private StructureEnhancer         $structureEnhancer,
-    )
+    ) {}
    {
    }
    /**
     * Lokaler Ingest: erzeugt deterministische NDJSON-Records.
@@ -34,16 +32,13 @@ final readonly class KnowledgeIngestService
        $text = $this->loader->load($version->getFilePath());
        $extension = $version->getFileExtension() ?? 'txt';
-        // 2️⃣ Deterministische Textbereinigung
+        // 2️⃣ Artefakt-Sanitizing
-        $text = $this->documentSanitizer->sanitize(
+        $text = $this->documentSanitizer->sanitize($text, $extension);
            $text,
            $extension
        );
-        // 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU)
+        // 3️⃣ Struktur-Hints (deterministisch, minimal)
        $text = $this->structureEnhancer->enhance($text);
-        // 4️⃣ Chunking
+        // 4️⃣ Chunking (inkl. TextNormalizer)
        $chunks = $this->chunker->chunk($text);
        $doc = $version->getDocument();
@@ -56,13 +51,15 @@ final readonly class KnowledgeIngestService
        foreach ($chunks as $chunkText) {
-            if ($title !== '' && !str_starts_with($chunkText, $title)) {
+            // 🔥 Titel nur im ersten Chunk einfügen
            if ($index === 0 && $title !== '') {
                $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
            }
            $chunkText = trim($chunkText);
-            // 🔥 deterministische Chunk-ID
+            // 🔥 Deterministische Chunk-ID
            // Wichtig: Normalisierung NUR für ID-Bildung
            $normalizedForId = $this->textNormalizer->normalize($chunkText);
            $chunkId = sha1(
@@ -75,11 +72,13 @@ final readonly class KnowledgeIngestService
                'chunk_id'    => $chunkId,
                'document_id' => $documentId,
                'version_id'  => $versionId,
-                'chunk_index' => $index++,
+                'chunk_index' => $index,
                'text'        => $chunkText,
                'checksum'    => sha1($chunkText),
                'metadata'    => $this->buildMetadata($version),
            ];
            $index++;
        }
    }
@@ -101,6 +100,7 @@ final readonly class KnowledgeIngestService
        $doc = $version->getDocument();
        $title = null;
        if (method_exists($doc, 'getTitle')) {
            $title = $doc->getTitle();
        } elseif (method_exists($doc, 'getName')) {
--- a/src/Knowledge/Ingest/SimpleChunker.php
+++ b/src/Knowledge/Ingest/SimpleChunker.php
@@ -13,27 +13,22 @@ final readonly class SimpleChunker
    public function __construct(
        private IndexConfigurationProvider $configurationProvider,
        private TextNormalizer $textNormalizer
-    )
+    ) {}
    {
    }
    /** @return string[] */
    public function chunk(string $text): array
    {
        $config = $this->configurationProvider->getConfiguration();
-        $maxWords     = $config->getChunkSize();
+        $maxWords     = max(1, $config->getChunkSize());
-        $overlapWords = $config->getChunkOverlap();
+        $overlapWords = max(0, $config->getChunkOverlap());
        $text = $this->textNormalizer->normalize($text);
        if ($text === '') {
            return [];
        }
-        // ======================================================
+        // Absatzbasierte Vorstruktur
        // HYBRID: Erst Absatzbasiert sammeln
        // ======================================================
        $paragraphs = preg_split('/\n{2,}/u', $text);
        if (!$paragraphs) {
            return [];
@@ -52,7 +47,7 @@ final readonly class SimpleChunker
            $paragraphWordCount = $this->countWords($paragraph);
-            // Falls einzelner Absatz größer als maxWords → Fallback
+            // Absatz größer als maxWords → Wort-Fallback
            if ($paragraphWordCount > $maxWords) {
                if ($currentChunk !== '') {
@@ -68,14 +63,14 @@ final readonly class SimpleChunker
                continue;
            }
-            // Absatz passt noch in aktuellen Chunk
+            // Absatz passt in aktuellen Chunk
            if ($currentWordCount + $paragraphWordCount <= $maxWords) {
                $currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph;
                $currentWordCount += $paragraphWordCount;
                continue;
            }
-            // Flush aktueller Chunk
+            // Flush
            if ($currentChunk !== '') {
                $chunks[] = trim($currentChunk);
            }
@@ -92,7 +87,7 @@ final readonly class SimpleChunker
    }
    // ======================================================
-    // Wortbasierter Fallback (Original-Logik beibehalten)
+    // Wortbasierter Fallback
    // ======================================================
    /** @return string[] */
@@ -125,6 +120,7 @@ final readonly class SimpleChunker
        $wordPos = 0;
        while ($wordPos < $totalWords) {
            $wordEnd = min($wordPos + $maxWords, $totalWords);
            $tokenStart = $wordTokenIndexes[$wordPos];
@@ -154,11 +150,13 @@ final readonly class SimpleChunker
    private function adjustCutToBoundary(array $tokens, int $start, int $end): int
    {
        // Schutz für Listenanfänge
        $startToken = $tokens[$start] ?? '';
-        if (preg_match('/^- /u', ltrim($startToken))) {
+        if (preg_match('/^\s*-\s+/u', $startToken)) {
            return $end;
        }
        // Rückwärts prüfen auf Absatz- oder Satzende
        for ($i = $end - 1; $i > $start; $i--) {
            if ($tokens[$i] === "\n\n") {
@@ -190,9 +188,13 @@ final readonly class SimpleChunker
        $out  = [];
        foreach ($chunks as $chunk) {
-            $key = mb_strtolower(
+
-                preg_replace('/\s+/u', ' ', trim($chunk))
+            $normalized = preg_replace('/\s+/u', ' ', trim($chunk));
-            );
+            if ($normalized === null) {
                continue;
            }
            $key = mb_strtolower($normalized);
            if (isset($seen[$key])) {
                continue;
--- a/src/Knowledge/Retrieval/CachedRetriever.php
+++ b/src/Knowledge/Retrieval/CachedRetriever.php
@@ -1,48 +0,0 @@
 <?php
 declare(strict_types=1);
 namespace App\Knowledge\Retrieval;
 use Psr\Cache\CacheItemPoolInterface;
 use Psr\Cache\InvalidArgumentException;
 final readonly class CachedRetriever implements RetrieverInterface
 {
    public function __construct(
        private RetrieverInterface     $inner,
        private CacheItemPoolInterface $cache,
        private int                    $ttlSeconds
    )
    {
    }
    /**
     * @throws InvalidArgumentException
     */
    public function retrieve(string $prompt, int $limit = 10): array
    {
        $key = $this->buildCacheKey($prompt, $limit);
        $item = $this->cache->getItem($key);
        if ($item->isHit()) {
            return $item->get();
        }
        $result = $this->inner->retrieve($prompt, $limit);
        $item->set($result);
        $item->expiresAfter($this->ttlSeconds);
        $this->cache->save($item);
        return $result;
    }
    private function buildCacheKey(string $prompt, int $limit): string
    {
        $normalized = mb_strtolower(trim($prompt));
        $normalized = preg_replace('/\s+/u', ' ', $normalized);
        return 'rag_retrieval_' . sha1($normalized . '|' . $limit);
    }
 }
--- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
+++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
@@ -57,6 +57,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface
            return [$result['catalogBlock']];
        }
        if ($result['selectedChunkIds'] === []) {
            return [];
        }
        return $this->collectTextsFromIds(
            $result['selectedChunkIds'],
            $result['rows']
@@ -84,10 +88,15 @@ final class NdjsonHybridRetriever implements RetrieverInterface
            ]];
        }
        if ($result['selectedChunkIds'] === []) {
            return [];
        }
        $out = [];
        $rank = 0;
        foreach ($result['selectedChunkIds'] as $chunkId) {
            if (!isset($result['rows'][$chunkId])) {
                continue;
            }
@@ -127,6 +136,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $route = $this->routeResolver->resolve($salesIntent, $entityLabel);
        if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
            $catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);
            if ($catalogBlock !== null) {
@@ -147,6 +157,21 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $core = $this->runCore($prompt, $config, $withScores, $salesIntent);
        if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
            return [
                'route' => $route,
                'entityLabel' => $entityLabel,
                'intent' => $salesIntent,
                'isListQuery' => $core['is_list_query'],
                'selectedChunkIds' => [],
                'rows' => [],
                'rrfScores' => [],
                'rawScores' => [],
                'threshold' => $core['threshold'],
                'catalogBlock' => null,
            ];
        }
        $selectedChunkIds = $core['is_list_query']
            ? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
            : $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
@@ -182,8 +207,17 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $isListQuery = $this->intentLite->isListQuery($prompt);
        $cleanQuery = $this->queryCleaner->clean($prompt);
        if ($cleanQuery === '') {
-            $cleanQuery = $prompt;
+            return [
                'limit' => $limit,
                'is_list_query' => $isListQuery,
                'threshold' => self::VECTOR_SCORE_THRESHOLD,
                'ranked_chunk_ids' => [],
                'rows' => [],
                'rrf_scores' => [],
                'raw_scores' => [],
            ];
        }
        [$threshold, $topK] = $this->computeThresholdAndTopK(
@@ -200,10 +234,22 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $globalHits = $this->vectorClient->search($cleanQuery, $topK);
        $scopedHits = [];
-        if (!empty($candidateDocIds)) {
+        if ($candidateDocIds !== []) {
            $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
        }
        if ($globalHits === [] && $scopedHits === []) {
            return [
                'limit' => $limit,
                'is_list_query' => $isListQuery,
                'threshold' => $threshold,
                'ranked_chunk_ids' => [],
                'rows' => [],
                'rrf_scores' => [],
                'raw_scores' => [],
            ];
        }
        $fused = $this->fuseHits(
            $globalHits,
            $scopedHits,
@@ -216,11 +262,25 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $rawScores = $fused['raw_scores'];
        if ($rrfScores === [] && $globalHits !== []) {
-            $rrfScores = $this->fallbackRrfFromHits($globalHits, self::EMPTY_RRF_FALLBACK_TOPN);
+            $rrfScores = $this->fallbackRrfFromHits(
                $globalHits,
                self::EMPTY_RRF_FALLBACK_TOPN
            );
        }
        if ($rrfScores === []) {
            return [
                'limit' => $limit,
                'is_list_query' => $isListQuery,
                'threshold' => $threshold,
                'ranked_chunk_ids' => [],
                'rows' => [],
                'rrf_scores' => [],
                'raw_scores' => $rawScores,
            ];
        }
        arsort($rrfScores);
        $rankedChunkIds = array_keys($rrfScores);
        $rows = $this->lookup->findByChunkIds($rankedChunkIds);
@@ -254,13 +314,19 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
    }
-    private function computeThresholdAndTopK(string $salesIntent, bool $isListQuery, int $vectorTopKBase): array
+    private function computeThresholdAndTopK(
-    {
+        string $salesIntent,
        bool $isListQuery,
        int $vectorTopKBase
    ): array {
        $threshold = self::VECTOR_SCORE_THRESHOLD;
        $topK = $vectorTopKBase;
-        if ($salesIntent === SalesIntentLite::OBJECTION ||
+        if (
-            $salesIntent === SalesIntentLite::PRICING) {
+            $salesIntent === SalesIntentLite::OBJECTION ||
            $salesIntent === SalesIntentLite::PRICING
        ) {
            $threshold += 0.02;
        }
@@ -333,6 +399,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $rank = 0;
        foreach ($hits as $hit) {
            if (!isset($hit['chunk_id'])) {
                continue;
            }
@@ -354,6 +421,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $out = [];
        foreach ($chunkIds as $id) {
            if (!isset($rows[$id]['text'])) {
                continue;
            }
@@ -433,11 +501,13 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $out = [];
        foreach ($chunkIds as $id) {
            if (!isset($rows[$id]['text'])) {
                continue;
            }
            $text = trim((string)$rows[$id]['text']);
            if ($text !== '') {
                $out[] = $text;
            }
--- a/src/Knowledge/Text/TextNormalizer.php
+++ b/src/Knowledge/Text/TextNormalizer.php
@@ -13,7 +13,15 @@ final class TextNormalizer
        }
        // -------------------------------------------------
-        // 1. Encoding-Artefakte & Sonderzeichen
+        // 1. Unicode-Normalisierung (wichtig für Stabilität)
        // -------------------------------------------------
        if (class_exists(\Normalizer::class)) {
            $text = \Normalizer::normalize($text, \Normalizer::FORM_C) ?? $text;
        }
        // -------------------------------------------------
        // 2. Encoding-Artefakte & Sonderzeichen
        // -------------------------------------------------
        // Word/PDF Bullet-Artefakte (häufiges Problemzeichen)
@@ -26,38 +34,49 @@ final class TextNormalizer
            $text
        );
        // Private-Use-Area entfernen
        $text = preg_replace('/[\x{E000}-\x{F8FF}]/u', '', $text);
        // Non-breaking space → normales Leerzeichen
        $text = str_replace("\xC2\xA0", ' ', $text);
        // Zero-width characters entfernen
        $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text);
-        // -------------------------------------------------
+        // Geschützte Leerzeichen & ähnliche Varianten vereinheitlichen
-        // 2. Zeilenumbrüche vereinheitlichen
+        $text = str_replace(
-        // -------------------------------------------------
+            [
-
+                "\xC2\xA0", // NBSP
-        $text = str_replace("\r\n", "\n", $text);
+                "\xE2\x80\xAF", // Narrow NBSP
-        $text = str_replace("\r", "\n", $text);
+                "\xE2\x80\x89", // Thin space
            ],
            ' ',
            $text
        );
        // -------------------------------------------------
-        // 3. Silbentrennung über Zeilen entfernen
+        // 3. Zeilenumbrüche vereinheitlichen
        // -------------------------------------------------
        $text = str_replace(["\r\n", "\r"], "\n", $text);
        // -------------------------------------------------
        // 4. Silbentrennung über Zeilen entfernen
        //
        // Beispiel:
        // Testo-
        // mat → Testomat
        //
        // Nur wenn direkt Buchstabe folgt
        // -------------------------------------------------
        $text = preg_replace('/-\n(\p{L})/u', '$1', $text);
        // -------------------------------------------------
-        // 4. Whitespace normalisieren
+        // 5. Whitespace normalisieren
        // -------------------------------------------------
        // Mehrfache Leerzeichen reduzieren
        $text = preg_replace('/[ \t]+/u', ' ', $text);
-        // Mehrfache Leerzeilen reduzieren
+        // Mehr als 2 Leerzeilen reduzieren
        $text = preg_replace('/\n{3,}/u', "\n\n", $text);
        return trim($text);