optimize system and cleanup

2026-03-02 21:27:20 +01:00
parent 6b8d1b1936
commit e7047cd885
10 changed files with 459 additions and 346 deletions
--- a/src/Knowledge/Ingest/DocumentLoader.php
+++ b/src/Knowledge/Ingest/DocumentLoader.php
@@ -27,11 +27,12 @@ final class DocumentLoader
    private function loadText(string $path): string
    {
        $content = file_get_contents($path);
+
        if ($content === false) {
            throw new \RuntimeException("Could not read file: {$path}");
        }

-        return $this->normalize($content);
+        return $this->normalizeLineEndings($content);
    }

    private function loadPdf(string $path): string
@@ -49,120 +50,31 @@ final class DocumentLoader
            );
        }

-        return $this->normalize($text);
+        return $this->normalizeLineEndings($text);
    }

-    private function normalize(string $text): string
+    /**
+     * Loader ist bewusst minimal.
+     *
+     * KEINE:
+     * - Silbentrennung
+     * - Listen-Reparatur
+     * - Struktur-Merges
+     * - Regex-Orgie
+     *
+     * Nur:
+     * - Zeilenumbrüche vereinheitlichen
+     * - trim
+     */
+    private function normalizeLineEndings(string $text): string
    {
        if ($text === '') {
            return '';
        }

-        // 1. Silbentrennung entfernen
-        $text = preg_replace('/-\n/', '', $text);
-
-        // 2. Einheitliche Zeilenumbrüche
+        // Einheitliche Zeilenumbrüche
        $text = str_replace(["\r\n", "\r"], "\n", $text);

-        // 3. Symbolmüll entfernen
-        $text = $this->removeUnwantedSymbols($text);
-
-        // 4. Struktur-Reparatur
-        $text = $this->repairStructure($text);
-
-        // 5. Inline-Listen stabilisieren
-        $text = preg_replace('/\s-\s/', "\n- ", $text);
-
-        // 6. Whitespace normalisieren
-        $text = preg_replace('/[ \t]+/', ' ', $text);
-        $text = preg_replace('/\n{3,}/', "\n\n", $text);
-
        return trim($text);
    }
-
-    private function removeUnwantedSymbols(string $text): string
-    {
-        $text = str_replace(['©', '®', '™', '℠'], '', $text);
-        $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text);
-        $text = preg_replace('/[^\P{C}\n]+/u', '', $text);
-        return $text;
-    }
-
-    /**
-     * Konsolidierte Struktur-Reparatur
-     */
-    private function repairStructure(string $text): string
-    {
-        $lines = explode("\n", $text);
-        $out   = [];
-        $count = count($lines);
-
-        for ($i = 0; $i < $count; $i++) {
-            $current = trim($lines[$i]);
-
-            if ($current === '') {
-                $out[] = '';
-                continue;
-            }
-
-            if ($i < $count - 1) {
-                $next = trim($lines[$i + 1]);
-
-                // --- 1. Modellnummern / Zahlfortsetzung ---
-                if (
-                    !preg_match('/^- /', $current) &&
-                    !preg_match('/^- /', $next) &&
-                    !preg_match('/[\.:\?!]$/', $current) &&
-                    preg_match('/^\d+/', $next) // beginnt mit Zahl
-                ) {
-                    $out[] = $current . ' ' . $next;
-                    $i++;
-                    continue;
-                }
-
-                // --- 2. Satzfortsetzung (Zeile beginnt klein) ---
-                if (
-                    !preg_match('/^- /', $current) &&
-                    !preg_match('/^- /', $next) &&
-                    !preg_match('/[\.:\?!]$/', $current) &&
-                    preg_match('/^[a-zäöü]/u', $next)
-                ) {
-                    $out[] = $current . ' ' . $next;
-                    $i++;
-                    continue;
-                }
-
-                // --- 3. Falsche Listenfortsetzung ---
-                if (
-                    preg_match('/^- /', $current) &&
-                    preg_match('/^- [a-zäöü]/u', $next) &&
-                    !preg_match('/[\.:\?!]$/', $current)
-                ) {
-                    $merged = rtrim($current) . ' ' . ltrim(substr($next, 2));
-                    $out[] = $merged;
-                    $i++;
-                    continue;
-                }
-            }
-
-            // --- 4. Pseudo-Liste wie "- 808 festlegen" ---
-            if (preg_match('/^- \d+[A-Za-z ]{0,25}$/', $current)) {
-                $out[] = substr($current, 2);
-                continue;
-            }
-
-            // --- 5. Pseudo-Liste wie "- im eingeschalteten Zustand ..." ---
-            if (
-                preg_match('/^- [a-zäöü]/u', $current) &&
-                ($i === 0 || !preg_match('/^- /', trim($lines[$i - 1])))
-            ) {
-                $out[] = substr($current, 2);
-                continue;
-            }
-
-            $out[] = $current;
-        }
-
-        return implode("\n", $out);
-    }
 }
--- a/src/Knowledge/Ingest/KnowledgeIngestService.php
+++ b/src/Knowledge/Ingest/KnowledgeIngestService.php
@@ -18,10 +18,8 @@ final readonly class KnowledgeIngestService
        private DocumentVersionRepository $versionRepo,
        private TextNormalizer            $textNormalizer,
        private DocumentSanitizer         $documentSanitizer,
-        private StructureEnhancer         $structureEnhancer, // ✅ NEU
-    )
-    {
-    }
+        private StructureEnhancer         $structureEnhancer,
+    ) {}

    /**
     * Lokaler Ingest: erzeugt deterministische NDJSON-Records.
@@ -34,16 +32,13 @@ final readonly class KnowledgeIngestService
        $text = $this->loader->load($version->getFilePath());
        $extension = $version->getFileExtension() ?? 'txt';

-        // 2️⃣ Deterministische Textbereinigung
-        $text = $this->documentSanitizer->sanitize(
-            $text,
-            $extension
-        );
+        // 2️⃣ Artefakt-Sanitizing
+        $text = $this->documentSanitizer->sanitize($text, $extension);

-        // 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU)
+        // 3️⃣ Struktur-Hints (deterministisch, minimal)
        $text = $this->structureEnhancer->enhance($text);

-        // 4️⃣ Chunking
+        // 4️⃣ Chunking (inkl. TextNormalizer)
        $chunks = $this->chunker->chunk($text);

        $doc = $version->getDocument();
@@ -56,13 +51,15 @@ final readonly class KnowledgeIngestService

        foreach ($chunks as $chunkText) {

-            if ($title !== '' && !str_starts_with($chunkText, $title)) {
+            // 🔥 Titel nur im ersten Chunk einfügen
+            if ($index === 0 && $title !== '') {
                $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
            }

            $chunkText = trim($chunkText);

-            // 🔥 deterministische Chunk-ID
+            // 🔥 Deterministische Chunk-ID
+            // Wichtig: Normalisierung NUR für ID-Bildung
            $normalizedForId = $this->textNormalizer->normalize($chunkText);

            $chunkId = sha1(
@@ -75,11 +72,13 @@ final readonly class KnowledgeIngestService
                'chunk_id'    => $chunkId,
                'document_id' => $documentId,
                'version_id'  => $versionId,
-                'chunk_index' => $index++,
+                'chunk_index' => $index,
                'text'        => $chunkText,
                'checksum'    => sha1($chunkText),
                'metadata'    => $this->buildMetadata($version),
            ];
+
+            $index++;
        }
    }

@@ -101,6 +100,7 @@ final readonly class KnowledgeIngestService
        $doc = $version->getDocument();

        $title = null;
+
        if (method_exists($doc, 'getTitle')) {
            $title = $doc->getTitle();
        } elseif (method_exists($doc, 'getName')) {
--- a/src/Knowledge/Ingest/SimpleChunker.php
+++ b/src/Knowledge/Ingest/SimpleChunker.php
@@ -13,27 +13,22 @@ final readonly class SimpleChunker
    public function __construct(
        private IndexConfigurationProvider $configurationProvider,
        private TextNormalizer $textNormalizer
-    )
-    {
-    }
+    ) {}

    /** @return string[] */
    public function chunk(string $text): array
    {
        $config = $this->configurationProvider->getConfiguration();

-        $maxWords     = $config->getChunkSize();
-        $overlapWords = $config->getChunkOverlap();
+        $maxWords     = max(1, $config->getChunkSize());
+        $overlapWords = max(0, $config->getChunkOverlap());

        $text = $this->textNormalizer->normalize($text);
        if ($text === '') {
            return [];
        }

-        // ======================================================
-        // HYBRID: Erst Absatzbasiert sammeln
-        // ======================================================
-
+        // Absatzbasierte Vorstruktur
        $paragraphs = preg_split('/\n{2,}/u', $text);
        if (!$paragraphs) {
            return [];
@@ -52,7 +47,7 @@ final readonly class SimpleChunker

            $paragraphWordCount = $this->countWords($paragraph);

-            // Falls einzelner Absatz größer als maxWords → Fallback
+            // Absatz größer als maxWords → Wort-Fallback
            if ($paragraphWordCount > $maxWords) {

                if ($currentChunk !== '') {
@@ -68,14 +63,14 @@ final readonly class SimpleChunker
                continue;
            }

-            // Absatz passt noch in aktuellen Chunk
+            // Absatz passt in aktuellen Chunk
            if ($currentWordCount + $paragraphWordCount <= $maxWords) {
                $currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph;
                $currentWordCount += $paragraphWordCount;
                continue;
            }

-            // Flush aktueller Chunk
+            // Flush
            if ($currentChunk !== '') {
                $chunks[] = trim($currentChunk);
            }
@@ -92,7 +87,7 @@ final readonly class SimpleChunker
    }

    // ======================================================
-    // Wortbasierter Fallback (Original-Logik beibehalten)
+    // Wortbasierter Fallback
    // ======================================================

    /** @return string[] */
@@ -125,6 +120,7 @@ final readonly class SimpleChunker
        $wordPos = 0;

        while ($wordPos < $totalWords) {
+
            $wordEnd = min($wordPos + $maxWords, $totalWords);

            $tokenStart = $wordTokenIndexes[$wordPos];
@@ -154,11 +150,13 @@ final readonly class SimpleChunker

    private function adjustCutToBoundary(array $tokens, int $start, int $end): int
    {
+        // Schutz für Listenanfänge
        $startToken = $tokens[$start] ?? '';
-        if (preg_match('/^- /u', ltrim($startToken))) {
+        if (preg_match('/^\s*-\s+/u', $startToken)) {
            return $end;
        }

+        // Rückwärts prüfen auf Absatz- oder Satzende
        for ($i = $end - 1; $i > $start; $i--) {

            if ($tokens[$i] === "\n\n") {
@@ -190,9 +188,13 @@ final readonly class SimpleChunker
        $out  = [];

        foreach ($chunks as $chunk) {
-            $key = mb_strtolower(
-                preg_replace('/\s+/u', ' ', trim($chunk))
-            );
+
+            $normalized = preg_replace('/\s+/u', ' ', trim($chunk));
+            if ($normalized === null) {
+                continue;
+            }
+
+            $key = mb_strtolower($normalized);

            if (isset($seen[$key])) {
                continue;
--- a/src/Knowledge/Retrieval/CachedRetriever.php
+++ b/src/Knowledge/Retrieval/CachedRetriever.php
@@ -1,48 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace App\Knowledge\Retrieval;
-
-use Psr\Cache\CacheItemPoolInterface;
-use Psr\Cache\InvalidArgumentException;
-
-final readonly class CachedRetriever implements RetrieverInterface
-{
-    public function __construct(
-        private RetrieverInterface     $inner,
-        private CacheItemPoolInterface $cache,
-        private int                    $ttlSeconds
-    )
-    {
-    }
-
-    /**
-     * @throws InvalidArgumentException
-     */
-    public function retrieve(string $prompt, int $limit = 10): array
-    {
-        $key = $this->buildCacheKey($prompt, $limit);
-
-        $item = $this->cache->getItem($key);
-        if ($item->isHit()) {
-            return $item->get();
-        }
-
-        $result = $this->inner->retrieve($prompt, $limit);
-
-        $item->set($result);
-        $item->expiresAfter($this->ttlSeconds);
-        $this->cache->save($item);
-
-        return $result;
-    }
-
-    private function buildCacheKey(string $prompt, int $limit): string
-    {
-        $normalized = mb_strtolower(trim($prompt));
-        $normalized = preg_replace('/\s+/u', ' ', $normalized);
-
-        return 'rag_retrieval_' . sha1($normalized . '|' . $limit);
-    }
-}
--- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
+++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
@@ -57,6 +57,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface
            return [$result['catalogBlock']];
        }

+        if ($result['selectedChunkIds'] === []) {
+            return [];
+        }
+
        return $this->collectTextsFromIds(
            $result['selectedChunkIds'],
            $result['rows']
@@ -84,10 +88,15 @@ final class NdjsonHybridRetriever implements RetrieverInterface
            ]];
        }

+        if ($result['selectedChunkIds'] === []) {
+            return [];
+        }
+
        $out = [];
        $rank = 0;

        foreach ($result['selectedChunkIds'] as $chunkId) {
+
            if (!isset($result['rows'][$chunkId])) {
                continue;
            }
@@ -127,6 +136,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $route = $this->routeResolver->resolve($salesIntent, $entityLabel);

        if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
+
            $catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);

            if ($catalogBlock !== null) {
@@ -147,6 +157,21 @@ final class NdjsonHybridRetriever implements RetrieverInterface

        $core = $this->runCore($prompt, $config, $withScores, $salesIntent);

+        if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
+            return [
+                'route' => $route,
+                'entityLabel' => $entityLabel,
+                'intent' => $salesIntent,
+                'isListQuery' => $core['is_list_query'],
+                'selectedChunkIds' => [],
+                'rows' => [],
+                'rrfScores' => [],
+                'rawScores' => [],
+                'threshold' => $core['threshold'],
+                'catalogBlock' => null,
+            ];
+        }
+
        $selectedChunkIds = $core['is_list_query']
            ? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
            : $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
@@ -182,8 +207,17 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $isListQuery = $this->intentLite->isListQuery($prompt);

        $cleanQuery = $this->queryCleaner->clean($prompt);
+
        if ($cleanQuery === '') {
-            $cleanQuery = $prompt;
+            return [
+                'limit' => $limit,
+                'is_list_query' => $isListQuery,
+                'threshold' => self::VECTOR_SCORE_THRESHOLD,
+                'ranked_chunk_ids' => [],
+                'rows' => [],
+                'rrf_scores' => [],
+                'raw_scores' => [],
+            ];
        }

        [$threshold, $topK] = $this->computeThresholdAndTopK(
@@ -200,10 +234,22 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $globalHits = $this->vectorClient->search($cleanQuery, $topK);

        $scopedHits = [];
-        if (!empty($candidateDocIds)) {
+        if ($candidateDocIds !== []) {
            $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
        }

+        if ($globalHits === [] && $scopedHits === []) {
+            return [
+                'limit' => $limit,
+                'is_list_query' => $isListQuery,
+                'threshold' => $threshold,
+                'ranked_chunk_ids' => [],
+                'rows' => [],
+                'rrf_scores' => [],
+                'raw_scores' => [],
+            ];
+        }
+
        $fused = $this->fuseHits(
            $globalHits,
            $scopedHits,
@@ -216,11 +262,25 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $rawScores = $fused['raw_scores'];

        if ($rrfScores === [] && $globalHits !== []) {
-            $rrfScores = $this->fallbackRrfFromHits($globalHits, self::EMPTY_RRF_FALLBACK_TOPN);
+            $rrfScores = $this->fallbackRrfFromHits(
+                $globalHits,
+                self::EMPTY_RRF_FALLBACK_TOPN
+            );
+        }
+
+        if ($rrfScores === []) {
+            return [
+                'limit' => $limit,
+                'is_list_query' => $isListQuery,
+                'threshold' => $threshold,
+                'ranked_chunk_ids' => [],
+                'rows' => [],
+                'rrf_scores' => [],
+                'raw_scores' => $rawScores,
+            ];
        }

        arsort($rrfScores);
-
        $rankedChunkIds = array_keys($rrfScores);
        $rows = $this->lookup->findByChunkIds($rankedChunkIds);

@@ -254,13 +314,19 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
    }

-    private function computeThresholdAndTopK(string $salesIntent, bool $isListQuery, int $vectorTopKBase): array
-    {
+    private function computeThresholdAndTopK(
+        string $salesIntent,
+        bool $isListQuery,
+        int $vectorTopKBase
+    ): array {
+
        $threshold = self::VECTOR_SCORE_THRESHOLD;
        $topK = $vectorTopKBase;

-        if ($salesIntent === SalesIntentLite::OBJECTION ||
-            $salesIntent === SalesIntentLite::PRICING) {
+        if (
+            $salesIntent === SalesIntentLite::OBJECTION ||
+            $salesIntent === SalesIntentLite::PRICING
+        ) {
            $threshold += 0.02;
        }

@@ -333,6 +399,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $rank = 0;

        foreach ($hits as $hit) {
+
            if (!isset($hit['chunk_id'])) {
                continue;
            }
@@ -354,6 +421,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $out = [];

        foreach ($chunkIds as $id) {
+
            if (!isset($rows[$id]['text'])) {
                continue;
            }
@@ -433,11 +501,13 @@ final class NdjsonHybridRetriever implements RetrieverInterface
        $out = [];

        foreach ($chunkIds as $id) {
+
            if (!isset($rows[$id]['text'])) {
                continue;
            }

            $text = trim((string)$rows[$id]['text']);
+
            if ($text !== '') {
                $out[] = $text;
            }
--- a/src/Knowledge/Text/TextNormalizer.php
+++ b/src/Knowledge/Text/TextNormalizer.php
@@ -13,7 +13,15 @@ final class TextNormalizer
        }

        // -------------------------------------------------
-        // 1. Encoding-Artefakte & Sonderzeichen
+        // 1. Unicode-Normalisierung (wichtig für Stabilität)
+        // -------------------------------------------------
+
+        if (class_exists(\Normalizer::class)) {
+            $text = \Normalizer::normalize($text, \Normalizer::FORM_C) ?? $text;
+        }
+
+        // -------------------------------------------------
+        // 2. Encoding-Artefakte & Sonderzeichen
        // -------------------------------------------------

        // Word/PDF Bullet-Artefakte (häufiges Problemzeichen)
@@ -26,38 +34,49 @@ final class TextNormalizer
            $text
        );

+        // Private-Use-Area entfernen
        $text = preg_replace('/[\x{E000}-\x{F8FF}]/u', '', $text);

-        // Non-breaking space → normales Leerzeichen
-        $text = str_replace("\xC2\xA0", ' ', $text);
-
        // Zero-width characters entfernen
        $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text);

-        // -------------------------------------------------
-        // 2. Zeilenumbrüche vereinheitlichen
-        // -------------------------------------------------
-
-        $text = str_replace("\r\n", "\n", $text);
-        $text = str_replace("\r", "\n", $text);
+        // Geschützte Leerzeichen & ähnliche Varianten vereinheitlichen
+        $text = str_replace(
+            [
+                "\xC2\xA0", // NBSP
+                "\xE2\x80\xAF", // Narrow NBSP
+                "\xE2\x80\x89", // Thin space
+            ],
+            ' ',
+            $text
+        );

        // -------------------------------------------------
-        // 3. Silbentrennung über Zeilen entfernen
+        // 3. Zeilenumbrüche vereinheitlichen
+        // -------------------------------------------------
+
+        $text = str_replace(["\r\n", "\r"], "\n", $text);
+
+        // -------------------------------------------------
+        // 4. Silbentrennung über Zeilen entfernen
+        //
        // Beispiel:
        // Testo-
        // mat → Testomat
+        //
+        // Nur wenn direkt Buchstabe folgt
        // -------------------------------------------------

        $text = preg_replace('/-\n(\p{L})/u', '$1', $text);

        // -------------------------------------------------
-        // 4. Whitespace normalisieren
+        // 5. Whitespace normalisieren
        // -------------------------------------------------

        // Mehrfache Leerzeichen reduzieren
        $text = preg_replace('/[ \t]+/u', ' ', $text);

-        // Mehrfache Leerzeilen reduzieren
+        // Mehr als 2 Leerzeilen reduzieren
        $text = preg_replace('/\n{3,}/u', "\n\n", $text);

        return trim($text);