diff --git a/config/services.yaml b/config/services.yaml index d577bb1..45addee 100644 --- a/config/services.yaml +++ b/config/services.yaml @@ -99,14 +99,9 @@ services: App\Knowledge\Retrieval\NdjsonHybridRetriever: ~ - App\Knowledge\Retrieval\CachedRetriever: - arguments: - $inner: '@App\Knowledge\Retrieval\NdjsonHybridRetriever' - $cache: '@cache.app' - $ttlSeconds: 600 - + # CachedRetriever entfernt: war Interface-inkompatibel und erzeugt Drift/Chaos App\Knowledge\Retrieval\RetrieverInterface: - alias: App\Knowledge\Retrieval\CachedRetriever + alias: App\Knowledge\Retrieval\NdjsonHybridRetriever # ------------------------------------------------------------ # Index Configuration Provider diff --git a/src/Ingest/DocumentSanitizer.php b/src/Ingest/DocumentSanitizer.php index cc802c1..63a97f3 100644 --- a/src/Ingest/DocumentSanitizer.php +++ b/src/Ingest/DocumentSanitizer.php @@ -7,68 +7,65 @@ namespace App\Ingest; /** * DocumentSanitizer * - * Ziel (deterministisch, minimal-invasiv): - * - Entfernt typische PDF-/DOC-Artefakte VOR dem Chunking: - * - Inhaltsverzeichnis-Blöcke (TOC) - * - Seitenzahlen / "Seite X von Y" - * - wiederkehrende Header/Footer-Zeilen - * - Dot-Leader-Zeilen (".... 12") + * Deterministic, minimal-invasive preprocessing BEFORE chunking. * - * Guardrails: - * - Keine semantische Umschreibung - * - Keine Zufälligkeit - * - Kein Entfernen echter Fließtext-Absätze + * Removes typical PDF/DOC artefacts: + * - Table of contents blocks + * - Page numbers + * - Repeated headers/footers + * - Dot-leader lines (e.g. "...... 12") + * + * Design principles: + * - No semantic rewriting + * - No randomness + * - No removal of real paragraphs + * - Type-aware sanitizing (PDF/DOC != MD/TXT) */ final class DocumentSanitizer { private const MAX_HEADER_LEN = 120; private const REPEAT_HEADER_MIN_COUNT = 3; - public function sanitize( - string $text, - string $fileExtension - ): string + public function sanitize(string $text, string $fileExtension): string { if ($text === '') { return ''; } $text = $this->normalizeLineEndings($text); - $fileExtension = strtolower($fileExtension); + // Nur PDF-/DOC-artige Formate aggressiver behandeln if (in_array($fileExtension, ['pdf', 'doc', 'docx'], true)) { - $text = $this->removeToc($text); - $text = $this->removePageNumbers($text); - $text = $this->removeDotLeaderLines($text); - $text = $this->removeRepeatedHeaders($text); + $text = $this->sanitizePdfLike($text); } - $text = $this->cleanupWhitespace($text); + return trim($this->cleanupWhitespace($text)); + } - return trim($text); + // ========================================================= + // PIPELINE + // ========================================================= + + private function sanitizePdfLike(string $text): string + { + $text = $this->removeToc($text); + $text = $this->removePageNumbers($text); + $text = $this->removeDotLeaderLines($text); + $text = $this->removeRepeatedHeaders($text); + + return $text; } private function normalizeLineEndings(string $text): string { - // Vereinheitlichen auf \n (deterministisch, kein Encoding-Change) return str_replace(["\r\n", "\r"], "\n", $text); } - /** - * Entfernt TOC-Block nach "Inhaltsverzeichnis" bis zum ersten "echten" Absatz. - * - * Heuristik: - * - Start: Zeile enthält "Inhaltsverzeichnis" (case-insensitive) - * - Innerhalb TOC werden Zeilen entfernt, die wie TOC-Einträge aussehen: - * - Dot-Leader + Seitenzahl - * - Kapitelnummern + Text + Seitenzahl - * - Ende: sobald eine Zeile "absatzartig" wirkt: - * - ausreichend lang UND enthält Satzpunkt (.) - * - * Guardrail: - * - Leere Zeilen innerhalb TOC werden verworfen (damit TOC-Block wirklich weg ist) - */ + // ========================================================= + // TOC REMOVAL + // ========================================================= + private function removeToc(string $text): string { $lines = explode("\n", $text); @@ -86,24 +83,24 @@ final class DocumentSanitizer } if ($inToc) { - // Innerhalb TOC: leere Zeilen weg (Block entfernen) if ($trim === '') { continue; } - // typische TOC-Zeilen (Leader / Kapitelnummern) - if ($this->looksLikeDotLeaderLine($trim) || $this->looksLikeNumberedTocLine($trim)) { + if ( + $this->looksLikeDotLeaderLine($trim) || + $this->looksLikeNumberedTocLine($trim) + ) { continue; } - // Ende TOC, wenn "echter Absatz" beginnt (lang + Punkt) - if (strlen($trim) >= 120 && str_contains($trim, '.')) { + // Ende TOC sobald normale Satzstruktur erkannt wird + if (preg_match('/[a-zäöüß]\.\s*$/iu', $trim)) { $inToc = false; $filtered[] = $line; continue; } - // sonst: solange wir im TOC sind, ignorieren continue; } @@ -113,13 +110,10 @@ final class DocumentSanitizer return implode("\n", $filtered); } - /** - * Entfernt typische Seitenzahl-Zeilen. - * - * Guardrails: - * - Nur kurze, "isolierte" Zeilen (trim != '') - * - Lässt Fließtext unangetastet - */ + // ========================================================= + // PAGE NUMBERS + // ========================================================= + private function removePageNumbers(string $text): string { $lines = explode("\n", $text); @@ -134,17 +128,22 @@ final class DocumentSanitizer } // "Seite 3" / "Seite 3 von 20" - if (preg_match('/^seite\s+\d+(\s+von\s+\d+)?$/iu', $trim)) { + if (preg_match('/^seite\s+\d{1,4}(\s+von\s+\d{1,4})?$/iu', $trim)) { continue; } // "Page 12" / "Page 12 of 34" - if (preg_match('/^page\s+\d+(\s+of\s+\d+)?$/iu', $trim)) { + if (preg_match('/^page\s+\d{1,4}(\s+of\s+\d{1,4})?$/iu', $trim)) { continue; } - // "- 4 -" / "4" / "– 4 –" - if (preg_match('/^[-–]?\s?\d{1,3}\s?[-–]?$/u', $trim)) { + // Isolierte Seitenmarker: "- 4 -" oder "– 4 –" + if (preg_match('/^[-–]\s?\d{1,4}\s?[-–]$/u', $trim)) { + continue; + } + + // Nur reine Zahl (max 3 Stellen, um IDs nicht zu killen) + if (preg_match('/^\d{1,3}$/u', $trim)) { continue; } @@ -154,10 +153,10 @@ final class DocumentSanitizer return implode("\n", $filtered); } - /** - * Entfernt Dot-Leader-Zeilen überall (nicht nur im TOC), - * z.B.: "Kapitel ......... 12" - */ + // ========================================================= + // DOT LEADER + // ========================================================= + private function removeDotLeaderLines(string $text): string { $lines = explode("\n", $text); @@ -176,19 +175,14 @@ final class DocumentSanitizer return implode("\n", $filtered); } - /** - * Entfernt wiederkehrende Header/Footer-Zeilen. - * - * Guardrails: - * - Nur relativ kurze Zeilen (unter MAX_HEADER_LEN) - * - Nur wenn identisch (trim) >= REPEAT_HEADER_MIN_COUNT - * - Leere Zeilen bleiben erhalten - */ + // ========================================================= + // REPEATED HEADERS + // ========================================================= + private function removeRepeatedHeaders(string $text): string { $lines = explode("\n", $text); - // counts basiert auf trim (damit z.B. unterschiedliche Einrückung nicht zählt) $trimmed = array_map('trim', $lines); $counts = array_count_values($trimmed); @@ -211,27 +205,27 @@ final class DocumentSanitizer return implode("\n", $filtered); } + // ========================================================= + // WHITESPACE + // ========================================================= + private function cleanupWhitespace(string $text): string { - // nicht zu aggressiv: nur 3+ Leerzeilen auf 2 reduzieren - $text = preg_replace("/\n{3,}/", "\n\n", $text); - return $text ?? ''; + // Maximal 2 Leerzeilen + return preg_replace("/\n{3,}/", "\n\n", $text); } // ========================================================= - // Heuristics (isoliert, testbar) + // HEURISTICS // ========================================================= private function looksLikeDotLeaderLine(string $trimmedLine): bool { - // "Text ..... 12" (mind. 5 Punkte, Seitenzahl am Ende) - return (bool)preg_match('/^.+\.{5,}\s*\d+$/u', $trimmedLine); + return (bool)preg_match('/^.+\.{4,}\s*\d+$/u', $trimmedLine); } private function looksLikeNumberedTocLine(string $trimmedLine): bool { - // "2.1 Kapitelname 12" / "3 Kapitelname 7" - // Kapitelnummern + Text + Seitenzahl am Ende return (bool)preg_match('/^\d+(\.\d+)*\s+.+\s+\d+$/u', $trimmedLine); } } \ No newline at end of file diff --git a/src/Ingest/StructureEnhancer.php b/src/Ingest/StructureEnhancer.php index 6cdaf49..1b26b4c 100644 --- a/src/Ingest/StructureEnhancer.php +++ b/src/Ingest/StructureEnhancer.php @@ -4,8 +4,27 @@ declare(strict_types=1); namespace App\Ingest; +/** + * StructureEnhancer + * + * Minimal, deterministic structure hints BEFORE chunking. + * + * Adds: + * - Heading markers ("## ") for isolated short title lines + * - Bullet markers ("- ") for obvious list runs + * + * Non-goals: + * - No semantic rewriting + * - No sentence merging + * - No aggressive list guessing + */ final class StructureEnhancer { + private const MAX_HEADING_LEN = 80; + + private const MAX_LIST_ITEM_LEN = 140; + private const MIN_LIST_RUN = 2; + public function enhance(string $text): string { if ($text === '') { @@ -13,6 +32,8 @@ final class StructureEnhancer } $text = $this->normalizeLineEndings($text); + + // Reihenfolge: erst Headings, dann Listen (stabiler fürs Chunking) $text = $this->detectHeadings($text); $text = $this->detectSimpleLists($text); @@ -24,6 +45,10 @@ final class StructureEnhancer return str_replace(["\r\n", "\r"], "\n", $text); } + // ========================================================= + // HEADINGS + // ========================================================= + private function detectHeadings(string $text): string { $lines = explode("\n", $text); @@ -52,22 +77,31 @@ final class StructureEnhancer return false; } - if (strlen($line) > 80) { + // Schon Markdown-Heading? Dann nicht anfassen. + if (preg_match('/^#{1,6}\s+/u', $line)) { return false; } - if (str_ends_with($line, '.')) { + if (mb_strlen($line) > self::MAX_HEADING_LEN) { return false; } + // Heading soll kein "Satz" sein + if (preg_match('/[.!?]\s*$/u', $line)) { + return false; + } + + // Keine typischen Satz-Kommas (zu risky) if (str_contains($line, ',')) { return false; } - if (preg_match('/\d+\.\d+/', $line)) { + // Nummerierte Kapitel "1.2" / "2.3.4" nicht zwangs-heading-en + if (preg_match('/\b\d+\.\d+(\.\d+)*\b/u', $line)) { return false; } + // Muss "isoliert" stehen (leerzeile davor und danach) $prev = $lines[$index - 1] ?? ''; $next = $lines[$index + 1] ?? ''; @@ -75,48 +109,81 @@ final class StructureEnhancer return false; } + // Guardrail: mindestens ein Buchstabe + if (!preg_match('/\p{L}/u', $line)) { + return false; + } + + // Klassiker: UPPERCASE oder Title Case $uppercaseRatio = $this->uppercaseRatio($line); - if ($uppercaseRatio > 0.6) { + if ($uppercaseRatio >= 0.65) { return true; } - if ($this->isTitleCase($line)) { - return true; - } - - return false; + return $this->isTitleCase($line); } private function uppercaseRatio(string $line): float { - $letters = preg_replace('/[^a-zA-ZÄÖÜäöü]/u', '', $line); - if ($letters === '') { - return 0; + $letters = preg_replace('/[^\p{L}]/u', '', $line); + if ($letters === '' || $letters === null) { + return 0.0; } - $upper = preg_replace('/[^A-ZÄÖÜ]/u', '', $letters); + $upper = preg_replace('/[^\p{Lu}]/u', '', $letters); + if ($upper === null) { + return 0.0; + } - return mb_strlen($upper) / mb_strlen($letters); + $lettersLen = mb_strlen($letters); + if ($lettersLen === 0) { + return 0.0; + } + + return mb_strlen($upper) / $lettersLen; } private function isTitleCase(string $line): bool { - $words = explode(' ', $line); - $count = 0; + $words = preg_split('/\s+/u', trim($line)); + if (!$words) { + return false; + } + + $wordCount = 0; + $capCount = 0; foreach ($words as $word) { + $word = trim($word); if ($word === '') { continue; } - if (mb_strtoupper(mb_substr($word, 0, 1)) === mb_substr($word, 0, 1)) { - $count++; + // Wörter ohne Buchstaben ignorieren + if (!preg_match('/\p{L}/u', $word)) { + continue; + } + + $wordCount++; + + $first = mb_substr($word, 0, 1); + if ($first !== '' && mb_strtoupper($first) === $first) { + $capCount++; } } - return $count >= max(1, intdiv(count($words), 2)); + if ($wordCount === 0) { + return false; + } + + // mindestens die Hälfte der Wörter beginnt groß + return $capCount >= max(1, intdiv($wordCount + 1, 2)); } + // ========================================================= + // LISTS + // ========================================================= + private function detectSimpleLists(string $text): string { $lines = explode("\n", $text); @@ -127,36 +194,45 @@ final class StructureEnhancer foreach ($lines as $line) { $trim = trim($line); + // Bereits echte Liste? → nicht anfassen + if (preg_match('/^-\s+/u', $trim) || preg_match('/^\d+\.\s+/u', $trim)) { + $this->flushListBuffer($buffer, $out); + $out[] = $line; + continue; + } + if ($this->isListCandidate($trim)) { $buffer[] = $trim; continue; } - if (count($buffer) >= 2) { - foreach ($buffer as $item) { - $out[] = '- ' . $item; - } - } else { - foreach ($buffer as $item) { - $out[] = $item; - } - } - - $buffer = []; + $this->flushListBuffer($buffer, $out); $out[] = $line; } - if (count($buffer) >= 2) { + $this->flushListBuffer($buffer, $out); + + return implode("\n", $out); + } + + private function flushListBuffer(array &$buffer, array &$out): void + { + if ($buffer === []) { + return; + } + + if (count($buffer) >= self::MIN_LIST_RUN) { foreach ($buffer as $item) { $out[] = '- ' . $item; } } else { + // single line: unverändert lassen (kein "erraten"!) foreach ($buffer as $item) { $out[] = $item; } } - return implode("\n", $out); + $buffer = []; } private function isListCandidate(string $line): bool @@ -165,18 +241,32 @@ final class StructureEnhancer return false; } - if (strlen($line) > 120) { + // zu lang = ziemlich sicher Absatz/Satz + if (mb_strlen($line) > self::MAX_LIST_ITEM_LEN) { return false; } - if (str_ends_with($line, '.')) { + // wenn es wie ein Satz endet, nicht als Liste + if (preg_match('/[.!?]\s*$/u', $line)) { return false; } + // "Key: Value" ist typischerweise keine Liste if (str_contains($line, ':')) { return false; } + // Wenn es ein kompletter Satz sein könnte (Verb/Artikel), nicht raten: + // -> minimaler Guardrail: beginnt mit Großbuchstabe UND enthält mindestens 5 Wörter => eher Satz/Absatz + $words = preg_split('/\s+/u', trim($line)); + if ($words && count($words) >= 5) { + $first = mb_substr($line, 0, 1); + if ($first !== '' && mb_strtoupper($first) === $first) { + return false; + } + } + + // nur "kurze, stichpunktartige" Zeilen als Kandidat akzeptieren return true; } } \ No newline at end of file diff --git a/src/Intent/CatalogIntentLite.php b/src/Intent/CatalogIntentLite.php index b8004c3..8ca2267 100644 --- a/src/Intent/CatalogIntentLite.php +++ b/src/Intent/CatalogIntentLite.php @@ -11,9 +11,7 @@ use App\Tag\TagTypes; /** * CatalogIntentLite * - * Reiner Entity-Detector. - * - * Verantwortlich nur für: + * Verantwortlich ausschließlich für: * - Vector-Tag-Erkennung * - Score-Gate * - Ambiguity-Check @@ -26,27 +24,14 @@ use App\Tag\TagTypes; */ final class CatalogIntentLite { - /** - * Minimaler Similarity-Score. - * Verhindert Rauschen. - */ private const MIN_SCORE = 0.72; - - /** - * Differenz zwischen Top1 und Top2, - * damit kein unsicherer Treffer akzeptiert wird. - */ private const AMBIGUITY_DELTA = 0.03; public function __construct( private readonly TagVectorSearchClient $tagVectorClient, - private readonly QueryCleaner $queryCleaner, + private readonly QueryCleaner $queryCleaner, ) {} - /** - * Gibt das canonical Label der erkannten catalog_entity zurück - * oder null, wenn kein sauberer Treffer. - */ public function detect(string $prompt): ?string { $prompt = trim($prompt); @@ -54,10 +39,82 @@ final class CatalogIntentLite return null; } - $promptTag = $this->queryCleaner->clean($prompt); + $clean = $this->queryCleaner->clean($prompt); + if ($clean === '') { + $clean = $prompt; + } - // 1) Tag-Vector-Suche - $hits = $this->tagVectorClient->search($promptTag, 3); + // ---------------------------------------------------- + // 1️⃣ Primär: Vollquery testen + // ---------------------------------------------------- + + $label = $this->detectFromQuery($clean); + if ($label !== null) { + return $label; + } + + // ---------------------------------------------------- + // 2️⃣ Fallback: Tokenweise testen + // (wichtig für "geräteliste testomat") + // ---------------------------------------------------- + + $tokens = $this->tokenize($clean); + + $bestLabel = null; + $bestScore = 0.0; + + foreach ($tokens as $token) { + + // sehr kurze Tokens ignorieren (Noise) + if (mb_strlen($token) < 3) { + continue; + } + + $hits = $this->tagVectorClient->search($token, 3); + + if ($hits === []) { + continue; + } + + $top = $hits[0] ?? null; + if (!is_array($top)) { + continue; + } + + $score = (float)($top['score'] ?? 0.0); + + if ($score < self::MIN_SCORE) { + continue; + } + + // Ambiguity-Check + if (isset($hits[1])) { + $secondScore = (float)($hits[1]['score'] ?? 0.0); + if (abs($score - $secondScore) < self::AMBIGUITY_DELTA) { + continue; + } + } + + if (($top['tag_type'] ?? null) !== TagTypes::CATALOG_ENTITY) { + continue; + } + + if ($score > $bestScore) { + $bestScore = $score; + $bestLabel = trim((string)($top['label'] ?? '')); + } + } + + if ($bestLabel === null || $bestLabel === '') { + return null; + } + + return mb_strtolower($bestLabel); + } + + private function detectFromQuery(string $query): ?string + { + $hits = $this->tagVectorClient->search($query, 3); if ($hits === []) { return null; @@ -66,26 +123,21 @@ final class CatalogIntentLite $best = $hits[0]; $bestScore = (float)($best['score'] ?? 0.0); - // 2) Score-Tags if ($bestScore < self::MIN_SCORE) { return null; } - // 3) Ambiguity-Check if (isset($hits[1])) { $secondScore = (float)($hits[1]['score'] ?? 0.0); - if (abs($bestScore - $secondScore) < self::AMBIGUITY_DELTA) { return null; } } - // 4) Nur catalog_entity zulassen if (($best['tag_type'] ?? null) !== TagTypes::CATALOG_ENTITY) { return null; } - // 5) Canonical Label $label = trim((string)($best['label'] ?? '')); if ($label === '') { @@ -94,4 +146,31 @@ final class CatalogIntentLite return mb_strtolower($label); } + + private function tokenize(string $text): array + { + $parts = preg_split('/\s+/u', trim($text)); + if (!$parts) { + return []; + } + + $seen = []; + $out = []; + + foreach ($parts as $p) { + $p = trim($p); + if ($p === '') { + continue; + } + + if (isset($seen[$p])) { + continue; + } + + $seen[$p] = true; + $out[] = $p; + } + + return $out; + } } \ No newline at end of file diff --git a/src/Knowledge/Ingest/DocumentLoader.php b/src/Knowledge/Ingest/DocumentLoader.php index b8d3f9e..487a3f6 100644 --- a/src/Knowledge/Ingest/DocumentLoader.php +++ b/src/Knowledge/Ingest/DocumentLoader.php @@ -27,11 +27,12 @@ final class DocumentLoader private function loadText(string $path): string { $content = file_get_contents($path); + if ($content === false) { throw new \RuntimeException("Could not read file: {$path}"); } - return $this->normalize($content); + return $this->normalizeLineEndings($content); } private function loadPdf(string $path): string @@ -49,120 +50,31 @@ final class DocumentLoader ); } - return $this->normalize($text); + return $this->normalizeLineEndings($text); } - private function normalize(string $text): string + /** + * Loader ist bewusst minimal. + * + * KEINE: + * - Silbentrennung + * - Listen-Reparatur + * - Struktur-Merges + * - Regex-Orgie + * + * Nur: + * - Zeilenumbrüche vereinheitlichen + * - trim + */ + private function normalizeLineEndings(string $text): string { if ($text === '') { return ''; } - // 1. Silbentrennung entfernen - $text = preg_replace('/-\n/', '', $text); - - // 2. Einheitliche Zeilenumbrüche + // Einheitliche Zeilenumbrüche $text = str_replace(["\r\n", "\r"], "\n", $text); - // 3. Symbolmüll entfernen - $text = $this->removeUnwantedSymbols($text); - - // 4. Struktur-Reparatur - $text = $this->repairStructure($text); - - // 5. Inline-Listen stabilisieren - $text = preg_replace('/\s-\s/', "\n- ", $text); - - // 6. Whitespace normalisieren - $text = preg_replace('/[ \t]+/', ' ', $text); - $text = preg_replace('/\n{3,}/', "\n\n", $text); - return trim($text); } - - private function removeUnwantedSymbols(string $text): string - { - $text = str_replace(['©', '®', '™', '℠'], '', $text); - $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text); - $text = preg_replace('/[^\P{C}\n]+/u', '', $text); - return $text; - } - - /** - * Konsolidierte Struktur-Reparatur - */ - private function repairStructure(string $text): string - { - $lines = explode("\n", $text); - $out = []; - $count = count($lines); - - for ($i = 0; $i < $count; $i++) { - $current = trim($lines[$i]); - - if ($current === '') { - $out[] = ''; - continue; - } - - if ($i < $count - 1) { - $next = trim($lines[$i + 1]); - - // --- 1. Modellnummern / Zahlfortsetzung --- - if ( - !preg_match('/^- /', $current) && - !preg_match('/^- /', $next) && - !preg_match('/[\.:\?!]$/', $current) && - preg_match('/^\d+/', $next) // beginnt mit Zahl - ) { - $out[] = $current . ' ' . $next; - $i++; - continue; - } - - // --- 2. Satzfortsetzung (Zeile beginnt klein) --- - if ( - !preg_match('/^- /', $current) && - !preg_match('/^- /', $next) && - !preg_match('/[\.:\?!]$/', $current) && - preg_match('/^[a-zäöü]/u', $next) - ) { - $out[] = $current . ' ' . $next; - $i++; - continue; - } - - // --- 3. Falsche Listenfortsetzung --- - if ( - preg_match('/^- /', $current) && - preg_match('/^- [a-zäöü]/u', $next) && - !preg_match('/[\.:\?!]$/', $current) - ) { - $merged = rtrim($current) . ' ' . ltrim(substr($next, 2)); - $out[] = $merged; - $i++; - continue; - } - } - - // --- 4. Pseudo-Liste wie "- 808 festlegen" --- - if (preg_match('/^- \d+[A-Za-z ]{0,25}$/', $current)) { - $out[] = substr($current, 2); - continue; - } - - // --- 5. Pseudo-Liste wie "- im eingeschalteten Zustand ..." --- - if ( - preg_match('/^- [a-zäöü]/u', $current) && - ($i === 0 || !preg_match('/^- /', trim($lines[$i - 1]))) - ) { - $out[] = substr($current, 2); - continue; - } - - $out[] = $current; - } - - return implode("\n", $out); - } } \ No newline at end of file diff --git a/src/Knowledge/Ingest/KnowledgeIngestService.php b/src/Knowledge/Ingest/KnowledgeIngestService.php index f72ac82..603126d 100644 --- a/src/Knowledge/Ingest/KnowledgeIngestService.php +++ b/src/Knowledge/Ingest/KnowledgeIngestService.php @@ -18,10 +18,8 @@ final readonly class KnowledgeIngestService private DocumentVersionRepository $versionRepo, private TextNormalizer $textNormalizer, private DocumentSanitizer $documentSanitizer, - private StructureEnhancer $structureEnhancer, // ✅ NEU - ) - { - } + private StructureEnhancer $structureEnhancer, + ) {} /** * Lokaler Ingest: erzeugt deterministische NDJSON-Records. @@ -34,16 +32,13 @@ final readonly class KnowledgeIngestService $text = $this->loader->load($version->getFilePath()); $extension = $version->getFileExtension() ?? 'txt'; - // 2️⃣ Deterministische Textbereinigung - $text = $this->documentSanitizer->sanitize( - $text, - $extension - ); + // 2️⃣ Artefakt-Sanitizing + $text = $this->documentSanitizer->sanitize($text, $extension); - // 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU) + // 3️⃣ Struktur-Hints (deterministisch, minimal) $text = $this->structureEnhancer->enhance($text); - // 4️⃣ Chunking + // 4️⃣ Chunking (inkl. TextNormalizer) $chunks = $this->chunker->chunk($text); $doc = $version->getDocument(); @@ -56,13 +51,15 @@ final readonly class KnowledgeIngestService foreach ($chunks as $chunkText) { - if ($title !== '' && !str_starts_with($chunkText, $title)) { + // 🔥 Titel nur im ersten Chunk einfügen + if ($index === 0 && $title !== '') { $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText; } $chunkText = trim($chunkText); - // 🔥 deterministische Chunk-ID + // 🔥 Deterministische Chunk-ID + // Wichtig: Normalisierung NUR für ID-Bildung $normalizedForId = $this->textNormalizer->normalize($chunkText); $chunkId = sha1( @@ -75,11 +72,13 @@ final readonly class KnowledgeIngestService 'chunk_id' => $chunkId, 'document_id' => $documentId, 'version_id' => $versionId, - 'chunk_index' => $index++, + 'chunk_index' => $index, 'text' => $chunkText, 'checksum' => sha1($chunkText), 'metadata' => $this->buildMetadata($version), ]; + + $index++; } } @@ -101,6 +100,7 @@ final readonly class KnowledgeIngestService $doc = $version->getDocument(); $title = null; + if (method_exists($doc, 'getTitle')) { $title = $doc->getTitle(); } elseif (method_exists($doc, 'getName')) { diff --git a/src/Knowledge/Ingest/SimpleChunker.php b/src/Knowledge/Ingest/SimpleChunker.php index e75466d..1fb7b9a 100644 --- a/src/Knowledge/Ingest/SimpleChunker.php +++ b/src/Knowledge/Ingest/SimpleChunker.php @@ -13,27 +13,22 @@ final readonly class SimpleChunker public function __construct( private IndexConfigurationProvider $configurationProvider, private TextNormalizer $textNormalizer - ) - { - } + ) {} /** @return string[] */ public function chunk(string $text): array { $config = $this->configurationProvider->getConfiguration(); - $maxWords = $config->getChunkSize(); - $overlapWords = $config->getChunkOverlap(); + $maxWords = max(1, $config->getChunkSize()); + $overlapWords = max(0, $config->getChunkOverlap()); $text = $this->textNormalizer->normalize($text); if ($text === '') { return []; } - // ====================================================== - // HYBRID: Erst Absatzbasiert sammeln - // ====================================================== - + // Absatzbasierte Vorstruktur $paragraphs = preg_split('/\n{2,}/u', $text); if (!$paragraphs) { return []; @@ -52,7 +47,7 @@ final readonly class SimpleChunker $paragraphWordCount = $this->countWords($paragraph); - // Falls einzelner Absatz größer als maxWords → Fallback + // Absatz größer als maxWords → Wort-Fallback if ($paragraphWordCount > $maxWords) { if ($currentChunk !== '') { @@ -68,14 +63,14 @@ final readonly class SimpleChunker continue; } - // Absatz passt noch in aktuellen Chunk + // Absatz passt in aktuellen Chunk if ($currentWordCount + $paragraphWordCount <= $maxWords) { $currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph; $currentWordCount += $paragraphWordCount; continue; } - // Flush aktueller Chunk + // Flush if ($currentChunk !== '') { $chunks[] = trim($currentChunk); } @@ -92,7 +87,7 @@ final readonly class SimpleChunker } // ====================================================== - // Wortbasierter Fallback (Original-Logik beibehalten) + // Wortbasierter Fallback // ====================================================== /** @return string[] */ @@ -125,6 +120,7 @@ final readonly class SimpleChunker $wordPos = 0; while ($wordPos < $totalWords) { + $wordEnd = min($wordPos + $maxWords, $totalWords); $tokenStart = $wordTokenIndexes[$wordPos]; @@ -154,11 +150,13 @@ final readonly class SimpleChunker private function adjustCutToBoundary(array $tokens, int $start, int $end): int { + // Schutz für Listenanfänge $startToken = $tokens[$start] ?? ''; - if (preg_match('/^- /u', ltrim($startToken))) { + if (preg_match('/^\s*-\s+/u', $startToken)) { return $end; } + // Rückwärts prüfen auf Absatz- oder Satzende for ($i = $end - 1; $i > $start; $i--) { if ($tokens[$i] === "\n\n") { @@ -190,9 +188,13 @@ final readonly class SimpleChunker $out = []; foreach ($chunks as $chunk) { - $key = mb_strtolower( - preg_replace('/\s+/u', ' ', trim($chunk)) - ); + + $normalized = preg_replace('/\s+/u', ' ', trim($chunk)); + if ($normalized === null) { + continue; + } + + $key = mb_strtolower($normalized); if (isset($seen[$key])) { continue; diff --git a/src/Knowledge/Retrieval/CachedRetriever.php b/src/Knowledge/Retrieval/CachedRetriever.php deleted file mode 100644 index 301de3c..0000000 --- a/src/Knowledge/Retrieval/CachedRetriever.php +++ /dev/null @@ -1,48 +0,0 @@ -buildCacheKey($prompt, $limit); - - $item = $this->cache->getItem($key); - if ($item->isHit()) { - return $item->get(); - } - - $result = $this->inner->retrieve($prompt, $limit); - - $item->set($result); - $item->expiresAfter($this->ttlSeconds); - $this->cache->save($item); - - return $result; - } - - private function buildCacheKey(string $prompt, int $limit): string - { - $normalized = mb_strtolower(trim($prompt)); - $normalized = preg_replace('/\s+/u', ' ', $normalized); - - return 'rag_retrieval_' . sha1($normalized . '|' . $limit); - } -} diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index a9f9ad3..5a8aed5 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -57,6 +57,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface return [$result['catalogBlock']]; } + if ($result['selectedChunkIds'] === []) { + return []; + } + return $this->collectTextsFromIds( $result['selectedChunkIds'], $result['rows'] @@ -84,10 +88,15 @@ final class NdjsonHybridRetriever implements RetrieverInterface ]]; } + if ($result['selectedChunkIds'] === []) { + return []; + } + $out = []; $rank = 0; foreach ($result['selectedChunkIds'] as $chunkId) { + if (!isset($result['rows'][$chunkId])) { continue; } @@ -127,6 +136,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface $route = $this->routeResolver->resolve($salesIntent, $entityLabel); if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) { + $catalogBlock = $this->entityCatalogService->listByTerm($entityLabel); if ($catalogBlock !== null) { @@ -147,6 +157,21 @@ final class NdjsonHybridRetriever implements RetrieverInterface $core = $this->runCore($prompt, $config, $withScores, $salesIntent); + if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { + return [ + 'route' => $route, + 'entityLabel' => $entityLabel, + 'intent' => $salesIntent, + 'isListQuery' => $core['is_list_query'], + 'selectedChunkIds' => [], + 'rows' => [], + 'rrfScores' => [], + 'rawScores' => [], + 'threshold' => $core['threshold'], + 'catalogBlock' => null, + ]; + } + $selectedChunkIds = $core['is_list_query'] ? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']) : $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']); @@ -182,8 +207,17 @@ final class NdjsonHybridRetriever implements RetrieverInterface $isListQuery = $this->intentLite->isListQuery($prompt); $cleanQuery = $this->queryCleaner->clean($prompt); + if ($cleanQuery === '') { - $cleanQuery = $prompt; + return [ + 'limit' => $limit, + 'is_list_query' => $isListQuery, + 'threshold' => self::VECTOR_SCORE_THRESHOLD, + 'ranked_chunk_ids' => [], + 'rows' => [], + 'rrf_scores' => [], + 'raw_scores' => [], + ]; } [$threshold, $topK] = $this->computeThresholdAndTopK( @@ -200,10 +234,22 @@ final class NdjsonHybridRetriever implements RetrieverInterface $globalHits = $this->vectorClient->search($cleanQuery, $topK); $scopedHits = []; - if (!empty($candidateDocIds)) { + if ($candidateDocIds !== []) { $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds); } + if ($globalHits === [] && $scopedHits === []) { + return [ + 'limit' => $limit, + 'is_list_query' => $isListQuery, + 'threshold' => $threshold, + 'ranked_chunk_ids' => [], + 'rows' => [], + 'rrf_scores' => [], + 'raw_scores' => [], + ]; + } + $fused = $this->fuseHits( $globalHits, $scopedHits, @@ -216,11 +262,25 @@ final class NdjsonHybridRetriever implements RetrieverInterface $rawScores = $fused['raw_scores']; if ($rrfScores === [] && $globalHits !== []) { - $rrfScores = $this->fallbackRrfFromHits($globalHits, self::EMPTY_RRF_FALLBACK_TOPN); + $rrfScores = $this->fallbackRrfFromHits( + $globalHits, + self::EMPTY_RRF_FALLBACK_TOPN + ); + } + + if ($rrfScores === []) { + return [ + 'limit' => $limit, + 'is_list_query' => $isListQuery, + 'threshold' => $threshold, + 'ranked_chunk_ids' => [], + 'rows' => [], + 'rrf_scores' => [], + 'raw_scores' => $rawScores, + ]; } arsort($rrfScores); - $rankedChunkIds = array_keys($rrfScores); $rows = $this->lookup->findByChunkIds($rankedChunkIds); @@ -254,13 +314,19 @@ final class NdjsonHybridRetriever implements RetrieverInterface return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY); } - private function computeThresholdAndTopK(string $salesIntent, bool $isListQuery, int $vectorTopKBase): array - { + private function computeThresholdAndTopK( + string $salesIntent, + bool $isListQuery, + int $vectorTopKBase + ): array { + $threshold = self::VECTOR_SCORE_THRESHOLD; $topK = $vectorTopKBase; - if ($salesIntent === SalesIntentLite::OBJECTION || - $salesIntent === SalesIntentLite::PRICING) { + if ( + $salesIntent === SalesIntentLite::OBJECTION || + $salesIntent === SalesIntentLite::PRICING + ) { $threshold += 0.02; } @@ -333,6 +399,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface $rank = 0; foreach ($hits as $hit) { + if (!isset($hit['chunk_id'])) { continue; } @@ -354,6 +421,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface $out = []; foreach ($chunkIds as $id) { + if (!isset($rows[$id]['text'])) { continue; } @@ -433,11 +501,13 @@ final class NdjsonHybridRetriever implements RetrieverInterface $out = []; foreach ($chunkIds as $id) { + if (!isset($rows[$id]['text'])) { continue; } $text = trim((string)$rows[$id]['text']); + if ($text !== '') { $out[] = $text; } diff --git a/src/Knowledge/Text/TextNormalizer.php b/src/Knowledge/Text/TextNormalizer.php index acd07b4..5e09ffa 100644 --- a/src/Knowledge/Text/TextNormalizer.php +++ b/src/Knowledge/Text/TextNormalizer.php @@ -13,7 +13,15 @@ final class TextNormalizer } // ------------------------------------------------- - // 1. Encoding-Artefakte & Sonderzeichen + // 1. Unicode-Normalisierung (wichtig für Stabilität) + // ------------------------------------------------- + + if (class_exists(\Normalizer::class)) { + $text = \Normalizer::normalize($text, \Normalizer::FORM_C) ?? $text; + } + + // ------------------------------------------------- + // 2. Encoding-Artefakte & Sonderzeichen // ------------------------------------------------- // Word/PDF Bullet-Artefakte (häufiges Problemzeichen) @@ -26,38 +34,49 @@ final class TextNormalizer $text ); + // Private-Use-Area entfernen $text = preg_replace('/[\x{E000}-\x{F8FF}]/u', '', $text); - // Non-breaking space → normales Leerzeichen - $text = str_replace("\xC2\xA0", ' ', $text); - // Zero-width characters entfernen $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text); - // ------------------------------------------------- - // 2. Zeilenumbrüche vereinheitlichen - // ------------------------------------------------- - - $text = str_replace("\r\n", "\n", $text); - $text = str_replace("\r", "\n", $text); + // Geschützte Leerzeichen & ähnliche Varianten vereinheitlichen + $text = str_replace( + [ + "\xC2\xA0", // NBSP + "\xE2\x80\xAF", // Narrow NBSP + "\xE2\x80\x89", // Thin space + ], + ' ', + $text + ); // ------------------------------------------------- - // 3. Silbentrennung über Zeilen entfernen + // 3. Zeilenumbrüche vereinheitlichen + // ------------------------------------------------- + + $text = str_replace(["\r\n", "\r"], "\n", $text); + + // ------------------------------------------------- + // 4. Silbentrennung über Zeilen entfernen + // // Beispiel: // Testo- // mat → Testomat + // + // Nur wenn direkt Buchstabe folgt // ------------------------------------------------- $text = preg_replace('/-\n(\p{L})/u', '$1', $text); // ------------------------------------------------- - // 4. Whitespace normalisieren + // 5. Whitespace normalisieren // ------------------------------------------------- // Mehrfache Leerzeichen reduzieren $text = preg_replace('/[ \t]+/u', ' ', $text); - // Mehrfache Leerzeilen reduzieren + // Mehr als 2 Leerzeilen reduzieren $text = preg_replace('/\n{3,}/u', "\n\n", $text); return trim($text);