diff --git a/src/Knowledge/QueryCleaner.php b/src/Knowledge/QueryCleaner.php index bf38e31..c6c2f1d 100644 --- a/src/Knowledge/QueryCleaner.php +++ b/src/Knowledge/QueryCleaner.php @@ -4,17 +4,41 @@ declare(strict_types=1); namespace App\Knowledge; -final readonly class QueryCleaner +final class QueryCleaner { - public function __construct( - private StopWords $stopWords - ) { - } - + /** + * Bereinigt eine Query ausschließlich für Retrieval-Zwecke. + * + * Wichtig: + * - Unicode-sicher + * - Zahlen bleiben erhalten + * - Negationen bleiben erhalten + * - Keine aggressive Token-Längen-Filterung + * - StopWords werden entfernt + */ public function clean(string $query): string { - $query = mb_strtolower($query); + if ($query === '') { + return ''; + } + + // 1. Unicode-sicher lowercase + $query = mb_strtolower($query, 'UTF-8'); + + // 2. Bindestriche & Slashes als Worttrenner behandeln + $query = str_replace(['-', '/'], ' ', $query); + + // 3. Sonderzeichen entfernen, aber: + // - Buchstaben behalten + // - Zahlen behalten + // - Umlaute behalten $query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query); + + if ($query === null) { + return ''; + } + + // 4. Mehrfache Whitespaces normalisieren $query = preg_replace('/\s+/u', ' ', $query); $query = trim($query); @@ -22,18 +46,35 @@ final readonly class QueryCleaner return ''; } - $tokens = explode(' ', $query); - $stopWords = $this->stopWords->getStopWords(); // <-- wichtig: nutzt deine Klasse + // 5. Tokenisierung + $tokens = preg_split('/\s+/u', $query); - $filtered = array_filter( - $tokens, - function (string $word) use ($stopWords): bool { - return $word !== '' - && mb_strlen($word) > 2 - && !in_array($word, $stopWords, true); + if ($tokens === false) { + return ''; + } + + $cleanTokens = []; + + foreach ($tokens as $token) { + + $token = trim($token); + + if ($token === '') { + continue; } - ); - return implode(' ', $filtered); + // StopWords entfernen + if (StopWords::isStopWord($token)) { + continue; + } + + $cleanTokens[] = $token; + } + + if ($cleanTokens === []) { + return ''; + } + + return implode(' ', $cleanTokens); } -} +} \ No newline at end of file diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 8c7588d..c7e83f1 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -6,6 +6,7 @@ namespace App\Knowledge\Retrieval; use App\Entity\ModelGenerationConfig; use App\Knowledge\ChunkManager; +use App\Knowledge\QueryCleaner; use App\Repository\ModelGenerationConfigRepository; use App\Tag\TagRoutingService; use App\Vector\VectorSearchClient; @@ -24,6 +25,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface private readonly VectorSearchClient $vectorClient, private readonly TagRoutingService $tagRouting, private readonly ModelGenerationConfigRepository $configRepository, + private readonly QueryCleaner $queryCleaner, ) {} public function retrieve(string $prompt): array @@ -47,12 +49,21 @@ final class NdjsonHybridRetriever implements RetrieverInterface $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); + // Wichtig: List-Detection bleibt auf Originalprompt (sonst entfernst du "zeige/liste" etc.) $isListQuery = $this->isListQuery($prompt); // ------------------------------------------------- - // 1) Tag Routing + // CLEAN QUERY (nur für Retrieval: Tags + Vector) // ------------------------------------------------- - $candidateDocIds = $this->tagRouting->route($prompt); + $cleanQuery = $this->queryCleaner->clean($prompt); + if ($cleanQuery === '') { + $cleanQuery = $prompt; + } + + // ------------------------------------------------- + // 1) Tag Routing (bereinigte Query) + // ------------------------------------------------- + $candidateDocIds = $this->tagRouting->route($cleanQuery); $candidateSet = null; if (is_array($candidateDocIds) && $candidateDocIds !== []) { @@ -76,22 +87,22 @@ final class NdjsonHybridRetriever implements RetrieverInterface } // ------------------------------------------------- - // 3) Vector Search (Scoped wenn möglich) + // 3) Vector Search (bereinigte Query; scoped wenn möglich) // ------------------------------------------------- if ($candidateSet !== null) { $hits = $this->vectorClient->searchScoped( - $prompt, + $cleanQuery, $topK, array_keys($candidateSet) ); // Wenn scoped nichts liefert → global fallback if ($hits === []) { - $hits = $this->vectorClient->search($prompt, $vectorTopKBase); + $hits = $this->vectorClient->search($cleanQuery, $vectorTopKBase); } } else { - $hits = $this->vectorClient->search($prompt, $topK); + $hits = $this->vectorClient->search($cleanQuery, $topK); } if ($hits === []) { @@ -294,7 +305,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface } $chunk = trim($rows[$id]['text']); - $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); if (isset($seen[$key])) { diff --git a/src/Knowledge/StopWords.php b/src/Knowledge/StopWords.php index 7675329..5f4185b 100644 --- a/src/Knowledge/StopWords.php +++ b/src/Knowledge/StopWords.php @@ -1,1863 +1,64 @@ stopWords); + return self::STOP_WORDS; + } + + /** + * Prüft, ob ein Wort ein Stopwort ist. + */ + public static function isStopWord(string $word): bool + { + return in_array($word, self::STOP_WORDS, true); } } \ No newline at end of file