From 2713da1afbca5cc0fd1e6e792d023212cffd4e2b Mon Sep 17 00:00:00 2001 From: team2 Date: Fri, 27 Feb 2026 07:12:36 +0100 Subject: [PATCH] optimize intents --- src/Intent/IntentLite.php | 144 ++++++++++++++++++ .../Retrieval/NdjsonHybridRetriever.php | 29 ++-- 2 files changed, 155 insertions(+), 18 deletions(-) create mode 100644 src/Intent/IntentLite.php diff --git a/src/Intent/IntentLite.php b/src/Intent/IntentLite.php new file mode 100644 index 0000000..4eea2c1 --- /dev/null +++ b/src/Intent/IntentLite.php @@ -0,0 +1,144 @@ +normalize($originalPrompt); + + $score = 0; + $signals = []; + + // -------------------------------------------------------- + // 1. Starke explizite Listen-Trigger (hohes Gewicht) + // -------------------------------------------------------- + $strongPatterns = [ + '/\bliste(n)?\b/u', + '/\bauflisten\b/u', + '/\baufz(a|ä)hl(en)?\b/u', + '/\bnenn(e)?\b/u', + '/\bzeig(e)?\b/u', + '/\bwelche\s+sind\b/u', + '/\bwelche\s+gibt\s+es\b/u', + '/\bwas\s+sind\b/u', + '/\bwie\s+viele\b/u', + '/\branking\b/u', + '/\btop\s*\d+\b/u', + ]; + + foreach ($strongPatterns as $pattern) { + if (preg_match($pattern, $p) === 1) { + $score += 3; + $signals[] = "strong:$pattern"; + } + } + + // -------------------------------------------------------- + // 2. Mengen- / Mehrzahl-Indikatoren + // -------------------------------------------------------- + $quantityWords = [ + 'alle', + 'sämtliche', + 'saemtliche', + 'mehrere', + 'verschiedene', + 'einige', + 'viele', + 'optionen', + 'möglichkeiten', + 'moeglichkeiten', + 'varianten', + 'arten', + 'modelle', + 'funktionen', + 'punkte', + 'schritte', + 'kategorien', + 'übersicht', + 'uebersicht', + ]; + + foreach ($quantityWords as $word) { + if (preg_match('/\b' . preg_quote($word, '/') . '\b/u', $p) === 1) { + $score += 2; + $signals[] = "quantity:$word"; + } + } + + // -------------------------------------------------------- + // 3. Explizite Zahlen (z. B. "5 Vorteile") + // -------------------------------------------------------- + if (preg_match('/\b\d+\b/u', $p) === 1) { + $score += 2; + $signals[] = 'number'; + } + + // -------------------------------------------------------- + // 4. Enumeration-Hinweise (1., -, *, etc.) + // -------------------------------------------------------- + if ( + preg_match('/(^|\s)(\d+\)|\d+\.|-\s|\*\s)/u', $originalPrompt) === 1 + ) { + $score += 1; + $signals[] = 'enumeration_hint'; + } + + // -------------------------------------------------------- + // Entscheidung + // -------------------------------------------------------- + $isList = $score >= self::LIST_THRESHOLD; + + return [ + 'is_list' => $isList, + 'score' => $score, + 'signals' => $signals, + ]; + } + + public function isListQuery(string $originalPrompt): bool + { + return $this->detectList($originalPrompt)['is_list']; + } + + // ------------------------------------------------------------ + // Interne Normalisierung (ohne Stopword-Entfernung!) + // ------------------------------------------------------------ + private function normalize(string $s): string + { + $s = mb_strtolower($s); + + // Umlaute zusätzlich absichern (falls QueryCleaner das tut) + $replacements = [ + 'ä' => 'ae', + 'ö' => 'oe', + 'ü' => 'ue', + 'ß' => 'ss', + ]; + + // Nur als Zusatzform speichern (nicht ersetzen!) + foreach ($replacements as $umlaut => $alt) { + if (str_contains($s, $umlaut)) { + $s .= ' ' . str_replace($umlaut, $alt, $s); + break; + } + } + + return $s; + } +} \ No newline at end of file diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index b535f4d..64d3244 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -5,6 +5,7 @@ declare(strict_types=1); namespace App\Knowledge\Retrieval; use App\Entity\ModelGenerationConfig; +use App\Intent\IntentLite; use App\Knowledge\ChunkManager; use App\Knowledge\QueryCleaner; use App\Repository\ModelGenerationConfigRepository; @@ -25,12 +26,15 @@ final class NdjsonHybridRetriever implements RetrieverInterface private const TAG_SCORE_BONUS = 0.08; public function __construct( - private readonly NdjsonChunkLookup $lookup, - private readonly VectorSearchClient $vectorClient, - private readonly TagRoutingService $tagRouting, + private readonly NdjsonChunkLookup $lookup, + private readonly VectorSearchClient $vectorClient, + private readonly TagRoutingService $tagRouting, private readonly ModelGenerationConfigRepository $configRepository, - private readonly QueryCleaner $queryCleaner, - ) {} + private readonly QueryCleaner $queryCleaner, + private readonly IntentLite $intentLite + ) + { + } public function retrieve(string $prompt): array { @@ -54,7 +58,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); // Wichtig: List-Detection bleibt auf Originalprompt (sonst entfernst du "zeige/liste" etc.) - $isListQuery = $this->isListQuery($prompt); + $isListQuery = $this->intentLite->isListQuery($prompt); // ------------------------------------------------- // CLEAN QUERY (nur für Retrieval: Tags + Vector) @@ -81,7 +85,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface // List mode: höhere Abdeckung, um mehr Dokumente zu ranken if ($isListQuery) { - $topK = max($vectorTopKBase * 3, 80); + $topK = (int)round($vectorTopKBase * 2.5); } $topK = max(1, min($topK, self::HARD_MAX_VECTORK)); @@ -194,17 +198,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface // LIST QUERY DETECTION // ========================================================= - private function isListQuery(string $prompt): bool - { - $prompt = mb_strtolower($prompt); - - return str_contains($prompt, 'liste') - || str_contains($prompt, 'zeige') - || str_contains($prompt, 'nenn') - || str_contains($prompt, 'welche') - || preg_match('/\b\d+\b/', $prompt) === 1; - } - // ========================================================= // DOCUMENT RANKING (Adjusted Scores incl. Tag Bonus) // =========================================================