optimize intents
This commit is contained in:
144
src/Intent/IntentLite.php
Normal file
144
src/Intent/IntentLite.php
Normal file
@@ -0,0 +1,144 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Intent;
|
||||
|
||||
/**
|
||||
* IntentLite
|
||||
*
|
||||
* Deterministische, LLM-agnostische Intent-Erkennung.
|
||||
* Fokus: LIST-Intent für Retrieval-Steuerung.
|
||||
*
|
||||
* WICHTIG:
|
||||
* - Immer mit dem ORIGINAL-Prompt aufrufen.
|
||||
* - Nicht mit dem QueryCleaner-Ergebnis.
|
||||
*/
|
||||
final class IntentLite
|
||||
{
|
||||
private const LIST_THRESHOLD = 4;
|
||||
|
||||
public function detectList(string $originalPrompt): array
|
||||
{
|
||||
$p = $this->normalize($originalPrompt);
|
||||
|
||||
$score = 0;
|
||||
$signals = [];
|
||||
|
||||
// --------------------------------------------------------
|
||||
// 1. Starke explizite Listen-Trigger (hohes Gewicht)
|
||||
// --------------------------------------------------------
|
||||
$strongPatterns = [
|
||||
'/\bliste(n)?\b/u',
|
||||
'/\bauflisten\b/u',
|
||||
'/\baufz(a|ä)hl(en)?\b/u',
|
||||
'/\bnenn(e)?\b/u',
|
||||
'/\bzeig(e)?\b/u',
|
||||
'/\bwelche\s+sind\b/u',
|
||||
'/\bwelche\s+gibt\s+es\b/u',
|
||||
'/\bwas\s+sind\b/u',
|
||||
'/\bwie\s+viele\b/u',
|
||||
'/\branking\b/u',
|
||||
'/\btop\s*\d+\b/u',
|
||||
];
|
||||
|
||||
foreach ($strongPatterns as $pattern) {
|
||||
if (preg_match($pattern, $p) === 1) {
|
||||
$score += 3;
|
||||
$signals[] = "strong:$pattern";
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------
|
||||
// 2. Mengen- / Mehrzahl-Indikatoren
|
||||
// --------------------------------------------------------
|
||||
$quantityWords = [
|
||||
'alle',
|
||||
'sämtliche',
|
||||
'saemtliche',
|
||||
'mehrere',
|
||||
'verschiedene',
|
||||
'einige',
|
||||
'viele',
|
||||
'optionen',
|
||||
'möglichkeiten',
|
||||
'moeglichkeiten',
|
||||
'varianten',
|
||||
'arten',
|
||||
'modelle',
|
||||
'funktionen',
|
||||
'punkte',
|
||||
'schritte',
|
||||
'kategorien',
|
||||
'übersicht',
|
||||
'uebersicht',
|
||||
];
|
||||
|
||||
foreach ($quantityWords as $word) {
|
||||
if (preg_match('/\b' . preg_quote($word, '/') . '\b/u', $p) === 1) {
|
||||
$score += 2;
|
||||
$signals[] = "quantity:$word";
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------
|
||||
// 3. Explizite Zahlen (z. B. "5 Vorteile")
|
||||
// --------------------------------------------------------
|
||||
if (preg_match('/\b\d+\b/u', $p) === 1) {
|
||||
$score += 2;
|
||||
$signals[] = 'number';
|
||||
}
|
||||
|
||||
// --------------------------------------------------------
|
||||
// 4. Enumeration-Hinweise (1., -, *, etc.)
|
||||
// --------------------------------------------------------
|
||||
if (
|
||||
preg_match('/(^|\s)(\d+\)|\d+\.|-\s|\*\s)/u', $originalPrompt) === 1
|
||||
) {
|
||||
$score += 1;
|
||||
$signals[] = 'enumeration_hint';
|
||||
}
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Entscheidung
|
||||
// --------------------------------------------------------
|
||||
$isList = $score >= self::LIST_THRESHOLD;
|
||||
|
||||
return [
|
||||
'is_list' => $isList,
|
||||
'score' => $score,
|
||||
'signals' => $signals,
|
||||
];
|
||||
}
|
||||
|
||||
public function isListQuery(string $originalPrompt): bool
|
||||
{
|
||||
return $this->detectList($originalPrompt)['is_list'];
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// Interne Normalisierung (ohne Stopword-Entfernung!)
|
||||
// ------------------------------------------------------------
|
||||
private function normalize(string $s): string
|
||||
{
|
||||
$s = mb_strtolower($s);
|
||||
|
||||
// Umlaute zusätzlich absichern (falls QueryCleaner das tut)
|
||||
$replacements = [
|
||||
'ä' => 'ae',
|
||||
'ö' => 'oe',
|
||||
'ü' => 'ue',
|
||||
'ß' => 'ss',
|
||||
];
|
||||
|
||||
// Nur als Zusatzform speichern (nicht ersetzen!)
|
||||
foreach ($replacements as $umlaut => $alt) {
|
||||
if (str_contains($s, $umlaut)) {
|
||||
$s .= ' ' . str_replace($umlaut, $alt, $s);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $s;
|
||||
}
|
||||
}
|
||||
@@ -5,6 +5,7 @@ declare(strict_types=1);
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Entity\ModelGenerationConfig;
|
||||
use App\Intent\IntentLite;
|
||||
use App\Knowledge\ChunkManager;
|
||||
use App\Knowledge\QueryCleaner;
|
||||
use App\Repository\ModelGenerationConfigRepository;
|
||||
@@ -30,7 +31,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
private readonly TagRoutingService $tagRouting,
|
||||
private readonly ModelGenerationConfigRepository $configRepository,
|
||||
private readonly QueryCleaner $queryCleaner,
|
||||
) {}
|
||||
private readonly IntentLite $intentLite
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
public function retrieve(string $prompt): array
|
||||
{
|
||||
@@ -54,7 +58,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
|
||||
|
||||
// Wichtig: List-Detection bleibt auf Originalprompt (sonst entfernst du "zeige/liste" etc.)
|
||||
$isListQuery = $this->isListQuery($prompt);
|
||||
$isListQuery = $this->intentLite->isListQuery($prompt);
|
||||
|
||||
// -------------------------------------------------
|
||||
// CLEAN QUERY (nur für Retrieval: Tags + Vector)
|
||||
@@ -81,7 +85,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
// List mode: höhere Abdeckung, um mehr Dokumente zu ranken
|
||||
if ($isListQuery) {
|
||||
$topK = max($vectorTopKBase * 3, 80);
|
||||
$topK = (int)round($vectorTopKBase * 2.5);
|
||||
}
|
||||
|
||||
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
|
||||
@@ -194,17 +198,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
// LIST QUERY DETECTION
|
||||
// =========================================================
|
||||
|
||||
private function isListQuery(string $prompt): bool
|
||||
{
|
||||
$prompt = mb_strtolower($prompt);
|
||||
|
||||
return str_contains($prompt, 'liste')
|
||||
|| str_contains($prompt, 'zeige')
|
||||
|| str_contains($prompt, 'nenn')
|
||||
|| str_contains($prompt, 'welche')
|
||||
|| preg_match('/\b\d+\b/', $prompt) === 1;
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// DOCUMENT RANKING (Adjusted Scores incl. Tag Bonus)
|
||||
// =========================================================
|
||||
|
||||
Reference in New Issue
Block a user