diff --git a/MATRIX_PARAMS.md b/MATRIX_PARAMS.md new file mode 100644 index 0000000..d4d1706 --- /dev/null +++ b/MATRIX_PARAMS.md @@ -0,0 +1,38 @@ +# Tabelle 1: Alle Parameter, die Retrieval beeinflussen (mit Kurz-Erklärung) + +| Ebene | Ort | Parameter | Standard / aktuell | Zweck / Einfluss | +|---|---|---|---:|---| +| **Config** | ModelGenerationConfig | retrievalMaxChunks | (dein Wert) | Wie viele Chunks maximal ans LLM gehen (Output-Limit). | +| **Config** | ModelGenerationConfig | retrievalVectorTopK | (dein Wert) | Wie viele Vector-Hits initial geholt werden (Recall-Breite). | +| **Retriever** | NdjsonHybridRetriever | HARD_MAX_CHUNKS | 200 | Harte Obergrenze für retrievalMaxChunks (Safety-Limit). | +| **Retriever** | NdjsonHybridRetriever | HARD_MAX_VECTORK | 200 | Harte Obergrenze für retrievalVectorTopK/topK (Safety-Limit). | +| **Retriever** | NdjsonHybridRetriever | VECTOR_SCORE_THRESHOLD | 0.40 | Qualitäts-Gate: Vector-Treffer darunter werden verworfen (stärkster Präzisionshebel). | +| **Retriever** | NdjsonHybridRetriever | List-Mode TopK | max(vectorTopKBase*3, 80) | Bei Listenfragen wird TopK stark erhöht für bessere Dokumentabdeckung. | +| **Retriever** | NdjsonHybridRetriever | isListQuery() | Heuristik | Aktiviert Dokument-Ranking statt reinem Chunk-Ranking. | +| **Retriever** | NdjsonHybridRetriever | Dedup-Normalisierung | whitespace-normalized | Entfernt Duplikate im finalen Chunk-Set. | +| **Tags** | TagRoutingService | DEFAULT_TOPK | 8 | Anzahl der geprüften Tag-Vector-Hits. | +| **Tags** | TagRoutingService | MIN_BEST_SCORE | 0.10 (empf. 0.25) | Ab welchem Tag-Score ein Bonus aktiviert wird. | +| **Tags** | TagRoutingService | MAX_CANDIDATE_DOCS | 200 | Maximale Anzahl Dokumente, die als Tag-Kandidaten gelten dürfen. | +| **Tags** | NdjsonHybridRetriever | TAG_SCORE_BONUS | z. B. 0.08 | Bonus auf Vector-Score bei Tag-Match (nur Ranking, kein Gate). | +| **Query** | QueryCleaner | clean($prompt) | implizit | Beeinflusst Embedding stark (Token-Normalisierung/Entfernung). | +| **Vector** | VectorSearchClient | search($query, topK) | implizit | Liefert Roh-Scores und Trefferverteilung (Basis des Rankings). | +| **Tag Vector** | TagVectorSearchClient | search($query, DEFAULT_TOPK) | implizit | Bestimmt, ob und welche Tags matchen (Bonus-Aktivierung). | + + +# Tabelle 2: Auswirkungen bei Änderung der Parameter + +| Parameter | Wenn erhöht | Wenn gesenkt | Typischer Effekt / Risiko | +|---|---|---|---| +| retrievalMaxChunks | Mehr Kontext, höhere Antworttiefe | Kompaktere Antworten, evtl. Wissensverlust | Zu hoch → Token/Noise-Risiko | +| HARD_MAX_CHUNKS | Erlaubt größere Kontexte | Strenger Kontext-Limit | Sicherheitsparameter | +| retrievalVectorTopK | Mehr Recall, breitere Kandidatenbasis | Weniger Recall, präziser aber evtl. Lücken | Zu hoch → mehr Noise | +| HARD_MAX_VECTORK | Größere Suchräume möglich | Strenger begrenzt | Sicherheitsparameter | +| VECTOR_SCORE_THRESHOLD | Höhere Präzision, weniger schwache Treffer | Mehr Treffer, aber mehr Rauschen | Zu niedrig → Bonus wirkt stärker | +| List-Mode TopK | Bessere Listenabdeckung | Listen evtl. unvollständig | Zu hoch → Noise | +| isListQuery | Häufigerer Dokumentmodus | Seltener Dokumentmodus | Fehlklassifikation möglich | +| QueryCleaner Aggressivität | Stabilere Suche, weniger Noise | Mehr Originalbegriffe | Zu aggressiv → Informationsverlust | +| DEFAULT_TOPK (Tags) | Mehr Tag-Kandidaten | Weniger Tag-Kandidaten | Zu hoch → Bonus häufiger aktiv | +| MIN_BEST_SCORE | Bonus seltener (nur starke Tag-Matches) | Bonus häufiger (auch schwache Matches) | Haupthebel gegen „Tags zu mächtig“ | +| MAX_CANDIDATE_DOCS | Mehr Dokumente erhalten Bonus | Weniger Dokumente erhalten Bonus | Zu hoch → Bonus verwässert | +| TAG_SCORE_BONUS | Tags pushen Ranking stärker | Tags pushen kaum | Zu hoch → Dominanz-Risiko | +| Dedup-Normalisierung | Weniger Dopplungen | Mehr Redundanz | Beeinflusst Vielfalt, nicht Relevanz | \ No newline at end of file diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index cb979a4..e1d2e78 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -14,11 +14,16 @@ use App\Vector\VectorSearchClient; final class NdjsonHybridRetriever implements RetrieverInterface { private const VECTOR_SCORE_THRESHOLD = 0.4; - private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 3; private const HARD_MAX_CHUNKS = 200; private const HARD_MAX_VECTORK = 200; + /** + * Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter). + * Enterprise Default: klein halten, sonst dominieren Tags wieder. + */ + private const TAG_SCORE_BONUS = 0.08; + public function __construct( private readonly ChunkManager $chunkManager, private readonly NdjsonChunkLookup $lookup, @@ -61,7 +66,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface } // ------------------------------------------------- - // 1) Tag Routing (bereinigte Query) + // 1) Tag Routing (bereinigte Query) -> NUR Bonus // ------------------------------------------------- $candidateDocIds = $this->tagRouting->route($cleanQuery); $candidateSet = null; @@ -75,87 +80,115 @@ final class NdjsonHybridRetriever implements RetrieverInterface // ------------------------------------------------- $topK = $vectorTopKBase; + // List mode: höhere Abdeckung, um mehr Dokumente zu ranken if ($isListQuery) { $topK = max($vectorTopKBase * 3, 80); } - if ($candidateSet !== null) { - $topK = min( - max($topK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $topK), - self::HARD_MAX_VECTORK - ); - } + $topK = max(1, min($topK, self::HARD_MAX_VECTORK)); // ------------------------------------------------- - // 3) Vector Search (bereinigte Query; scoped wenn möglich) + // 3) Vector Search (immer GLOBAL; Tags sind KEIN Filter) // ------------------------------------------------- - if ($candidateSet !== null) { - $hits = $this->vectorClient->searchScoped( - $cleanQuery, - $topK, - array_keys($candidateSet) - ); - - // Wenn scoped nichts liefert → global fallback - if ($hits === []) { - $hits = $this->vectorClient->search($cleanQuery, $vectorTopKBase); - } - - } else { - $hits = $this->vectorClient->search($cleanQuery, $topK); - } + $hits = $this->vectorClient->search($cleanQuery, $topK); if ($hits === []) { - return $candidateSet !== null - ? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit) - : []; + // Tags dürfen NICHT als Fallback wirken (sonst wieder zu mächtig) + return []; } // ------------------------------------------------- - // 4) ChunkIds + Lookup + // 4) ChunkIds + Scores sammeln (raw) // ------------------------------------------------- - $chunkIds = []; + /** @var array $rawScoreByChunkId */ + $rawScoreByChunkId = []; foreach ($hits as $hit) { if (!isset($hit['chunk_id'], $hit['score'])) { continue; } - if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) { + $raw = (float)$hit['score']; + + // Threshold wird auf RAW Score angewendet (Qualitätsgate) + if ($raw < self::VECTOR_SCORE_THRESHOLD) { continue; } - $chunkIds[] = (string)$hit['chunk_id']; + $chunkId = (string)$hit['chunk_id']; + + // Falls mehrfach: den besten raw score behalten + if (!isset($rawScoreByChunkId[$chunkId]) || $raw > $rawScoreByChunkId[$chunkId]) { + $rawScoreByChunkId[$chunkId] = $raw; + } } - if ($chunkIds === []) { - return $candidateSet !== null - ? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit) - : []; + if ($rawScoreByChunkId === []) { + return []; } - $rows = $this->lookup->findByChunkIds($chunkIds); + // Lookup liefert docId + Text etc. + $rows = $this->lookup->findByChunkIds(array_keys($rawScoreByChunkId)); // ------------------------------------------------- - // 5) Listenmodus → Dokument-Ranking + // 5) Adjusted Score (Tag Bonus) + Ranking // ------------------------------------------------- - if ($isListQuery && $candidateSet !== null) { + /** @var array $adjScoreByChunkId */ + $adjScoreByChunkId = []; - $rankedDocIds = $this->rankDocumentsFromHits($hits, $rows, $candidateSet); + foreach ($rawScoreByChunkId as $chunkId => $rawScore) { + if (!isset($rows[$chunkId])) { + continue; + } + + $adj = $rawScore; + + if ($candidateSet !== null) { + $docId = $rows[$chunkId]['document_id'] ?? null; + if (is_string($docId) && isset($candidateSet[$docId])) { + $adj += self::TAG_SCORE_BONUS; + } + } + + $adjScoreByChunkId[$chunkId] = $adj; + } + + if ($adjScoreByChunkId === []) { + return []; + } + + // Sort: adjusted desc, deterministic tie-break by chunkId + uksort($adjScoreByChunkId, static function (string $a, string $b) use ($adjScoreByChunkId): int { + $sa = $adjScoreByChunkId[$a]; + $sb = $adjScoreByChunkId[$b]; + + if ($sa === $sb) { + return $a <=> $b; + } + return ($sb <=> $sa); + }); + + $rankedChunkIds = array_keys($adjScoreByChunkId); + + // ------------------------------------------------- + // 6) Listenmodus → Dokument-Ranking (mit Tag-Bonus in Scores) + // ------------------------------------------------- + if ($isListQuery) { + $rankedDocIds = $this->rankDocumentsFromAdjustedScores($adjScoreByChunkId, $rows); if ($rankedDocIds === []) { - return $this->fallbackChunksFromCandidateDocs($candidateSet, $limit); + return []; } $topDocIds = array_slice($rankedDocIds, 0, $limit); - return $this->collectBestChunkPerDocument($topDocIds, $hits, $rows); + return $this->collectBestChunkPerDocumentAdjusted($topDocIds, $adjScoreByChunkId, $rows); } // ------------------------------------------------- - // 6) Normaler Chunk-Modus + // 7) Normaler Chunk-Modus (nach adjusted Ranking) // ------------------------------------------------- - return $this->collectTexts($chunkIds, $rows, $limit); + return $this->collectTexts($rankedChunkIds, $rows, $limit); } // ========================================================= @@ -174,30 +207,29 @@ final class NdjsonHybridRetriever implements RetrieverInterface } // ========================================================= - // DOCUMENT RANKING + // DOCUMENT RANKING (Adjusted Scores incl. Tag Bonus) // ========================================================= - private function rankDocumentsFromHits( - array $hits, - array $rows, - array $candidateSet - ): array { + /** + * @param array $adjScoreByChunkId + * @param array> $rows + * @return string[] + */ + private function rankDocumentsFromAdjustedScores(array $adjScoreByChunkId, array $rows): array + { $documentScores = []; - foreach ($hits as $hit) { - $chunkId = (string)($hit['chunk_id'] ?? ''); - + foreach ($adjScoreByChunkId as $chunkId => $score) { if (!isset($rows[$chunkId])) { continue; } $docId = $rows[$chunkId]['document_id'] ?? null; - - if (!is_string($docId) || !isset($candidateSet[$docId])) { + if (!is_string($docId) || $docId === '') { continue; } - $documentScores[$docId][] = (float)$hit['score']; + $documentScores[$docId][] = (float)$score; } if ($documentScores === []) { @@ -217,21 +249,21 @@ final class NdjsonHybridRetriever implements RetrieverInterface return array_keys($ranked); } - private function collectBestChunkPerDocument( - array $docIds, - array $hits, - array $rows - ): array { + /** + * @param string[] $docIds + * @param array $adjScoreByChunkId + * @param array> $rows + * @return string[] + */ + private function collectBestChunkPerDocumentAdjusted(array $docIds, array $adjScoreByChunkId, array $rows): array + { $result = []; foreach ($docIds as $docId) { - $bestScore = -INF; $bestText = null; - foreach ($hits as $hit) { - $chunkId = (string)($hit['chunk_id'] ?? ''); - + foreach ($adjScoreByChunkId as $chunkId => $score) { if (!isset($rows[$chunkId])) { continue; } @@ -240,8 +272,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface continue; } - if ((float)$hit['score'] > $bestScore) { - $bestScore = (float)$hit['score']; + if ((float)$score > $bestScore) { + $bestScore = (float)$score; $bestText = $rows[$chunkId]['text'] ?? null; } } @@ -255,45 +287,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface } // ========================================================= - // FALLBACK + NORMAL MODE + // NORMAL MODE // ========================================================= - private function fallbackChunksFromCandidateDocs(array $candidateSet, int $limit): array - { - $seen = []; - $out = []; - - foreach ($this->chunkManager->streamAll() as $row) { - $docId = $row['document_id'] ?? null; - - if (!is_string($docId) || !isset($candidateSet[$docId])) { - continue; - } - - $text = $row['text'] ?? null; - - if (!is_string($text) || $text === '') { - continue; - } - - $chunk = trim($text); - $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); - - if (isset($seen[$key])) { - continue; - } - - $seen[$key] = true; - $out[] = $chunk; - - if (\count($out) >= $limit) { - break; - } - } - - return $out; - } - private function collectTexts(array $chunkIds, array $rows, int $limit): array { $seen = []; @@ -304,7 +300,11 @@ final class NdjsonHybridRetriever implements RetrieverInterface continue; } - $chunk = trim($rows[$id]['text']); + $chunk = trim((string)$rows[$id]['text']); + if ($chunk === '') { + continue; + } + $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); if (isset($seen[$key])) { diff --git a/src/Tag/TagRoutingService.php b/src/Tag/TagRoutingService.php index fb0fcf5..f8e3a89 100644 --- a/src/Tag/TagRoutingService.php +++ b/src/Tag/TagRoutingService.php @@ -12,7 +12,7 @@ use Symfony\Component\Uid\Uuid; final class TagRoutingService { private const DEFAULT_TOPK = 8; - private const MIN_BEST_SCORE = 0.10; + private const MIN_BEST_SCORE = 0.25; private const MAX_CANDIDATE_DOCS = 200; public function __construct(