optimite tag logic boost
This commit is contained in:
38
MATRIX_PARAMS.md
Normal file
38
MATRIX_PARAMS.md
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
# Tabelle 1: Alle Parameter, die Retrieval beeinflussen (mit Kurz-Erklärung)
|
||||||
|
|
||||||
|
| Ebene | Ort | Parameter | Standard / aktuell | Zweck / Einfluss |
|
||||||
|
|---|---|---|---:|---|
|
||||||
|
| **Config** | ModelGenerationConfig | retrievalMaxChunks | (dein Wert) | Wie viele Chunks maximal ans LLM gehen (Output-Limit). |
|
||||||
|
| **Config** | ModelGenerationConfig | retrievalVectorTopK | (dein Wert) | Wie viele Vector-Hits initial geholt werden (Recall-Breite). |
|
||||||
|
| **Retriever** | NdjsonHybridRetriever | HARD_MAX_CHUNKS | 200 | Harte Obergrenze für retrievalMaxChunks (Safety-Limit). |
|
||||||
|
| **Retriever** | NdjsonHybridRetriever | HARD_MAX_VECTORK | 200 | Harte Obergrenze für retrievalVectorTopK/topK (Safety-Limit). |
|
||||||
|
| **Retriever** | NdjsonHybridRetriever | VECTOR_SCORE_THRESHOLD | 0.40 | Qualitäts-Gate: Vector-Treffer darunter werden verworfen (stärkster Präzisionshebel). |
|
||||||
|
| **Retriever** | NdjsonHybridRetriever | List-Mode TopK | max(vectorTopKBase*3, 80) | Bei Listenfragen wird TopK stark erhöht für bessere Dokumentabdeckung. |
|
||||||
|
| **Retriever** | NdjsonHybridRetriever | isListQuery() | Heuristik | Aktiviert Dokument-Ranking statt reinem Chunk-Ranking. |
|
||||||
|
| **Retriever** | NdjsonHybridRetriever | Dedup-Normalisierung | whitespace-normalized | Entfernt Duplikate im finalen Chunk-Set. |
|
||||||
|
| **Tags** | TagRoutingService | DEFAULT_TOPK | 8 | Anzahl der geprüften Tag-Vector-Hits. |
|
||||||
|
| **Tags** | TagRoutingService | MIN_BEST_SCORE | 0.10 (empf. 0.25) | Ab welchem Tag-Score ein Bonus aktiviert wird. |
|
||||||
|
| **Tags** | TagRoutingService | MAX_CANDIDATE_DOCS | 200 | Maximale Anzahl Dokumente, die als Tag-Kandidaten gelten dürfen. |
|
||||||
|
| **Tags** | NdjsonHybridRetriever | TAG_SCORE_BONUS | z. B. 0.08 | Bonus auf Vector-Score bei Tag-Match (nur Ranking, kein Gate). |
|
||||||
|
| **Query** | QueryCleaner | clean($prompt) | implizit | Beeinflusst Embedding stark (Token-Normalisierung/Entfernung). |
|
||||||
|
| **Vector** | VectorSearchClient | search($query, topK) | implizit | Liefert Roh-Scores und Trefferverteilung (Basis des Rankings). |
|
||||||
|
| **Tag Vector** | TagVectorSearchClient | search($query, DEFAULT_TOPK) | implizit | Bestimmt, ob und welche Tags matchen (Bonus-Aktivierung). |
|
||||||
|
|
||||||
|
|
||||||
|
# Tabelle 2: Auswirkungen bei Änderung der Parameter
|
||||||
|
|
||||||
|
| Parameter | Wenn erhöht | Wenn gesenkt | Typischer Effekt / Risiko |
|
||||||
|
|---|---|---|---|
|
||||||
|
| retrievalMaxChunks | Mehr Kontext, höhere Antworttiefe | Kompaktere Antworten, evtl. Wissensverlust | Zu hoch → Token/Noise-Risiko |
|
||||||
|
| HARD_MAX_CHUNKS | Erlaubt größere Kontexte | Strenger Kontext-Limit | Sicherheitsparameter |
|
||||||
|
| retrievalVectorTopK | Mehr Recall, breitere Kandidatenbasis | Weniger Recall, präziser aber evtl. Lücken | Zu hoch → mehr Noise |
|
||||||
|
| HARD_MAX_VECTORK | Größere Suchräume möglich | Strenger begrenzt | Sicherheitsparameter |
|
||||||
|
| VECTOR_SCORE_THRESHOLD | Höhere Präzision, weniger schwache Treffer | Mehr Treffer, aber mehr Rauschen | Zu niedrig → Bonus wirkt stärker |
|
||||||
|
| List-Mode TopK | Bessere Listenabdeckung | Listen evtl. unvollständig | Zu hoch → Noise |
|
||||||
|
| isListQuery | Häufigerer Dokumentmodus | Seltener Dokumentmodus | Fehlklassifikation möglich |
|
||||||
|
| QueryCleaner Aggressivität | Stabilere Suche, weniger Noise | Mehr Originalbegriffe | Zu aggressiv → Informationsverlust |
|
||||||
|
| DEFAULT_TOPK (Tags) | Mehr Tag-Kandidaten | Weniger Tag-Kandidaten | Zu hoch → Bonus häufiger aktiv |
|
||||||
|
| MIN_BEST_SCORE | Bonus seltener (nur starke Tag-Matches) | Bonus häufiger (auch schwache Matches) | Haupthebel gegen „Tags zu mächtig“ |
|
||||||
|
| MAX_CANDIDATE_DOCS | Mehr Dokumente erhalten Bonus | Weniger Dokumente erhalten Bonus | Zu hoch → Bonus verwässert |
|
||||||
|
| TAG_SCORE_BONUS | Tags pushen Ranking stärker | Tags pushen kaum | Zu hoch → Dominanz-Risiko |
|
||||||
|
| Dedup-Normalisierung | Weniger Dopplungen | Mehr Redundanz | Beeinflusst Vielfalt, nicht Relevanz |
|
||||||
@@ -14,11 +14,16 @@ use App\Vector\VectorSearchClient;
|
|||||||
final class NdjsonHybridRetriever implements RetrieverInterface
|
final class NdjsonHybridRetriever implements RetrieverInterface
|
||||||
{
|
{
|
||||||
private const VECTOR_SCORE_THRESHOLD = 0.4;
|
private const VECTOR_SCORE_THRESHOLD = 0.4;
|
||||||
private const VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED = 3;
|
|
||||||
|
|
||||||
private const HARD_MAX_CHUNKS = 200;
|
private const HARD_MAX_CHUNKS = 200;
|
||||||
private const HARD_MAX_VECTORK = 200;
|
private const HARD_MAX_VECTORK = 200;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter).
|
||||||
|
* Enterprise Default: klein halten, sonst dominieren Tags wieder.
|
||||||
|
*/
|
||||||
|
private const TAG_SCORE_BONUS = 0.08;
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly ChunkManager $chunkManager,
|
private readonly ChunkManager $chunkManager,
|
||||||
private readonly NdjsonChunkLookup $lookup,
|
private readonly NdjsonChunkLookup $lookup,
|
||||||
@@ -61,7 +66,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// 1) Tag Routing (bereinigte Query)
|
// 1) Tag Routing (bereinigte Query) -> NUR Bonus
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
||||||
$candidateSet = null;
|
$candidateSet = null;
|
||||||
@@ -75,87 +80,115 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
$topK = $vectorTopKBase;
|
$topK = $vectorTopKBase;
|
||||||
|
|
||||||
|
// List mode: höhere Abdeckung, um mehr Dokumente zu ranken
|
||||||
if ($isListQuery) {
|
if ($isListQuery) {
|
||||||
$topK = max($vectorTopKBase * 3, 80);
|
$topK = max($vectorTopKBase * 3, 80);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($candidateSet !== null) {
|
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
|
||||||
$topK = min(
|
|
||||||
max($topK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $topK),
|
|
||||||
self::HARD_MAX_VECTORK
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// 3) Vector Search (bereinigte Query; scoped wenn möglich)
|
// 3) Vector Search (immer GLOBAL; Tags sind KEIN Filter)
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
if ($candidateSet !== null) {
|
$hits = $this->vectorClient->search($cleanQuery, $topK);
|
||||||
$hits = $this->vectorClient->searchScoped(
|
|
||||||
$cleanQuery,
|
|
||||||
$topK,
|
|
||||||
array_keys($candidateSet)
|
|
||||||
);
|
|
||||||
|
|
||||||
// Wenn scoped nichts liefert → global fallback
|
|
||||||
if ($hits === []) {
|
|
||||||
$hits = $this->vectorClient->search($cleanQuery, $vectorTopKBase);
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
$hits = $this->vectorClient->search($cleanQuery, $topK);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($hits === []) {
|
if ($hits === []) {
|
||||||
return $candidateSet !== null
|
// Tags dürfen NICHT als Fallback wirken (sonst wieder zu mächtig)
|
||||||
? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit)
|
return [];
|
||||||
: [];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// 4) ChunkIds + Lookup
|
// 4) ChunkIds + Scores sammeln (raw)
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
$chunkIds = [];
|
/** @var array<string,float> $rawScoreByChunkId */
|
||||||
|
$rawScoreByChunkId = [];
|
||||||
|
|
||||||
foreach ($hits as $hit) {
|
foreach ($hits as $hit) {
|
||||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
|
$raw = (float)$hit['score'];
|
||||||
|
|
||||||
|
// Threshold wird auf RAW Score angewendet (Qualitätsgate)
|
||||||
|
if ($raw < self::VECTOR_SCORE_THRESHOLD) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$chunkIds[] = (string)$hit['chunk_id'];
|
$chunkId = (string)$hit['chunk_id'];
|
||||||
|
|
||||||
|
// Falls mehrfach: den besten raw score behalten
|
||||||
|
if (!isset($rawScoreByChunkId[$chunkId]) || $raw > $rawScoreByChunkId[$chunkId]) {
|
||||||
|
$rawScoreByChunkId[$chunkId] = $raw;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($chunkIds === []) {
|
if ($rawScoreByChunkId === []) {
|
||||||
return $candidateSet !== null
|
return [];
|
||||||
? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit)
|
|
||||||
: [];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$rows = $this->lookup->findByChunkIds($chunkIds);
|
// Lookup liefert docId + Text etc.
|
||||||
|
$rows = $this->lookup->findByChunkIds(array_keys($rawScoreByChunkId));
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// 5) Listenmodus → Dokument-Ranking
|
// 5) Adjusted Score (Tag Bonus) + Ranking
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
if ($isListQuery && $candidateSet !== null) {
|
/** @var array<string,float> $adjScoreByChunkId */
|
||||||
|
$adjScoreByChunkId = [];
|
||||||
|
|
||||||
$rankedDocIds = $this->rankDocumentsFromHits($hits, $rows, $candidateSet);
|
foreach ($rawScoreByChunkId as $chunkId => $rawScore) {
|
||||||
|
if (!isset($rows[$chunkId])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$adj = $rawScore;
|
||||||
|
|
||||||
|
if ($candidateSet !== null) {
|
||||||
|
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||||
|
if (is_string($docId) && isset($candidateSet[$docId])) {
|
||||||
|
$adj += self::TAG_SCORE_BONUS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$adjScoreByChunkId[$chunkId] = $adj;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($adjScoreByChunkId === []) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort: adjusted desc, deterministic tie-break by chunkId
|
||||||
|
uksort($adjScoreByChunkId, static function (string $a, string $b) use ($adjScoreByChunkId): int {
|
||||||
|
$sa = $adjScoreByChunkId[$a];
|
||||||
|
$sb = $adjScoreByChunkId[$b];
|
||||||
|
|
||||||
|
if ($sa === $sb) {
|
||||||
|
return $a <=> $b;
|
||||||
|
}
|
||||||
|
return ($sb <=> $sa);
|
||||||
|
});
|
||||||
|
|
||||||
|
$rankedChunkIds = array_keys($adjScoreByChunkId);
|
||||||
|
|
||||||
|
// -------------------------------------------------
|
||||||
|
// 6) Listenmodus → Dokument-Ranking (mit Tag-Bonus in Scores)
|
||||||
|
// -------------------------------------------------
|
||||||
|
if ($isListQuery) {
|
||||||
|
$rankedDocIds = $this->rankDocumentsFromAdjustedScores($adjScoreByChunkId, $rows);
|
||||||
|
|
||||||
if ($rankedDocIds === []) {
|
if ($rankedDocIds === []) {
|
||||||
return $this->fallbackChunksFromCandidateDocs($candidateSet, $limit);
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
$topDocIds = array_slice($rankedDocIds, 0, $limit);
|
$topDocIds = array_slice($rankedDocIds, 0, $limit);
|
||||||
|
|
||||||
return $this->collectBestChunkPerDocument($topDocIds, $hits, $rows);
|
return $this->collectBestChunkPerDocumentAdjusted($topDocIds, $adjScoreByChunkId, $rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// 6) Normaler Chunk-Modus
|
// 7) Normaler Chunk-Modus (nach adjusted Ranking)
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
return $this->collectTexts($chunkIds, $rows, $limit);
|
return $this->collectTexts($rankedChunkIds, $rows, $limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
// =========================================================
|
// =========================================================
|
||||||
@@ -174,30 +207,29 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
}
|
}
|
||||||
|
|
||||||
// =========================================================
|
// =========================================================
|
||||||
// DOCUMENT RANKING
|
// DOCUMENT RANKING (Adjusted Scores incl. Tag Bonus)
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
private function rankDocumentsFromHits(
|
/**
|
||||||
array $hits,
|
* @param array<string,float> $adjScoreByChunkId
|
||||||
array $rows,
|
* @param array<string,array<string,mixed>> $rows
|
||||||
array $candidateSet
|
* @return string[]
|
||||||
): array {
|
*/
|
||||||
|
private function rankDocumentsFromAdjustedScores(array $adjScoreByChunkId, array $rows): array
|
||||||
|
{
|
||||||
$documentScores = [];
|
$documentScores = [];
|
||||||
|
|
||||||
foreach ($hits as $hit) {
|
foreach ($adjScoreByChunkId as $chunkId => $score) {
|
||||||
$chunkId = (string)($hit['chunk_id'] ?? '');
|
|
||||||
|
|
||||||
if (!isset($rows[$chunkId])) {
|
if (!isset($rows[$chunkId])) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||||
|
if (!is_string($docId) || $docId === '') {
|
||||||
if (!is_string($docId) || !isset($candidateSet[$docId])) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$documentScores[$docId][] = (float)$hit['score'];
|
$documentScores[$docId][] = (float)$score;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($documentScores === []) {
|
if ($documentScores === []) {
|
||||||
@@ -217,21 +249,21 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
return array_keys($ranked);
|
return array_keys($ranked);
|
||||||
}
|
}
|
||||||
|
|
||||||
private function collectBestChunkPerDocument(
|
/**
|
||||||
array $docIds,
|
* @param string[] $docIds
|
||||||
array $hits,
|
* @param array<string,float> $adjScoreByChunkId
|
||||||
array $rows
|
* @param array<string,array<string,mixed>> $rows
|
||||||
): array {
|
* @return string[]
|
||||||
|
*/
|
||||||
|
private function collectBestChunkPerDocumentAdjusted(array $docIds, array $adjScoreByChunkId, array $rows): array
|
||||||
|
{
|
||||||
$result = [];
|
$result = [];
|
||||||
|
|
||||||
foreach ($docIds as $docId) {
|
foreach ($docIds as $docId) {
|
||||||
|
|
||||||
$bestScore = -INF;
|
$bestScore = -INF;
|
||||||
$bestText = null;
|
$bestText = null;
|
||||||
|
|
||||||
foreach ($hits as $hit) {
|
foreach ($adjScoreByChunkId as $chunkId => $score) {
|
||||||
$chunkId = (string)($hit['chunk_id'] ?? '');
|
|
||||||
|
|
||||||
if (!isset($rows[$chunkId])) {
|
if (!isset($rows[$chunkId])) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -240,8 +272,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((float)$hit['score'] > $bestScore) {
|
if ((float)$score > $bestScore) {
|
||||||
$bestScore = (float)$hit['score'];
|
$bestScore = (float)$score;
|
||||||
$bestText = $rows[$chunkId]['text'] ?? null;
|
$bestText = $rows[$chunkId]['text'] ?? null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -255,45 +287,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
}
|
}
|
||||||
|
|
||||||
// =========================================================
|
// =========================================================
|
||||||
// FALLBACK + NORMAL MODE
|
// NORMAL MODE
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
private function fallbackChunksFromCandidateDocs(array $candidateSet, int $limit): array
|
|
||||||
{
|
|
||||||
$seen = [];
|
|
||||||
$out = [];
|
|
||||||
|
|
||||||
foreach ($this->chunkManager->streamAll() as $row) {
|
|
||||||
$docId = $row['document_id'] ?? null;
|
|
||||||
|
|
||||||
if (!is_string($docId) || !isset($candidateSet[$docId])) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$text = $row['text'] ?? null;
|
|
||||||
|
|
||||||
if (!is_string($text) || $text === '') {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$chunk = trim($text);
|
|
||||||
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
|
|
||||||
|
|
||||||
if (isset($seen[$key])) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$seen[$key] = true;
|
|
||||||
$out[] = $chunk;
|
|
||||||
|
|
||||||
if (\count($out) >= $limit) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return $out;
|
|
||||||
}
|
|
||||||
|
|
||||||
private function collectTexts(array $chunkIds, array $rows, int $limit): array
|
private function collectTexts(array $chunkIds, array $rows, int $limit): array
|
||||||
{
|
{
|
||||||
$seen = [];
|
$seen = [];
|
||||||
@@ -304,7 +300,11 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$chunk = trim($rows[$id]['text']);
|
$chunk = trim((string)$rows[$id]['text']);
|
||||||
|
if ($chunk === '') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
|
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
|
||||||
|
|
||||||
if (isset($seen[$key])) {
|
if (isset($seen[$key])) {
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ use Symfony\Component\Uid\Uuid;
|
|||||||
final class TagRoutingService
|
final class TagRoutingService
|
||||||
{
|
{
|
||||||
private const DEFAULT_TOPK = 8;
|
private const DEFAULT_TOPK = 8;
|
||||||
private const MIN_BEST_SCORE = 0.10;
|
private const MIN_BEST_SCORE = 0.25;
|
||||||
private const MAX_CANDIDATE_DOCS = 200;
|
private const MAX_CANDIDATE_DOCS = 200;
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
|
|||||||
Reference in New Issue
Block a user