optimize as sales rag
This commit is contained in:
@@ -37,6 +37,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// PRODUCTION (UNVERÄNDERTES VERHALTEN)
|
||||
// =========================================================
|
||||
|
||||
public function retrieve(string $prompt): array
|
||||
{
|
||||
$config = $this->configRepository->findActiveForModel();
|
||||
@@ -49,6 +53,120 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
public function retrieveInternal(string $prompt, ModelGenerationConfig $config): array
|
||||
{
|
||||
$core = $this->runCore($prompt, $config, false);
|
||||
|
||||
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// ✅ ORIGINAL: Normal Mode -> Sales optimized selection
|
||||
if (!$core['is_list_query']) {
|
||||
return $this->collectSalesOptimized(
|
||||
$core['ranked_chunk_ids'],
|
||||
$core['rows'],
|
||||
$core['limit']
|
||||
);
|
||||
}
|
||||
|
||||
// ✅ ORIGINAL: List Mode -> simple collectTexts
|
||||
return $this->collectTexts(
|
||||
$core['ranked_chunk_ids'],
|
||||
$core['rows'],
|
||||
$core['limit']
|
||||
);
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// DEBUG (NEU, ABER NICHT IM PRODUKTIONS-PFAD)
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* Gibt genau DIE Treffer zurück, die auch in Produktion ausgewählt werden,
|
||||
* plus Scores/Meta pro ausgewähltem Chunk.
|
||||
*
|
||||
* @return array<int, array{
|
||||
* rank:int,
|
||||
* chunk_id:string,
|
||||
* document_id: (string|null),
|
||||
* raw_score:(float|null),
|
||||
* rrf_score:(float|null),
|
||||
* threshold:float,
|
||||
* intent:string,
|
||||
* is_list_query:bool,
|
||||
* text:string
|
||||
* }>
|
||||
*/
|
||||
public function retrieveDebug(string $prompt): array
|
||||
{
|
||||
$config = $this->configRepository->findActiveForModel();
|
||||
|
||||
if ($config === null) {
|
||||
throw new \RuntimeException('No active ModelGenerationConfig found.');
|
||||
}
|
||||
|
||||
$core = $this->runCore($prompt, $config, true);
|
||||
|
||||
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// 1) Production-like selection: wir selektieren Texte,
|
||||
// aber in Debug brauchen wir die ChunkIds dazu.
|
||||
$selectedChunkIds = $core['is_list_query']
|
||||
? $this->selectChunkIdsListMode($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
|
||||
: $this->selectChunkIdsSalesMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
||||
|
||||
if ($selectedChunkIds === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// 2) Ausgabe inklusive Scores
|
||||
$out = [];
|
||||
$rank = 0;
|
||||
|
||||
foreach ($selectedChunkIds as $chunkId) {
|
||||
if (!isset($core['rows'][$chunkId])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$rank++;
|
||||
$text = trim((string)($core['rows'][$chunkId]['text'] ?? ''));
|
||||
|
||||
$out[] = [
|
||||
'rank' => $rank,
|
||||
'chunk_id' => $chunkId,
|
||||
'document_id' => isset($core['rows'][$chunkId]['document_id']) ? (string)$core['rows'][$chunkId]['document_id'] : null,
|
||||
'raw_score' => isset($core['raw_scores'][$chunkId]) ? (float)$core['raw_scores'][$chunkId] : null,
|
||||
'rrf_score' => isset($core['rrf_scores'][$chunkId]) ? (float)$core['rrf_scores'][$chunkId] : null,
|
||||
'threshold' => (float)$core['threshold'],
|
||||
'intent' => (string)$core['sales_intent'],
|
||||
'is_list_query'=> (bool)$core['is_list_query'],
|
||||
'text' => $text,
|
||||
];
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// CORE PIPELINE (einmalig, shared)
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* @return array{
|
||||
* limit:int,
|
||||
* is_list_query:bool,
|
||||
* sales_intent:string,
|
||||
* threshold:float,
|
||||
* topk:int,
|
||||
* ranked_chunk_ids: string[],
|
||||
* rows: array<string, array<string,mixed>>,
|
||||
* rrf_scores: array<string,float>,
|
||||
* raw_scores: array<string,float>
|
||||
* }
|
||||
*/
|
||||
private function runCore(string $prompt, ModelGenerationConfig $config, bool $withScores): array
|
||||
{
|
||||
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
|
||||
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
|
||||
@@ -61,16 +179,13 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$cleanQuery = $prompt;
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// Intent-based adjustments
|
||||
// -------------------------------------------------
|
||||
|
||||
// Intent-based adjustments (identisch zur Produktionslogik)
|
||||
$threshold = self::VECTOR_SCORE_THRESHOLD;
|
||||
$topK = $vectorTopKBase;
|
||||
|
||||
switch ($salesIntent) {
|
||||
case SalesIntentLite::PRICING:
|
||||
$threshold += 0.02; // more precision
|
||||
$threshold += 0.02;
|
||||
break;
|
||||
|
||||
case SalesIntentLite::COMPARISON:
|
||||
@@ -101,10 +216,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
|
||||
|
||||
// -------------------------------------------------
|
||||
// Tag routing
|
||||
// -------------------------------------------------
|
||||
|
||||
// Tag routing (identisch)
|
||||
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
||||
$candidateSet = null;
|
||||
|
||||
@@ -112,63 +224,228 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$candidateSet = array_fill_keys($candidateDocIds, true);
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// Dual search
|
||||
// -------------------------------------------------
|
||||
|
||||
// Dual search (identisch)
|
||||
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
||||
|
||||
$scopedHits = [];
|
||||
if ($candidateSet !== null) {
|
||||
$scopedHits = $this->vectorClient->searchScoped(
|
||||
$cleanQuery,
|
||||
$topK,
|
||||
array_keys($candidateSet)
|
||||
);
|
||||
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, array_keys($candidateSet));
|
||||
}
|
||||
|
||||
if ($globalHits === [] && $scopedHits === []) {
|
||||
return [];
|
||||
return [
|
||||
'limit' => $limit,
|
||||
'is_list_query' => $isListQuery,
|
||||
'sales_intent' => (string)$salesIntent,
|
||||
'threshold' => $threshold,
|
||||
'topk' => $topK,
|
||||
'ranked_chunk_ids' => [],
|
||||
'rows' => [],
|
||||
'rrf_scores' => [],
|
||||
'raw_scores' => [],
|
||||
];
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// RRF Fusion
|
||||
// -------------------------------------------------
|
||||
|
||||
$rrfScores = [];
|
||||
$rawScores = [];
|
||||
|
||||
$this->applyRrf($globalHits, $rrfScores, $threshold);
|
||||
$this->applyRrf($scopedHits, $rrfScores, $threshold, $salesIntent === SalesIntentLite::OBJECTION);
|
||||
// RRF (identisch) + optional raw capture
|
||||
$this->applyRrfWithOptionalRaw($globalHits, $rrfScores, $rawScores, $threshold, false, $withScores);
|
||||
$this->applyRrfWithOptionalRaw(
|
||||
$scopedHits,
|
||||
$rrfScores,
|
||||
$rawScores,
|
||||
$threshold,
|
||||
$salesIntent === SalesIntentLite::OBJECTION,
|
||||
$withScores
|
||||
);
|
||||
|
||||
if ($rrfScores === []) {
|
||||
return [];
|
||||
return [
|
||||
'limit' => $limit,
|
||||
'is_list_query' => $isListQuery,
|
||||
'sales_intent' => (string)$salesIntent,
|
||||
'threshold' => $threshold,
|
||||
'topk' => $topK,
|
||||
'ranked_chunk_ids' => [],
|
||||
'rows' => [],
|
||||
'rrf_scores' => [],
|
||||
'raw_scores' => $rawScores,
|
||||
];
|
||||
}
|
||||
|
||||
arsort($rrfScores);
|
||||
$rankedChunkIds = array_keys($rrfScores);
|
||||
|
||||
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
|
||||
if ($rows === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
if (!$isListQuery) {
|
||||
return $this->collectSalesOptimized(
|
||||
$rankedChunkIds,
|
||||
$rows,
|
||||
$limit
|
||||
);
|
||||
}
|
||||
|
||||
return $this->collectTexts($rankedChunkIds, $rows, $limit);
|
||||
return [
|
||||
'limit' => $limit,
|
||||
'is_list_query' => $isListQuery,
|
||||
'sales_intent' => (string)$salesIntent,
|
||||
'threshold' => $threshold,
|
||||
'topk' => $topK,
|
||||
'ranked_chunk_ids' => $rankedChunkIds,
|
||||
'rows' => $rows,
|
||||
'rrf_scores' => $rrfScores,
|
||||
'raw_scores' => $rawScores,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gleiche Logik wie applyRrf(), aber optional mit raw-score capture.
|
||||
*
|
||||
* @param array<int, array{chunk_id:string, score:float}> $hits
|
||||
* @param array<string,float> $rrfScores
|
||||
* @param array<string,float> $rawScores
|
||||
*/
|
||||
private function applyRrfWithOptionalRaw(
|
||||
array $hits,
|
||||
array &$rrfScores,
|
||||
array &$rawScores,
|
||||
float $threshold,
|
||||
bool $boost = false,
|
||||
bool $captureRaw = false
|
||||
): void {
|
||||
$rank = 0;
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$raw = (float)$hit['score'];
|
||||
if ($raw < $threshold) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkId = (string)$hit['chunk_id'];
|
||||
|
||||
if ($captureRaw) {
|
||||
// wenn global+scoped vorkommt: bestes raw behalten
|
||||
if (!isset($rawScores[$chunkId]) || $raw > $rawScores[$chunkId]) {
|
||||
$rawScores[$chunkId] = $raw;
|
||||
}
|
||||
}
|
||||
|
||||
$rank++;
|
||||
$rrf = 1 / (self::RRF_K + $rank);
|
||||
|
||||
if ($boost) {
|
||||
$rrf *= 1.2;
|
||||
}
|
||||
|
||||
if (!isset($rrfScores[$chunkId])) {
|
||||
$rrfScores[$chunkId] = 0.0;
|
||||
}
|
||||
|
||||
$rrfScores[$chunkId] += $rrf;
|
||||
}
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// DEBUG SELECTION HELPERS (identisch zu Produktionsregeln)
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* List-Mode nutzt exakt collectTexts() Regeln, aber gibt ChunkIds zurück.
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
private function selectChunkIdsListMode(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$seen = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($chunkIds as $id) {
|
||||
if (!isset($rows[$id]['text'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunk = trim((string)$rows[$id]['text']);
|
||||
if ($chunk === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
|
||||
|
||||
if (isset($seen[$key])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$seen[$key] = true;
|
||||
$out[] = (string)$id;
|
||||
|
||||
if (\count($out) >= $limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normal-Mode nutzt exakt collectSalesOptimized() Regeln, aber gibt ChunkIds zurück.
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
private function selectChunkIdsSalesMode(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$out = [];
|
||||
$docCounter = [];
|
||||
$docChunkPositions = [];
|
||||
|
||||
foreach ($chunkIds as $chunkId) {
|
||||
if (!isset($rows[$chunkId]['text'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
||||
|
||||
if (!is_string($docId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_int($chunkIndex)) {
|
||||
$prev = $docChunkPositions[$docId] ?? [];
|
||||
foreach ($prev as $prevIdx) {
|
||||
if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) {
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
$docChunkPositions[$docId][] = $chunkIndex;
|
||||
}
|
||||
|
||||
$text = trim((string)$rows[$chunkId]['text']);
|
||||
if ($text === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = (string)$chunkId;
|
||||
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
|
||||
|
||||
if (\count($out) >= $limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// ORIGINAL METHODS (UNVERÄNDERT)
|
||||
// =========================================================
|
||||
|
||||
private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void
|
||||
{
|
||||
$rank = 0;
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
|
||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||
continue;
|
||||
}
|
||||
@@ -184,7 +461,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$rrf = 1 / (self::RRF_K + $rank);
|
||||
|
||||
if ($boost) {
|
||||
$rrf *= 1.2; // scoped boost for objections
|
||||
$rrf *= 1.2;
|
||||
}
|
||||
|
||||
if (!isset($rrfScores[$chunkId])) {
|
||||
@@ -202,7 +479,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$docChunkPositions = [];
|
||||
|
||||
foreach ($chunkIds as $chunkId) {
|
||||
|
||||
if (!isset($rows[$chunkId]['text'])) {
|
||||
continue;
|
||||
}
|
||||
@@ -236,7 +512,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$out[] = $text;
|
||||
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
|
||||
|
||||
if (count($out) >= $limit) {
|
||||
if (\count($out) >= $limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -250,7 +526,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$out = [];
|
||||
|
||||
foreach ($chunkIds as $id) {
|
||||
|
||||
if (!isset($rows[$id]['text'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user