optimize as sales rag
This commit is contained in:
@@ -41,6 +41,7 @@ final readonly class KnowledgeIngestService
|
||||
foreach ($chunks as $chunkText) {
|
||||
|
||||
if ($title !== '' && !str_starts_with($chunkText, $title)) {
|
||||
//title with backticks
|
||||
$chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Entity\ModelGenerationConfig;
|
||||
use App\Intent\IntentLite;
|
||||
use App\Knowledge\ChunkManager;
|
||||
use App\Intent\SalesIntentLite;
|
||||
use App\Knowledge\QueryCleaner;
|
||||
use App\Repository\ModelGenerationConfigRepository;
|
||||
use App\Tag\TagRoutingService;
|
||||
@@ -21,11 +21,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
private const LIST_BONUS = 1.5;
|
||||
|
||||
/**
|
||||
* Tags must only provide a small bonus (never act as a gate/filter).
|
||||
* Enterprise default: keep it low, otherwise tags will dominate ranking again.
|
||||
*/
|
||||
private const TAG_SCORE_BONUS = 0.1 * (1 - self::VECTOR_SCORE_THRESHOLD);
|
||||
private const MAX_CHUNKS_PER_DOC = 2;
|
||||
private const MIN_CHUNK_DISTANCE = 2;
|
||||
private const RRF_K = 60;
|
||||
|
||||
public function __construct(
|
||||
private readonly NdjsonChunkLookup $lookup,
|
||||
@@ -33,7 +31,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
private readonly TagRoutingService $tagRouting,
|
||||
private readonly ModelGenerationConfigRepository $configRepository,
|
||||
private readonly QueryCleaner $queryCleaner,
|
||||
private readonly IntentLite $intentLite
|
||||
private readonly IntentLite $intentLite,
|
||||
private readonly SalesIntentLite $salesIntentLite
|
||||
)
|
||||
{
|
||||
}
|
||||
@@ -49,27 +48,63 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return $this->retrieveInternal($prompt, $config);
|
||||
}
|
||||
|
||||
|
||||
public function retrieveInternal(string $prompt, ModelGenerationConfig $config): array
|
||||
{
|
||||
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
|
||||
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
|
||||
|
||||
// Important: list-intent detection must run on the original prompt
|
||||
// (cleaning might remove "show/list" etc.).
|
||||
$isListQuery = $this->intentLite->isListQuery($prompt);
|
||||
$salesIntent = $this->salesIntentLite->detect($prompt)['intent'];
|
||||
|
||||
// -------------------------------------------------
|
||||
// CLEAN QUERY (retrieval-only: tag routing + vector search)
|
||||
// -------------------------------------------------
|
||||
$cleanQuery = $this->queryCleaner->clean($prompt);
|
||||
if ($cleanQuery === '') {
|
||||
$cleanQuery = $prompt;
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// 1) Tag routing (cleaned query) -> bonus only
|
||||
// Intent-based adjustments
|
||||
// -------------------------------------------------
|
||||
|
||||
$threshold = self::VECTOR_SCORE_THRESHOLD;
|
||||
$topK = $vectorTopKBase;
|
||||
|
||||
switch ($salesIntent) {
|
||||
case SalesIntentLite::PRICING:
|
||||
$threshold += 0.02; // more precision
|
||||
break;
|
||||
|
||||
case SalesIntentLite::COMPARISON:
|
||||
$topK = (int)round($vectorTopKBase * 1.4);
|
||||
break;
|
||||
|
||||
case SalesIntentLite::OBJECTION:
|
||||
$threshold -= 0.02;
|
||||
break;
|
||||
|
||||
case SalesIntentLite::IMPLEMENTATION:
|
||||
$topK = (int)round($vectorTopKBase * 1.3);
|
||||
break;
|
||||
|
||||
case SalesIntentLite::ROI:
|
||||
$topK = (int)round($vectorTopKBase * 1.2);
|
||||
break;
|
||||
|
||||
case SalesIntentLite::DISCOVERY:
|
||||
default:
|
||||
$threshold -= 0.03;
|
||||
break;
|
||||
}
|
||||
|
||||
if ($isListQuery) {
|
||||
$topK = (int)round($topK * self::LIST_BONUS);
|
||||
}
|
||||
|
||||
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
|
||||
|
||||
// -------------------------------------------------
|
||||
// Tag routing
|
||||
// -------------------------------------------------
|
||||
|
||||
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
||||
$candidateSet = null;
|
||||
|
||||
@@ -78,215 +113,144 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// 2) Determine TopK
|
||||
// Dual search
|
||||
// -------------------------------------------------
|
||||
$topK = $vectorTopKBase;
|
||||
|
||||
// List mode: increase coverage to rank more documents
|
||||
if ($isListQuery) {
|
||||
$topK = (int)round($vectorTopKBase * self::LIST_BONUS);
|
||||
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
||||
|
||||
$scopedHits = [];
|
||||
if ($candidateSet !== null) {
|
||||
$scopedHits = $this->vectorClient->searchScoped(
|
||||
$cleanQuery,
|
||||
$topK,
|
||||
array_keys($candidateSet)
|
||||
);
|
||||
}
|
||||
|
||||
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
|
||||
|
||||
// -------------------------------------------------
|
||||
// 3) Vector search (always GLOBAL; tags are NOT a filter)
|
||||
// -------------------------------------------------
|
||||
$hits = $this->vectorClient->search($cleanQuery, $topK);
|
||||
|
||||
if ($hits === []) {
|
||||
// Tags must NOT act as a fallback (otherwise they become too powerful again).
|
||||
if ($globalHits === [] && $scopedHits === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// 4) Collect chunkIds + scores (raw)
|
||||
// RRF Fusion
|
||||
// -------------------------------------------------
|
||||
/** @var array<string,float> $rawScoreByChunkId */
|
||||
$rawScoreByChunkId = [];
|
||||
|
||||
$rrfScores = [];
|
||||
|
||||
$this->applyRrf($globalHits, $rrfScores, $threshold);
|
||||
$this->applyRrf($scopedHits, $rrfScores, $threshold, $salesIntent === SalesIntentLite::OBJECTION);
|
||||
|
||||
if ($rrfScores === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
arsort($rrfScores);
|
||||
$rankedChunkIds = array_keys($rrfScores);
|
||||
|
||||
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
|
||||
if ($rows === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
if (!$isListQuery) {
|
||||
return $this->collectSalesOptimized(
|
||||
$rankedChunkIds,
|
||||
$rows,
|
||||
$limit
|
||||
);
|
||||
}
|
||||
|
||||
return $this->collectTexts($rankedChunkIds, $rows, $limit);
|
||||
}
|
||||
|
||||
private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void
|
||||
{
|
||||
$rank = 0;
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
|
||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$raw = (float)$hit['score'];
|
||||
|
||||
// Apply the threshold to the RAW score (quality gate)
|
||||
if ($raw < self::VECTOR_SCORE_THRESHOLD) {
|
||||
if ($raw < $threshold) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkId = (string)$hit['chunk_id'];
|
||||
|
||||
// If a chunk appears multiple times, keep the best raw score
|
||||
if (!isset($rawScoreByChunkId[$chunkId]) || $raw > $rawScoreByChunkId[$chunkId]) {
|
||||
$rawScoreByChunkId[$chunkId] = $raw;
|
||||
}
|
||||
}
|
||||
$rank++;
|
||||
$rrf = 1 / (self::RRF_K + $rank);
|
||||
|
||||
if ($rawScoreByChunkId === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Lookup returns document_id + text etc.
|
||||
$rows = $this->lookup->findByChunkIds(array_keys($rawScoreByChunkId));
|
||||
|
||||
// -------------------------------------------------
|
||||
// 5) Adjusted score (tag bonus) + ranking
|
||||
// -------------------------------------------------
|
||||
/** @var array<string,float> $adjScoreByChunkId */
|
||||
$adjScoreByChunkId = [];
|
||||
|
||||
foreach ($rawScoreByChunkId as $chunkId => $rawScore) {
|
||||
if (!isset($rows[$chunkId])) {
|
||||
continue;
|
||||
if ($boost) {
|
||||
$rrf *= 1.2; // scoped boost for objections
|
||||
}
|
||||
|
||||
$adj = $rawScore;
|
||||
|
||||
if ($candidateSet !== null) {
|
||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||
if (is_string($docId) && isset($candidateSet[$docId])) {
|
||||
$adj += self::TAG_SCORE_BONUS;
|
||||
}
|
||||
if (!isset($rrfScores[$chunkId])) {
|
||||
$rrfScores[$chunkId] = 0.0;
|
||||
}
|
||||
|
||||
$adjScoreByChunkId[$chunkId] = $adj;
|
||||
$rrfScores[$chunkId] += $rrf;
|
||||
}
|
||||
|
||||
if ($adjScoreByChunkId === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Sort: adjusted desc, deterministic tie-break by chunkId
|
||||
uksort($adjScoreByChunkId, static function (string $a, string $b) use ($adjScoreByChunkId): int {
|
||||
$sa = $adjScoreByChunkId[$a];
|
||||
$sb = $adjScoreByChunkId[$b];
|
||||
|
||||
if ($sa === $sb) {
|
||||
return $a <=> $b;
|
||||
}
|
||||
return ($sb <=> $sa);
|
||||
});
|
||||
|
||||
$rankedChunkIds = array_keys($adjScoreByChunkId);
|
||||
|
||||
// -------------------------------------------------
|
||||
// 6) List mode -> document ranking (with tag bonus in scores)
|
||||
// -------------------------------------------------
|
||||
if ($isListQuery) {
|
||||
$rankedDocIds = $this->rankDocumentsFromAdjustedScores($adjScoreByChunkId, $rows);
|
||||
|
||||
if ($rankedDocIds === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$topDocIds = array_slice($rankedDocIds, 0, $limit);
|
||||
|
||||
return $this->collectBestChunkPerDocumentAdjusted($topDocIds, $adjScoreByChunkId, $rows);
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// 7) Normal chunk mode (by adjusted ranking)
|
||||
// -------------------------------------------------
|
||||
return $this->collectTexts($rankedChunkIds, $rows, $limit);
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// LIST QUERY DETECTION
|
||||
// =========================================================
|
||||
|
||||
// =========================================================
|
||||
// DOCUMENT RANKING (Adjusted scores incl. tag bonus)
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* @param array<string,float> $adjScoreByChunkId
|
||||
* @param array<string,array<string,mixed>> $rows
|
||||
* @return string[]
|
||||
*/
|
||||
private function rankDocumentsFromAdjustedScores(array $adjScoreByChunkId, array $rows): array
|
||||
private function collectSalesOptimized(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$documentScores = [];
|
||||
$out = [];
|
||||
$docCounter = [];
|
||||
$docChunkPositions = [];
|
||||
|
||||
foreach ($adjScoreByChunkId as $chunkId => $score) {
|
||||
if (!isset($rows[$chunkId])) {
|
||||
foreach ($chunkIds as $chunkId) {
|
||||
|
||||
if (!isset($rows[$chunkId]['text'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||
if (!is_string($docId) || $docId === '') {
|
||||
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
||||
|
||||
if (!is_string($docId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$documentScores[$docId][] = (float)$score;
|
||||
}
|
||||
|
||||
if ($documentScores === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$ranked = [];
|
||||
|
||||
foreach ($documentScores as $docId => $scores) {
|
||||
rsort($scores);
|
||||
$topScores = array_slice($scores, 0, 3);
|
||||
$ranked[$docId] = array_sum($topScores) / count($topScores);
|
||||
}
|
||||
|
||||
arsort($ranked);
|
||||
|
||||
return array_keys($ranked);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[] $docIds
|
||||
* @param array<string,float> $adjScoreByChunkId
|
||||
* @param array<string,array<string,mixed>> $rows
|
||||
* @return string[]
|
||||
*/
|
||||
private function collectBestChunkPerDocumentAdjusted(array $docIds, array $adjScoreByChunkId, array $rows): array
|
||||
{
|
||||
$result = [];
|
||||
|
||||
foreach ($docIds as $docId) {
|
||||
$bestScore = -INF;
|
||||
$bestText = null;
|
||||
|
||||
foreach ($adjScoreByChunkId as $chunkId => $score) {
|
||||
if (!isset($rows[$chunkId])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (($rows[$chunkId]['document_id'] ?? null) !== $docId) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((float)$score > $bestScore) {
|
||||
$bestScore = (float)$score;
|
||||
$bestText = $rows[$chunkId]['text'] ?? null;
|
||||
}
|
||||
if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_string($bestText) && $bestText !== '') {
|
||||
$result[] = trim($bestText);
|
||||
if (is_int($chunkIndex)) {
|
||||
$prev = $docChunkPositions[$docId] ?? [];
|
||||
foreach ($prev as $prevIdx) {
|
||||
if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) {
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
$docChunkPositions[$docId][] = $chunkIndex;
|
||||
}
|
||||
|
||||
$text = trim((string)$rows[$chunkId]['text']);
|
||||
if ($text === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = $text;
|
||||
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
|
||||
|
||||
if (count($out) >= $limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $result;
|
||||
return $out;
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// NORMAL MODE
|
||||
// =========================================================
|
||||
|
||||
private function collectTexts(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$seen = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($chunkIds as $id) {
|
||||
|
||||
if (!isset($rows[$id]['text'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user