add new configs
This commit is contained in:
@@ -5,6 +5,7 @@ declare(strict_types=1);
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Catalog\EntityCatalogService;
|
||||
use App\Config\NdjsonHybridRetrieverConfig;
|
||||
use App\Entity\ModelGenerationConfig;
|
||||
use App\Intent\CatalogIntentLite;
|
||||
use App\Intent\IntentLite;
|
||||
@@ -14,35 +15,20 @@ use App\Routing\IntentRouteResolver;
|
||||
use App\Tag\TagRoutingService;
|
||||
use App\Vector\VectorSearchClient;
|
||||
|
||||
final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
private const VECTOR_SCORE_THRESHOLD = 0.75;
|
||||
|
||||
private const HARD_MAX_CHUNKS = 90;
|
||||
private const HARD_MAX_VECTORK = 250;
|
||||
|
||||
private const LIST_BONUS = 1.25;
|
||||
|
||||
private const MAX_CHUNKS_PER_DOC = 2;
|
||||
private const MIN_CHUNK_DISTANCE = 2.5;
|
||||
private const RRF_K = 60;
|
||||
|
||||
private const THRESHOLD_FLOOR = 0.83;
|
||||
private const THRESHOLD_CEIL = 0.92;
|
||||
private const EMPTY_RRF_FALLBACK_TOPN = 1;
|
||||
|
||||
public function __construct(
|
||||
private readonly NdjsonChunkLookup $lookup,
|
||||
private readonly VectorSearchClient $vectorClient,
|
||||
private readonly TagRoutingService $tagRouting,
|
||||
private readonly ModelGenerationConfigRepository $configRepository,
|
||||
private readonly QueryCleaner $queryCleaner,
|
||||
private readonly IntentLite $intentLite,
|
||||
private readonly SalesIntentLite $salesIntentLite,
|
||||
private readonly CatalogIntentLite $catalogIntent,
|
||||
private readonly IntentRouteResolver $routeResolver,
|
||||
private readonly EntityCatalogService $entityCatalogService,
|
||||
private readonly QueryEnricher $queryEnricher,
|
||||
private NdjsonChunkLookup $lookup,
|
||||
private VectorSearchClient $vectorClient,
|
||||
private TagRoutingService $tagRouting,
|
||||
private ModelGenerationConfigRepository $configRepository,
|
||||
private QueryCleaner $queryCleaner,
|
||||
private IntentLite $intentLite,
|
||||
private SalesIntentLite $salesIntentLite,
|
||||
private CatalogIntentLite $catalogIntent,
|
||||
private IntentRouteResolver $routeResolver,
|
||||
private EntityCatalogService $entityCatalogService,
|
||||
private QueryEnricher $queryEnricher,
|
||||
)
|
||||
{
|
||||
}
|
||||
@@ -206,8 +192,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
): array
|
||||
{
|
||||
|
||||
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
|
||||
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
|
||||
$limit = max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS));
|
||||
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
|
||||
|
||||
$isListQuery = $this->intentLite->isListQuery($prompt);
|
||||
|
||||
@@ -218,7 +204,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return [
|
||||
'limit' => $limit,
|
||||
'is_list_query' => $isListQuery,
|
||||
'threshold' => self::VECTOR_SCORE_THRESHOLD,
|
||||
'threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD,
|
||||
'ranked_chunk_ids' => [],
|
||||
'rows' => [],
|
||||
'rrf_scores' => [],
|
||||
@@ -270,7 +256,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
if ($rrfScores === [] && $globalHits !== []) {
|
||||
$rrfScores = $this->fallbackRrfFromHits(
|
||||
$globalHits,
|
||||
self::EMPTY_RRF_FALLBACK_TOPN
|
||||
NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN
|
||||
);
|
||||
}
|
||||
|
||||
@@ -327,7 +313,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
): array
|
||||
{
|
||||
|
||||
$threshold = self::VECTOR_SCORE_THRESHOLD;
|
||||
$threshold = NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD;
|
||||
$topK = $vectorTopKBase;
|
||||
|
||||
if (
|
||||
@@ -338,11 +324,11 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
if ($isListQuery) {
|
||||
$topK = (int)round($topK * self::LIST_BONUS);
|
||||
$topK = (int)round($topK * NdjsonHybridRetrieverConfig::LIST_BONUS);
|
||||
}
|
||||
|
||||
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
|
||||
$threshold = max(self::THRESHOLD_FLOOR, min(self::THRESHOLD_CEIL, $threshold));
|
||||
$topK = max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
|
||||
$threshold = max(NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR, min(NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, $threshold));
|
||||
|
||||
return [$threshold, $topK];
|
||||
}
|
||||
@@ -382,7 +368,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
$rank++;
|
||||
$rrf = 1.0 / (self::RRF_K + $rank);
|
||||
$rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
|
||||
|
||||
if ($boost) {
|
||||
$rrf *= 1.2;
|
||||
@@ -413,7 +399,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
$rank++;
|
||||
$rrf[(string)$hit['chunk_id']] = 1.0 / (self::RRF_K + $rank);
|
||||
$rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
|
||||
|
||||
if ($rank >= $topN) {
|
||||
break;
|
||||
@@ -475,13 +461,13 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
continue;
|
||||
}
|
||||
|
||||
if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) {
|
||||
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_int($chunkIndex)) {
|
||||
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
|
||||
if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) {
|
||||
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user