448 lines
13 KiB
PHP
448 lines
13 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Knowledge\Retrieval;
|
|
|
|
use App\Catalog\EntityCatalogService;
|
|
use App\Entity\ModelGenerationConfig;
|
|
use App\Intent\CatalogIntentLite;
|
|
use App\Intent\IntentLite;
|
|
use App\Intent\SalesIntentLite;
|
|
use App\Repository\ModelGenerationConfigRepository;
|
|
use App\Routing\IntentRouteResolver;
|
|
use App\Tag\TagRoutingService;
|
|
use App\Vector\VectorSearchClient;
|
|
|
|
final class NdjsonHybridRetriever implements RetrieverInterface
|
|
{
|
|
private const VECTOR_SCORE_THRESHOLD = 0.75;
|
|
|
|
private const HARD_MAX_CHUNKS = 90;
|
|
private const HARD_MAX_VECTORK = 250;
|
|
|
|
private const LIST_BONUS = 1.25;
|
|
|
|
private const MAX_CHUNKS_PER_DOC = 2;
|
|
private const MIN_CHUNK_DISTANCE = 2;
|
|
private const RRF_K = 60;
|
|
|
|
private const THRESHOLD_FLOOR = 0.65;
|
|
private const THRESHOLD_CEIL = 0.90;
|
|
private const EMPTY_RRF_FALLBACK_TOPN = 5;
|
|
|
|
public function __construct(
|
|
private readonly NdjsonChunkLookup $lookup,
|
|
private readonly VectorSearchClient $vectorClient,
|
|
private readonly TagRoutingService $tagRouting,
|
|
private readonly ModelGenerationConfigRepository $configRepository,
|
|
private readonly QueryCleaner $queryCleaner,
|
|
private readonly IntentLite $intentLite,
|
|
private readonly SalesIntentLite $salesIntentLite,
|
|
private readonly CatalogIntentLite $catalogIntent,
|
|
private readonly IntentRouteResolver $routeResolver,
|
|
private readonly EntityCatalogService $entityCatalogService
|
|
) {}
|
|
|
|
// =========================================================
|
|
// PUBLIC API
|
|
// =========================================================
|
|
|
|
public function retrieve(string $prompt): array
|
|
{
|
|
$config = $this->requireConfig();
|
|
$result = $this->execute($prompt, $config, false);
|
|
|
|
if ($result['catalogBlock'] !== null) {
|
|
return [$result['catalogBlock']];
|
|
}
|
|
|
|
return $this->collectTextsFromIds(
|
|
$result['selectedChunkIds'],
|
|
$result['rows']
|
|
);
|
|
}
|
|
|
|
public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array
|
|
{
|
|
$config = $config ?? $this->requireConfig();
|
|
$result = $this->execute($prompt, $config, true);
|
|
|
|
if ($result['catalogBlock'] !== null) {
|
|
return [[
|
|
'rank' => 1,
|
|
'chunk_id' => '__CATALOG_LIST__',
|
|
'document_id' => null,
|
|
'raw_score' => null,
|
|
'rrf_score' => null,
|
|
'threshold' => 0.0,
|
|
'intent' => $result['intent'],
|
|
'route' => $result['route'],
|
|
'entity_label' => $result['entityLabel'],
|
|
'is_list_query' => true,
|
|
'text' => $result['catalogBlock'],
|
|
]];
|
|
}
|
|
|
|
$out = [];
|
|
$rank = 0;
|
|
|
|
foreach ($result['selectedChunkIds'] as $chunkId) {
|
|
if (!isset($result['rows'][$chunkId])) {
|
|
continue;
|
|
}
|
|
|
|
$rank++;
|
|
|
|
$out[] = [
|
|
'rank' => $rank,
|
|
'chunk_id' => $chunkId,
|
|
'document_id' => $result['rows'][$chunkId]['document_id'] ?? null,
|
|
'raw_score' => $result['rawScores'][$chunkId] ?? null,
|
|
'rrf_score' => $result['rrfScores'][$chunkId] ?? null,
|
|
'threshold' => $result['threshold'],
|
|
'intent' => $result['intent'],
|
|
'route' => $result['route'],
|
|
'entity_label' => $result['entityLabel'],
|
|
'is_list_query' => $result['isListQuery'],
|
|
'text' => trim((string)$result['rows'][$chunkId]['text']),
|
|
];
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
// =========================================================
|
|
// CENTRAL ORCHESTRATION
|
|
// =========================================================
|
|
|
|
private function execute(
|
|
string $prompt,
|
|
ModelGenerationConfig $config,
|
|
bool $withScores
|
|
): array {
|
|
|
|
$entityLabel = $this->catalogIntent->detect($prompt);
|
|
$salesIntent = $this->detectSalesIntent($prompt);
|
|
$route = $this->routeResolver->resolve($salesIntent, $entityLabel);
|
|
|
|
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
|
|
$catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);
|
|
|
|
if ($catalogBlock !== null) {
|
|
return [
|
|
'route' => $route,
|
|
'entityLabel' => $entityLabel,
|
|
'intent' => $salesIntent,
|
|
'isListQuery' => true,
|
|
'selectedChunkIds' => [],
|
|
'rows' => [],
|
|
'rrfScores' => [],
|
|
'rawScores' => [],
|
|
'threshold' => 0.0,
|
|
'catalogBlock' => trim($catalogBlock),
|
|
];
|
|
}
|
|
}
|
|
|
|
$core = $this->runCore($prompt, $config, $withScores, $salesIntent);
|
|
|
|
$selectedChunkIds = $core['is_list_query']
|
|
? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
|
|
: $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
|
|
|
return [
|
|
'route' => $route,
|
|
'entityLabel' => $entityLabel,
|
|
'intent' => $salesIntent,
|
|
'isListQuery' => $core['is_list_query'],
|
|
'selectedChunkIds' => $selectedChunkIds,
|
|
'rows' => $core['rows'],
|
|
'rrfScores' => $core['rrf_scores'],
|
|
'rawScores' => $core['raw_scores'],
|
|
'threshold' => $core['threshold'],
|
|
'catalogBlock' => null,
|
|
];
|
|
}
|
|
|
|
// =========================================================
|
|
// CORE PIPELINE
|
|
// =========================================================
|
|
|
|
private function runCore(
|
|
string $prompt,
|
|
ModelGenerationConfig $config,
|
|
bool $withScores,
|
|
string $salesIntent
|
|
): array {
|
|
|
|
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
|
|
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
|
|
|
|
$isListQuery = $this->intentLite->isListQuery($prompt);
|
|
|
|
$cleanQuery = $this->queryCleaner->clean($prompt);
|
|
if ($cleanQuery === '') {
|
|
$cleanQuery = $prompt;
|
|
}
|
|
|
|
[$threshold, $topK] = $this->computeThresholdAndTopK(
|
|
$salesIntent,
|
|
$isListQuery,
|
|
$vectorTopKBase
|
|
);
|
|
|
|
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
|
$candidateDocIds = is_array($candidateDocIds)
|
|
? array_values(array_unique(array_filter($candidateDocIds, 'is_string')))
|
|
: [];
|
|
|
|
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
|
|
|
$scopedHits = [];
|
|
if (!empty($candidateDocIds)) {
|
|
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
|
|
}
|
|
|
|
$fused = $this->fuseHits(
|
|
$globalHits,
|
|
$scopedHits,
|
|
$threshold,
|
|
$salesIntent === SalesIntentLite::OBJECTION,
|
|
$withScores
|
|
);
|
|
|
|
$rrfScores = $fused['rrf_scores'];
|
|
$rawScores = $fused['raw_scores'];
|
|
|
|
if ($rrfScores === [] && $globalHits !== []) {
|
|
$rrfScores = $this->fallbackRrfFromHits($globalHits, self::EMPTY_RRF_FALLBACK_TOPN);
|
|
}
|
|
|
|
arsort($rrfScores);
|
|
|
|
$rankedChunkIds = array_keys($rrfScores);
|
|
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
|
|
|
|
return [
|
|
'limit' => $limit,
|
|
'is_list_query' => $isListQuery,
|
|
'threshold' => $threshold,
|
|
'ranked_chunk_ids' => $rankedChunkIds,
|
|
'rows' => $rows,
|
|
'rrf_scores' => $rrfScores,
|
|
'raw_scores' => $rawScores,
|
|
];
|
|
}
|
|
|
|
// =========================================================
|
|
// SUPPORT
|
|
// =========================================================
|
|
|
|
private function requireConfig(): ModelGenerationConfig
|
|
{
|
|
$config = $this->configRepository->findActiveForModel();
|
|
if ($config === null) {
|
|
throw new \RuntimeException('No active ModelGenerationConfig found.');
|
|
}
|
|
return $config;
|
|
}
|
|
|
|
private function detectSalesIntent(string $prompt): string
|
|
{
|
|
$data = $this->salesIntentLite->detect($prompt);
|
|
return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
|
|
}
|
|
|
|
private function computeThresholdAndTopK(string $salesIntent, bool $isListQuery, int $vectorTopKBase): array
|
|
{
|
|
$threshold = self::VECTOR_SCORE_THRESHOLD;
|
|
$topK = $vectorTopKBase;
|
|
|
|
if ($salesIntent === SalesIntentLite::OBJECTION ||
|
|
$salesIntent === SalesIntentLite::PRICING) {
|
|
$threshold += 0.02;
|
|
}
|
|
|
|
if ($isListQuery) {
|
|
$topK = (int)round($topK * self::LIST_BONUS);
|
|
}
|
|
|
|
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
|
|
$threshold = max(self::THRESHOLD_FLOOR, min(self::THRESHOLD_CEIL, $threshold));
|
|
|
|
return [$threshold, $topK];
|
|
}
|
|
|
|
private function fuseHits(
|
|
array $globalHits,
|
|
array $scopedHits,
|
|
float $threshold,
|
|
bool $boostScoped,
|
|
bool $captureRaw
|
|
): array {
|
|
|
|
$rrfScores = [];
|
|
$rawScores = [];
|
|
|
|
$apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void {
|
|
|
|
$rank = 0;
|
|
|
|
foreach ($hits as $hit) {
|
|
|
|
if (!isset($hit['chunk_id'], $hit['score'])) {
|
|
continue;
|
|
}
|
|
|
|
$raw = (float)$hit['score'];
|
|
|
|
if ($raw < $threshold) {
|
|
continue;
|
|
}
|
|
|
|
$chunkId = (string)$hit['chunk_id'];
|
|
|
|
if ($captureRaw) {
|
|
$rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
|
|
}
|
|
|
|
$rank++;
|
|
$rrf = 1.0 / (self::RRF_K + $rank);
|
|
|
|
if ($boost) {
|
|
$rrf *= 1.2;
|
|
}
|
|
|
|
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
|
|
}
|
|
};
|
|
|
|
$apply($globalHits, false);
|
|
$apply($scopedHits, $boostScoped);
|
|
|
|
return [
|
|
'rrf_scores' => $rrfScores,
|
|
'raw_scores' => $rawScores,
|
|
];
|
|
}
|
|
|
|
private function fallbackRrfFromHits(array $hits, int $topN): array
|
|
{
|
|
$rrf = [];
|
|
$rank = 0;
|
|
|
|
foreach ($hits as $hit) {
|
|
if (!isset($hit['chunk_id'])) {
|
|
continue;
|
|
}
|
|
|
|
$rank++;
|
|
$rrf[(string)$hit['chunk_id']] = 1.0 / (self::RRF_K + $rank);
|
|
|
|
if ($rank >= $topN) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $rrf;
|
|
}
|
|
|
|
private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array
|
|
{
|
|
$seen = [];
|
|
$out = [];
|
|
|
|
foreach ($chunkIds as $id) {
|
|
if (!isset($rows[$id]['text'])) {
|
|
continue;
|
|
}
|
|
|
|
$chunk = trim((string)$rows[$id]['text']);
|
|
if ($chunk === '') {
|
|
continue;
|
|
}
|
|
|
|
$key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));
|
|
|
|
if (isset($seen[$key])) {
|
|
continue;
|
|
}
|
|
|
|
$seen[$key] = true;
|
|
$out[] = (string)$id;
|
|
|
|
if (count($out) >= $limit) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array
|
|
{
|
|
$out = [];
|
|
$docCounter = [];
|
|
$docChunkPositions = [];
|
|
|
|
foreach ($chunkIds as $chunkId) {
|
|
|
|
if (!isset($rows[$chunkId]['text'])) {
|
|
continue;
|
|
}
|
|
|
|
$docId = $rows[$chunkId]['document_id'] ?? null;
|
|
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
|
|
|
if (!is_string($docId)) {
|
|
continue;
|
|
}
|
|
|
|
if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) {
|
|
continue;
|
|
}
|
|
|
|
if (is_int($chunkIndex)) {
|
|
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
|
|
if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) {
|
|
continue 2;
|
|
}
|
|
}
|
|
$docChunkPositions[$docId][] = $chunkIndex;
|
|
}
|
|
|
|
$text = trim((string)$rows[$chunkId]['text']);
|
|
if ($text === '') {
|
|
continue;
|
|
}
|
|
|
|
$out[] = (string)$chunkId;
|
|
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
|
|
|
|
if (count($out) >= $limit) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
private function collectTextsFromIds(array $chunkIds, array $rows): array
|
|
{
|
|
$out = [];
|
|
|
|
foreach ($chunkIds as $id) {
|
|
if (!isset($rows[$id]['text'])) {
|
|
continue;
|
|
}
|
|
|
|
$text = trim((string)$rows[$id]['text']);
|
|
if ($text !== '') {
|
|
$out[] = $text;
|
|
}
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
} |