Files
MtoRagSystem/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
2026-03-02 15:42:03 +01:00

448 lines
13 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Catalog\EntityCatalogService;
use App\Entity\ModelGenerationConfig;
use App\Intent\CatalogIntentLite;
use App\Intent\IntentLite;
use App\Intent\SalesIntentLite;
use App\Repository\ModelGenerationConfigRepository;
use App\Routing\IntentRouteResolver;
use App\Tag\TagRoutingService;
use App\Vector\VectorSearchClient;
final class NdjsonHybridRetriever implements RetrieverInterface
{
private const VECTOR_SCORE_THRESHOLD = 0.75;
private const HARD_MAX_CHUNKS = 90;
private const HARD_MAX_VECTORK = 250;
private const LIST_BONUS = 1.25;
private const MAX_CHUNKS_PER_DOC = 2;
private const MIN_CHUNK_DISTANCE = 2;
private const RRF_K = 60;
private const THRESHOLD_FLOOR = 0.65;
private const THRESHOLD_CEIL = 0.90;
private const EMPTY_RRF_FALLBACK_TOPN = 5;
public function __construct(
private readonly NdjsonChunkLookup $lookup,
private readonly VectorSearchClient $vectorClient,
private readonly TagRoutingService $tagRouting,
private readonly ModelGenerationConfigRepository $configRepository,
private readonly QueryCleaner $queryCleaner,
private readonly IntentLite $intentLite,
private readonly SalesIntentLite $salesIntentLite,
private readonly CatalogIntentLite $catalogIntent,
private readonly IntentRouteResolver $routeResolver,
private readonly EntityCatalogService $entityCatalogService
) {}
// =========================================================
// PUBLIC API
// =========================================================
public function retrieve(string $prompt): array
{
$config = $this->requireConfig();
$result = $this->execute($prompt, $config, false);
if ($result['catalogBlock'] !== null) {
return [$result['catalogBlock']];
}
return $this->collectTextsFromIds(
$result['selectedChunkIds'],
$result['rows']
);
}
public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array
{
$config = $config ?? $this->requireConfig();
$result = $this->execute($prompt, $config, true);
if ($result['catalogBlock'] !== null) {
return [[
'rank' => 1,
'chunk_id' => '__CATALOG_LIST__',
'document_id' => null,
'raw_score' => null,
'rrf_score' => null,
'threshold' => 0.0,
'intent' => $result['intent'],
'route' => $result['route'],
'entity_label' => $result['entityLabel'],
'is_list_query' => true,
'text' => $result['catalogBlock'],
]];
}
$out = [];
$rank = 0;
foreach ($result['selectedChunkIds'] as $chunkId) {
if (!isset($result['rows'][$chunkId])) {
continue;
}
$rank++;
$out[] = [
'rank' => $rank,
'chunk_id' => $chunkId,
'document_id' => $result['rows'][$chunkId]['document_id'] ?? null,
'raw_score' => $result['rawScores'][$chunkId] ?? null,
'rrf_score' => $result['rrfScores'][$chunkId] ?? null,
'threshold' => $result['threshold'],
'intent' => $result['intent'],
'route' => $result['route'],
'entity_label' => $result['entityLabel'],
'is_list_query' => $result['isListQuery'],
'text' => trim((string)$result['rows'][$chunkId]['text']),
];
}
return $out;
}
// =========================================================
// CENTRAL ORCHESTRATION
// =========================================================
private function execute(
string $prompt,
ModelGenerationConfig $config,
bool $withScores
): array {
$entityLabel = $this->catalogIntent->detect($prompt);
$salesIntent = $this->detectSalesIntent($prompt);
$route = $this->routeResolver->resolve($salesIntent, $entityLabel);
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
$catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);
if ($catalogBlock !== null) {
return [
'route' => $route,
'entityLabel' => $entityLabel,
'intent' => $salesIntent,
'isListQuery' => true,
'selectedChunkIds' => [],
'rows' => [],
'rrfScores' => [],
'rawScores' => [],
'threshold' => 0.0,
'catalogBlock' => trim($catalogBlock),
];
}
}
$core = $this->runCore($prompt, $config, $withScores, $salesIntent);
$selectedChunkIds = $core['is_list_query']
? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
: $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
return [
'route' => $route,
'entityLabel' => $entityLabel,
'intent' => $salesIntent,
'isListQuery' => $core['is_list_query'],
'selectedChunkIds' => $selectedChunkIds,
'rows' => $core['rows'],
'rrfScores' => $core['rrf_scores'],
'rawScores' => $core['raw_scores'],
'threshold' => $core['threshold'],
'catalogBlock' => null,
];
}
// =========================================================
// CORE PIPELINE
// =========================================================
private function runCore(
string $prompt,
ModelGenerationConfig $config,
bool $withScores,
string $salesIntent
): array {
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
$isListQuery = $this->intentLite->isListQuery($prompt);
$cleanQuery = $this->queryCleaner->clean($prompt);
if ($cleanQuery === '') {
$cleanQuery = $prompt;
}
[$threshold, $topK] = $this->computeThresholdAndTopK(
$salesIntent,
$isListQuery,
$vectorTopKBase
);
$candidateDocIds = $this->tagRouting->route($cleanQuery);
$candidateDocIds = is_array($candidateDocIds)
? array_values(array_unique(array_filter($candidateDocIds, 'is_string')))
: [];
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
$scopedHits = [];
if (!empty($candidateDocIds)) {
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
}
$fused = $this->fuseHits(
$globalHits,
$scopedHits,
$threshold,
$salesIntent === SalesIntentLite::OBJECTION,
$withScores
);
$rrfScores = $fused['rrf_scores'];
$rawScores = $fused['raw_scores'];
if ($rrfScores === [] && $globalHits !== []) {
$rrfScores = $this->fallbackRrfFromHits($globalHits, self::EMPTY_RRF_FALLBACK_TOPN);
}
arsort($rrfScores);
$rankedChunkIds = array_keys($rrfScores);
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'threshold' => $threshold,
'ranked_chunk_ids' => $rankedChunkIds,
'rows' => $rows,
'rrf_scores' => $rrfScores,
'raw_scores' => $rawScores,
];
}
// =========================================================
// SUPPORT
// =========================================================
private function requireConfig(): ModelGenerationConfig
{
$config = $this->configRepository->findActiveForModel();
if ($config === null) {
throw new \RuntimeException('No active ModelGenerationConfig found.');
}
return $config;
}
private function detectSalesIntent(string $prompt): string
{
$data = $this->salesIntentLite->detect($prompt);
return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
}
private function computeThresholdAndTopK(string $salesIntent, bool $isListQuery, int $vectorTopKBase): array
{
$threshold = self::VECTOR_SCORE_THRESHOLD;
$topK = $vectorTopKBase;
if ($salesIntent === SalesIntentLite::OBJECTION ||
$salesIntent === SalesIntentLite::PRICING) {
$threshold += 0.02;
}
if ($isListQuery) {
$topK = (int)round($topK * self::LIST_BONUS);
}
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
$threshold = max(self::THRESHOLD_FLOOR, min(self::THRESHOLD_CEIL, $threshold));
return [$threshold, $topK];
}
private function fuseHits(
array $globalHits,
array $scopedHits,
float $threshold,
bool $boostScoped,
bool $captureRaw
): array {
$rrfScores = [];
$rawScores = [];
$apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void {
$rank = 0;
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
$raw = (float)$hit['score'];
if ($raw < $threshold) {
continue;
}
$chunkId = (string)$hit['chunk_id'];
if ($captureRaw) {
$rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
}
$rank++;
$rrf = 1.0 / (self::RRF_K + $rank);
if ($boost) {
$rrf *= 1.2;
}
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
}
};
$apply($globalHits, false);
$apply($scopedHits, $boostScoped);
return [
'rrf_scores' => $rrfScores,
'raw_scores' => $rawScores,
];
}
private function fallbackRrfFromHits(array $hits, int $topN): array
{
$rrf = [];
$rank = 0;
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'])) {
continue;
}
$rank++;
$rrf[(string)$hit['chunk_id']] = 1.0 / (self::RRF_K + $rank);
if ($rank >= $topN) {
break;
}
}
return $rrf;
}
private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array
{
$seen = [];
$out = [];
foreach ($chunkIds as $id) {
if (!isset($rows[$id]['text'])) {
continue;
}
$chunk = trim((string)$rows[$id]['text']);
if ($chunk === '') {
continue;
}
$key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = (string)$id;
if (count($out) >= $limit) {
break;
}
}
return $out;
}
private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array
{
$out = [];
$docCounter = [];
$docChunkPositions = [];
foreach ($chunkIds as $chunkId) {
if (!isset($rows[$chunkId]['text'])) {
continue;
}
$docId = $rows[$chunkId]['document_id'] ?? null;
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
if (!is_string($docId)) {
continue;
}
if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) {
continue;
}
if (is_int($chunkIndex)) {
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) {
continue 2;
}
}
$docChunkPositions[$docId][] = $chunkIndex;
}
$text = trim((string)$rows[$chunkId]['text']);
if ($text === '') {
continue;
}
$out[] = (string)$chunkId;
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
if (count($out) >= $limit) {
break;
}
}
return $out;
}
private function collectTextsFromIds(array $chunkIds, array $rows): array
{
$out = [];
foreach ($chunkIds as $id) {
if (!isset($rows[$id]['text'])) {
continue;
}
$text = trim((string)$rows[$id]['text']);
if ($text !== '') {
$out[] = $text;
}
}
return $out;
}
}