Files
MtoRagSystem/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
2026-02-28 07:45:34 +01:00

554 lines
16 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Entity\ModelGenerationConfig;
use App\Intent\IntentLite;
use App\Intent\SalesIntentLite;
use App\Knowledge\QueryCleaner;
use App\Repository\ModelGenerationConfigRepository;
use App\Tag\TagRoutingService;
use App\Vector\VectorSearchClient;
final class NdjsonHybridRetriever implements RetrieverInterface
{
private const VECTOR_SCORE_THRESHOLD = 0.72;
private const HARD_MAX_CHUNKS = 90;
private const HARD_MAX_VECTORK = 250;
private const LIST_BONUS = 1.5;
private const MAX_CHUNKS_PER_DOC = 2;
private const MIN_CHUNK_DISTANCE = 2;
private const RRF_K = 60;
public function __construct(
private readonly NdjsonChunkLookup $lookup,
private readonly VectorSearchClient $vectorClient,
private readonly TagRoutingService $tagRouting,
private readonly ModelGenerationConfigRepository $configRepository,
private readonly QueryCleaner $queryCleaner,
private readonly IntentLite $intentLite,
private readonly SalesIntentLite $salesIntentLite
)
{
}
// =========================================================
// PRODUCTION (UNVERÄNDERTES VERHALTEN)
// =========================================================
public function retrieve(string $prompt): array
{
$config = $this->configRepository->findActiveForModel();
if ($config === null) {
throw new \RuntimeException('No active ModelGenerationConfig found.');
}
return $this->retrieveInternal($prompt, $config);
}
public function retrieveInternal(string $prompt, ModelGenerationConfig $config): array
{
$core = $this->runCore($prompt, $config, false);
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
return [];
}
// ✅ ORIGINAL: Normal Mode -> Sales optimized selection
if (!$core['is_list_query']) {
return $this->collectSalesOptimized(
$core['ranked_chunk_ids'],
$core['rows'],
$core['limit']
);
}
// ✅ ORIGINAL: List Mode -> simple collectTexts
return $this->collectTexts(
$core['ranked_chunk_ids'],
$core['rows'],
$core['limit']
);
}
// =========================================================
// DEBUG (NEU, ABER NICHT IM PRODUKTIONS-PFAD)
// =========================================================
/**
* Gibt genau DIE Treffer zurück, die auch in Produktion ausgewählt werden,
* plus Scores/Meta pro ausgewähltem Chunk.
*
* @return array<int, array{
* rank:int,
* chunk_id:string,
* document_id: (string|null),
* raw_score:(float|null),
* rrf_score:(float|null),
* threshold:float,
* intent:string,
* is_list_query:bool,
* text:string
* }>
*/
public function retrieveDebug(string $prompt): array
{
$config = $this->configRepository->findActiveForModel();
if ($config === null) {
throw new \RuntimeException('No active ModelGenerationConfig found.');
}
$core = $this->runCore($prompt, $config, true);
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
return [];
}
// 1) Production-like selection: wir selektieren Texte,
// aber in Debug brauchen wir die ChunkIds dazu.
$selectedChunkIds = $core['is_list_query']
? $this->selectChunkIdsListMode($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
: $this->selectChunkIdsSalesMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
if ($selectedChunkIds === []) {
return [];
}
// 2) Ausgabe inklusive Scores
$out = [];
$rank = 0;
foreach ($selectedChunkIds as $chunkId) {
if (!isset($core['rows'][$chunkId])) {
continue;
}
$rank++;
$text = trim((string)($core['rows'][$chunkId]['text'] ?? ''));
$out[] = [
'rank' => $rank,
'chunk_id' => $chunkId,
'document_id' => isset($core['rows'][$chunkId]['document_id']) ? (string)$core['rows'][$chunkId]['document_id'] : null,
'raw_score' => isset($core['raw_scores'][$chunkId]) ? (float)$core['raw_scores'][$chunkId] : null,
'rrf_score' => isset($core['rrf_scores'][$chunkId]) ? (float)$core['rrf_scores'][$chunkId] : null,
'threshold' => (float)$core['threshold'],
'intent' => (string)$core['sales_intent'],
'is_list_query'=> (bool)$core['is_list_query'],
'text' => $text,
];
}
return $out;
}
// =========================================================
// CORE PIPELINE (einmalig, shared)
// =========================================================
/**
* @return array{
* limit:int,
* is_list_query:bool,
* sales_intent:string,
* threshold:float,
* topk:int,
* ranked_chunk_ids: string[],
* rows: array<string, array<string,mixed>>,
* rrf_scores: array<string,float>,
* raw_scores: array<string,float>
* }
*/
private function runCore(string $prompt, ModelGenerationConfig $config, bool $withScores): array
{
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
$isListQuery = $this->intentLite->isListQuery($prompt);
$salesIntent = $this->salesIntentLite->detect($prompt)['intent'];
$cleanQuery = $this->queryCleaner->clean($prompt);
if ($cleanQuery === '') {
$cleanQuery = $prompt;
}
// Intent-based adjustments (identisch zur Produktionslogik)
$threshold = self::VECTOR_SCORE_THRESHOLD;
$topK = $vectorTopKBase;
switch ($salesIntent) {
case SalesIntentLite::PRICING:
$threshold += 0.02;
break;
case SalesIntentLite::COMPARISON:
$topK = (int)round($vectorTopKBase * 1.4);
break;
case SalesIntentLite::OBJECTION:
$threshold -= 0.02;
break;
case SalesIntentLite::IMPLEMENTATION:
$topK = (int)round($vectorTopKBase * 1.3);
break;
case SalesIntentLite::ROI:
$topK = (int)round($vectorTopKBase * 1.2);
break;
case SalesIntentLite::DISCOVERY:
default:
$threshold -= 0.03;
break;
}
if ($isListQuery) {
$topK = (int)round($topK * self::LIST_BONUS);
}
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
// Tag routing (identisch)
$candidateDocIds = $this->tagRouting->route($cleanQuery);
$candidateSet = null;
if (is_array($candidateDocIds) && $candidateDocIds !== []) {
$candidateSet = array_fill_keys($candidateDocIds, true);
}
// Dual search (identisch)
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
$scopedHits = [];
if ($candidateSet !== null) {
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, array_keys($candidateSet));
}
if ($globalHits === [] && $scopedHits === []) {
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'sales_intent' => (string)$salesIntent,
'threshold' => $threshold,
'topk' => $topK,
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
'raw_scores' => [],
];
}
$rrfScores = [];
$rawScores = [];
// RRF (identisch) + optional raw capture
$this->applyRrfWithOptionalRaw($globalHits, $rrfScores, $rawScores, $threshold, false, $withScores);
$this->applyRrfWithOptionalRaw(
$scopedHits,
$rrfScores,
$rawScores,
$threshold,
$salesIntent === SalesIntentLite::OBJECTION,
$withScores
);
if ($rrfScores === []) {
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'sales_intent' => (string)$salesIntent,
'threshold' => $threshold,
'topk' => $topK,
'ranked_chunk_ids' => [],
'rows' => [],
'rrf_scores' => [],
'raw_scores' => $rawScores,
];
}
arsort($rrfScores);
$rankedChunkIds = array_keys($rrfScores);
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
return [
'limit' => $limit,
'is_list_query' => $isListQuery,
'sales_intent' => (string)$salesIntent,
'threshold' => $threshold,
'topk' => $topK,
'ranked_chunk_ids' => $rankedChunkIds,
'rows' => $rows,
'rrf_scores' => $rrfScores,
'raw_scores' => $rawScores,
];
}
/**
* Gleiche Logik wie applyRrf(), aber optional mit raw-score capture.
*
* @param array<int, array{chunk_id:string, score:float}> $hits
* @param array<string,float> $rrfScores
* @param array<string,float> $rawScores
*/
private function applyRrfWithOptionalRaw(
array $hits,
array &$rrfScores,
array &$rawScores,
float $threshold,
bool $boost = false,
bool $captureRaw = false
): void {
$rank = 0;
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
$raw = (float)$hit['score'];
if ($raw < $threshold) {
continue;
}
$chunkId = (string)$hit['chunk_id'];
if ($captureRaw) {
// wenn global+scoped vorkommt: bestes raw behalten
if (!isset($rawScores[$chunkId]) || $raw > $rawScores[$chunkId]) {
$rawScores[$chunkId] = $raw;
}
}
$rank++;
$rrf = 1 / (self::RRF_K + $rank);
if ($boost) {
$rrf *= 1.2;
}
if (!isset($rrfScores[$chunkId])) {
$rrfScores[$chunkId] = 0.0;
}
$rrfScores[$chunkId] += $rrf;
}
}
// =========================================================
// DEBUG SELECTION HELPERS (identisch zu Produktionsregeln)
// =========================================================
/**
* List-Mode nutzt exakt collectTexts() Regeln, aber gibt ChunkIds zurück.
*
* @return string[]
*/
private function selectChunkIdsListMode(array $chunkIds, array $rows, int $limit): array
{
$seen = [];
$out = [];
foreach ($chunkIds as $id) {
if (!isset($rows[$id]['text'])) {
continue;
}
$chunk = trim((string)$rows[$id]['text']);
if ($chunk === '') {
continue;
}
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = (string)$id;
if (\count($out) >= $limit) {
break;
}
}
return $out;
}
/**
* Normal-Mode nutzt exakt collectSalesOptimized() Regeln, aber gibt ChunkIds zurück.
*
* @return string[]
*/
private function selectChunkIdsSalesMode(array $chunkIds, array $rows, int $limit): array
{
$out = [];
$docCounter = [];
$docChunkPositions = [];
foreach ($chunkIds as $chunkId) {
if (!isset($rows[$chunkId]['text'])) {
continue;
}
$docId = $rows[$chunkId]['document_id'] ?? null;
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
if (!is_string($docId)) {
continue;
}
if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) {
continue;
}
if (is_int($chunkIndex)) {
$prev = $docChunkPositions[$docId] ?? [];
foreach ($prev as $prevIdx) {
if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) {
continue 2;
}
}
$docChunkPositions[$docId][] = $chunkIndex;
}
$text = trim((string)$rows[$chunkId]['text']);
if ($text === '') {
continue;
}
$out[] = (string)$chunkId;
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
if (\count($out) >= $limit) {
break;
}
}
return $out;
}
// =========================================================
// ORIGINAL METHODS (UNVERÄNDERT)
// =========================================================
private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void
{
$rank = 0;
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
$raw = (float)$hit['score'];
if ($raw < $threshold) {
continue;
}
$chunkId = (string)$hit['chunk_id'];
$rank++;
$rrf = 1 / (self::RRF_K + $rank);
if ($boost) {
$rrf *= 1.2;
}
if (!isset($rrfScores[$chunkId])) {
$rrfScores[$chunkId] = 0.0;
}
$rrfScores[$chunkId] += $rrf;
}
}
private function collectSalesOptimized(array $chunkIds, array $rows, int $limit): array
{
$out = [];
$docCounter = [];
$docChunkPositions = [];
foreach ($chunkIds as $chunkId) {
if (!isset($rows[$chunkId]['text'])) {
continue;
}
$docId = $rows[$chunkId]['document_id'] ?? null;
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
if (!is_string($docId)) {
continue;
}
if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) {
continue;
}
if (is_int($chunkIndex)) {
$prev = $docChunkPositions[$docId] ?? [];
foreach ($prev as $prevIdx) {
if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) {
continue 2;
}
}
$docChunkPositions[$docId][] = $chunkIndex;
}
$text = trim((string)$rows[$chunkId]['text']);
if ($text === '') {
continue;
}
$out[] = $text;
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
if (\count($out) >= $limit) {
break;
}
}
return $out;
}
private function collectTexts(array $chunkIds, array $rows, int $limit): array
{
$seen = [];
$out = [];
foreach ($chunkIds as $id) {
if (!isset($rows[$id]['text'])) {
continue;
}
$chunk = trim((string)$rows[$id]['text']);
if ($chunk === '') {
continue;
}
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = $chunk;
if (\count($out) >= $limit) {
break;
}
}
return $out;
}
}