fix retrieve final technical questions v4

This commit is contained in:
team 1
2026-04-24 11:49:02 +02:00
parent 8a7cb89c5d
commit 66f09e83ca
4 changed files with 356 additions and 13 deletions

View File

@@ -211,7 +211,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
if ($exactDocumentMatch !== null) {
$selectedChunkIds = $this->selectExactDocumentChunkIds(
$exactDocumentMatch['rows'],
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS))
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)),
$prompt
);
if ($selectedChunkIds !== []) {
@@ -631,15 +632,127 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
}
/**
* Selects a coherent chunk window from one exact document title match.
* Selects a coherent chunk window from one exact document-title match.
*
* For exact product questions we prefer a pure document slice over
* cross-document fusion to avoid mixing neighbouring product families.
* A pure first-N slice is too weak for follow-up questions: the title may
* identify the right document, while the current follow-up asks for a
* specific detail from a later chunk (for example an indicator, range,
* threshold, interface, relay, or error code).
*
* Therefore this method stays inside the matched document, but ranks its
* chunks by overlap with the effective retrieval query before sorting the
* final selection back into document order for prompt readability.
*
* @param array<string,array<string,mixed>> $rows
* @return string[]
*/
private function selectExactDocumentChunkIds(array $rows, int $limit): array
private function selectExactDocumentChunkIds(array $rows, int $limit, string $prompt): array
{
$orderedRows = $this->sortRowsByChunkIndex($rows);
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
if ($orderedRows === [] || $max <= 0) {
return [];
}
$queryTokens = $this->buildExactDocumentSelectionTokens($prompt);
if ($queryTokens === []) {
return $this->firstChunkIdsFromRows($orderedRows, $max);
}
$scored = [];
foreach ($orderedRows as $order => $row) {
$chunkId = $row['chunk_id'] ?? null;
$text = trim((string)($row['text'] ?? ''));
if (!is_string($chunkId) || $chunkId === '' || $text === '') {
continue;
}
$haystack = $this->normalizeText($this->extractDocumentTitle($row) . ' ' . $text);
$haystackTokens = array_fill_keys($this->tokenizeText($haystack), true);
$score = 0.0;
foreach ($queryTokens as $token) {
if (!isset($haystackTokens[$token])) {
continue;
}
if (preg_match('/\d/u', $token) === 1) {
$score += 6.0;
continue;
}
if ($this->isExactDetailToken($token)) {
$score += 5.0;
continue;
}
$score += 2.0;
}
// Keep early chunks slightly competitive for overview facts,
// without letting them hide strongly matching detail chunks.
$score += max(0.0, 1.0 - ($order * 0.05));
$scored[] = [
'id' => $chunkId,
'score' => $score,
'order' => $order,
'chunk_index' => is_int($row['chunk_index'] ?? null) ? (int)$row['chunk_index'] : null,
];
}
if ($scored === []) {
return [];
}
usort($scored, static function (array $a, array $b): int {
if ($a['score'] !== $b['score']) {
return $b['score'] <=> $a['score'];
}
return $a['order'] <=> $b['order'];
});
$selected = array_slice($scored, 0, $max);
usort($selected, static function (array $a, array $b): int {
$aIndex = $a['chunk_index'];
$bIndex = $b['chunk_index'];
if ($aIndex === null && $bIndex === null) {
return $a['order'] <=> $b['order'];
}
if ($aIndex === null) {
return 1;
}
if ($bIndex === null) {
return -1;
}
if ($aIndex !== $bIndex) {
return $aIndex <=> $bIndex;
}
return $a['order'] <=> $b['order'];
});
return array_map(
static fn(array $row): string => (string)$row['id'],
$selected
);
}
/**
* @param array<string,array<string,mixed>> $rows
* @return array<int,array<string,mixed>>
*/
private function sortRowsByChunkIndex(array $rows): array
{
uasort($rows, static function (array $a, array $b): int {
$aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX;
@@ -652,8 +765,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? ''));
});
return array_values($rows);
}
/**
* @param array<int,array<string,mixed>> $rows
* @return string[]
*/
private function firstChunkIdsFromRows(array $rows, int $limit): array
{
$selected = [];
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
foreach ($rows as $row) {
$chunkId = $row['chunk_id'] ?? null;
@@ -665,7 +786,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$selected[] = $chunkId;
if (count($selected) >= $max) {
if (count($selected) >= $limit) {
break;
}
}
@@ -673,6 +794,50 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return $selected;
}
/**
* @return string[]
*/
private function buildExactDocumentSelectionTokens(string $prompt): array
{
$tokens = $this->tokenizeText($this->normalizeText($prompt));
$out = [];
foreach ($tokens as $token) {
if ($this->isGenericExactSelectionToken($token)) {
continue;
}
if (preg_match('/\d/u', $token) === 1 || mb_strlen($token, 'UTF-8') >= 3) {
$out[] = $token;
}
}
return array_values(array_unique($out));
}
private function isExactDetailToken(string $token): bool
{
return in_array($token, [
'indikator', 'indikatortyp', 'reagenz', 'reagens', 'grenzwert',
'messbereich', 'bereich', 'wasserhaerte', 'wasserhärte',
'resthaerte', 'resthärte', 'haerte', 'härte', 'aufloesung',
'auflösung', 'schnittstelle', 'relais', 'fehlercode', 'code',
'wert', 'werte',
], true);
}
private function isGenericExactSelectionToken(string $token): bool
{
return in_array($token, [
'vorherige', 'vorheriger', 'nutzerfrage', 'aktuelle',
'folgefrage', 'frage', 'antwort', 'welche', 'welcher', 'welches',
'welchem', 'welchen', 'wird', 'werden', 'wurde', 'kann', 'koennen',
'können', 'mit', 'der', 'die', 'das', 'den', 'dem', 'ein', 'eine',
'einer', 'eines', 'ist', 'sind', 'was', 'wie', 'wo', 'zum', 'zur',
'fuer', 'für', 'durch', 'von', 'vom', 'und', 'oder', 'auch',
], true);
}
/**
* Builds synthetic scores for exact-title fast-path selections.
*