fix retrieve final technical questions v4
This commit is contained in:
@@ -211,7 +211,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
if ($exactDocumentMatch !== null) {
|
||||
$selectedChunkIds = $this->selectExactDocumentChunkIds(
|
||||
$exactDocumentMatch['rows'],
|
||||
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS))
|
||||
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)),
|
||||
$prompt
|
||||
);
|
||||
|
||||
if ($selectedChunkIds !== []) {
|
||||
@@ -631,15 +632,127 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects a coherent chunk window from one exact document title match.
|
||||
* Selects a coherent chunk window from one exact document-title match.
|
||||
*
|
||||
* For exact product questions we prefer a pure document slice over
|
||||
* cross-document fusion to avoid mixing neighbouring product families.
|
||||
* A pure first-N slice is too weak for follow-up questions: the title may
|
||||
* identify the right document, while the current follow-up asks for a
|
||||
* specific detail from a later chunk (for example an indicator, range,
|
||||
* threshold, interface, relay, or error code).
|
||||
*
|
||||
* Therefore this method stays inside the matched document, but ranks its
|
||||
* chunks by overlap with the effective retrieval query before sorting the
|
||||
* final selection back into document order for prompt readability.
|
||||
*
|
||||
* @param array<string,array<string,mixed>> $rows
|
||||
* @return string[]
|
||||
*/
|
||||
private function selectExactDocumentChunkIds(array $rows, int $limit): array
|
||||
private function selectExactDocumentChunkIds(array $rows, int $limit, string $prompt): array
|
||||
{
|
||||
$orderedRows = $this->sortRowsByChunkIndex($rows);
|
||||
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
|
||||
|
||||
if ($orderedRows === [] || $max <= 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$queryTokens = $this->buildExactDocumentSelectionTokens($prompt);
|
||||
|
||||
if ($queryTokens === []) {
|
||||
return $this->firstChunkIdsFromRows($orderedRows, $max);
|
||||
}
|
||||
|
||||
$scored = [];
|
||||
|
||||
foreach ($orderedRows as $order => $row) {
|
||||
$chunkId = $row['chunk_id'] ?? null;
|
||||
$text = trim((string)($row['text'] ?? ''));
|
||||
|
||||
if (!is_string($chunkId) || $chunkId === '' || $text === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$haystack = $this->normalizeText($this->extractDocumentTitle($row) . ' ' . $text);
|
||||
$haystackTokens = array_fill_keys($this->tokenizeText($haystack), true);
|
||||
$score = 0.0;
|
||||
|
||||
foreach ($queryTokens as $token) {
|
||||
if (!isset($haystackTokens[$token])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (preg_match('/\d/u', $token) === 1) {
|
||||
$score += 6.0;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($this->isExactDetailToken($token)) {
|
||||
$score += 5.0;
|
||||
continue;
|
||||
}
|
||||
|
||||
$score += 2.0;
|
||||
}
|
||||
|
||||
// Keep early chunks slightly competitive for overview facts,
|
||||
// without letting them hide strongly matching detail chunks.
|
||||
$score += max(0.0, 1.0 - ($order * 0.05));
|
||||
|
||||
$scored[] = [
|
||||
'id' => $chunkId,
|
||||
'score' => $score,
|
||||
'order' => $order,
|
||||
'chunk_index' => is_int($row['chunk_index'] ?? null) ? (int)$row['chunk_index'] : null,
|
||||
];
|
||||
}
|
||||
|
||||
if ($scored === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
usort($scored, static function (array $a, array $b): int {
|
||||
if ($a['score'] !== $b['score']) {
|
||||
return $b['score'] <=> $a['score'];
|
||||
}
|
||||
|
||||
return $a['order'] <=> $b['order'];
|
||||
});
|
||||
|
||||
$selected = array_slice($scored, 0, $max);
|
||||
|
||||
usort($selected, static function (array $a, array $b): int {
|
||||
$aIndex = $a['chunk_index'];
|
||||
$bIndex = $b['chunk_index'];
|
||||
|
||||
if ($aIndex === null && $bIndex === null) {
|
||||
return $a['order'] <=> $b['order'];
|
||||
}
|
||||
|
||||
if ($aIndex === null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if ($bIndex === null) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ($aIndex !== $bIndex) {
|
||||
return $aIndex <=> $bIndex;
|
||||
}
|
||||
|
||||
return $a['order'] <=> $b['order'];
|
||||
});
|
||||
|
||||
return array_map(
|
||||
static fn(array $row): string => (string)$row['id'],
|
||||
$selected
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string,array<string,mixed>> $rows
|
||||
* @return array<int,array<string,mixed>>
|
||||
*/
|
||||
private function sortRowsByChunkIndex(array $rows): array
|
||||
{
|
||||
uasort($rows, static function (array $a, array $b): int {
|
||||
$aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX;
|
||||
@@ -652,8 +765,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? ''));
|
||||
});
|
||||
|
||||
return array_values($rows);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int,array<string,mixed>> $rows
|
||||
* @return string[]
|
||||
*/
|
||||
private function firstChunkIdsFromRows(array $rows, int $limit): array
|
||||
{
|
||||
$selected = [];
|
||||
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
|
||||
|
||||
foreach ($rows as $row) {
|
||||
$chunkId = $row['chunk_id'] ?? null;
|
||||
@@ -665,7 +786,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
$selected[] = $chunkId;
|
||||
|
||||
if (count($selected) >= $max) {
|
||||
if (count($selected) >= $limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -673,6 +794,50 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return $selected;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
private function buildExactDocumentSelectionTokens(string $prompt): array
|
||||
{
|
||||
$tokens = $this->tokenizeText($this->normalizeText($prompt));
|
||||
$out = [];
|
||||
|
||||
foreach ($tokens as $token) {
|
||||
if ($this->isGenericExactSelectionToken($token)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (preg_match('/\d/u', $token) === 1 || mb_strlen($token, 'UTF-8') >= 3) {
|
||||
$out[] = $token;
|
||||
}
|
||||
}
|
||||
|
||||
return array_values(array_unique($out));
|
||||
}
|
||||
|
||||
private function isExactDetailToken(string $token): bool
|
||||
{
|
||||
return in_array($token, [
|
||||
'indikator', 'indikatortyp', 'reagenz', 'reagens', 'grenzwert',
|
||||
'messbereich', 'bereich', 'wasserhaerte', 'wasserhärte',
|
||||
'resthaerte', 'resthärte', 'haerte', 'härte', 'aufloesung',
|
||||
'auflösung', 'schnittstelle', 'relais', 'fehlercode', 'code',
|
||||
'wert', 'werte',
|
||||
], true);
|
||||
}
|
||||
|
||||
private function isGenericExactSelectionToken(string $token): bool
|
||||
{
|
||||
return in_array($token, [
|
||||
'vorherige', 'vorheriger', 'nutzerfrage', 'aktuelle',
|
||||
'folgefrage', 'frage', 'antwort', 'welche', 'welcher', 'welches',
|
||||
'welchem', 'welchen', 'wird', 'werden', 'wurde', 'kann', 'koennen',
|
||||
'können', 'mit', 'der', 'die', 'das', 'den', 'dem', 'ein', 'eine',
|
||||
'einer', 'eines', 'ist', 'sind', 'was', 'wie', 'wo', 'zum', 'zur',
|
||||
'fuer', 'für', 'durch', 'von', 'vom', 'und', 'oder', 'auch',
|
||||
], true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds synthetic scores for exact-title fast-path selections.
|
||||
*
|
||||
|
||||
Reference in New Issue
Block a user