fix retrieve final

This commit is contained in:
team 1
2026-04-24 12:02:34 +02:00
parent 66f09e83ca
commit 868f9a8857
3 changed files with 252 additions and 5 deletions

View File

@@ -655,12 +655,15 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return [];
}
$queryTokens = $this->buildExactDocumentSelectionTokens($prompt);
$queryTokens = $this->expandExactSelectionTokenVariants(
$this->buildExactDocumentSelectionTokens($prompt)
);
if ($queryTokens === []) {
return $this->firstChunkIdsFromRows($orderedRows, $max);
}
$detailFocus = $this->buildExactDocumentDetailFocus($prompt);
$scored = [];
foreach ($orderedRows as $order => $row) {
@@ -672,7 +675,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
}
$haystack = $this->normalizeText($this->extractDocumentTitle($row) . ' ' . $text);
$haystackTokens = array_fill_keys($this->tokenizeText($haystack), true);
$haystackTokens = array_fill_keys(
$this->expandExactSelectionTokenVariants($this->tokenizeText($haystack)),
true
);
$score = 0.0;
foreach ($queryTokens as $token) {
@@ -693,6 +699,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$score += 2.0;
}
$score += $this->scoreExactDocumentDetailFocus($detailFocus, $haystack, $text);
// Keep early chunks slightly competitive for overview facts,
// without letting them hide strongly matching detail chunks.
$score += max(0.0, 1.0 - ($order * 0.05));
@@ -815,10 +823,143 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return array_values(array_unique($out));
}
/**
* @param string[] $tokens
* @return string[]
*/
private function expandExactSelectionTokenVariants(array $tokens): array
{
$out = [];
foreach ($tokens as $token) {
foreach ($this->exactSelectionTokenVariants($token) as $variant) {
$out[] = $variant;
}
}
return array_values(array_unique(array_filter(
$out,
static fn(string $token): bool => $token !== ''
)));
}
/**
* @return string[]
*/
private function exactSelectionTokenVariants(string $token): array
{
$token = trim($token);
if ($token === '') {
return [];
}
$variants = [$token];
$length = mb_strlen($token, 'UTF-8');
if ($length >= 5) {
foreach (['typen', 'innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) {
if (!str_ends_with($token, $suffix)) {
continue;
}
$stem = mb_substr($token, 0, $length - mb_strlen($suffix, 'UTF-8'), 'UTF-8');
if (mb_strlen($stem, 'UTF-8') >= 3) {
$variants[] = $stem;
}
}
}
if (str_starts_with($token, 'indikator')) {
$variants[] = 'indikator';
$variants[] = 'indikatortyp';
}
if (str_starts_with($token, 'grenzwert')) {
$variants[] = 'grenzwert';
}
if (str_starts_with($token, 'messbereich')) {
$variants[] = 'messbereich';
}
if (str_starts_with($token, 'testomat')) {
$variants[] = 'testomat';
}
return array_values(array_unique($variants));
}
/**
* @return array{asks_indicator:bool}
*/
private function buildExactDocumentDetailFocus(string $prompt): array
{
$normalized = $this->normalizeText($prompt);
$tokens = array_fill_keys(
$this->expandExactSelectionTokenVariants($this->tokenizeText($normalized)),
true
);
$asksIndicator = isset($tokens['indikator'])
|| isset($tokens['indikatortyp'])
|| isset($tokens['reagenz'])
|| isset($tokens['reagens'])
|| str_contains($normalized, 'mit welchem')
|| str_contains($normalized, 'womit');
return [
'asks_indicator' => $asksIndicator,
];
}
/**
* Gives detail chunks inside an already matched exact document a strong
* advantage for follow-up questions such as "which indicator measures that
* value". This remains scoped to the exact document, so it does not affect
* shop searches or broad product discovery.
*
* @param array{asks_indicator:bool} $detailFocus
*/
private function scoreExactDocumentDetailFocus(array $detailFocus, string $normalizedHaystack, string $rawText): float
{
$score = 0.0;
if (!$detailFocus['asks_indicator']) {
return $score;
}
if (preg_match('/verf(?:ü|ue)gbare\s+indikatortypen|indikatortypen|indikatorvarianten/iu', $rawText) === 1) {
$score += 14.0;
}
if (preg_match('/\|\s*(?:typ|indikator)\s*\|\s*(?:grenzwert|messbereich|bereich)/iu', $rawText) === 1) {
$score += 10.0;
}
if (preg_match('/\|\s*[A-Z]{0,4}\s*\d{2,4}\s*[A-Z]?\s*\|\s*\d/iu', $rawText) === 1) {
$score += 8.0;
}
if (
str_contains($normalizedHaystack, 'indikator')
&& (
str_contains($normalizedHaystack, 'grenzwert')
|| str_contains($normalizedHaystack, 'messbereich')
|| str_contains($normalizedHaystack, 'bereich')
)
) {
$score += 5.0;
}
return $score;
}
private function isExactDetailToken(string $token): bool
{
return in_array($token, [
'indikator', 'indikatortyp', 'reagenz', 'reagens', 'grenzwert',
'indikator', 'indikatoren', 'indikatortyp', 'indikatortypen', 'reagenz', 'reagens', 'grenzwert',
'messbereich', 'bereich', 'wasserhaerte', 'wasserhärte',
'resthaerte', 'resthärte', 'haerte', 'härte', 'aufloesung',
'auflösung', 'schnittstelle', 'relais', 'fehlercode', 'code',
@@ -830,7 +971,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
{
return in_array($token, [
'vorherige', 'vorheriger', 'nutzerfrage', 'aktuelle',
'folgefrage', 'frage', 'antwort', 'welche', 'welcher', 'welches',
'folgefrage', 'frage', 'antwort', 'technische', 'referenzanker',
'referenzaufloesung', 'referenzauflösung', 'faktenquelle', 'keine',
'welche', 'welcher', 'welches',
'welchem', 'welchen', 'wird', 'werden', 'wurde', 'kann', 'koennen',
'können', 'mit', 'der', 'die', 'das', 'den', 'dem', 'ein', 'eine',
'einer', 'eines', 'ist', 'sind', 'was', 'wie', 'wo', 'zum', 'zur',