fix retrieve final technical questions v4
This commit is contained in:
@@ -54,6 +54,8 @@ final readonly class AgentRunner
|
|||||||
$optimizedShopQuery = '';
|
$optimizedShopQuery = '';
|
||||||
$shopSearchQuery = '';
|
$shopSearchQuery = '';
|
||||||
$commerceIntent = CommerceIntentLite::NONE;
|
$commerceIntent = CommerceIntentLite::NONE;
|
||||||
|
$knowledgeRetrievalPrompt = $prompt;
|
||||||
|
$usedFollowUpRetrievalContext = false;
|
||||||
$commerceHistoryContext = '';
|
$commerceHistoryContext = '';
|
||||||
$attemptedShopRepair = false;
|
$attemptedShopRepair = false;
|
||||||
$usedShopRepair = false;
|
$usedShopRepair = false;
|
||||||
@@ -77,14 +79,30 @@ final readonly class AgentRunner
|
|||||||
$this->addSource($sources, $this->agentRunnerConfig->getExternalUrlSourceLabel());
|
$this->addSource($sources, $this->agentRunnerConfig->getExternalUrlSourceLabel());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$commerceIntent = $this->detectCommerceIntent($prompt);
|
||||||
|
|
||||||
yield $this->systemMsg($this->agentRunnerConfig->getRetrieveKnowledgeMessage(), 'think');
|
yield $this->systemMsg($this->agentRunnerConfig->getRetrieveKnowledgeMessage(), 'think');
|
||||||
|
|
||||||
$knowledgeChunks = $this->retriever->retrieve($prompt);
|
$knowledgeRetrievalPrompt = $this->buildKnowledgeRetrievalPrompt(
|
||||||
|
prompt: $prompt,
|
||||||
|
userId: $userId,
|
||||||
|
commerceIntent: $commerceIntent
|
||||||
|
);
|
||||||
|
$usedFollowUpRetrievalContext = $knowledgeRetrievalPrompt !== $prompt;
|
||||||
|
|
||||||
|
$knowledgeChunks = $this->retriever->retrieve($knowledgeRetrievalPrompt);
|
||||||
if ($knowledgeChunks !== []) {
|
if ($knowledgeChunks !== []) {
|
||||||
$this->addSource($sources, $this->agentRunnerConfig->getRagKnowledgeSourceLabel());
|
$this->addSource($sources, $this->agentRunnerConfig->getRagKnowledgeSourceLabel());
|
||||||
}
|
}
|
||||||
|
|
||||||
$commerceIntent = $this->detectCommerceIntent($prompt);
|
if ($usedFollowUpRetrievalContext) {
|
||||||
|
$this->agentLogger->info('Knowledge retrieval used follow-up context', [
|
||||||
|
'userId' => $userId,
|
||||||
|
'prompt' => $prompt,
|
||||||
|
'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt,
|
||||||
|
'commerceIntent' => $commerceIntent,
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
if ($this->isCommerceIntent($commerceIntent)) {
|
if ($this->isCommerceIntent($commerceIntent)) {
|
||||||
yield $this->systemMsg($this->agentRunnerConfig->getOptimizeSearchMessage(), 'think');
|
yield $this->systemMsg($this->agentRunnerConfig->getOptimizeSearchMessage(), 'think');
|
||||||
@@ -171,6 +189,8 @@ final readonly class AgentRunner
|
|||||||
'finalPrompt' => $finalPrompt,
|
'finalPrompt' => $finalPrompt,
|
||||||
'optimizedShopQuery' => $optimizedShopQuery,
|
'optimizedShopQuery' => $optimizedShopQuery,
|
||||||
'shopSearchQuery' => $shopSearchQuery,
|
'shopSearchQuery' => $shopSearchQuery,
|
||||||
|
'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt,
|
||||||
|
'usedFollowUpRetrievalContext' => $usedFollowUpRetrievalContext,
|
||||||
'primaryShopResultsCount' => count($primaryShopResults),
|
'primaryShopResultsCount' => count($primaryShopResults),
|
||||||
'shopResultsCount' => count($shopResults),
|
'shopResultsCount' => count($shopResults),
|
||||||
'attemptedShopRepair' => $attemptedShopRepair,
|
'attemptedShopRepair' => $attemptedShopRepair,
|
||||||
@@ -228,6 +248,8 @@ final readonly class AgentRunner
|
|||||||
'usedShopRepair' => $usedShopRepair,
|
'usedShopRepair' => $usedShopRepair,
|
||||||
'shopRepairQueries' => $shopRepairQueries,
|
'shopRepairQueries' => $shopRepairQueries,
|
||||||
'knowledgeChunkCount' => count($knowledgeChunks),
|
'knowledgeChunkCount' => count($knowledgeChunks),
|
||||||
|
'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt,
|
||||||
|
'usedFollowUpRetrievalContext' => $usedFollowUpRetrievalContext,
|
||||||
'hasUrlContent' => $urlContent !== '',
|
'hasUrlContent' => $urlContent !== '',
|
||||||
'usedOptimizedShopQuery' => $optimizedShopQuery !== '',
|
'usedOptimizedShopQuery' => $optimizedShopQuery !== '',
|
||||||
'optimizedShopQuery' => $optimizedShopQuery,
|
'optimizedShopQuery' => $optimizedShopQuery,
|
||||||
@@ -258,6 +280,159 @@ final readonly class AgentRunner
|
|||||||
|| $commerceIntent === CommerceIntentLite::ADVISORY_PRODUCT_SEARCH;
|
|| $commerceIntent === CommerceIntentLite::ADVISORY_PRODUCT_SEARCH;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private function buildKnowledgeRetrievalPrompt(
|
||||||
|
string $prompt,
|
||||||
|
string $userId,
|
||||||
|
string $commerceIntent
|
||||||
|
): string {
|
||||||
|
if (!$this->shouldUseFollowUpContextForKnowledgeRetrieval($prompt, $commerceIntent)) {
|
||||||
|
return $prompt;
|
||||||
|
}
|
||||||
|
|
||||||
|
$history = $this->contextService->buildUserContextWithinBudget($userId, 3000);
|
||||||
|
$previousQuestions = $this->extractRecentUserQuestions($history, 2);
|
||||||
|
|
||||||
|
if ($previousQuestions === []) {
|
||||||
|
return $prompt;
|
||||||
|
}
|
||||||
|
|
||||||
|
$lines = [];
|
||||||
|
|
||||||
|
foreach ($previousQuestions as $question) {
|
||||||
|
$lines[] = 'Vorherige Nutzerfrage: ' . $question;
|
||||||
|
}
|
||||||
|
|
||||||
|
$lines[] = 'Aktuelle Folgefrage: ' . $prompt;
|
||||||
|
|
||||||
|
return implode("\n", $lines);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function shouldUseFollowUpContextForKnowledgeRetrieval(string $prompt, string $commerceIntent): bool
|
||||||
|
{
|
||||||
|
if ($this->isCommerceIntent($commerceIntent)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
$normalized = $this->normalizeFollowUpText($prompt);
|
||||||
|
|
||||||
|
if ($normalized === '') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($this->containsExplicitCommercialFollowUpSignal($normalized)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mb_strlen($normalized, 'UTF-8') > 180 && !$this->containsStrongFollowUpReference($normalized)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->containsStrongFollowUpReference($normalized);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function containsStrongFollowUpReference(string $normalized): bool
|
||||||
|
{
|
||||||
|
$patterns = [
|
||||||
|
'/\bder\s+wert\b/u',
|
||||||
|
'/\bdieser\s+wert\b/u',
|
||||||
|
'/\bdiesen\s+wert\b/u',
|
||||||
|
'/\bdem\s+wert\b/u',
|
||||||
|
'/\bmit\s+welche(?:m|n|r)?\b/u',
|
||||||
|
'/\bwomit\b/u',
|
||||||
|
'/\bdamit\b/u',
|
||||||
|
'/\bdafuer\b/u',
|
||||||
|
'/\bdafür\b/u',
|
||||||
|
'/\bdazu\b/u',
|
||||||
|
'/\bdaraus\b/u',
|
||||||
|
'/\bwelche(?:r|s|m|n)?\s+indikator\b/u',
|
||||||
|
'/\bwelche(?:r|s|m|n)?\s+indikatortyp\b/u',
|
||||||
|
'/\bindikator\s+(?:dafuer|dafür|dazu|hierfuer|hierfür)\b/u',
|
||||||
|
'/\bwelche(?:r|s|m|n)?\s+bereich\b/u',
|
||||||
|
'/\bwelche(?:r|s|m|n)?\s+messbereich\b/u',
|
||||||
|
'/\bwelche(?:r|s|m|n)?\s+grenzwert\b/u',
|
||||||
|
];
|
||||||
|
|
||||||
|
foreach ($patterns as $pattern) {
|
||||||
|
if (preg_match($pattern, $normalized) === 1) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function containsExplicitCommercialFollowUpSignal(string $normalized): bool
|
||||||
|
{
|
||||||
|
$commercialSignals = [
|
||||||
|
'shop', 'preis', 'preise', 'kostet', 'kosten', 'kaufen', 'bestellen',
|
||||||
|
'warenkorb', 'lieferzeit', 'verfuegbar', 'verfügbar', 'lager', 'url',
|
||||||
|
'link', 'artikelnummer', 'sku', 'produktnummer',
|
||||||
|
];
|
||||||
|
|
||||||
|
foreach ($commercialSignals as $signal) {
|
||||||
|
if (str_contains($normalized, $signal)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return string[]
|
||||||
|
*/
|
||||||
|
private function extractRecentUserQuestions(string $history, int $limit): array
|
||||||
|
{
|
||||||
|
$history = trim($history);
|
||||||
|
|
||||||
|
if ($history === '' || $limit <= 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match_all('/^Question:\s*(.+)$/mi', $history, $matches) !== 1) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
$questions = array_values(array_filter(
|
||||||
|
array_map(
|
||||||
|
fn(string $question): string => $this->sanitizeHistoryQuestion($question),
|
||||||
|
$matches[1] ?? []
|
||||||
|
),
|
||||||
|
static fn(string $question): bool => $question !== ''
|
||||||
|
));
|
||||||
|
|
||||||
|
if ($questions === []) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return array_slice($questions, -$limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function sanitizeHistoryQuestion(string $question): string
|
||||||
|
{
|
||||||
|
$question = trim((string) preg_replace('/\s+/u', ' ', $question));
|
||||||
|
|
||||||
|
if ($question === '') {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mb_strlen($question, 'UTF-8') <= 500) {
|
||||||
|
return $question;
|
||||||
|
}
|
||||||
|
|
||||||
|
return rtrim(mb_substr($question, 0, 497, 'UTF-8')) . '...';
|
||||||
|
}
|
||||||
|
|
||||||
|
private function normalizeFollowUpText(string $value): string
|
||||||
|
{
|
||||||
|
$value = mb_strtolower(trim($value), 'UTF-8');
|
||||||
|
$value = str_replace(['-', '/', '_'], ' ', $value);
|
||||||
|
$value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
|
||||||
|
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
||||||
|
|
||||||
|
return trim($value);
|
||||||
|
}
|
||||||
|
|
||||||
private function buildOptimizedShopQuery(
|
private function buildOptimizedShopQuery(
|
||||||
string $prompt,
|
string $prompt,
|
||||||
string $userId,
|
string $userId,
|
||||||
|
|||||||
@@ -13,12 +13,12 @@ final class AgentRunnerConfig
|
|||||||
|
|
||||||
public function getProductSearchKnowledgeChunkLimit(): int
|
public function getProductSearchKnowledgeChunkLimit(): int
|
||||||
{
|
{
|
||||||
return 2;
|
return 6;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getAdvisoryProductSearchKnowledgeChunkLimit(): int
|
public function getAdvisoryProductSearchKnowledgeChunkLimit(): int
|
||||||
{
|
{
|
||||||
return 3;
|
return 9;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getOptimizedShopQueryPrefixPattern(): string
|
public function getOptimizedShopQueryPrefixPattern(): string
|
||||||
|
|||||||
@@ -88,8 +88,10 @@ final class PromptBuilderConfig
|
|||||||
{
|
{
|
||||||
return [
|
return [
|
||||||
'The following messages are previous turns of this conversation.',
|
'The following messages are previous turns of this conversation.',
|
||||||
'Use them to resolve references, follow-up questions, and user intent.',
|
'Use them only to resolve references, follow-up questions, and user intent.',
|
||||||
'They must not override retrieved factual knowledge or live shop data.',
|
'Previous assistant answers are not a factual source for technical values, product compatibility, indicators, ranges, prices, or availability.',
|
||||||
|
'All factual claims must come from retrieved factual knowledge, user-provided URL content, or live shop data.',
|
||||||
|
'Conversation context must not override retrieved factual knowledge or live shop data.',
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -253,6 +255,7 @@ final class PromptBuilderConfig
|
|||||||
'- Clearly separate explicit facts from inferences.',
|
'- Clearly separate explicit facts from inferences.',
|
||||||
'- If a conclusion goes beyond the source wording, label it exactly as \'Inference:\'.',
|
'- If a conclusion goes beyond the source wording, label it exactly as \'Inference:\'.',
|
||||||
'- If a sentence cannot be traced to the provided sources, do not write it.',
|
'- If a sentence cannot be traced to the provided sources, do not write it.',
|
||||||
|
'- For follow-up questions, use the conversation only to resolve what the user refers to; do not copy technical facts from previous assistant answers unless the same fact is present in the current retrieved sources.',
|
||||||
'- Never mention external manufacturers, external brands, or external products unless they are explicitly present in the provided sources.',
|
'- Never mention external manufacturers, external brands, or external products unless they are explicitly present in the provided sources.',
|
||||||
'- If the sources do not identify a suitable product, do not invent one.',
|
'- If the sources do not identify a suitable product, do not invent one.',
|
||||||
];
|
];
|
||||||
|
|||||||
@@ -211,7 +211,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
if ($exactDocumentMatch !== null) {
|
if ($exactDocumentMatch !== null) {
|
||||||
$selectedChunkIds = $this->selectExactDocumentChunkIds(
|
$selectedChunkIds = $this->selectExactDocumentChunkIds(
|
||||||
$exactDocumentMatch['rows'],
|
$exactDocumentMatch['rows'],
|
||||||
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS))
|
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)),
|
||||||
|
$prompt
|
||||||
);
|
);
|
||||||
|
|
||||||
if ($selectedChunkIds !== []) {
|
if ($selectedChunkIds !== []) {
|
||||||
@@ -631,15 +632,127 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Selects a coherent chunk window from one exact document title match.
|
* Selects a coherent chunk window from one exact document-title match.
|
||||||
*
|
*
|
||||||
* For exact product questions we prefer a pure document slice over
|
* A pure first-N slice is too weak for follow-up questions: the title may
|
||||||
* cross-document fusion to avoid mixing neighbouring product families.
|
* identify the right document, while the current follow-up asks for a
|
||||||
|
* specific detail from a later chunk (for example an indicator, range,
|
||||||
|
* threshold, interface, relay, or error code).
|
||||||
|
*
|
||||||
|
* Therefore this method stays inside the matched document, but ranks its
|
||||||
|
* chunks by overlap with the effective retrieval query before sorting the
|
||||||
|
* final selection back into document order for prompt readability.
|
||||||
*
|
*
|
||||||
* @param array<string,array<string,mixed>> $rows
|
* @param array<string,array<string,mixed>> $rows
|
||||||
* @return string[]
|
* @return string[]
|
||||||
*/
|
*/
|
||||||
private function selectExactDocumentChunkIds(array $rows, int $limit): array
|
private function selectExactDocumentChunkIds(array $rows, int $limit, string $prompt): array
|
||||||
|
{
|
||||||
|
$orderedRows = $this->sortRowsByChunkIndex($rows);
|
||||||
|
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
|
||||||
|
|
||||||
|
if ($orderedRows === [] || $max <= 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
$queryTokens = $this->buildExactDocumentSelectionTokens($prompt);
|
||||||
|
|
||||||
|
if ($queryTokens === []) {
|
||||||
|
return $this->firstChunkIdsFromRows($orderedRows, $max);
|
||||||
|
}
|
||||||
|
|
||||||
|
$scored = [];
|
||||||
|
|
||||||
|
foreach ($orderedRows as $order => $row) {
|
||||||
|
$chunkId = $row['chunk_id'] ?? null;
|
||||||
|
$text = trim((string)($row['text'] ?? ''));
|
||||||
|
|
||||||
|
if (!is_string($chunkId) || $chunkId === '' || $text === '') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$haystack = $this->normalizeText($this->extractDocumentTitle($row) . ' ' . $text);
|
||||||
|
$haystackTokens = array_fill_keys($this->tokenizeText($haystack), true);
|
||||||
|
$score = 0.0;
|
||||||
|
|
||||||
|
foreach ($queryTokens as $token) {
|
||||||
|
if (!isset($haystackTokens[$token])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match('/\d/u', $token) === 1) {
|
||||||
|
$score += 6.0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($this->isExactDetailToken($token)) {
|
||||||
|
$score += 5.0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$score += 2.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Keep early chunks slightly competitive for overview facts,
|
||||||
|
// without letting them hide strongly matching detail chunks.
|
||||||
|
$score += max(0.0, 1.0 - ($order * 0.05));
|
||||||
|
|
||||||
|
$scored[] = [
|
||||||
|
'id' => $chunkId,
|
||||||
|
'score' => $score,
|
||||||
|
'order' => $order,
|
||||||
|
'chunk_index' => is_int($row['chunk_index'] ?? null) ? (int)$row['chunk_index'] : null,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($scored === []) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
usort($scored, static function (array $a, array $b): int {
|
||||||
|
if ($a['score'] !== $b['score']) {
|
||||||
|
return $b['score'] <=> $a['score'];
|
||||||
|
}
|
||||||
|
|
||||||
|
return $a['order'] <=> $b['order'];
|
||||||
|
});
|
||||||
|
|
||||||
|
$selected = array_slice($scored, 0, $max);
|
||||||
|
|
||||||
|
usort($selected, static function (array $a, array $b): int {
|
||||||
|
$aIndex = $a['chunk_index'];
|
||||||
|
$bIndex = $b['chunk_index'];
|
||||||
|
|
||||||
|
if ($aIndex === null && $bIndex === null) {
|
||||||
|
return $a['order'] <=> $b['order'];
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($aIndex === null) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($bIndex === null) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($aIndex !== $bIndex) {
|
||||||
|
return $aIndex <=> $bIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $a['order'] <=> $b['order'];
|
||||||
|
});
|
||||||
|
|
||||||
|
return array_map(
|
||||||
|
static fn(array $row): string => (string)$row['id'],
|
||||||
|
$selected
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param array<string,array<string,mixed>> $rows
|
||||||
|
* @return array<int,array<string,mixed>>
|
||||||
|
*/
|
||||||
|
private function sortRowsByChunkIndex(array $rows): array
|
||||||
{
|
{
|
||||||
uasort($rows, static function (array $a, array $b): int {
|
uasort($rows, static function (array $a, array $b): int {
|
||||||
$aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX;
|
$aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX;
|
||||||
@@ -652,8 +765,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? ''));
|
return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? ''));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
return array_values($rows);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param array<int,array<string,mixed>> $rows
|
||||||
|
* @return string[]
|
||||||
|
*/
|
||||||
|
private function firstChunkIdsFromRows(array $rows, int $limit): array
|
||||||
|
{
|
||||||
$selected = [];
|
$selected = [];
|
||||||
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
|
|
||||||
|
|
||||||
foreach ($rows as $row) {
|
foreach ($rows as $row) {
|
||||||
$chunkId = $row['chunk_id'] ?? null;
|
$chunkId = $row['chunk_id'] ?? null;
|
||||||
@@ -665,7 +786,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
|
|
||||||
$selected[] = $chunkId;
|
$selected[] = $chunkId;
|
||||||
|
|
||||||
if (count($selected) >= $max) {
|
if (count($selected) >= $limit) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -673,6 +794,50 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
return $selected;
|
return $selected;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return string[]
|
||||||
|
*/
|
||||||
|
private function buildExactDocumentSelectionTokens(string $prompt): array
|
||||||
|
{
|
||||||
|
$tokens = $this->tokenizeText($this->normalizeText($prompt));
|
||||||
|
$out = [];
|
||||||
|
|
||||||
|
foreach ($tokens as $token) {
|
||||||
|
if ($this->isGenericExactSelectionToken($token)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match('/\d/u', $token) === 1 || mb_strlen($token, 'UTF-8') >= 3) {
|
||||||
|
$out[] = $token;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return array_values(array_unique($out));
|
||||||
|
}
|
||||||
|
|
||||||
|
private function isExactDetailToken(string $token): bool
|
||||||
|
{
|
||||||
|
return in_array($token, [
|
||||||
|
'indikator', 'indikatortyp', 'reagenz', 'reagens', 'grenzwert',
|
||||||
|
'messbereich', 'bereich', 'wasserhaerte', 'wasserhärte',
|
||||||
|
'resthaerte', 'resthärte', 'haerte', 'härte', 'aufloesung',
|
||||||
|
'auflösung', 'schnittstelle', 'relais', 'fehlercode', 'code',
|
||||||
|
'wert', 'werte',
|
||||||
|
], true);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function isGenericExactSelectionToken(string $token): bool
|
||||||
|
{
|
||||||
|
return in_array($token, [
|
||||||
|
'vorherige', 'vorheriger', 'nutzerfrage', 'aktuelle',
|
||||||
|
'folgefrage', 'frage', 'antwort', 'welche', 'welcher', 'welches',
|
||||||
|
'welchem', 'welchen', 'wird', 'werden', 'wurde', 'kann', 'koennen',
|
||||||
|
'können', 'mit', 'der', 'die', 'das', 'den', 'dem', 'ein', 'eine',
|
||||||
|
'einer', 'eines', 'ist', 'sind', 'was', 'wie', 'wo', 'zum', 'zur',
|
||||||
|
'fuer', 'für', 'durch', 'von', 'vom', 'und', 'oder', 'auch',
|
||||||
|
], true);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds synthetic scores for exact-title fast-path selections.
|
* Builds synthetic scores for exact-title fast-path selections.
|
||||||
*
|
*
|
||||||
|
|||||||
Reference in New Issue
Block a user