fix retrieve final technical questions v4
This commit is contained in:
@@ -54,6 +54,8 @@ final readonly class AgentRunner
|
||||
$optimizedShopQuery = '';
|
||||
$shopSearchQuery = '';
|
||||
$commerceIntent = CommerceIntentLite::NONE;
|
||||
$knowledgeRetrievalPrompt = $prompt;
|
||||
$usedFollowUpRetrievalContext = false;
|
||||
$commerceHistoryContext = '';
|
||||
$attemptedShopRepair = false;
|
||||
$usedShopRepair = false;
|
||||
@@ -77,14 +79,30 @@ final readonly class AgentRunner
|
||||
$this->addSource($sources, $this->agentRunnerConfig->getExternalUrlSourceLabel());
|
||||
}
|
||||
|
||||
$commerceIntent = $this->detectCommerceIntent($prompt);
|
||||
|
||||
yield $this->systemMsg($this->agentRunnerConfig->getRetrieveKnowledgeMessage(), 'think');
|
||||
|
||||
$knowledgeChunks = $this->retriever->retrieve($prompt);
|
||||
$knowledgeRetrievalPrompt = $this->buildKnowledgeRetrievalPrompt(
|
||||
prompt: $prompt,
|
||||
userId: $userId,
|
||||
commerceIntent: $commerceIntent
|
||||
);
|
||||
$usedFollowUpRetrievalContext = $knowledgeRetrievalPrompt !== $prompt;
|
||||
|
||||
$knowledgeChunks = $this->retriever->retrieve($knowledgeRetrievalPrompt);
|
||||
if ($knowledgeChunks !== []) {
|
||||
$this->addSource($sources, $this->agentRunnerConfig->getRagKnowledgeSourceLabel());
|
||||
}
|
||||
|
||||
$commerceIntent = $this->detectCommerceIntent($prompt);
|
||||
if ($usedFollowUpRetrievalContext) {
|
||||
$this->agentLogger->info('Knowledge retrieval used follow-up context', [
|
||||
'userId' => $userId,
|
||||
'prompt' => $prompt,
|
||||
'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt,
|
||||
'commerceIntent' => $commerceIntent,
|
||||
]);
|
||||
}
|
||||
|
||||
if ($this->isCommerceIntent($commerceIntent)) {
|
||||
yield $this->systemMsg($this->agentRunnerConfig->getOptimizeSearchMessage(), 'think');
|
||||
@@ -171,6 +189,8 @@ final readonly class AgentRunner
|
||||
'finalPrompt' => $finalPrompt,
|
||||
'optimizedShopQuery' => $optimizedShopQuery,
|
||||
'shopSearchQuery' => $shopSearchQuery,
|
||||
'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt,
|
||||
'usedFollowUpRetrievalContext' => $usedFollowUpRetrievalContext,
|
||||
'primaryShopResultsCount' => count($primaryShopResults),
|
||||
'shopResultsCount' => count($shopResults),
|
||||
'attemptedShopRepair' => $attemptedShopRepair,
|
||||
@@ -228,6 +248,8 @@ final readonly class AgentRunner
|
||||
'usedShopRepair' => $usedShopRepair,
|
||||
'shopRepairQueries' => $shopRepairQueries,
|
||||
'knowledgeChunkCount' => count($knowledgeChunks),
|
||||
'knowledgeRetrievalPrompt' => $knowledgeRetrievalPrompt,
|
||||
'usedFollowUpRetrievalContext' => $usedFollowUpRetrievalContext,
|
||||
'hasUrlContent' => $urlContent !== '',
|
||||
'usedOptimizedShopQuery' => $optimizedShopQuery !== '',
|
||||
'optimizedShopQuery' => $optimizedShopQuery,
|
||||
@@ -258,6 +280,159 @@ final readonly class AgentRunner
|
||||
|| $commerceIntent === CommerceIntentLite::ADVISORY_PRODUCT_SEARCH;
|
||||
}
|
||||
|
||||
private function buildKnowledgeRetrievalPrompt(
|
||||
string $prompt,
|
||||
string $userId,
|
||||
string $commerceIntent
|
||||
): string {
|
||||
if (!$this->shouldUseFollowUpContextForKnowledgeRetrieval($prompt, $commerceIntent)) {
|
||||
return $prompt;
|
||||
}
|
||||
|
||||
$history = $this->contextService->buildUserContextWithinBudget($userId, 3000);
|
||||
$previousQuestions = $this->extractRecentUserQuestions($history, 2);
|
||||
|
||||
if ($previousQuestions === []) {
|
||||
return $prompt;
|
||||
}
|
||||
|
||||
$lines = [];
|
||||
|
||||
foreach ($previousQuestions as $question) {
|
||||
$lines[] = 'Vorherige Nutzerfrage: ' . $question;
|
||||
}
|
||||
|
||||
$lines[] = 'Aktuelle Folgefrage: ' . $prompt;
|
||||
|
||||
return implode("\n", $lines);
|
||||
}
|
||||
|
||||
private function shouldUseFollowUpContextForKnowledgeRetrieval(string $prompt, string $commerceIntent): bool
|
||||
{
|
||||
if ($this->isCommerceIntent($commerceIntent)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$normalized = $this->normalizeFollowUpText($prompt);
|
||||
|
||||
if ($normalized === '') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($this->containsExplicitCommercialFollowUpSignal($normalized)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (mb_strlen($normalized, 'UTF-8') > 180 && !$this->containsStrongFollowUpReference($normalized)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return $this->containsStrongFollowUpReference($normalized);
|
||||
}
|
||||
|
||||
private function containsStrongFollowUpReference(string $normalized): bool
|
||||
{
|
||||
$patterns = [
|
||||
'/\bder\s+wert\b/u',
|
||||
'/\bdieser\s+wert\b/u',
|
||||
'/\bdiesen\s+wert\b/u',
|
||||
'/\bdem\s+wert\b/u',
|
||||
'/\bmit\s+welche(?:m|n|r)?\b/u',
|
||||
'/\bwomit\b/u',
|
||||
'/\bdamit\b/u',
|
||||
'/\bdafuer\b/u',
|
||||
'/\bdafür\b/u',
|
||||
'/\bdazu\b/u',
|
||||
'/\bdaraus\b/u',
|
||||
'/\bwelche(?:r|s|m|n)?\s+indikator\b/u',
|
||||
'/\bwelche(?:r|s|m|n)?\s+indikatortyp\b/u',
|
||||
'/\bindikator\s+(?:dafuer|dafür|dazu|hierfuer|hierfür)\b/u',
|
||||
'/\bwelche(?:r|s|m|n)?\s+bereich\b/u',
|
||||
'/\bwelche(?:r|s|m|n)?\s+messbereich\b/u',
|
||||
'/\bwelche(?:r|s|m|n)?\s+grenzwert\b/u',
|
||||
];
|
||||
|
||||
foreach ($patterns as $pattern) {
|
||||
if (preg_match($pattern, $normalized) === 1) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private function containsExplicitCommercialFollowUpSignal(string $normalized): bool
|
||||
{
|
||||
$commercialSignals = [
|
||||
'shop', 'preis', 'preise', 'kostet', 'kosten', 'kaufen', 'bestellen',
|
||||
'warenkorb', 'lieferzeit', 'verfuegbar', 'verfügbar', 'lager', 'url',
|
||||
'link', 'artikelnummer', 'sku', 'produktnummer',
|
||||
];
|
||||
|
||||
foreach ($commercialSignals as $signal) {
|
||||
if (str_contains($normalized, $signal)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
private function extractRecentUserQuestions(string $history, int $limit): array
|
||||
{
|
||||
$history = trim($history);
|
||||
|
||||
if ($history === '' || $limit <= 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
if (preg_match_all('/^Question:\s*(.+)$/mi', $history, $matches) !== 1) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$questions = array_values(array_filter(
|
||||
array_map(
|
||||
fn(string $question): string => $this->sanitizeHistoryQuestion($question),
|
||||
$matches[1] ?? []
|
||||
),
|
||||
static fn(string $question): bool => $question !== ''
|
||||
));
|
||||
|
||||
if ($questions === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return array_slice($questions, -$limit);
|
||||
}
|
||||
|
||||
private function sanitizeHistoryQuestion(string $question): string
|
||||
{
|
||||
$question = trim((string) preg_replace('/\s+/u', ' ', $question));
|
||||
|
||||
if ($question === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
if (mb_strlen($question, 'UTF-8') <= 500) {
|
||||
return $question;
|
||||
}
|
||||
|
||||
return rtrim(mb_substr($question, 0, 497, 'UTF-8')) . '...';
|
||||
}
|
||||
|
||||
private function normalizeFollowUpText(string $value): string
|
||||
{
|
||||
$value = mb_strtolower(trim($value), 'UTF-8');
|
||||
$value = str_replace(['-', '/', '_'], ' ', $value);
|
||||
$value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
|
||||
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
||||
|
||||
return trim($value);
|
||||
}
|
||||
|
||||
private function buildOptimizedShopQuery(
|
||||
string $prompt,
|
||||
string $userId,
|
||||
|
||||
@@ -13,12 +13,12 @@ final class AgentRunnerConfig
|
||||
|
||||
public function getProductSearchKnowledgeChunkLimit(): int
|
||||
{
|
||||
return 2;
|
||||
return 6;
|
||||
}
|
||||
|
||||
public function getAdvisoryProductSearchKnowledgeChunkLimit(): int
|
||||
{
|
||||
return 3;
|
||||
return 9;
|
||||
}
|
||||
|
||||
public function getOptimizedShopQueryPrefixPattern(): string
|
||||
|
||||
@@ -88,8 +88,10 @@ final class PromptBuilderConfig
|
||||
{
|
||||
return [
|
||||
'The following messages are previous turns of this conversation.',
|
||||
'Use them to resolve references, follow-up questions, and user intent.',
|
||||
'They must not override retrieved factual knowledge or live shop data.',
|
||||
'Use them only to resolve references, follow-up questions, and user intent.',
|
||||
'Previous assistant answers are not a factual source for technical values, product compatibility, indicators, ranges, prices, or availability.',
|
||||
'All factual claims must come from retrieved factual knowledge, user-provided URL content, or live shop data.',
|
||||
'Conversation context must not override retrieved factual knowledge or live shop data.',
|
||||
];
|
||||
}
|
||||
|
||||
@@ -253,6 +255,7 @@ final class PromptBuilderConfig
|
||||
'- Clearly separate explicit facts from inferences.',
|
||||
'- If a conclusion goes beyond the source wording, label it exactly as \'Inference:\'.',
|
||||
'- If a sentence cannot be traced to the provided sources, do not write it.',
|
||||
'- For follow-up questions, use the conversation only to resolve what the user refers to; do not copy technical facts from previous assistant answers unless the same fact is present in the current retrieved sources.',
|
||||
'- Never mention external manufacturers, external brands, or external products unless they are explicitly present in the provided sources.',
|
||||
'- If the sources do not identify a suitable product, do not invent one.',
|
||||
];
|
||||
|
||||
@@ -211,7 +211,8 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
if ($exactDocumentMatch !== null) {
|
||||
$selectedChunkIds = $this->selectExactDocumentChunkIds(
|
||||
$exactDocumentMatch['rows'],
|
||||
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS))
|
||||
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS)),
|
||||
$prompt
|
||||
);
|
||||
|
||||
if ($selectedChunkIds !== []) {
|
||||
@@ -631,15 +632,127 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects a coherent chunk window from one exact document title match.
|
||||
* Selects a coherent chunk window from one exact document-title match.
|
||||
*
|
||||
* For exact product questions we prefer a pure document slice over
|
||||
* cross-document fusion to avoid mixing neighbouring product families.
|
||||
* A pure first-N slice is too weak for follow-up questions: the title may
|
||||
* identify the right document, while the current follow-up asks for a
|
||||
* specific detail from a later chunk (for example an indicator, range,
|
||||
* threshold, interface, relay, or error code).
|
||||
*
|
||||
* Therefore this method stays inside the matched document, but ranks its
|
||||
* chunks by overlap with the effective retrieval query before sorting the
|
||||
* final selection back into document order for prompt readability.
|
||||
*
|
||||
* @param array<string,array<string,mixed>> $rows
|
||||
* @return string[]
|
||||
*/
|
||||
private function selectExactDocumentChunkIds(array $rows, int $limit): array
|
||||
private function selectExactDocumentChunkIds(array $rows, int $limit, string $prompt): array
|
||||
{
|
||||
$orderedRows = $this->sortRowsByChunkIndex($rows);
|
||||
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
|
||||
|
||||
if ($orderedRows === [] || $max <= 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$queryTokens = $this->buildExactDocumentSelectionTokens($prompt);
|
||||
|
||||
if ($queryTokens === []) {
|
||||
return $this->firstChunkIdsFromRows($orderedRows, $max);
|
||||
}
|
||||
|
||||
$scored = [];
|
||||
|
||||
foreach ($orderedRows as $order => $row) {
|
||||
$chunkId = $row['chunk_id'] ?? null;
|
||||
$text = trim((string)($row['text'] ?? ''));
|
||||
|
||||
if (!is_string($chunkId) || $chunkId === '' || $text === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$haystack = $this->normalizeText($this->extractDocumentTitle($row) . ' ' . $text);
|
||||
$haystackTokens = array_fill_keys($this->tokenizeText($haystack), true);
|
||||
$score = 0.0;
|
||||
|
||||
foreach ($queryTokens as $token) {
|
||||
if (!isset($haystackTokens[$token])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (preg_match('/\d/u', $token) === 1) {
|
||||
$score += 6.0;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($this->isExactDetailToken($token)) {
|
||||
$score += 5.0;
|
||||
continue;
|
||||
}
|
||||
|
||||
$score += 2.0;
|
||||
}
|
||||
|
||||
// Keep early chunks slightly competitive for overview facts,
|
||||
// without letting them hide strongly matching detail chunks.
|
||||
$score += max(0.0, 1.0 - ($order * 0.05));
|
||||
|
||||
$scored[] = [
|
||||
'id' => $chunkId,
|
||||
'score' => $score,
|
||||
'order' => $order,
|
||||
'chunk_index' => is_int($row['chunk_index'] ?? null) ? (int)$row['chunk_index'] : null,
|
||||
];
|
||||
}
|
||||
|
||||
if ($scored === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
usort($scored, static function (array $a, array $b): int {
|
||||
if ($a['score'] !== $b['score']) {
|
||||
return $b['score'] <=> $a['score'];
|
||||
}
|
||||
|
||||
return $a['order'] <=> $b['order'];
|
||||
});
|
||||
|
||||
$selected = array_slice($scored, 0, $max);
|
||||
|
||||
usort($selected, static function (array $a, array $b): int {
|
||||
$aIndex = $a['chunk_index'];
|
||||
$bIndex = $b['chunk_index'];
|
||||
|
||||
if ($aIndex === null && $bIndex === null) {
|
||||
return $a['order'] <=> $b['order'];
|
||||
}
|
||||
|
||||
if ($aIndex === null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if ($bIndex === null) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ($aIndex !== $bIndex) {
|
||||
return $aIndex <=> $bIndex;
|
||||
}
|
||||
|
||||
return $a['order'] <=> $b['order'];
|
||||
});
|
||||
|
||||
return array_map(
|
||||
static fn(array $row): string => (string)$row['id'],
|
||||
$selected
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string,array<string,mixed>> $rows
|
||||
* @return array<int,array<string,mixed>>
|
||||
*/
|
||||
private function sortRowsByChunkIndex(array $rows): array
|
||||
{
|
||||
uasort($rows, static function (array $a, array $b): int {
|
||||
$aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX;
|
||||
@@ -652,8 +765,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? ''));
|
||||
});
|
||||
|
||||
return array_values($rows);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int,array<string,mixed>> $rows
|
||||
* @return string[]
|
||||
*/
|
||||
private function firstChunkIdsFromRows(array $rows, int $limit): array
|
||||
{
|
||||
$selected = [];
|
||||
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
|
||||
|
||||
foreach ($rows as $row) {
|
||||
$chunkId = $row['chunk_id'] ?? null;
|
||||
@@ -665,7 +786,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
$selected[] = $chunkId;
|
||||
|
||||
if (count($selected) >= $max) {
|
||||
if (count($selected) >= $limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -673,6 +794,50 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return $selected;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
private function buildExactDocumentSelectionTokens(string $prompt): array
|
||||
{
|
||||
$tokens = $this->tokenizeText($this->normalizeText($prompt));
|
||||
$out = [];
|
||||
|
||||
foreach ($tokens as $token) {
|
||||
if ($this->isGenericExactSelectionToken($token)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (preg_match('/\d/u', $token) === 1 || mb_strlen($token, 'UTF-8') >= 3) {
|
||||
$out[] = $token;
|
||||
}
|
||||
}
|
||||
|
||||
return array_values(array_unique($out));
|
||||
}
|
||||
|
||||
private function isExactDetailToken(string $token): bool
|
||||
{
|
||||
return in_array($token, [
|
||||
'indikator', 'indikatortyp', 'reagenz', 'reagens', 'grenzwert',
|
||||
'messbereich', 'bereich', 'wasserhaerte', 'wasserhärte',
|
||||
'resthaerte', 'resthärte', 'haerte', 'härte', 'aufloesung',
|
||||
'auflösung', 'schnittstelle', 'relais', 'fehlercode', 'code',
|
||||
'wert', 'werte',
|
||||
], true);
|
||||
}
|
||||
|
||||
private function isGenericExactSelectionToken(string $token): bool
|
||||
{
|
||||
return in_array($token, [
|
||||
'vorherige', 'vorheriger', 'nutzerfrage', 'aktuelle',
|
||||
'folgefrage', 'frage', 'antwort', 'welche', 'welcher', 'welches',
|
||||
'welchem', 'welchen', 'wird', 'werden', 'wurde', 'kann', 'koennen',
|
||||
'können', 'mit', 'der', 'die', 'das', 'den', 'dem', 'ein', 'eine',
|
||||
'einer', 'eines', 'ist', 'sind', 'was', 'wie', 'wo', 'zum', 'zur',
|
||||
'fuer', 'für', 'durch', 'von', 'vom', 'und', 'oder', 'auch',
|
||||
], true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds synthetic scores for exact-title fast-path selections.
|
||||
*
|
||||
|
||||
Reference in New Issue
Block a user