fix retrieve final technical questions

This commit is contained in:
team 1
2026-04-24 11:32:54 +02:00
parent b800c1fc8f
commit 8a7cb89c5d
3 changed files with 190 additions and 7 deletions

View File

@@ -306,6 +306,9 @@ final class PromptBuilderConfig
'- If the source states signal logic such as green/red, output that signal logic only and do not expand it into operational recommendations or alarm semantics unless explicitly stated.',
'- If the source lists application areas, repeat only those areas and do not broaden them.',
'- If the source names an indicator and threshold, reproduce that exactly without extrapolation.',
'- For lowest, highest, smallest, largest, minimum, maximum, Grenzwert, Messbereich or Aufloesung questions, first identify the exact numeric extreme from the retrieved knowledge and answer that value directly.',
'- For numeric extreme questions, do not combine a value, device name, indicator name, range or product variant from different chunks unless the same retrieved entry explicitly connects them.',
'- If several devices or indicators are present, keep each device-indicator-range assignment separate and do not transfer an indicator from one product to another.',
'- If the source states only a threshold function, do not expand it into broader control logic.',
'- If a detail is not explicitly stated in the provided sources, say so plainly.',
'- Prefer short, source-close sentences over explanatory expansion.',

View File

@@ -40,6 +40,14 @@ final class CommerceIntentLite
);
}
if ($this->isTechnicalFactualKnowledgeQuery($prompt) && !$this->hasExplicitCommerceIntent($prompt)) {
return $this->buildDetectionResult(
intent: self::NONE,
score: 0,
signals: ['technical_factual_knowledge_query']
);
}
$score = 0;
$signals = [];
@@ -87,6 +95,49 @@ final class CommerceIntentLite
return $this->matchesAnyPattern($prompt, $this->config->getExplicitCommerceIntentPatterns());
}
/**
* Detects factual technical knowledge questions that must stay in RAG retrieval.
*
* Product names such as Testomat can look like commerce queries, but questions
* about limits, measuring ranges, thresholds, resolution or monitoring values
* must not trigger shop search. Shop search may still run when the user uses
* explicit commerce wording such as price, buy, order, shop, article or SKU.
*/
private function isTechnicalFactualKnowledgeQuery(string $prompt): bool
{
$hasQuestionMarker = $this->matchesAnyPattern($prompt, [
'/\bwas\s+ist\b/u',
'/\bwelche?r?s?\b/u',
'/\bwie\s+(hoch|niedrig|klein|gross|groß)\b/u',
'/\bniedrigste[rsn]?\b/u',
'/\bkleinste[rsn]?\b/u',
'/\bhöchste[rsn]?\b/u',
'/\bhoechste[rsn]?\b/u',
]);
if (!$hasQuestionMarker) {
return false;
}
return $this->matchesAnyPattern($prompt, [
'/\bgrenzwert(?:e|en|es)?\b/u',
'/\bmessbereich(?:e|en|s)?\b/u',
'/\bwasserhärte\b/u',
'/\bwasserhaerte\b/u',
'/\bresthärte\b/u',
'/\bresthaerte\b/u',
'/\bgesamthärte\b/u',
'/\bgesamthaerte\b/u',
'/\bauflösung\b/u',
'/\baufloesung\b/u',
'/\bindikator(?:en|s)?\b/u',
'/\btestomat(?:en|s)?\b/u',
'/\büberwach(?:t|en|ung)\b/u',
'/\bueberwach(?:t|en|ung)\b/u',
'/\bmess(?:en|ung|bar|wert)\b/u',
]);
}
/**
* @param string[] $patterns
*/

View File

@@ -10,8 +10,7 @@ final readonly class NdjsonChunkLookup
{
public function __construct(
private ChunkManager $chunkManager
)
{
) {
}
/**
@@ -64,12 +63,12 @@ final readonly class NdjsonChunkLookup
}
/**
* Resolves the best exact document title match from the user prompt.
* Resolves the best document-title match from the user prompt.
*
* Matching rules:
* - the normalized prompt must contain the full normalized document title
* - titles containing digits are preferred, e.g. "Testomat 808"
* - longer exact titles win over shorter generic titles
* - exact contiguous title matches still win
* - if no exact match exists, all significant title tokens may match in any order
* - token fallback is intentionally conservative and handles light German suffixes
*
* @return array{
* document_id:string,
@@ -135,6 +134,27 @@ final readonly class NdjsonChunkLookup
}
}
if ($best === null) {
foreach ($documents as $document) {
$normalizedTitle = $document['normalized_title'];
if (!$this->isConfidentTitleTokenMatch($normalizedPrompt, $normalizedTitle)) {
continue;
}
$score = 500 + mb_strlen($normalizedTitle, 'UTF-8');
if (preg_match('/\d/u', $normalizedTitle) === 1) {
$score += 1000;
}
if ($best === null || $score > $bestScore) {
$best = $document;
$bestScore = $score;
}
}
}
if ($best === null) {
return null;
}
@@ -192,6 +212,115 @@ final readonly class NdjsonChunkLookup
return count($significantTokens) >= 2 || preg_match('/\d/u', $normalizedTitle) === 1;
}
/**
* Fallback for factual prompts that mention a document title by its terms,
* but not as one contiguous phrase.
*/
private function isConfidentTitleTokenMatch(string $normalizedPrompt, string $normalizedTitle): bool
{
if ($normalizedPrompt === '' || $normalizedTitle === '') {
return false;
}
$titleTokens = $this->significantTitleTokens($normalizedTitle);
if (count($titleTokens) < 3 && preg_match('/\d/u', $normalizedTitle) !== 1) {
return false;
}
$promptTokenVariants = $this->tokenVariantLookup($normalizedPrompt);
foreach ($titleTokens as $titleToken) {
$matched = false;
foreach ($this->tokenVariants($titleToken) as $variant) {
if (isset($promptTokenVariants[$variant])) {
$matched = true;
break;
}
}
if (!$matched) {
return false;
}
}
return true;
}
/**
* @return string[]
*/
private function significantTitleTokens(string $normalizedTitle): array
{
$tokens = preg_split('/\s+/u', $normalizedTitle, -1, PREG_SPLIT_NO_EMPTY) ?: [];
$out = [];
foreach ($tokens as $token) {
$token = trim($token);
if ($token === '') {
continue;
}
if (mb_strlen($token, 'UTF-8') < 3 && preg_match('/\d/u', $token) !== 1) {
continue;
}
$out[] = $token;
}
return array_values(array_unique($out));
}
/**
* @return array<string,bool>
*/
private function tokenVariantLookup(string $normalizedText): array
{
$tokens = preg_split('/\s+/u', $normalizedText, -1, PREG_SPLIT_NO_EMPTY) ?: [];
$lookup = [];
foreach ($tokens as $token) {
foreach ($this->tokenVariants($token) as $variant) {
$lookup[$variant] = true;
}
}
return $lookup;
}
/**
* @return string[]
*/
private function tokenVariants(string $token): array
{
$token = trim($token);
if ($token === '') {
return [];
}
$variants = [$token];
$length = mb_strlen($token, 'UTF-8');
if ($length >= 5) {
foreach (['innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) {
if (!str_ends_with($token, $suffix)) {
continue;
}
$stem = mb_substr($token, 0, $length - mb_strlen($suffix, 'UTF-8'), 'UTF-8');
if (mb_strlen($stem, 'UTF-8') >= 3) {
$variants[] = $stem;
}
}
}
return array_values(array_unique($variants));
}
private function normalizeText(string $value): string
{
$value = mb_strtolower(trim($value), 'UTF-8');