fix retrieve final technical questions
This commit is contained in:
@@ -306,6 +306,9 @@ final class PromptBuilderConfig
|
||||
'- If the source states signal logic such as green/red, output that signal logic only and do not expand it into operational recommendations or alarm semantics unless explicitly stated.',
|
||||
'- If the source lists application areas, repeat only those areas and do not broaden them.',
|
||||
'- If the source names an indicator and threshold, reproduce that exactly without extrapolation.',
|
||||
'- For lowest, highest, smallest, largest, minimum, maximum, Grenzwert, Messbereich or Aufloesung questions, first identify the exact numeric extreme from the retrieved knowledge and answer that value directly.',
|
||||
'- For numeric extreme questions, do not combine a value, device name, indicator name, range or product variant from different chunks unless the same retrieved entry explicitly connects them.',
|
||||
'- If several devices or indicators are present, keep each device-indicator-range assignment separate and do not transfer an indicator from one product to another.',
|
||||
'- If the source states only a threshold function, do not expand it into broader control logic.',
|
||||
'- If a detail is not explicitly stated in the provided sources, say so plainly.',
|
||||
'- Prefer short, source-close sentences over explanatory expansion.',
|
||||
|
||||
@@ -40,6 +40,14 @@ final class CommerceIntentLite
|
||||
);
|
||||
}
|
||||
|
||||
if ($this->isTechnicalFactualKnowledgeQuery($prompt) && !$this->hasExplicitCommerceIntent($prompt)) {
|
||||
return $this->buildDetectionResult(
|
||||
intent: self::NONE,
|
||||
score: 0,
|
||||
signals: ['technical_factual_knowledge_query']
|
||||
);
|
||||
}
|
||||
|
||||
$score = 0;
|
||||
$signals = [];
|
||||
|
||||
@@ -87,6 +95,49 @@ final class CommerceIntentLite
|
||||
return $this->matchesAnyPattern($prompt, $this->config->getExplicitCommerceIntentPatterns());
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects factual technical knowledge questions that must stay in RAG retrieval.
|
||||
*
|
||||
* Product names such as Testomat can look like commerce queries, but questions
|
||||
* about limits, measuring ranges, thresholds, resolution or monitoring values
|
||||
* must not trigger shop search. Shop search may still run when the user uses
|
||||
* explicit commerce wording such as price, buy, order, shop, article or SKU.
|
||||
*/
|
||||
private function isTechnicalFactualKnowledgeQuery(string $prompt): bool
|
||||
{
|
||||
$hasQuestionMarker = $this->matchesAnyPattern($prompt, [
|
||||
'/\bwas\s+ist\b/u',
|
||||
'/\bwelche?r?s?\b/u',
|
||||
'/\bwie\s+(hoch|niedrig|klein|gross|groß)\b/u',
|
||||
'/\bniedrigste[rsn]?\b/u',
|
||||
'/\bkleinste[rsn]?\b/u',
|
||||
'/\bhöchste[rsn]?\b/u',
|
||||
'/\bhoechste[rsn]?\b/u',
|
||||
]);
|
||||
|
||||
if (!$hasQuestionMarker) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return $this->matchesAnyPattern($prompt, [
|
||||
'/\bgrenzwert(?:e|en|es)?\b/u',
|
||||
'/\bmessbereich(?:e|en|s)?\b/u',
|
||||
'/\bwasserhärte\b/u',
|
||||
'/\bwasserhaerte\b/u',
|
||||
'/\bresthärte\b/u',
|
||||
'/\bresthaerte\b/u',
|
||||
'/\bgesamthärte\b/u',
|
||||
'/\bgesamthaerte\b/u',
|
||||
'/\bauflösung\b/u',
|
||||
'/\baufloesung\b/u',
|
||||
'/\bindikator(?:en|s)?\b/u',
|
||||
'/\btestomat(?:en|s)?\b/u',
|
||||
'/\büberwach(?:t|en|ung)\b/u',
|
||||
'/\bueberwach(?:t|en|ung)\b/u',
|
||||
'/\bmess(?:en|ung|bar|wert)\b/u',
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[] $patterns
|
||||
*/
|
||||
|
||||
@@ -10,8 +10,7 @@ final readonly class NdjsonChunkLookup
|
||||
{
|
||||
public function __construct(
|
||||
private ChunkManager $chunkManager
|
||||
)
|
||||
{
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -64,12 +63,12 @@ final readonly class NdjsonChunkLookup
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves the best exact document title match from the user prompt.
|
||||
* Resolves the best document-title match from the user prompt.
|
||||
*
|
||||
* Matching rules:
|
||||
* - the normalized prompt must contain the full normalized document title
|
||||
* - titles containing digits are preferred, e.g. "Testomat 808"
|
||||
* - longer exact titles win over shorter generic titles
|
||||
* - exact contiguous title matches still win
|
||||
* - if no exact match exists, all significant title tokens may match in any order
|
||||
* - token fallback is intentionally conservative and handles light German suffixes
|
||||
*
|
||||
* @return array{
|
||||
* document_id:string,
|
||||
@@ -135,6 +134,27 @@ final readonly class NdjsonChunkLookup
|
||||
}
|
||||
}
|
||||
|
||||
if ($best === null) {
|
||||
foreach ($documents as $document) {
|
||||
$normalizedTitle = $document['normalized_title'];
|
||||
|
||||
if (!$this->isConfidentTitleTokenMatch($normalizedPrompt, $normalizedTitle)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$score = 500 + mb_strlen($normalizedTitle, 'UTF-8');
|
||||
|
||||
if (preg_match('/\d/u', $normalizedTitle) === 1) {
|
||||
$score += 1000;
|
||||
}
|
||||
|
||||
if ($best === null || $score > $bestScore) {
|
||||
$best = $document;
|
||||
$bestScore = $score;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($best === null) {
|
||||
return null;
|
||||
}
|
||||
@@ -192,6 +212,115 @@ final readonly class NdjsonChunkLookup
|
||||
return count($significantTokens) >= 2 || preg_match('/\d/u', $normalizedTitle) === 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fallback for factual prompts that mention a document title by its terms,
|
||||
* but not as one contiguous phrase.
|
||||
*/
|
||||
private function isConfidentTitleTokenMatch(string $normalizedPrompt, string $normalizedTitle): bool
|
||||
{
|
||||
if ($normalizedPrompt === '' || $normalizedTitle === '') {
|
||||
return false;
|
||||
}
|
||||
|
||||
$titleTokens = $this->significantTitleTokens($normalizedTitle);
|
||||
|
||||
if (count($titleTokens) < 3 && preg_match('/\d/u', $normalizedTitle) !== 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$promptTokenVariants = $this->tokenVariantLookup($normalizedPrompt);
|
||||
|
||||
foreach ($titleTokens as $titleToken) {
|
||||
$matched = false;
|
||||
|
||||
foreach ($this->tokenVariants($titleToken) as $variant) {
|
||||
if (isset($promptTokenVariants[$variant])) {
|
||||
$matched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!$matched) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
private function significantTitleTokens(string $normalizedTitle): array
|
||||
{
|
||||
$tokens = preg_split('/\s+/u', $normalizedTitle, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||||
$out = [];
|
||||
|
||||
foreach ($tokens as $token) {
|
||||
$token = trim($token);
|
||||
|
||||
if ($token === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mb_strlen($token, 'UTF-8') < 3 && preg_match('/\d/u', $token) !== 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = $token;
|
||||
}
|
||||
|
||||
return array_values(array_unique($out));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string,bool>
|
||||
*/
|
||||
private function tokenVariantLookup(string $normalizedText): array
|
||||
{
|
||||
$tokens = preg_split('/\s+/u', $normalizedText, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||||
$lookup = [];
|
||||
|
||||
foreach ($tokens as $token) {
|
||||
foreach ($this->tokenVariants($token) as $variant) {
|
||||
$lookup[$variant] = true;
|
||||
}
|
||||
}
|
||||
|
||||
return $lookup;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
private function tokenVariants(string $token): array
|
||||
{
|
||||
$token = trim($token);
|
||||
|
||||
if ($token === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
$variants = [$token];
|
||||
$length = mb_strlen($token, 'UTF-8');
|
||||
|
||||
if ($length >= 5) {
|
||||
foreach (['innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) {
|
||||
if (!str_ends_with($token, $suffix)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$stem = mb_substr($token, 0, $length - mb_strlen($suffix, 'UTF-8'), 'UTF-8');
|
||||
|
||||
if (mb_strlen($stem, 'UTF-8') >= 3) {
|
||||
$variants[] = $stem;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return array_values(array_unique($variants));
|
||||
}
|
||||
|
||||
private function normalizeText(string $value): string
|
||||
{
|
||||
$value = mb_strtolower(trim($value), 'UTF-8');
|
||||
|
||||
Reference in New Issue
Block a user