diff --git a/src/Config/PromptBuilderConfig.php b/src/Config/PromptBuilderConfig.php index 72b93bd..baca8cb 100644 --- a/src/Config/PromptBuilderConfig.php +++ b/src/Config/PromptBuilderConfig.php @@ -306,6 +306,9 @@ final class PromptBuilderConfig '- If the source states signal logic such as green/red, output that signal logic only and do not expand it into operational recommendations or alarm semantics unless explicitly stated.', '- If the source lists application areas, repeat only those areas and do not broaden them.', '- If the source names an indicator and threshold, reproduce that exactly without extrapolation.', + '- For lowest, highest, smallest, largest, minimum, maximum, Grenzwert, Messbereich or Aufloesung questions, first identify the exact numeric extreme from the retrieved knowledge and answer that value directly.', + '- For numeric extreme questions, do not combine a value, device name, indicator name, range or product variant from different chunks unless the same retrieved entry explicitly connects them.', + '- If several devices or indicators are present, keep each device-indicator-range assignment separate and do not transfer an indicator from one product to another.', '- If the source states only a threshold function, do not expand it into broader control logic.', '- If a detail is not explicitly stated in the provided sources, say so plainly.', '- Prefer short, source-close sentences over explanatory expansion.', diff --git a/src/Intent/CommerceIntentLite.php b/src/Intent/CommerceIntentLite.php index ce232c2..87820bc 100644 --- a/src/Intent/CommerceIntentLite.php +++ b/src/Intent/CommerceIntentLite.php @@ -40,6 +40,14 @@ final class CommerceIntentLite ); } + if ($this->isTechnicalFactualKnowledgeQuery($prompt) && !$this->hasExplicitCommerceIntent($prompt)) { + return $this->buildDetectionResult( + intent: self::NONE, + score: 0, + signals: ['technical_factual_knowledge_query'] + ); + } + $score = 0; $signals = []; @@ -87,6 +95,49 @@ final class CommerceIntentLite return $this->matchesAnyPattern($prompt, $this->config->getExplicitCommerceIntentPatterns()); } + /** + * Detects factual technical knowledge questions that must stay in RAG retrieval. + * + * Product names such as Testomat can look like commerce queries, but questions + * about limits, measuring ranges, thresholds, resolution or monitoring values + * must not trigger shop search. Shop search may still run when the user uses + * explicit commerce wording such as price, buy, order, shop, article or SKU. + */ + private function isTechnicalFactualKnowledgeQuery(string $prompt): bool + { + $hasQuestionMarker = $this->matchesAnyPattern($prompt, [ + '/\bwas\s+ist\b/u', + '/\bwelche?r?s?\b/u', + '/\bwie\s+(hoch|niedrig|klein|gross|groß)\b/u', + '/\bniedrigste[rsn]?\b/u', + '/\bkleinste[rsn]?\b/u', + '/\bhöchste[rsn]?\b/u', + '/\bhoechste[rsn]?\b/u', + ]); + + if (!$hasQuestionMarker) { + return false; + } + + return $this->matchesAnyPattern($prompt, [ + '/\bgrenzwert(?:e|en|es)?\b/u', + '/\bmessbereich(?:e|en|s)?\b/u', + '/\bwasserhärte\b/u', + '/\bwasserhaerte\b/u', + '/\bresthärte\b/u', + '/\bresthaerte\b/u', + '/\bgesamthärte\b/u', + '/\bgesamthaerte\b/u', + '/\bauflösung\b/u', + '/\baufloesung\b/u', + '/\bindikator(?:en|s)?\b/u', + '/\btestomat(?:en|s)?\b/u', + '/\büberwach(?:t|en|ung)\b/u', + '/\bueberwach(?:t|en|ung)\b/u', + '/\bmess(?:en|ung|bar|wert)\b/u', + ]); + } + /** * @param string[] $patterns */ diff --git a/src/Knowledge/Retrieval/NdjsonChunkLookup.php b/src/Knowledge/Retrieval/NdjsonChunkLookup.php index 8e3b3fe..b947158 100644 --- a/src/Knowledge/Retrieval/NdjsonChunkLookup.php +++ b/src/Knowledge/Retrieval/NdjsonChunkLookup.php @@ -10,8 +10,7 @@ final readonly class NdjsonChunkLookup { public function __construct( private ChunkManager $chunkManager - ) - { + ) { } /** @@ -64,12 +63,12 @@ final readonly class NdjsonChunkLookup } /** - * Resolves the best exact document title match from the user prompt. + * Resolves the best document-title match from the user prompt. * * Matching rules: - * - the normalized prompt must contain the full normalized document title - * - titles containing digits are preferred, e.g. "Testomat 808" - * - longer exact titles win over shorter generic titles + * - exact contiguous title matches still win + * - if no exact match exists, all significant title tokens may match in any order + * - token fallback is intentionally conservative and handles light German suffixes * * @return array{ * document_id:string, @@ -135,6 +134,27 @@ final readonly class NdjsonChunkLookup } } + if ($best === null) { + foreach ($documents as $document) { + $normalizedTitle = $document['normalized_title']; + + if (!$this->isConfidentTitleTokenMatch($normalizedPrompt, $normalizedTitle)) { + continue; + } + + $score = 500 + mb_strlen($normalizedTitle, 'UTF-8'); + + if (preg_match('/\d/u', $normalizedTitle) === 1) { + $score += 1000; + } + + if ($best === null || $score > $bestScore) { + $best = $document; + $bestScore = $score; + } + } + } + if ($best === null) { return null; } @@ -192,6 +212,115 @@ final readonly class NdjsonChunkLookup return count($significantTokens) >= 2 || preg_match('/\d/u', $normalizedTitle) === 1; } + /** + * Fallback for factual prompts that mention a document title by its terms, + * but not as one contiguous phrase. + */ + private function isConfidentTitleTokenMatch(string $normalizedPrompt, string $normalizedTitle): bool + { + if ($normalizedPrompt === '' || $normalizedTitle === '') { + return false; + } + + $titleTokens = $this->significantTitleTokens($normalizedTitle); + + if (count($titleTokens) < 3 && preg_match('/\d/u', $normalizedTitle) !== 1) { + return false; + } + + $promptTokenVariants = $this->tokenVariantLookup($normalizedPrompt); + + foreach ($titleTokens as $titleToken) { + $matched = false; + + foreach ($this->tokenVariants($titleToken) as $variant) { + if (isset($promptTokenVariants[$variant])) { + $matched = true; + break; + } + } + + if (!$matched) { + return false; + } + } + + return true; + } + + /** + * @return string[] + */ + private function significantTitleTokens(string $normalizedTitle): array + { + $tokens = preg_split('/\s+/u', $normalizedTitle, -1, PREG_SPLIT_NO_EMPTY) ?: []; + $out = []; + + foreach ($tokens as $token) { + $token = trim($token); + + if ($token === '') { + continue; + } + + if (mb_strlen($token, 'UTF-8') < 3 && preg_match('/\d/u', $token) !== 1) { + continue; + } + + $out[] = $token; + } + + return array_values(array_unique($out)); + } + + /** + * @return array + */ + private function tokenVariantLookup(string $normalizedText): array + { + $tokens = preg_split('/\s+/u', $normalizedText, -1, PREG_SPLIT_NO_EMPTY) ?: []; + $lookup = []; + + foreach ($tokens as $token) { + foreach ($this->tokenVariants($token) as $variant) { + $lookup[$variant] = true; + } + } + + return $lookup; + } + + /** + * @return string[] + */ + private function tokenVariants(string $token): array + { + $token = trim($token); + + if ($token === '') { + return []; + } + + $variants = [$token]; + $length = mb_strlen($token, 'UTF-8'); + + if ($length >= 5) { + foreach (['innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) { + if (!str_ends_with($token, $suffix)) { + continue; + } + + $stem = mb_substr($token, 0, $length - mb_strlen($suffix, 'UTF-8'), 'UTF-8'); + + if (mb_strlen($stem, 'UTF-8') >= 3) { + $variants[] = $stem; + } + } + } + + return array_values(array_unique($variants)); + } + private function normalizeText(string $value): string { $value = mb_strtolower(trim($value), 'UTF-8'); @@ -201,4 +330,4 @@ final readonly class NdjsonChunkLookup return trim($value); } -} \ No newline at end of file +}