From 387506b23937b367e2f2c3da2a172e5f01ec579d Mon Sep 17 00:00:00 2001 From: team 1 Date: Mon, 4 May 2026 16:33:36 +0200 Subject: [PATCH] p35 --- config/retriex/agent.yaml | 15 ++++++ config/retriex/language.yaml | 9 ++++ config/retriex/retrieval.yaml | 11 ++++ src/Agent/AgentRunner.php | 39 ++++++-------- src/Config/AgentRunnerConfig.php | 53 +++++++++++++++++++ src/Config/LanguageCleanupConfig.php | 50 +++++++++++++++++ src/Config/NdjsonHybridRetrieverConfig.php | 7 +++ src/Config/RetriexEffectiveConfigProvider.php | 12 +++++ src/Intent/IntentLite.php | 21 +++----- src/Intent/SalesIntentLite.php | 13 ++--- src/Knowledge/Retrieval/NdjsonChunkLookup.php | 6 ++- .../Retrieval/NdjsonHybridRetriever.php | 2 +- src/Service/FormatText.php | 17 +++--- 13 files changed, 198 insertions(+), 57 deletions(-) diff --git a/config/retriex/agent.yaml b/config/retriex/agent.yaml index f118929..cdf99f1 100644 --- a/config/retriex/agent.yaml +++ b/config/retriex/agent.yaml @@ -16,6 +16,12 @@ parameters: max_length_ratio_percent: 150 heartbeat_message: 'Ich optimiere die Anfrage…' output_prefix_pattern: '/^(?:normalisiert|korrigiert|corrected|normalized)\s*:\s*/iu' + placeholder_outputs: + - normalized user input + - corrected user input + - user input + - normalisierte nutzereingabe + - korrigierte nutzereingabe skip_patterns: - '/https?:\/\//iu' - '/\bwww\./iu' @@ -192,6 +198,15 @@ parameters: testomat_model_pattern: '/\bTestomat(?:®)?\s+(?:\d{3,4}(?:\s+[A-Z]{2,8})?|EVO(?:\s+[A-Z]{2,6})?|ECO(?:[-\s]?(?:PLUS|C))?|DUO(?:\s+\d{3,4})?|LAB(?:\s+[A-Z]{2,6})?)\b/iu' hardness_value_pattern: '/\b\d+(?:[,.]\d+)?\s*°\s*dH\b/iu' + followup_actions: + commerce: + Im Shop suchen: 'Suche die aktuelle Produktauswahl im Shop.' + Nur Zubehör anzeigen: 'Zeige aus der aktuellen Produktauswahl nur Zubehör.' + Nur Geräte anzeigen: 'Zeige aus der aktuellen Produktauswahl nur Geräte.' + Preis anzeigen: 'Zeige mir die Preise der aktuell relevanten Produkte.' + knowledge: + Technische Details anzeigen: 'Zeige technische Details zur aktuellen Antwort.' + messages: empty_prompt: '❌ Empty prompt.' analyze_request: 'Ich analysiere deine Anfrage...' diff --git a/config/retriex/language.yaml b/config/retriex/language.yaml index ebf50ad..274d983 100644 --- a/config/retriex/language.yaml +++ b/config/retriex/language.yaml @@ -69,6 +69,15 @@ parameters: - tc - '0,02' + normalization: + # Generic language normalization tables. Keep these in YAML so PHP code + # executes normalization logic without owning language-specific lists. + ascii_transliteration: + ä: ae + ö: oe + ü: ue + ß: ss + stopword_groups: de_core: - der diff --git a/config/retriex/retrieval.yaml b/config/retriex/retrieval.yaml index aec8a6f..fae9339 100644 --- a/config/retriex/retrieval.yaml +++ b/config/retriex/retrieval.yaml @@ -46,6 +46,17 @@ parameters: - messbereich testomat: - testomat + exact_selection_token_variant_suffixes: + - typen + - innen + - enen + - ern + - en + - er + - es + - e + - s + - n exact_selection_indicator_question_tokens: - indikator - indikatortyp diff --git a/src/Agent/AgentRunner.php b/src/Agent/AgentRunner.php index 2ade359..93f5640 100644 --- a/src/Agent/AgentRunner.php +++ b/src/Agent/AgentRunner.php @@ -985,12 +985,7 @@ final readonly class AgentRunner private function normalizeFuzzyRoutingToken(string $token): string { $token = mb_strtolower(trim($token), 'UTF-8'); - $token = strtr($token, [ - 'ä' => 'ae', - 'ö' => 'oe', - 'ü' => 'ue', - 'ß' => 'ss', - ]); + $token = $this->languageCleanupConfig->transliterateToAscii($token); $token = preg_replace('/[^a-z0-9]+/u', '', $token) ?? $token; return trim($token); @@ -1028,13 +1023,13 @@ final readonly class AgentRunner { $normalized = $this->normalizeRoutingComparisonText($candidate); - return in_array($normalized, [ - 'normalized user input', - 'corrected user input', - 'user input', - 'normalisierte nutzereingabe', - 'korrigierte nutzereingabe', - ], true); + foreach ($this->agentRunnerConfig->getInputNormalizationPlaceholderOutputs() as $placeholderOutput) { + if ($normalized === $this->normalizeRoutingComparisonText($placeholderOutput)) { + return true; + } + } + + return false; } private function normalizeRoutingComparisonText(string $value): string @@ -2857,12 +2852,7 @@ final readonly class AgentRunner $value = html_entity_decode(strip_tags($value), ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8'); $value = mb_strtolower($value, 'UTF-8'); $value = str_replace(['‐', '‑', '‒', '–', '—'], '-', $value); - $value = strtr($value, [ - 'ä' => 'ae', - 'ö' => 'oe', - 'ü' => 'ue', - 'ß' => 'ss', - ]); + $value = $this->languageCleanupConfig->transliterateToAscii($value); $value = preg_replace('/\s+/u', ' ', $value) ?? $value; return trim($value); @@ -3241,14 +3231,15 @@ final readonly class AgentRunner $actions = []; if ($isCommerceIntent || $hasShopResults) { - $actions[] = ['Im Shop suchen', 'Suche die aktuelle Produktauswahl im Shop.']; - $actions[] = ['Nur Zubehör anzeigen', 'Zeige aus der aktuellen Produktauswahl nur Zubehör.']; - $actions[] = ['Nur Geräte anzeigen', 'Zeige aus der aktuellen Produktauswahl nur Geräte.']; - $actions[] = ['Preis anzeigen', 'Zeige mir die Preise der aktuell relevanten Produkte.']; + foreach ($this->agentRunnerConfig->getCommerceFollowUpActions() as $label => $actionPrompt) { + $actions[] = [$label, $actionPrompt]; + } } if ($hasKnowledge || $hasShopResults) { - $actions[] = ['Technische Details anzeigen', 'Zeige technische Details zur aktuellen Antwort.']; + foreach ($this->agentRunnerConfig->getKnowledgeFollowUpActions() as $label => $actionPrompt) { + $actions[] = [$label, $actionPrompt]; + } } if ($actions === []) { diff --git a/src/Config/AgentRunnerConfig.php b/src/Config/AgentRunnerConfig.php index 2a1f6b1..b0dcce9 100644 --- a/src/Config/AgentRunnerConfig.php +++ b/src/Config/AgentRunnerConfig.php @@ -260,6 +260,26 @@ final class AgentRunnerConfig return $this->getRequiredStringList('input_normalization.fuzzy_routing.terms'); } + /** + * @return string[] + */ + public function getInputNormalizationPlaceholderOutputs(): array + { + return $this->getRequiredStringList('input_normalization.placeholder_outputs'); + } + + /** @return array */ + public function getCommerceFollowUpActions(): array + { + return $this->getRequiredStringMap('followup_actions.commerce'); + } + + /** @return array */ + public function getKnowledgeFollowUpActions(): array + { + return $this->getRequiredStringMap('followup_actions.knowledge'); + } + private function getRequiredInt(string $key): int { $value = $this->requiredValue($key); @@ -384,6 +404,39 @@ final class AgentRunnerConfig return $out; } + /** + * @return array + */ + private function getRequiredStringMap(string $key): array + { + $value = $this->requiredValue($key); + + if (!is_array($value)) { + throw new \InvalidArgumentException(sprintf('RetrieX agent config key "%s" must be a string map.', $key)); + } + + $out = []; + + foreach ($value as $mapKey => $mapValue) { + if (!is_scalar($mapKey) || !is_scalar($mapValue)) { + continue; + } + + $mapKey = trim((string) $mapKey); + $mapValue = trim((string) $mapValue); + + if ($mapKey !== '' && $mapValue !== '') { + $out[$mapKey] = $mapValue; + } + } + + if ($out === []) { + throw new \InvalidArgumentException(sprintf('RetrieX agent config key "%s" must contain at least one valid entry.', $key)); + } + + return $out; + } + /** * @return array */ diff --git a/src/Config/LanguageCleanupConfig.php b/src/Config/LanguageCleanupConfig.php index f8570d8..a2667c1 100644 --- a/src/Config/LanguageCleanupConfig.php +++ b/src/Config/LanguageCleanupConfig.php @@ -44,6 +44,27 @@ final class LanguageCleanupConfig return in_array($term, $this->getProtectedTerms(), true); } + /** @return array */ + public function getAsciiTransliterationMap(): array + { + $normalization = $this->requiredMap('normalization'); + if (!array_key_exists('ascii_transliteration', $normalization)) { + throw $this->invalid('normalization.ascii_transliteration', 'is missing'); + } + + return $this->stringMapFromValue($normalization['ascii_transliteration'], 'normalization.ascii_transliteration', true); + } + + public function transliterateToAscii(string $value): string + { + $map = $this->getAsciiTransliterationMap(); + if ($map === []) { + return $value; + } + + return strtr($value, $map); + } + /** @return string[] */ public function getCleanupProfileNames(): array { @@ -235,6 +256,35 @@ final class LanguageCleanupConfig return $out; } + /** @return array */ + private function stringMapFromValue(mixed $value, string $path, bool $required): array + { + if (!is_array($value)) { + throw $this->invalid($path, 'must be a map of non-empty strings'); + } + + $out = []; + foreach ($value as $key => $item) { + if (!is_scalar($key) || !is_scalar($item)) { + continue; + } + + $key = trim((string) $key); + $item = trim((string) $item); + if ($key === '' || $item === '') { + continue; + } + + $out[$key] = $item; + } + + if ($required && $out === []) { + throw $this->invalid($path, 'must contain at least one non-empty map entry'); + } + + return $out; + } + /** @param string[] $terms */ private function removeProtectedTerms(array $terms): array { diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index 8d7823d..c2511dc 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -148,6 +148,12 @@ final class NdjsonHybridRetrieverConfig return $this->requiredStringListMap('exact_selection_token_variant_prefixes'); } + /** @return string[] */ + public function exactSelectionTokenVariantSuffixes(): array + { + return $this->requiredStringList('exact_selection_token_variant_suffixes'); + } + /** @return string[] */ public function exactSelectionIndicatorQuestionTokens(): array { @@ -313,6 +319,7 @@ final class NdjsonHybridRetrieverConfig 'focused_product_max_chunks' => $this->focusedProductMaxChunks(), 'catalog_list_shortcut_patterns' => $this->catalogListShortcutPatterns(), 'exact_selection_token_variant_prefixes' => $this->exactSelectionTokenVariantPrefixes(), + 'exact_selection_token_variant_suffixes' => $this->exactSelectionTokenVariantSuffixes(), 'exact_selection_indicator_question_tokens' => $this->exactSelectionIndicatorQuestionTokens(), 'exact_selection_indicator_question_phrases' => $this->exactSelectionIndicatorQuestionPhrases(), 'exact_selection_indicator_table_heading_patterns' => $this->exactSelectionIndicatorTableHeadingPatterns(), diff --git a/src/Config/RetriexEffectiveConfigProvider.php b/src/Config/RetriexEffectiveConfigProvider.php index 1cf3e9a..d7192a6 100644 --- a/src/Config/RetriexEffectiveConfigProvider.php +++ b/src/Config/RetriexEffectiveConfigProvider.php @@ -583,6 +583,7 @@ final readonly class RetriexEffectiveConfigProvider 'max_length_ratio_percent' => $this->agentRunnerConfig->getInputNormalizationMaxLengthRatioPercent(), 'heartbeat_message' => $this->agentRunnerConfig->getInputNormalizationHeartbeatMessage(), 'output_prefix_pattern' => $this->agentRunnerConfig->getInputNormalizationOutputPrefixPattern(), + 'placeholder_outputs' => $this->agentRunnerConfig->getInputNormalizationPlaceholderOutputs(), 'skip_patterns' => $this->agentRunnerConfig->getInputNormalizationSkipPatterns(), 'prompt' => [ 'intro' => $this->agentRunnerConfig->getInputNormalizationIntro(), @@ -602,6 +603,10 @@ final readonly class RetriexEffectiveConfigProvider 'terms' => $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingTerms(), ], ], + 'followup_actions' => [ + 'commerce' => $this->agentRunnerConfig->getCommerceFollowUpActions(), + 'knowledge' => $this->agentRunnerConfig->getKnowledgeFollowUpActions(), + ], 'messages' => [ 'empty_prompt' => $this->agentRunnerConfig->getEmptyPromptMessage(), 'analyze_request' => $this->agentRunnerConfig->getAnalyzeRequestMessage(), @@ -929,6 +934,9 @@ final readonly class RetriexEffectiveConfigProvider return [ 'stopwords' => $this->stopWordsConfig->getStopWords(), 'protected_terms' => $this->languageCleanupConfig->getProtectedTerms(), + 'normalization' => [ + 'ascii_transliteration' => $this->languageCleanupConfig->getAsciiTransliterationMap(), + ], 'cleanup_profile_names' => $this->languageCleanupConfig->getCleanupProfileNames(), 'cleanup_profiles' => $profiles, ]; @@ -1200,6 +1208,7 @@ final readonly class RetriexEffectiveConfigProvider private function validateAgent(array $agent, array &$errors, array &$warnings): void { $this->validateStringListMap($agent['messages'] ?? [], 'agent.messages', $errors, $warnings); + $this->validateStringListMap($agent['followup_actions'] ?? [], 'agent.followup_actions', $errors, $warnings); $this->validateStringListMap($agent['source_labels'] ?? [], 'agent.source_labels', $errors, $warnings); $this->validateStringListMap($agent['html_templates'] ?? [], 'agent.html_templates', $errors, $warnings); @@ -1217,6 +1226,9 @@ final readonly class RetriexEffectiveConfigProvider $errors[] = 'agent.follow_up_context.commercial_table_follow_up.query_template_without_model must not be empty.'; } + $inputNormalization = is_array($agent['input_normalization'] ?? null) ? $agent['input_normalization'] : []; + $this->validateStringList($this->toList($inputNormalization['placeholder_outputs'] ?? []), 'agent.input_normalization.placeholder_outputs', $errors, $warnings); + $ragEvidence = is_array($agent['rag_evidence_guard'] ?? null) ? $agent['rag_evidence_guard'] : []; $ragEvidenceCleanupProfile = $ragEvidence['cleanup_profile'] ?? null; if (!is_string($ragEvidenceCleanupProfile) || trim($ragEvidenceCleanupProfile) === '') { diff --git a/src/Intent/IntentLite.php b/src/Intent/IntentLite.php index 8bd21da..1e1868f 100644 --- a/src/Intent/IntentLite.php +++ b/src/Intent/IntentLite.php @@ -5,6 +5,7 @@ declare(strict_types=1); namespace App\Intent; use App\Config\IntentLightConfig; +use App\Config\LanguageCleanupConfig; /** * IntentLite @@ -20,10 +21,9 @@ final readonly class IntentLite { public function __construct( - private IntentLightConfig $config - ) - { - + private IntentLightConfig $config, + private LanguageCleanupConfig $languageCleanupConfig + ) { } public function detectList(string $originalPrompt): array @@ -99,16 +99,9 @@ final readonly class IntentLite { $s = mb_strtolower($s); - // Umlaute zusätzlich absichern (falls QueryCleaner das tut) - $replacements = [ - 'ä' => 'ae', - 'ö' => 'oe', - 'ü' => 'ue', - 'ß' => 'ss', - ]; - - // Nur als Zusatzform speichern (nicht ersetzen!) - foreach ($replacements as $umlaut => $alt) { + // Keep the language-specific transliteration table in YAML. + // Only append an ASCII variant; do not replace the original form. + foreach ($this->languageCleanupConfig->getAsciiTransliterationMap() as $umlaut => $alt) { if (str_contains($s, $umlaut)) { $s .= ' ' . str_replace($umlaut, $alt, $s); break; diff --git a/src/Intent/SalesIntentLite.php b/src/Intent/SalesIntentLite.php index 3566886..aefb9df 100644 --- a/src/Intent/SalesIntentLite.php +++ b/src/Intent/SalesIntentLite.php @@ -4,6 +4,7 @@ declare(strict_types=1); namespace App\Intent; +use App\Config\LanguageCleanupConfig; use App\Config\SalesIntentConfig; final class SalesIntentLite @@ -16,7 +17,8 @@ final class SalesIntentLite public const ROI = 'roi'; public function __construct( - private readonly SalesIntentConfig $config + private readonly SalesIntentConfig $config, + private readonly LanguageCleanupConfig $languageCleanupConfig ) { } @@ -123,11 +125,6 @@ final class SalesIntentLite { $s = mb_strtolower($s); - return strtr($s, [ - 'ä' => 'ae', - 'ö' => 'oe', - 'ü' => 'ue', - 'ß' => 'ss', - ]); + return $this->languageCleanupConfig->transliterateToAscii($s); } -} \ No newline at end of file +} diff --git a/src/Knowledge/Retrieval/NdjsonChunkLookup.php b/src/Knowledge/Retrieval/NdjsonChunkLookup.php index 9e9078a..e718745 100644 --- a/src/Knowledge/Retrieval/NdjsonChunkLookup.php +++ b/src/Knowledge/Retrieval/NdjsonChunkLookup.php @@ -4,12 +4,14 @@ declare(strict_types=1); namespace App\Knowledge\Retrieval; +use App\Config\NdjsonHybridRetrieverConfig; use App\Knowledge\ChunkManager; final readonly class NdjsonChunkLookup { public function __construct( - private ChunkManager $chunkManager + private ChunkManager $chunkManager, + private NdjsonHybridRetrieverConfig $retrieverConfig ) { } @@ -395,7 +397,7 @@ final readonly class NdjsonChunkLookup $length = mb_strlen($token, 'UTF-8'); if ($length >= 5) { - foreach (['innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) { + foreach ($this->retrieverConfig->exactSelectionTokenVariantSuffixes() as $suffix) { if (!str_ends_with($token, $suffix)) { continue; } diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 2850a7f..6f609a9 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -843,7 +843,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $length = mb_strlen($token, 'UTF-8'); if ($length >= 5) { - foreach (['typen', 'innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) { + foreach ($this->retrieverConfig->exactSelectionTokenVariantSuffixes() as $suffix) { if (!str_ends_with($token, $suffix)) { continue; } diff --git a/src/Service/FormatText.php b/src/Service/FormatText.php index 5f26a93..782ca90 100644 --- a/src/Service/FormatText.php +++ b/src/Service/FormatText.php @@ -2,19 +2,20 @@ namespace App\Service; +use App\Config\LanguageCleanupConfig; + class FormatText { + public function __construct(private readonly LanguageCleanupConfig $languageCleanupConfig) + { + } + function slugify(string $text): string { $text = mb_strtolower($text, 'UTF-8'); - // Umlaute ersetzen - $replacements = [ - 'ä' => 'ae', - 'ö' => 'oe', - 'ü' => 'ue', - 'ß' => 'ss' - ]; + // Use YAML-backed language normalization instead of a PHP-owned list. + $replacements = $this->languageCleanupConfig->getAsciiTransliterationMap(); $text = str_replace(array_keys($replacements), $replacements, $text); // Nicht erlaubte Zeichen entfernen @@ -27,4 +28,4 @@ class FormatText return trim($text, '-'); } -} \ No newline at end of file +}