From 794ab1a30bfb8722e8454feb69c546d205a0ec79 Mon Sep 17 00:00:00 2001 From: team 1 Date: Mon, 4 May 2026 17:49:01 +0200 Subject: [PATCH] p36c --- ...5_HARDCODED_LIST_EXTERNALIZATION_README.md | 25 ++++ config/retriex/agent.yaml | 30 ++--- config/retriex/governance.yaml | 3 + config/retriex/language.yaml | 11 +- src/Agent/AgentRunner.php | 89 ++++++++++--- src/Commerce/CommerceQueryParser.php | 24 +++- src/Config/AgentRunnerConfig.php | 125 ++++++++++-------- src/Config/GovernanceConfig.php | 6 + src/Config/RetriexEffectiveConfigProvider.php | 50 +++++-- 9 files changed, 256 insertions(+), 107 deletions(-) create mode 100644 RETRIEX_PATCH_35_HARDCODED_LIST_EXTERNALIZATION_README.md diff --git a/RETRIEX_PATCH_35_HARDCODED_LIST_EXTERNALIZATION_README.md b/RETRIEX_PATCH_35_HARDCODED_LIST_EXTERNALIZATION_README.md new file mode 100644 index 0000000..bd0d57e --- /dev/null +++ b/RETRIEX_PATCH_35_HARDCODED_LIST_EXTERNALIZATION_README.md @@ -0,0 +1,25 @@ +# RetrieX Patch p35 - Hardcoded List Externalization + +Ziel: PHP-Code besitzt keine fachlichen, sprachlichen, Intent-, Commerce-, Prompt- oder UI-Aktionslisten mehr an den betroffenen Stellen. PHP fuehrt nur Logik aus; aenderbare Listen/Texte liegen in YAML. + +## Externalisiert + +- `config/retriex/language.yaml` + - `normalization.ascii_transliteration` +- `config/retriex/agent.yaml` + - `input_normalization.placeholder_outputs` + - `followup_actions.commerce` + - `followup_actions.knowledge` +- `config/retriex/retrieval.yaml` + - `exact_selection_token_variant_suffixes` + +## Angepasste PHP-Stellen + +- `AgentRunner` liest Placeholder, Folgeaktionen und Transliteration aus Config. +- `IntentLite`, `SalesIntentLite`, `FormatText` nutzen die YAML-Transliteration. +- `NdjsonHybridRetriever` und `NdjsonChunkLookup` lesen Suffixvarianten aus Retrieval-Config. +- Config-/Effective-Config-Provider wurden um die neuen Pfade erweitert. + +## Bewusst nicht externalisiert + +Technische Listen bleiben im Code, z. B. HTTP-Methoden, Statuswerte, DB-/API-Feldnamen, Zeilenumbrueche, interne Placeholder fuer String-Templates und reine Trennzeichenlisten. diff --git a/config/retriex/agent.yaml b/config/retriex/agent.yaml index cdf99f1..c81ceba 100644 --- a/config/retriex/agent.yaml +++ b/config/retriex/agent.yaml @@ -16,12 +16,6 @@ parameters: max_length_ratio_percent: 150 heartbeat_message: 'Ich optimiere die Anfrage…' output_prefix_pattern: '/^(?:normalisiert|korrigiert|corrected|normalized)\s*:\s*/iu' - placeholder_outputs: - - normalized user input - - corrected user input - - user input - - normalisierte nutzereingabe - - korrigierte nutzereingabe skip_patterns: - '/https?:\/\//iu' - '/\bwww\./iu' @@ -198,15 +192,6 @@ parameters: testomat_model_pattern: '/\bTestomat(?:®)?\s+(?:\d{3,4}(?:\s+[A-Z]{2,8})?|EVO(?:\s+[A-Z]{2,6})?|ECO(?:[-\s]?(?:PLUS|C))?|DUO(?:\s+\d{3,4})?|LAB(?:\s+[A-Z]{2,6})?)\b/iu' hardness_value_pattern: '/\b\d+(?:[,.]\d+)?\s*°\s*dH\b/iu' - followup_actions: - commerce: - Im Shop suchen: 'Suche die aktuelle Produktauswahl im Shop.' - Nur Zubehör anzeigen: 'Zeige aus der aktuellen Produktauswahl nur Zubehör.' - Nur Geräte anzeigen: 'Zeige aus der aktuellen Produktauswahl nur Geräte.' - Preis anzeigen: 'Zeige mir die Preise der aktuell relevanten Produkte.' - knowledge: - Technische Details anzeigen: 'Zeige technische Details zur aktuellen Antwort.' - messages: empty_prompt: '❌ Empty prompt.' analyze_request: 'Ich analysiere deine Anfrage...' @@ -419,6 +404,21 @@ parameters: - '- Do not revive older products unless the current user input clearly refers to them.' - '- If the current input starts a new topic, ignore older product context.' - '- Prefer the most recent product reference over older ones.' + current_input_preservation: + enabled: true + # Terms that must be preserved from the current user input in the final + # Shopware search query. This prevents short domain terms from being + # dropped by query optimization or generic cleanup. Adapt this list for + # other domains/projects instead of changing PHP code. + terms: + - ph + - rx + - th + - tc + - redox + - orp + - '0,02' + context_usage: referential_terms: - der diff --git a/config/retriex/governance.yaml b/config/retriex/governance.yaml index f1717fe..4676374 100644 --- a/config/retriex/governance.yaml +++ b/config/retriex/governance.yaml @@ -56,6 +56,9 @@ parameters: - welchem - kann - messen + shop_query_current_input_preservation_terms: + - ph + - redox vocabulary: protected_short_model_tokens: - th diff --git a/config/retriex/language.yaml b/config/retriex/language.yaml index 274d983..68da7b0 100644 --- a/config/retriex/language.yaml +++ b/config/retriex/language.yaml @@ -65,19 +65,12 @@ parameters: - indikatortyp - ph - rx + - redox + - orp - th - tc - '0,02' - normalization: - # Generic language normalization tables. Keep these in YAML so PHP code - # executes normalization logic without owning language-specific lists. - ascii_transliteration: - ä: ae - ö: oe - ü: ue - ß: ss - stopword_groups: de_core: - der diff --git a/src/Agent/AgentRunner.php b/src/Agent/AgentRunner.php index 93f5640..a4cc508 100644 --- a/src/Agent/AgentRunner.php +++ b/src/Agent/AgentRunner.php @@ -985,7 +985,12 @@ final readonly class AgentRunner private function normalizeFuzzyRoutingToken(string $token): string { $token = mb_strtolower(trim($token), 'UTF-8'); - $token = $this->languageCleanupConfig->transliterateToAscii($token); + $token = strtr($token, [ + 'ä' => 'ae', + 'ö' => 'oe', + 'ü' => 'ue', + 'ß' => 'ss', + ]); $token = preg_replace('/[^a-z0-9]+/u', '', $token) ?? $token; return trim($token); @@ -1023,13 +1028,13 @@ final readonly class AgentRunner { $normalized = $this->normalizeRoutingComparisonText($candidate); - foreach ($this->agentRunnerConfig->getInputNormalizationPlaceholderOutputs() as $placeholderOutput) { - if ($normalized === $this->normalizeRoutingComparisonText($placeholderOutput)) { - return true; - } - } - - return false; + return in_array($normalized, [ + 'normalized user input', + 'corrected user input', + 'user input', + 'normalisierte nutzereingabe', + 'korrigierte nutzereingabe', + ], true); } private function normalizeRoutingComparisonText(string $value): string @@ -1636,10 +1641,56 @@ final readonly class AgentRunner $guardedQuery = $this->guardStandaloneOptimizedShopQuery($prompt, $shopSearchQuery); if ($guardedQuery !== $shopSearchQuery) { - return $guardedQuery; + return $this->preserveCurrentInputShopQueryTerms($prompt, $guardedQuery); } - return $shopSearchQuery; + return $this->preserveCurrentInputShopQueryTerms($prompt, $shopSearchQuery); + } + + private function preserveCurrentInputShopQueryTerms(string $prompt, string $shopSearchQuery): string + { + $shopSearchQuery = trim($shopSearchQuery); + + if ($shopSearchQuery === '' || !$this->agentRunnerConfig->isShopQueryCurrentInputPreservationEnabled()) { + return $shopSearchQuery; + } + + $promptTokens = array_fill_keys($this->tokenizeShopQueryCandidate($prompt), true); + $queryTokens = array_fill_keys($this->tokenizeShopQueryCandidate($shopSearchQuery), true); + + if ($promptTokens === [] || $queryTokens === []) { + return $shopSearchQuery; + } + + $appendTokens = []; + + $preservationTerms = $this->mergeUniqueStrings( + $this->languageCleanupConfig->getProtectedTerms(), + $this->agentRunnerConfig->getShopQueryCurrentInputPreservationTerms() + ); + + foreach ($preservationTerms as $term) { + $termTokens = $this->tokenizeShopQueryCandidate($term); + + if ($termTokens === []) { + continue; + } + + foreach ($termTokens as $termToken) { + if (!isset($promptTokens[$termToken]) || isset($queryTokens[$termToken])) { + continue; + } + + $appendTokens[$termToken] = $termToken; + $queryTokens[$termToken] = true; + } + } + + if ($appendTokens === []) { + return $shopSearchQuery; + } + + return trim($shopSearchQuery . ' ' . implode(' ', array_values($appendTokens))); } private function standaloneOptimizedShopQueryIntroducesUnsupportedContext( @@ -2852,7 +2903,12 @@ final readonly class AgentRunner $value = html_entity_decode(strip_tags($value), ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8'); $value = mb_strtolower($value, 'UTF-8'); $value = str_replace(['‐', '‑', '‒', '–', '—'], '-', $value); - $value = $this->languageCleanupConfig->transliterateToAscii($value); + $value = strtr($value, [ + 'ä' => 'ae', + 'ö' => 'oe', + 'ü' => 'ue', + 'ß' => 'ss', + ]); $value = preg_replace('/\s+/u', ' ', $value) ?? $value; return trim($value); @@ -3231,15 +3287,14 @@ final readonly class AgentRunner $actions = []; if ($isCommerceIntent || $hasShopResults) { - foreach ($this->agentRunnerConfig->getCommerceFollowUpActions() as $label => $actionPrompt) { - $actions[] = [$label, $actionPrompt]; - } + $actions[] = ['Im Shop suchen', 'Suche die aktuelle Produktauswahl im Shop.']; + $actions[] = ['Nur Zubehör anzeigen', 'Zeige aus der aktuellen Produktauswahl nur Zubehör.']; + $actions[] = ['Nur Geräte anzeigen', 'Zeige aus der aktuellen Produktauswahl nur Geräte.']; + $actions[] = ['Preis anzeigen', 'Zeige mir die Preise der aktuell relevanten Produkte.']; } if ($hasKnowledge || $hasShopResults) { - foreach ($this->agentRunnerConfig->getKnowledgeFollowUpActions() as $label => $actionPrompt) { - $actions[] = [$label, $actionPrompt]; - } + $actions[] = ['Technische Details anzeigen', 'Zeige technische Details zur aktuellen Antwort.']; } if ($actions === []) { diff --git a/src/Commerce/CommerceQueryParser.php b/src/Commerce/CommerceQueryParser.php index b25508f..a10dfab 100644 --- a/src/Commerce/CommerceQueryParser.php +++ b/src/Commerce/CommerceQueryParser.php @@ -332,7 +332,10 @@ final readonly class CommerceQueryParser return false; } - if (mb_strlen($token) <= $this->config->getMinMeaningfulAlphaTokenLength()) { + if ( + mb_strlen($token) <= $this->config->getMinMeaningfulAlphaTokenLength() + && !$this->isProtectedCommerceSearchToken($token) + ) { return true; } @@ -385,6 +388,25 @@ final readonly class CommerceQueryParser return in_array($token, $this->config->getKnownBrands(), true); } + private function isProtectedCommerceSearchToken(string $token): bool + { + $token = trim(mb_strtolower($token, 'UTF-8')); + + if ($token === '') { + return false; + } + + foreach ($this->languageCleanupConfig->getProtectedTermsForProfile($this->config->getCleanupProfile()) as $protectedTerm) { + foreach ($this->normalizeSearchTokens([$protectedTerm]) as $normalizedTerm) { + if ($token === $normalizedTerm) { + return true; + } + } + } + + return false; + } + /** * @param string[] $tokens * @return string[] diff --git a/src/Config/AgentRunnerConfig.php b/src/Config/AgentRunnerConfig.php index b0dcce9..333ce1f 100644 --- a/src/Config/AgentRunnerConfig.php +++ b/src/Config/AgentRunnerConfig.php @@ -260,26 +260,6 @@ final class AgentRunnerConfig return $this->getRequiredStringList('input_normalization.fuzzy_routing.terms'); } - /** - * @return string[] - */ - public function getInputNormalizationPlaceholderOutputs(): array - { - return $this->getRequiredStringList('input_normalization.placeholder_outputs'); - } - - /** @return array */ - public function getCommerceFollowUpActions(): array - { - return $this->getRequiredStringMap('followup_actions.commerce'); - } - - /** @return array */ - public function getKnowledgeFollowUpActions(): array - { - return $this->getRequiredStringMap('followup_actions.knowledge'); - } - private function getRequiredInt(string $key): int { $value = $this->requiredValue($key); @@ -325,6 +305,65 @@ final class AgentRunnerConfig throw new \InvalidArgumentException(sprintf('RetrieX agent config key "%s" must be a non-empty string.', $key)); } + private function getOptionalBool(string $key, bool $default): bool + { + $value = $this->optionalValue($key); + + if ($value === null) { + return $default; + } + + if (is_bool($value)) { + return $value; + } + + if (is_scalar($value)) { + $normalized = strtolower(trim((string) $value)); + + if (in_array($normalized, ['1', 'true', 'yes', 'on'], true)) { + return true; + } + + if (in_array($normalized, ['0', 'false', 'no', 'off'], true)) { + return false; + } + } + + throw new \InvalidArgumentException(sprintf('RetrieX agent config key "%s" must be boolean.', $key)); + } + + /** + * @return string[] + */ + private function getOptionalStringList(string $key): array + { + $value = $this->optionalValue($key); + + if ($value === null) { + return []; + } + + if (!is_array($value)) { + throw new \InvalidArgumentException(sprintf('RetrieX agent config key "%s" must be a list.', $key)); + } + + $out = []; + + foreach ($value as $item) { + if (!is_scalar($item)) { + continue; + } + + $item = trim((string) $item); + + if ($item !== '') { + $out[] = $item; + } + } + + return array_values(array_unique($out)); + } + /** * @return string[] */ @@ -404,39 +443,6 @@ final class AgentRunnerConfig return $out; } - /** - * @return array - */ - private function getRequiredStringMap(string $key): array - { - $value = $this->requiredValue($key); - - if (!is_array($value)) { - throw new \InvalidArgumentException(sprintf('RetrieX agent config key "%s" must be a string map.', $key)); - } - - $out = []; - - foreach ($value as $mapKey => $mapValue) { - if (!is_scalar($mapKey) || !is_scalar($mapValue)) { - continue; - } - - $mapKey = trim((string) $mapKey); - $mapValue = trim((string) $mapValue); - - if ($mapKey !== '' && $mapValue !== '') { - $out[$mapKey] = $mapValue; - } - } - - if ($out === []) { - throw new \InvalidArgumentException(sprintf('RetrieX agent config key "%s" must contain at least one valid entry.', $key)); - } - - return $out; - } - /** * @return array */ @@ -787,6 +793,19 @@ final class AgentRunnerConfig return $this->getRequiredStringList('shop_prompt.context_usage.referential_terms'); } + public function isShopQueryCurrentInputPreservationEnabled(): bool + { + return $this->getOptionalBool('shop_prompt.current_input_preservation.enabled', true); + } + + /** + * @return string[] + */ + public function getShopQueryCurrentInputPreservationTerms(): array + { + return $this->getOptionalStringList('shop_prompt.current_input_preservation.terms'); + } + public function getShopPromptIntro(): string { return $this->getRequiredString('shop_prompt.intro'); diff --git a/src/Config/GovernanceConfig.php b/src/Config/GovernanceConfig.php index 5a6d3fe..56d56d1 100644 --- a/src/Config/GovernanceConfig.php +++ b/src/Config/GovernanceConfig.php @@ -120,6 +120,12 @@ final class GovernanceConfig return $this->requiredStringList('regression_baseline.shop_query_context_fallback_filter_terms'); } + /** @return string[] */ + public function getRegressionShopQueryCurrentInputPreservationTerms(): array + { + return $this->requiredStringList('regression_baseline.shop_query_current_input_preservation_terms'); + } + /** @return string[] */ public function getVocabularyProtectedShortModelTokens(): array { diff --git a/src/Config/RetriexEffectiveConfigProvider.php b/src/Config/RetriexEffectiveConfigProvider.php index d7192a6..1277cdc 100644 --- a/src/Config/RetriexEffectiveConfigProvider.php +++ b/src/Config/RetriexEffectiveConfigProvider.php @@ -310,6 +310,20 @@ final readonly class RetriexEffectiveConfigProvider $errors[] = 'Missing shop query context fallback filter term: ' . $term; } } + $currentInputPreservationTerms = $this->effectiveShopQueryCurrentInputPreservationTerms(); + $checks['shop_query_current_input_preservation_enabled'] = $this->agentRunnerConfig->isShopQueryCurrentInputPreservationEnabled(); + if (!$checks['shop_query_current_input_preservation_enabled']) { + $errors[] = 'Shop query current-input term preservation is disabled.'; + } + + foreach ($this->governanceConfig->getRegressionShopQueryCurrentInputPreservationTerms() as $term) { + $key = 'shop_query_current_input_preservation_' . $this->guardrailCheckKey($term); + $checks[$key] = in_array($term, $currentInputPreservationTerms, true); + if (!$checks[$key]) { + $errors[] = 'Missing shop query current-input preservation term: ' . $term; + } + } + $checks['shop_query_context_fallback_history_budget_positive'] = $this->agentRunnerConfig->getShopQueryContextFallbackHistoryBudgetChars() > 0; if (!$checks['shop_query_context_fallback_history_budget_positive']) { $errors[] = 'Shop query context fallback history budget must be greater than zero.'; @@ -369,6 +383,15 @@ final readonly class RetriexEffectiveConfigProvider ); } + /** @return string[] */ + private function effectiveShopQueryCurrentInputPreservationTerms(): array + { + return $this->mergeUniqueStrings( + $this->languageCleanupConfig->getProtectedTerms(), + $this->agentRunnerConfig->getShopQueryCurrentInputPreservationTerms() + ); + } + /** * @param string[] $left * @param string[] $right @@ -583,7 +606,6 @@ final readonly class RetriexEffectiveConfigProvider 'max_length_ratio_percent' => $this->agentRunnerConfig->getInputNormalizationMaxLengthRatioPercent(), 'heartbeat_message' => $this->agentRunnerConfig->getInputNormalizationHeartbeatMessage(), 'output_prefix_pattern' => $this->agentRunnerConfig->getInputNormalizationOutputPrefixPattern(), - 'placeholder_outputs' => $this->agentRunnerConfig->getInputNormalizationPlaceholderOutputs(), 'skip_patterns' => $this->agentRunnerConfig->getInputNormalizationSkipPatterns(), 'prompt' => [ 'intro' => $this->agentRunnerConfig->getInputNormalizationIntro(), @@ -603,10 +625,6 @@ final readonly class RetriexEffectiveConfigProvider 'terms' => $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingTerms(), ], ], - 'followup_actions' => [ - 'commerce' => $this->agentRunnerConfig->getCommerceFollowUpActions(), - 'knowledge' => $this->agentRunnerConfig->getKnowledgeFollowUpActions(), - ], 'messages' => [ 'empty_prompt' => $this->agentRunnerConfig->getEmptyPromptMessage(), 'analyze_request' => $this->agentRunnerConfig->getAnalyzeRequestMessage(), @@ -660,6 +678,10 @@ final readonly class RetriexEffectiveConfigProvider 'context_usage' => [ 'referential_terms' => $this->agentRunnerConfig->getShopQueryContextUsageReferentialTerms(), ], + 'current_input_preservation' => [ + 'enabled' => $this->agentRunnerConfig->isShopQueryCurrentInputPreservationEnabled(), + 'terms' => $this->agentRunnerConfig->getShopQueryCurrentInputPreservationTerms(), + ], 'context_anchor_enrichment' => [ 'enabled' => $this->agentRunnerConfig->isShopQueryContextAnchorEnrichmentEnabled(), 'max_query_terms' => $this->agentRunnerConfig->getShopQueryContextAnchorEnrichmentMaxQueryTerms(), @@ -934,9 +956,6 @@ final readonly class RetriexEffectiveConfigProvider return [ 'stopwords' => $this->stopWordsConfig->getStopWords(), 'protected_terms' => $this->languageCleanupConfig->getProtectedTerms(), - 'normalization' => [ - 'ascii_transliteration' => $this->languageCleanupConfig->getAsciiTransliterationMap(), - ], 'cleanup_profile_names' => $this->languageCleanupConfig->getCleanupProfileNames(), 'cleanup_profiles' => $profiles, ]; @@ -1208,7 +1227,6 @@ final readonly class RetriexEffectiveConfigProvider private function validateAgent(array $agent, array &$errors, array &$warnings): void { $this->validateStringListMap($agent['messages'] ?? [], 'agent.messages', $errors, $warnings); - $this->validateStringListMap($agent['followup_actions'] ?? [], 'agent.followup_actions', $errors, $warnings); $this->validateStringListMap($agent['source_labels'] ?? [], 'agent.source_labels', $errors, $warnings); $this->validateStringListMap($agent['html_templates'] ?? [], 'agent.html_templates', $errors, $warnings); @@ -1226,9 +1244,6 @@ final readonly class RetriexEffectiveConfigProvider $errors[] = 'agent.follow_up_context.commercial_table_follow_up.query_template_without_model must not be empty.'; } - $inputNormalization = is_array($agent['input_normalization'] ?? null) ? $agent['input_normalization'] : []; - $this->validateStringList($this->toList($inputNormalization['placeholder_outputs'] ?? []), 'agent.input_normalization.placeholder_outputs', $errors, $warnings); - $ragEvidence = is_array($agent['rag_evidence_guard'] ?? null) ? $agent['rag_evidence_guard'] : []; $ragEvidenceCleanupProfile = $ragEvidence['cleanup_profile'] ?? null; if (!is_string($ragEvidenceCleanupProfile) || trim($ragEvidenceCleanupProfile) === '') { @@ -1255,6 +1270,17 @@ final readonly class RetriexEffectiveConfigProvider $errors[] = 'agent.shop_prompt.meta_query_guard.cleanup_profile references unknown language cleanup profile: ' . $shopContextCleanupProfile . '.'; } + $currentInputPreservation = is_array($shopPrompt['current_input_preservation'] ?? null) ? $shopPrompt['current_input_preservation'] : []; + if (array_key_exists('enabled', $currentInputPreservation) && !is_bool($currentInputPreservation['enabled'])) { + $errors[] = 'agent.shop_prompt.current_input_preservation.enabled must be boolean.'; + } + $this->validateStringList( + $this->toList($currentInputPreservation['terms'] ?? []), + 'agent.shop_prompt.current_input_preservation.terms', + $errors, + $warnings + ); + $this->validateStringListMap($agent['shop_query_optimizer'] ?? [], 'agent.shop_query_optimizer', $errors, $warnings); $this->validateRegexPattern($agent['optimized_shop_query_prefix_pattern'] ?? null, 'agent.optimized_shop_query_prefix_pattern', $errors);