diff --git a/config/retriex/language.yaml b/config/retriex/language.yaml index dc38fcf..1ff347c 100644 --- a/config/retriex/language.yaml +++ b/config/retriex/language.yaml @@ -230,6 +230,17 @@ parameters: - gibt es - suche im shop + # Reusable cleanup group sets keep common profile compositions in one place. + # Profiles may still add local group references after these shared sets. + stopword_group_sets: + de_conversation: + - de_core + - conversation + + phrase_group_sets: + user_instruction: + - user_instruction + meta_term_groups: presentation: - tabelle @@ -256,40 +267,40 @@ parameters: cleanup_profiles: commerce_query: + stopword_group_sets: + - de_conversation stopword_groups: - - de_core - - conversation - pronouns - user_instruction_terms - response_style - phrase_groups: + phrase_group_sets: - user_instruction rag_evidence: + stopword_group_sets: + - de_conversation stopword_groups: - - de_core - - conversation - user_instruction_terms retrieval_reference_cleanup: + stopword_group_sets: + - de_conversation stopword_groups: - - de_core - - conversation - question_terms meta_term_groups: - retrieval_reference shop_context_fallback: + stopword_group_sets: + - de_conversation stopword_groups: - - de_core - - conversation - pronouns - user_instruction_terms - question_terms - usage_terms - reference_fillers - response_style - phrase_groups: + phrase_group_sets: - user_instruction meta_term_groups: - presentation diff --git a/patch_history/RETRIEX_PATCH_43N_LANGUAGE_CLEANUP_GROUP_SETS_README.md b/patch_history/RETRIEX_PATCH_43N_LANGUAGE_CLEANUP_GROUP_SETS_README.md new file mode 100644 index 0000000..23d6714 --- /dev/null +++ b/patch_history/RETRIEX_PATCH_43N_LANGUAGE_CLEANUP_GROUP_SETS_README.md @@ -0,0 +1,46 @@ +# RetrieX Patch 43N - Language Cleanup Group Sets + +## Goal + +Reduce repeated cleanup-profile group compositions in `language.yaml` without changing effective cleanup behavior. + +## Changes + +- Added optional `stopword_group_sets` and `phrase_group_sets` to `config/retriex/language.yaml`. +- Moved repeated `de_core` + `conversation` profile composition into `stopword_group_sets.de_conversation`. +- Moved repeated `user_instruction` phrase composition into `phrase_group_sets.user_instruction`. +- Updated `LanguageCleanupConfig` to resolve group sets before local profile groups. +- Existing `stopword_groups`, `phrase_groups`, and `meta_term_groups` remain supported. + +## Non-goals + +- No new domain/runtime logic. +- No scoring changes. +- No prompt-rule changes. +- No retrieval changes. +- No admin UI changes. +- No new hard-coded fachliche lists in PHP core. + +## Expected effective behavior + +The effective cleanup profile outputs remain identical to p43M: + +- `commerce_query`: same stopwords, phrases, meta terms, protected terms. +- `rag_evidence`: same stopwords, phrases, meta terms, protected terms. +- `retrieval_reference_cleanup`: same stopwords, phrases, meta terms, protected terms. +- `shop_context_fallback`: same stopwords, phrases, meta terms, protected terms. + +## Local checks performed + +```bash +php -l src/Config/LanguageCleanupConfig.php +php -l src/Config/GovernanceConfig.php +php -l src/Config/AgentRunnerConfig.php +php -l src/Config/PromptBuilderConfig.php +php -l src/Config/SearchRepairConfig.php +python3 YAML parse check for config/retriex/*.yaml +python3 effective p43M-vs-p43N cleanup profile comparison +php LanguageCleanupConfig effective resolver comparison +``` + +The local `bin/console` checks could not be executed in this artifact environment because the ZIP does not include `vendor/` and Composer dependencies are unavailable here. diff --git a/src/Config/LanguageCleanupConfig.php b/src/Config/LanguageCleanupConfig.php index 820f442..2bca0d0 100644 --- a/src/Config/LanguageCleanupConfig.php +++ b/src/Config/LanguageCleanupConfig.php @@ -163,10 +163,14 @@ final class LanguageCleanupConfig private function resolveGroupedTerms(string $profileName, string $profileKey, string $rootKey): array { $profile = $this->requiredCleanupProfile($profileName); - $groupNames = $this->stringListFromValue( - $this->profileValue($profile, $profileKey), - sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey), - false + $groupNames = $this->resolveProfileGroupSetTerms($profileName, $profile, $profileKey); + $groupNames = $this->mergeUnique( + $groupNames, + $this->stringListFromValue( + $this->profileValue($profile, $profileKey), + sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey), + false + ) ); if ($groupNames === []) { @@ -193,6 +197,66 @@ final class LanguageCleanupConfig return $out; } + /** + * @param array $profile + * @return string[] + */ + private function resolveProfileGroupSetTerms(string $profileName, array $profile, string $profileKey): array + { + $profileSetKey = $this->profileGroupSetKey($profileKey); + $setNames = $this->stringListFromValue( + $this->profileValue($profile, $profileSetKey), + sprintf('cleanup_profiles.%s.%s', $profileName, $profileSetKey), + false + ); + + if ($setNames === []) { + return []; + } + + $rootSetKey = $this->rootGroupSetKey($profileKey); + $sets = $this->requiredMap($rootSetKey); + $out = []; + + foreach ($setNames as $setName) { + if (!array_key_exists($setName, $sets)) { + throw $this->invalid( + sprintf('cleanup_profiles.%s.%s', $profileName, $profileSetKey), + sprintf('references unknown group set "%s"', $setName) + ); + } + + $out = $this->mergeUnique( + $out, + $this->stringListFromValue($sets[$setName], sprintf('%s.%s', $rootSetKey, $setName), true) + ); + } + + return $out; + } + + private function profileGroupSetKey(string $profileKey): string + { + if ($profileKey === 'stopword_groups') { + return 'stopword_group_sets'; + } + + if ($profileKey === 'phrase_groups') { + return 'phrase_group_sets'; + } + + if ($profileKey === 'meta_term_groups') { + return 'meta_term_group_sets'; + } + + return sprintf('%s_sets', $profileKey); + } + + private function rootGroupSetKey(string $profileKey): string + { + return $this->profileGroupSetKey($profileKey); + } + /** @return array */ private function requiredCleanupProfile(string $profileName): array {