From 3d0b6b1cf81a8c3cef1c9d9c28dbd0f132eddc3d Mon Sep 17 00:00:00 2001 From: team 1 Date: Sun, 3 May 2026 20:50:26 +0200 Subject: [PATCH] p22 --- src/Config/LanguageCleanupConfig.php | 278 +++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 src/Config/LanguageCleanupConfig.php diff --git a/src/Config/LanguageCleanupConfig.php b/src/Config/LanguageCleanupConfig.php new file mode 100644 index 0000000..f8570d8 --- /dev/null +++ b/src/Config/LanguageCleanupConfig.php @@ -0,0 +1,278 @@ + $config + */ + public function __construct(private readonly array $config) + { + } + + /** @return string[] */ + public function getLegacyStopWords(): array + { + return $this->requiredTopLevelStringList('words'); + } + + /** @return string[] */ + public function getProtectedTerms(): array + { + return $this->requiredTopLevelStringList('protected_terms'); + } + + public function isProtectedTerm(string $term): bool + { + $term = $this->normalizeToken($term); + if ($term === '') { + return false; + } + + return in_array($term, $this->getProtectedTerms(), true); + } + + /** @return string[] */ + public function getCleanupProfileNames(): array + { + return array_keys($this->requiredMap('cleanup_profiles')); + } + + /** @return string[] */ + public function getStopWordsForProfile(string $profileName): array + { + return $this->removeProtectedTerms( + $this->resolveGroupedTerms($profileName, 'stopword_groups', 'stopword_groups') + ); + } + + /** @return string[] */ + public function getPhrasesForProfile(string $profileName): array + { + return $this->removeProtectedTerms( + $this->resolveGroupedTerms($profileName, 'phrase_groups', 'phrase_groups') + ); + } + + /** @return string[] */ + public function getMetaTermsForProfile(string $profileName): array + { + return $this->removeProtectedTerms( + $this->resolveGroupedTerms($profileName, 'meta_term_groups', 'meta_term_groups') + ); + } + + /** @return array{stopwords:string[], phrases:string[], meta_terms:string[], protected_terms:string[]} */ + public function getCleanupProfile(string $profileName): array + { + return [ + 'stopwords' => $this->getStopWordsForProfile($profileName), + 'phrases' => $this->getPhrasesForProfile($profileName), + 'meta_terms' => $this->getMetaTermsForProfile($profileName), + 'protected_terms' => $this->getProtectedTermsForProfile($profileName), + ]; + } + + /** @return string[] */ + public function getProtectedTermsForProfile(string $profileName): array + { + $profile = $this->requiredCleanupProfile($profileName); + $groupNames = $this->stringListFromValue( + $this->profileValue($profile, 'protected_term_groups'), + sprintf('cleanup_profiles.%s.protected_term_groups', $profileName), + false + ); + + if ($groupNames === []) { + return $this->getProtectedTerms(); + } + + $out = []; + foreach ($groupNames as $groupName) { + if ($groupName === 'protected_terms') { + $out = $this->mergeUnique($out, $this->getProtectedTerms()); + continue; + } + + throw $this->invalid( + sprintf('cleanup_profiles.%s.protected_term_groups', $profileName), + sprintf('references unknown protected term group "%s"', $groupName) + ); + } + + return $out; + } + + /** @return string[] */ + private function resolveGroupedTerms(string $profileName, string $profileKey, string $rootKey): array + { + $profile = $this->requiredCleanupProfile($profileName); + $groupNames = $this->stringListFromValue( + $this->profileValue($profile, $profileKey), + sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey), + false + ); + + if ($groupNames === []) { + return []; + } + + $groups = $this->requiredMap($rootKey); + $out = []; + + foreach ($groupNames as $groupName) { + if (!array_key_exists($groupName, $groups)) { + throw $this->invalid( + sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey), + sprintf('references unknown group "%s"', $groupName) + ); + } + + $out = $this->mergeUnique( + $out, + $this->stringListFromValue($groups[$groupName], sprintf('%s.%s', $rootKey, $groupName), true) + ); + } + + return $out; + } + + /** @return array */ + private function requiredCleanupProfile(string $profileName): array + { + $profileName = trim($profileName); + if ($profileName === '') { + throw $this->invalid('cleanup_profiles', 'profile name must not be empty'); + } + + $profiles = $this->requiredMap('cleanup_profiles'); + if (!array_key_exists($profileName, $profiles) || !is_array($profiles[$profileName])) { + throw $this->invalid('cleanup_profiles', sprintf('missing profile "%s"', $profileName)); + } + + /** @var array $profile */ + $profile = $profiles[$profileName]; + + return $profile; + } + + private function profileValue(array $profile, string $key): mixed + { + if (!array_key_exists($key, $profile)) { + return []; + } + + return $profile[$key]; + } + + /** @return array */ + private function requiredMap(string $key): array + { + if (!array_key_exists($key, $this->config) || !is_array($this->config[$key])) { + throw $this->invalid($key, 'must be a map'); + } + + /** @var array $map */ + $map = $this->config[$key]; + if ($map === []) { + throw $this->invalid($key, 'must not be empty'); + } + + return $map; + } + + /** @return string[] */ + private function requiredTopLevelStringList(string $key): array + { + if (!array_key_exists($key, $this->config)) { + throw $this->invalid($key, 'is missing'); + } + + return $this->stringListFromValue($this->config[$key], $key, true); + } + + /** @return string[] */ + private function stringListFromValue(mixed $value, string $path, bool $required): array + { + if (!is_array($value)) { + if (!$required && $value === []) { + return []; + } + + throw $this->invalid($path, 'must be a list of non-empty strings'); + } + + $out = []; + foreach ($value as $item) { + if (!is_scalar($item)) { + continue; + } + + $item = $this->normalizeToken((string) $item); + if ($item === '' || in_array($item, $out, true)) { + continue; + } + + $out[] = $item; + } + + if ($required && $out === []) { + throw $this->invalid($path, 'must contain at least one non-empty string'); + } + + return $out; + } + + /** @param string[] $terms */ + private function removeProtectedTerms(array $terms): array + { + $protected = $this->getProtectedTerms(); + + return array_values(array_filter( + $terms, + static fn (string $term): bool => !in_array($term, $protected, true) + )); + } + + /** + * @param string[] $left + * @param string[] $right + * @return string[] + */ + private function mergeUnique(array $left, array $right): array + { + foreach ($right as $item) { + if (!in_array($item, $left, true)) { + $left[] = $item; + } + } + + return $left; + } + + private function normalizeToken(string $token): string + { + $token = trim($token); + + return function_exists('mb_strtolower') + ? mb_strtolower($token, 'UTF-8') + : strtolower($token); + } + + private function invalid(string $path, string $message): InvalidArgumentException + { + return new InvalidArgumentException(sprintf('Invalid RetrieX language cleanup config at "%s": %s.', $path, $message)); + } +}