$config */ public function __construct( private readonly array $config, private readonly ?GenreConfig $genreConfig = null, ) { } /** @return string[] */ public function getLegacyStopWords(): array { return $this->requiredTopLevelStringList('words'); } /** @return string[] */ public function getProtectedTerms(): array { return $this->genreConfig?->getValueStringList('retrieval_and_language.protected_terms.terms') ?: $this->requiredTopLevelStringList('protected_terms'); } public function isProtectedTerm(string $term): bool { $term = $this->normalizeToken($term); if ($term === '') { return false; } return in_array($term, $this->getProtectedTerms(), true); } /** @return array */ public function getAsciiTransliterationMap(): array { $normalization = $this->requiredMap('normalization'); if (!array_key_exists('ascii_transliteration', $normalization)) { throw $this->invalid('normalization.ascii_transliteration', 'is missing'); } return $this->stringMapFromValue($normalization['ascii_transliteration'], 'normalization.ascii_transliteration', true); } public function transliterateToAscii(string $value): string { $map = $this->getAsciiTransliterationMap(); if ($map === []) { return $value; } return strtr($value, $map); } /** @return string[] */ public function getWordSeparatorCharacters(): array { return $this->getNormalizationStringList('word_separator_chars'); } /** @return string[] */ public function getDashEquivalents(): array { return $this->getNormalizationStringList('dash_equivalents'); } public function replaceWordSeparatorsWithSpace(string $value): string { return str_replace($this->getWordSeparatorCharacters(), ' ', $value); } public function normalizeDashEquivalents(string $value): string { return str_replace($this->getDashEquivalents(), '-', $value); } /** @return string[] */ public function getCleanupProfileNames(): array { return array_keys($this->requiredMap('cleanup_profiles')); } /** @return string[] */ public function getStopWordsForProfile(string $profileName): array { return $this->removeProtectedTerms( $this->resolveGroupedTerms($profileName, 'stopword_groups', 'stopword_groups') ); } /** @return string[] */ public function getPhrasesForProfile(string $profileName): array { return $this->removeProtectedTerms( $this->resolveGroupedTerms($profileName, 'phrase_groups', 'phrase_groups') ); } /** @return string[] */ public function getMetaTermsForProfile(string $profileName): array { return $this->removeProtectedTerms( $this->resolveGroupedTerms($profileName, 'meta_term_groups', 'meta_term_groups') ); } /** @return array{stopwords:string[], phrases:string[], meta_terms:string[], protected_terms:string[]} */ public function getCleanupProfile(string $profileName): array { return [ 'stopwords' => $this->getStopWordsForProfile($profileName), 'phrases' => $this->getPhrasesForProfile($profileName), 'meta_terms' => $this->getMetaTermsForProfile($profileName), 'protected_terms' => $this->getProtectedTermsForProfile($profileName), ]; } /** @return string[] */ public function getProtectedTermsForProfile(string $profileName): array { $profile = $this->requiredCleanupProfile($profileName); $groupNames = $this->stringListFromValue( $this->profileValue($profile, 'protected_term_groups'), sprintf('cleanup_profiles.%s.protected_term_groups', $profileName), false ); if ($groupNames === []) { return $this->getProtectedTerms(); } $out = []; foreach ($groupNames as $groupName) { if ($groupName === 'protected_terms') { $out = $this->mergeUnique($out, $this->getProtectedTerms()); continue; } throw $this->invalid( sprintf('cleanup_profiles.%s.protected_term_groups', $profileName), sprintf('references unknown protected term group "%s"', $groupName) ); } return $out; } /** @return string[] */ private function resolveGroupedTerms(string $profileName, string $profileKey, string $rootKey): array { $profile = $this->requiredCleanupProfile($profileName); $groupNames = $this->resolveProfileGroupSetTerms($profileName, $profile, $profileKey); $groupNames = $this->mergeUnique( $groupNames, $this->stringListFromValue( $this->profileValue($profile, $profileKey), sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey), false ) ); if ($groupNames === []) { return []; } $groups = $this->requiredMap($rootKey); $out = []; foreach ($groupNames as $groupName) { if (!array_key_exists($groupName, $groups)) { throw $this->invalid( sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey), sprintf('references unknown group "%s"', $groupName) ); } $out = $this->mergeUnique( $out, $this->stringListFromValue($groups[$groupName], sprintf('%s.%s', $rootKey, $groupName), true) ); } return $out; } /** * @param array $profile * @return string[] */ private function resolveProfileGroupSetTerms(string $profileName, array $profile, string $profileKey): array { $profileSetKey = $this->profileGroupSetKey($profileKey); $setNames = $this->stringListFromValue( $this->profileValue($profile, $profileSetKey), sprintf('cleanup_profiles.%s.%s', $profileName, $profileSetKey), false ); if ($setNames === []) { return []; } $rootSetKey = $this->rootGroupSetKey($profileKey); $sets = $this->requiredMap($rootSetKey); $out = []; foreach ($setNames as $setName) { if (!array_key_exists($setName, $sets)) { throw $this->invalid( sprintf('cleanup_profiles.%s.%s', $profileName, $profileSetKey), sprintf('references unknown group set "%s"', $setName) ); } $out = $this->mergeUnique( $out, $this->stringListFromValue($sets[$setName], sprintf('%s.%s', $rootSetKey, $setName), true) ); } return $out; } private function profileGroupSetKey(string $profileKey): string { if ($profileKey === 'stopword_groups') { return 'stopword_group_sets'; } if ($profileKey === 'phrase_groups') { return 'phrase_group_sets'; } if ($profileKey === 'meta_term_groups') { return 'meta_term_group_sets'; } return sprintf('%s_sets', $profileKey); } private function rootGroupSetKey(string $profileKey): string { return $this->profileGroupSetKey($profileKey); } /** @return array */ private function requiredCleanupProfile(string $profileName): array { $profileName = trim($profileName); if ($profileName === '') { throw $this->invalid('cleanup_profiles', 'profile name must not be empty'); } $profiles = $this->requiredMap('cleanup_profiles'); if (!array_key_exists($profileName, $profiles) || !is_array($profiles[$profileName])) { throw $this->invalid('cleanup_profiles', sprintf('missing profile "%s"', $profileName)); } /** @var array $profile */ $profile = $profiles[$profileName]; return $profile; } private function profileValue(array $profile, string $key): mixed { if (!array_key_exists($key, $profile)) { return []; } return $profile[$key]; } /** @return array */ private function requiredMap(string $key): array { if (!array_key_exists($key, $this->config) || !is_array($this->config[$key])) { throw $this->invalid($key, 'must be a map'); } /** @var array $map */ $map = $this->config[$key]; if ($map === []) { throw $this->invalid($key, 'must not be empty'); } return $map; } /** @return string[] */ private function getNormalizationStringList(string $key): array { $normalization = $this->requiredMap('normalization'); if (!array_key_exists($key, $normalization)) { throw $this->invalid(sprintf('normalization.%s', $key), 'is missing'); } return $this->stringListFromValue($normalization[$key], sprintf('normalization.%s', $key), true); } /** @return string[] */ private function requiredTopLevelStringList(string $key): array { if (!array_key_exists($key, $this->config)) { throw $this->invalid($key, 'is missing'); } return $this->stringListFromValue($this->config[$key], $key, true); } /** @return string[] */ private function stringListFromValue(mixed $value, string $path, bool $required): array { if (!is_array($value)) { if (!$required && $value === []) { return []; } throw $this->invalid($path, 'must be a list of non-empty strings'); } $out = []; foreach ($value as $item) { if (!is_scalar($item)) { continue; } $item = $this->normalizeToken((string) $item); if ($item === '' || in_array($item, $out, true)) { continue; } $out[] = $item; } if ($required && $out === []) { throw $this->invalid($path, 'must contain at least one non-empty string'); } return $out; } /** @return array */ private function stringMapFromValue(mixed $value, string $path, bool $required): array { if (!is_array($value)) { throw $this->invalid($path, 'must be a map of non-empty strings'); } $out = []; foreach ($value as $key => $item) { if (!is_scalar($key) || !is_scalar($item)) { continue; } $key = trim((string) $key); $item = trim((string) $item); if ($key === '' || $item === '') { continue; } $out[$key] = $item; } if ($required && $out === []) { throw $this->invalid($path, 'must contain at least one non-empty map entry'); } return $out; } /** @param string[] $terms */ private function removeProtectedTerms(array $terms): array { $protected = $this->getProtectedTerms(); return array_values(array_filter( $terms, static fn (string $term): bool => !in_array($term, $protected, true) )); } /** * @param string[] $left * @param string[] $right * @return string[] */ private function mergeUnique(array $left, array $right): array { foreach ($right as $item) { if (!in_array($item, $left, true)) { $left[] = $item; } } return $left; } private function normalizeToken(string $token): string { $token = trim($token); return function_exists('mb_strtolower') ? mb_strtolower($token, 'UTF-8') : strtolower($token); } private function invalid(string $path, string $message): InvalidArgumentException { return new InvalidArgumentException(sprintf('Invalid RetrieX language cleanup config at "%s": %s.', $path, $message)); } }