$config */ public function __construct(private readonly array $config) { } /** @return string[] */ public function getLegacyStopWords(): array { return $this->requiredTopLevelStringList('words'); } /** @return string[] */ public function getProtectedTerms(): array { return $this->requiredTopLevelStringList('protected_terms'); } public function isProtectedTerm(string $term): bool { $term = $this->normalizeToken($term); if ($term === '') { return false; } return in_array($term, $this->getProtectedTerms(), true); } /** @return array */ public function getAsciiTransliterationMap(): array { $normalization = $this->requiredMap('normalization'); if (!array_key_exists('ascii_transliteration', $normalization)) { throw $this->invalid('normalization.ascii_transliteration', 'is missing'); } return $this->stringMapFromValue($normalization['ascii_transliteration'], 'normalization.ascii_transliteration', true); } public function transliterateToAscii(string $value): string { $map = $this->getAsciiTransliterationMap(); if ($map === []) { return $value; } return strtr($value, $map); } /** @return string[] */ public function getCleanupProfileNames(): array { return array_keys($this->requiredMap('cleanup_profiles')); } /** @return string[] */ public function getStopWordsForProfile(string $profileName): array { return $this->removeProtectedTerms( $this->resolveGroupedTerms($profileName, 'stopword_groups', 'stopword_groups') ); } /** @return string[] */ public function getPhrasesForProfile(string $profileName): array { return $this->removeProtectedTerms( $this->resolveGroupedTerms($profileName, 'phrase_groups', 'phrase_groups') ); } /** @return string[] */ public function getMetaTermsForProfile(string $profileName): array { return $this->removeProtectedTerms( $this->resolveGroupedTerms($profileName, 'meta_term_groups', 'meta_term_groups') ); } /** @return array{stopwords:string[], phrases:string[], meta_terms:string[], protected_terms:string[]} */ public function getCleanupProfile(string $profileName): array { return [ 'stopwords' => $this->getStopWordsForProfile($profileName), 'phrases' => $this->getPhrasesForProfile($profileName), 'meta_terms' => $this->getMetaTermsForProfile($profileName), 'protected_terms' => $this->getProtectedTermsForProfile($profileName), ]; } /** @return string[] */ public function getProtectedTermsForProfile(string $profileName): array { $profile = $this->requiredCleanupProfile($profileName); $groupNames = $this->stringListFromValue( $this->profileValue($profile, 'protected_term_groups'), sprintf('cleanup_profiles.%s.protected_term_groups', $profileName), false ); if ($groupNames === []) { return $this->getProtectedTerms(); } $out = []; foreach ($groupNames as $groupName) { if ($groupName === 'protected_terms') { $out = $this->mergeUnique($out, $this->getProtectedTerms()); continue; } throw $this->invalid( sprintf('cleanup_profiles.%s.protected_term_groups', $profileName), sprintf('references unknown protected term group "%s"', $groupName) ); } return $out; } /** @return string[] */ private function resolveGroupedTerms(string $profileName, string $profileKey, string $rootKey): array { $profile = $this->requiredCleanupProfile($profileName); $groupNames = $this->stringListFromValue( $this->profileValue($profile, $profileKey), sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey), false ); if ($groupNames === []) { return []; } $groups = $this->requiredMap($rootKey); $out = []; foreach ($groupNames as $groupName) { if (!array_key_exists($groupName, $groups)) { throw $this->invalid( sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey), sprintf('references unknown group "%s"', $groupName) ); } $out = $this->mergeUnique( $out, $this->stringListFromValue($groups[$groupName], sprintf('%s.%s', $rootKey, $groupName), true) ); } return $out; } /** @return array */ private function requiredCleanupProfile(string $profileName): array { $profileName = trim($profileName); if ($profileName === '') { throw $this->invalid('cleanup_profiles', 'profile name must not be empty'); } $profiles = $this->requiredMap('cleanup_profiles'); if (!array_key_exists($profileName, $profiles) || !is_array($profiles[$profileName])) { throw $this->invalid('cleanup_profiles', sprintf('missing profile "%s"', $profileName)); } /** @var array $profile */ $profile = $profiles[$profileName]; return $profile; } private function profileValue(array $profile, string $key): mixed { if (!array_key_exists($key, $profile)) { return []; } return $profile[$key]; } /** @return array */ private function requiredMap(string $key): array { if (!array_key_exists($key, $this->config) || !is_array($this->config[$key])) { throw $this->invalid($key, 'must be a map'); } /** @var array $map */ $map = $this->config[$key]; if ($map === []) { throw $this->invalid($key, 'must not be empty'); } return $map; } /** @return string[] */ private function requiredTopLevelStringList(string $key): array { if (!array_key_exists($key, $this->config)) { throw $this->invalid($key, 'is missing'); } return $this->stringListFromValue($this->config[$key], $key, true); } /** @return string[] */ private function stringListFromValue(mixed $value, string $path, bool $required): array { if (!is_array($value)) { if (!$required && $value === []) { return []; } throw $this->invalid($path, 'must be a list of non-empty strings'); } $out = []; foreach ($value as $item) { if (!is_scalar($item)) { continue; } $item = $this->normalizeToken((string) $item); if ($item === '' || in_array($item, $out, true)) { continue; } $out[] = $item; } if ($required && $out === []) { throw $this->invalid($path, 'must contain at least one non-empty string'); } return $out; } /** @return array */ private function stringMapFromValue(mixed $value, string $path, bool $required): array { if (!is_array($value)) { throw $this->invalid($path, 'must be a map of non-empty strings'); } $out = []; foreach ($value as $key => $item) { if (!is_scalar($key) || !is_scalar($item)) { continue; } $key = trim((string) $key); $item = trim((string) $item); if ($key === '' || $item === '') { continue; } $out[$key] = $item; } if ($required && $out === []) { throw $this->invalid($path, 'must contain at least one non-empty map entry'); } return $out; } /** @param string[] $terms */ private function removeProtectedTerms(array $terms): array { $protected = $this->getProtectedTerms(); return array_values(array_filter( $terms, static fn (string $term): bool => !in_array($term, $protected, true) )); } /** * @param string[] $left * @param string[] $right * @return string[] */ private function mergeUnique(array $left, array $right): array { foreach ($right as $item) { if (!in_array($item, $left, true)) { $left[] = $item; } } return $left; } private function normalizeToken(string $token): string { $token = trim($token); return function_exists('mb_strtolower') ? mb_strtolower($token, 'UTF-8') : strtolower($token); } private function invalid(string $path, string $message): InvalidArgumentException { return new InvalidArgumentException(sprintf('Invalid RetrieX language cleanup config at "%s": %s.', $path, $message)); } }