430 lines
13 KiB
PHP
430 lines
13 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Config;
|
|
|
|
use InvalidArgumentException;
|
|
|
|
/**
|
|
* YAML-backed resolver for language cleanup profiles.
|
|
*
|
|
* The resolver is intentionally additive in p22: existing runtime callers keep
|
|
* using their legacy lists until later patches wire profiles into Commerce and
|
|
* Agent logic.
|
|
*/
|
|
final class LanguageCleanupConfig
|
|
{
|
|
/**
|
|
* @param array<string, mixed> $config
|
|
*/
|
|
public function __construct(
|
|
private readonly array $config,
|
|
private readonly ?GenreConfig $genreConfig = null,
|
|
) {
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function getLegacyStopWords(): array
|
|
{
|
|
return $this->requiredTopLevelStringList('words');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function getProtectedTerms(): array
|
|
{
|
|
return $this->genreConfig?->getValueStringList('retrieval_and_language.protected_terms.terms')
|
|
?: $this->requiredTopLevelStringList('protected_terms');
|
|
}
|
|
|
|
public function isProtectedTerm(string $term): bool
|
|
{
|
|
$term = $this->normalizeToken($term);
|
|
if ($term === '') {
|
|
return false;
|
|
}
|
|
|
|
return in_array($term, $this->getProtectedTerms(), true);
|
|
}
|
|
|
|
/** @return array<string, string> */
|
|
public function getAsciiTransliterationMap(): array
|
|
{
|
|
$normalization = $this->requiredMap('normalization');
|
|
if (!array_key_exists('ascii_transliteration', $normalization)) {
|
|
throw $this->invalid('normalization.ascii_transliteration', 'is missing');
|
|
}
|
|
|
|
return $this->stringMapFromValue($normalization['ascii_transliteration'], 'normalization.ascii_transliteration', true);
|
|
}
|
|
|
|
public function transliterateToAscii(string $value): string
|
|
{
|
|
$map = $this->getAsciiTransliterationMap();
|
|
if ($map === []) {
|
|
return $value;
|
|
}
|
|
|
|
return strtr($value, $map);
|
|
}
|
|
|
|
|
|
/** @return string[] */
|
|
public function getWordSeparatorCharacters(): array
|
|
{
|
|
return $this->getNormalizationStringList('word_separator_chars');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function getDashEquivalents(): array
|
|
{
|
|
return $this->getNormalizationStringList('dash_equivalents');
|
|
}
|
|
|
|
public function replaceWordSeparatorsWithSpace(string $value): string
|
|
{
|
|
return str_replace($this->getWordSeparatorCharacters(), ' ', $value);
|
|
}
|
|
|
|
public function normalizeDashEquivalents(string $value): string
|
|
{
|
|
return str_replace($this->getDashEquivalents(), '-', $value);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function getCleanupProfileNames(): array
|
|
{
|
|
return array_keys($this->requiredMap('cleanup_profiles'));
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function getStopWordsForProfile(string $profileName): array
|
|
{
|
|
return $this->removeProtectedTerms(
|
|
$this->resolveGroupedTerms($profileName, 'stopword_groups', 'stopword_groups')
|
|
);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function getPhrasesForProfile(string $profileName): array
|
|
{
|
|
return $this->removeProtectedTerms(
|
|
$this->resolveGroupedTerms($profileName, 'phrase_groups', 'phrase_groups')
|
|
);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function getMetaTermsForProfile(string $profileName): array
|
|
{
|
|
return $this->removeProtectedTerms(
|
|
$this->resolveGroupedTerms($profileName, 'meta_term_groups', 'meta_term_groups')
|
|
);
|
|
}
|
|
|
|
/** @return array{stopwords:string[], phrases:string[], meta_terms:string[], protected_terms:string[]} */
|
|
public function getCleanupProfile(string $profileName): array
|
|
{
|
|
return [
|
|
'stopwords' => $this->getStopWordsForProfile($profileName),
|
|
'phrases' => $this->getPhrasesForProfile($profileName),
|
|
'meta_terms' => $this->getMetaTermsForProfile($profileName),
|
|
'protected_terms' => $this->getProtectedTermsForProfile($profileName),
|
|
];
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function getProtectedTermsForProfile(string $profileName): array
|
|
{
|
|
$profile = $this->requiredCleanupProfile($profileName);
|
|
$groupNames = $this->stringListFromValue(
|
|
$this->profileValue($profile, 'protected_term_groups'),
|
|
sprintf('cleanup_profiles.%s.protected_term_groups', $profileName),
|
|
false
|
|
);
|
|
|
|
if ($groupNames === []) {
|
|
return $this->getProtectedTerms();
|
|
}
|
|
|
|
$out = [];
|
|
foreach ($groupNames as $groupName) {
|
|
if ($groupName === 'protected_terms') {
|
|
$out = $this->mergeUnique($out, $this->getProtectedTerms());
|
|
continue;
|
|
}
|
|
|
|
throw $this->invalid(
|
|
sprintf('cleanup_profiles.%s.protected_term_groups', $profileName),
|
|
sprintf('references unknown protected term group "%s"', $groupName)
|
|
);
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
/** @return string[] */
|
|
private function resolveGroupedTerms(string $profileName, string $profileKey, string $rootKey): array
|
|
{
|
|
$profile = $this->requiredCleanupProfile($profileName);
|
|
$groupNames = $this->resolveProfileGroupSetTerms($profileName, $profile, $profileKey);
|
|
$groupNames = $this->mergeUnique(
|
|
$groupNames,
|
|
$this->stringListFromValue(
|
|
$this->profileValue($profile, $profileKey),
|
|
sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey),
|
|
false
|
|
)
|
|
);
|
|
|
|
if ($groupNames === []) {
|
|
return [];
|
|
}
|
|
|
|
$groups = $this->requiredMap($rootKey);
|
|
$out = [];
|
|
|
|
foreach ($groupNames as $groupName) {
|
|
if (!array_key_exists($groupName, $groups)) {
|
|
throw $this->invalid(
|
|
sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey),
|
|
sprintf('references unknown group "%s"', $groupName)
|
|
);
|
|
}
|
|
|
|
$out = $this->mergeUnique(
|
|
$out,
|
|
$this->stringListFromValue($groups[$groupName], sprintf('%s.%s', $rootKey, $groupName), true)
|
|
);
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
/**
|
|
* @param array<string, mixed> $profile
|
|
* @return string[]
|
|
*/
|
|
private function resolveProfileGroupSetTerms(string $profileName, array $profile, string $profileKey): array
|
|
{
|
|
$profileSetKey = $this->profileGroupSetKey($profileKey);
|
|
$setNames = $this->stringListFromValue(
|
|
$this->profileValue($profile, $profileSetKey),
|
|
sprintf('cleanup_profiles.%s.%s', $profileName, $profileSetKey),
|
|
false
|
|
);
|
|
|
|
if ($setNames === []) {
|
|
return [];
|
|
}
|
|
|
|
$rootSetKey = $this->rootGroupSetKey($profileKey);
|
|
$sets = $this->requiredMap($rootSetKey);
|
|
$out = [];
|
|
|
|
foreach ($setNames as $setName) {
|
|
if (!array_key_exists($setName, $sets)) {
|
|
throw $this->invalid(
|
|
sprintf('cleanup_profiles.%s.%s', $profileName, $profileSetKey),
|
|
sprintf('references unknown group set "%s"', $setName)
|
|
);
|
|
}
|
|
|
|
$out = $this->mergeUnique(
|
|
$out,
|
|
$this->stringListFromValue($sets[$setName], sprintf('%s.%s', $rootSetKey, $setName), true)
|
|
);
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
private function profileGroupSetKey(string $profileKey): string
|
|
{
|
|
if ($profileKey === 'stopword_groups') {
|
|
return 'stopword_group_sets';
|
|
}
|
|
|
|
if ($profileKey === 'phrase_groups') {
|
|
return 'phrase_group_sets';
|
|
}
|
|
|
|
if ($profileKey === 'meta_term_groups') {
|
|
return 'meta_term_group_sets';
|
|
}
|
|
|
|
return sprintf('%s_sets', $profileKey);
|
|
}
|
|
|
|
private function rootGroupSetKey(string $profileKey): string
|
|
{
|
|
return $this->profileGroupSetKey($profileKey);
|
|
}
|
|
|
|
/** @return array<string, mixed> */
|
|
private function requiredCleanupProfile(string $profileName): array
|
|
{
|
|
$profileName = trim($profileName);
|
|
if ($profileName === '') {
|
|
throw $this->invalid('cleanup_profiles', 'profile name must not be empty');
|
|
}
|
|
|
|
$profiles = $this->requiredMap('cleanup_profiles');
|
|
if (!array_key_exists($profileName, $profiles) || !is_array($profiles[$profileName])) {
|
|
throw $this->invalid('cleanup_profiles', sprintf('missing profile "%s"', $profileName));
|
|
}
|
|
|
|
/** @var array<string, mixed> $profile */
|
|
$profile = $profiles[$profileName];
|
|
|
|
return $profile;
|
|
}
|
|
|
|
private function profileValue(array $profile, string $key): mixed
|
|
{
|
|
if (!array_key_exists($key, $profile)) {
|
|
return [];
|
|
}
|
|
|
|
return $profile[$key];
|
|
}
|
|
|
|
/** @return array<string, mixed> */
|
|
private function requiredMap(string $key): array
|
|
{
|
|
if (!array_key_exists($key, $this->config) || !is_array($this->config[$key])) {
|
|
throw $this->invalid($key, 'must be a map');
|
|
}
|
|
|
|
/** @var array<string, mixed> $map */
|
|
$map = $this->config[$key];
|
|
if ($map === []) {
|
|
throw $this->invalid($key, 'must not be empty');
|
|
}
|
|
|
|
return $map;
|
|
}
|
|
|
|
/** @return string[] */
|
|
private function getNormalizationStringList(string $key): array
|
|
{
|
|
$normalization = $this->requiredMap('normalization');
|
|
if (!array_key_exists($key, $normalization)) {
|
|
throw $this->invalid(sprintf('normalization.%s', $key), 'is missing');
|
|
}
|
|
|
|
return $this->stringListFromValue($normalization[$key], sprintf('normalization.%s', $key), true);
|
|
}
|
|
|
|
/** @return string[] */
|
|
private function requiredTopLevelStringList(string $key): array
|
|
{
|
|
if (!array_key_exists($key, $this->config)) {
|
|
throw $this->invalid($key, 'is missing');
|
|
}
|
|
|
|
return $this->stringListFromValue($this->config[$key], $key, true);
|
|
}
|
|
|
|
/** @return string[] */
|
|
private function stringListFromValue(mixed $value, string $path, bool $required): array
|
|
{
|
|
if (!is_array($value)) {
|
|
if (!$required && $value === []) {
|
|
return [];
|
|
}
|
|
|
|
throw $this->invalid($path, 'must be a list of non-empty strings');
|
|
}
|
|
|
|
$out = [];
|
|
foreach ($value as $item) {
|
|
if (!is_scalar($item)) {
|
|
continue;
|
|
}
|
|
|
|
$item = $this->normalizeToken((string) $item);
|
|
if ($item === '' || in_array($item, $out, true)) {
|
|
continue;
|
|
}
|
|
|
|
$out[] = $item;
|
|
}
|
|
|
|
if ($required && $out === []) {
|
|
throw $this->invalid($path, 'must contain at least one non-empty string');
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
/** @return array<string, string> */
|
|
private function stringMapFromValue(mixed $value, string $path, bool $required): array
|
|
{
|
|
if (!is_array($value)) {
|
|
throw $this->invalid($path, 'must be a map of non-empty strings');
|
|
}
|
|
|
|
$out = [];
|
|
foreach ($value as $key => $item) {
|
|
if (!is_scalar($key) || !is_scalar($item)) {
|
|
continue;
|
|
}
|
|
|
|
$key = trim((string) $key);
|
|
$item = trim((string) $item);
|
|
if ($key === '' || $item === '') {
|
|
continue;
|
|
}
|
|
|
|
$out[$key] = $item;
|
|
}
|
|
|
|
if ($required && $out === []) {
|
|
throw $this->invalid($path, 'must contain at least one non-empty map entry');
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
/** @param string[] $terms */
|
|
private function removeProtectedTerms(array $terms): array
|
|
{
|
|
$protected = $this->getProtectedTerms();
|
|
|
|
return array_values(array_filter(
|
|
$terms,
|
|
static fn (string $term): bool => !in_array($term, $protected, true)
|
|
));
|
|
}
|
|
|
|
/**
|
|
* @param string[] $left
|
|
* @param string[] $right
|
|
* @return string[]
|
|
*/
|
|
private function mergeUnique(array $left, array $right): array
|
|
{
|
|
foreach ($right as $item) {
|
|
if (!in_array($item, $left, true)) {
|
|
$left[] = $item;
|
|
}
|
|
}
|
|
|
|
return $left;
|
|
}
|
|
|
|
private function normalizeToken(string $token): string
|
|
{
|
|
$token = trim($token);
|
|
|
|
return function_exists('mb_strtolower')
|
|
? mb_strtolower($token, 'UTF-8')
|
|
: strtolower($token);
|
|
}
|
|
|
|
private function invalid(string $path, string $message): InvalidArgumentException
|
|
{
|
|
return new InvalidArgumentException(sprintf('Invalid RetrieX language cleanup config at "%s": %s.', $path, $message));
|
|
}
|
|
}
|