Files
MtoRagSystem/src/Config/LanguageCleanupConfig.php
2026-05-07 07:52:52 +02:00

430 lines
13 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Config;
use InvalidArgumentException;
/**
* YAML-backed resolver for language cleanup profiles.
*
* The resolver is intentionally additive in p22: existing runtime callers keep
* using their legacy lists until later patches wire profiles into Commerce and
* Agent logic.
*/
final class LanguageCleanupConfig
{
/**
* @param array<string, mixed> $config
*/
public function __construct(
private readonly array $config,
private readonly ?GenreConfig $genreConfig = null,
) {
}
/** @return string[] */
public function getLegacyStopWords(): array
{
return $this->requiredTopLevelStringList('words');
}
/** @return string[] */
public function getProtectedTerms(): array
{
return $this->genreConfig?->getValueStringList('retrieval_and_language.protected_terms.terms')
?: $this->requiredTopLevelStringList('protected_terms');
}
public function isProtectedTerm(string $term): bool
{
$term = $this->normalizeToken($term);
if ($term === '') {
return false;
}
return in_array($term, $this->getProtectedTerms(), true);
}
/** @return array<string, string> */
public function getAsciiTransliterationMap(): array
{
$normalization = $this->requiredMap('normalization');
if (!array_key_exists('ascii_transliteration', $normalization)) {
throw $this->invalid('normalization.ascii_transliteration', 'is missing');
}
return $this->stringMapFromValue($normalization['ascii_transliteration'], 'normalization.ascii_transliteration', true);
}
public function transliterateToAscii(string $value): string
{
$map = $this->getAsciiTransliterationMap();
if ($map === []) {
return $value;
}
return strtr($value, $map);
}
/** @return string[] */
public function getWordSeparatorCharacters(): array
{
return $this->getNormalizationStringList('word_separator_chars');
}
/** @return string[] */
public function getDashEquivalents(): array
{
return $this->getNormalizationStringList('dash_equivalents');
}
public function replaceWordSeparatorsWithSpace(string $value): string
{
return str_replace($this->getWordSeparatorCharacters(), ' ', $value);
}
public function normalizeDashEquivalents(string $value): string
{
return str_replace($this->getDashEquivalents(), '-', $value);
}
/** @return string[] */
public function getCleanupProfileNames(): array
{
return array_keys($this->requiredMap('cleanup_profiles'));
}
/** @return string[] */
public function getStopWordsForProfile(string $profileName): array
{
return $this->removeProtectedTerms(
$this->resolveGroupedTerms($profileName, 'stopword_groups', 'stopword_groups')
);
}
/** @return string[] */
public function getPhrasesForProfile(string $profileName): array
{
return $this->removeProtectedTerms(
$this->resolveGroupedTerms($profileName, 'phrase_groups', 'phrase_groups')
);
}
/** @return string[] */
public function getMetaTermsForProfile(string $profileName): array
{
return $this->removeProtectedTerms(
$this->resolveGroupedTerms($profileName, 'meta_term_groups', 'meta_term_groups')
);
}
/** @return array{stopwords:string[], phrases:string[], meta_terms:string[], protected_terms:string[]} */
public function getCleanupProfile(string $profileName): array
{
return [
'stopwords' => $this->getStopWordsForProfile($profileName),
'phrases' => $this->getPhrasesForProfile($profileName),
'meta_terms' => $this->getMetaTermsForProfile($profileName),
'protected_terms' => $this->getProtectedTermsForProfile($profileName),
];
}
/** @return string[] */
public function getProtectedTermsForProfile(string $profileName): array
{
$profile = $this->requiredCleanupProfile($profileName);
$groupNames = $this->stringListFromValue(
$this->profileValue($profile, 'protected_term_groups'),
sprintf('cleanup_profiles.%s.protected_term_groups', $profileName),
false
);
if ($groupNames === []) {
return $this->getProtectedTerms();
}
$out = [];
foreach ($groupNames as $groupName) {
if ($groupName === 'protected_terms') {
$out = $this->mergeUnique($out, $this->getProtectedTerms());
continue;
}
throw $this->invalid(
sprintf('cleanup_profiles.%s.protected_term_groups', $profileName),
sprintf('references unknown protected term group "%s"', $groupName)
);
}
return $out;
}
/** @return string[] */
private function resolveGroupedTerms(string $profileName, string $profileKey, string $rootKey): array
{
$profile = $this->requiredCleanupProfile($profileName);
$groupNames = $this->resolveProfileGroupSetTerms($profileName, $profile, $profileKey);
$groupNames = $this->mergeUnique(
$groupNames,
$this->stringListFromValue(
$this->profileValue($profile, $profileKey),
sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey),
false
)
);
if ($groupNames === []) {
return [];
}
$groups = $this->requiredMap($rootKey);
$out = [];
foreach ($groupNames as $groupName) {
if (!array_key_exists($groupName, $groups)) {
throw $this->invalid(
sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey),
sprintf('references unknown group "%s"', $groupName)
);
}
$out = $this->mergeUnique(
$out,
$this->stringListFromValue($groups[$groupName], sprintf('%s.%s', $rootKey, $groupName), true)
);
}
return $out;
}
/**
* @param array<string, mixed> $profile
* @return string[]
*/
private function resolveProfileGroupSetTerms(string $profileName, array $profile, string $profileKey): array
{
$profileSetKey = $this->profileGroupSetKey($profileKey);
$setNames = $this->stringListFromValue(
$this->profileValue($profile, $profileSetKey),
sprintf('cleanup_profiles.%s.%s', $profileName, $profileSetKey),
false
);
if ($setNames === []) {
return [];
}
$rootSetKey = $this->rootGroupSetKey($profileKey);
$sets = $this->requiredMap($rootSetKey);
$out = [];
foreach ($setNames as $setName) {
if (!array_key_exists($setName, $sets)) {
throw $this->invalid(
sprintf('cleanup_profiles.%s.%s', $profileName, $profileSetKey),
sprintf('references unknown group set "%s"', $setName)
);
}
$out = $this->mergeUnique(
$out,
$this->stringListFromValue($sets[$setName], sprintf('%s.%s', $rootSetKey, $setName), true)
);
}
return $out;
}
private function profileGroupSetKey(string $profileKey): string
{
if ($profileKey === 'stopword_groups') {
return 'stopword_group_sets';
}
if ($profileKey === 'phrase_groups') {
return 'phrase_group_sets';
}
if ($profileKey === 'meta_term_groups') {
return 'meta_term_group_sets';
}
return sprintf('%s_sets', $profileKey);
}
private function rootGroupSetKey(string $profileKey): string
{
return $this->profileGroupSetKey($profileKey);
}
/** @return array<string, mixed> */
private function requiredCleanupProfile(string $profileName): array
{
$profileName = trim($profileName);
if ($profileName === '') {
throw $this->invalid('cleanup_profiles', 'profile name must not be empty');
}
$profiles = $this->requiredMap('cleanup_profiles');
if (!array_key_exists($profileName, $profiles) || !is_array($profiles[$profileName])) {
throw $this->invalid('cleanup_profiles', sprintf('missing profile "%s"', $profileName));
}
/** @var array<string, mixed> $profile */
$profile = $profiles[$profileName];
return $profile;
}
private function profileValue(array $profile, string $key): mixed
{
if (!array_key_exists($key, $profile)) {
return [];
}
return $profile[$key];
}
/** @return array<string, mixed> */
private function requiredMap(string $key): array
{
if (!array_key_exists($key, $this->config) || !is_array($this->config[$key])) {
throw $this->invalid($key, 'must be a map');
}
/** @var array<string, mixed> $map */
$map = $this->config[$key];
if ($map === []) {
throw $this->invalid($key, 'must not be empty');
}
return $map;
}
/** @return string[] */
private function getNormalizationStringList(string $key): array
{
$normalization = $this->requiredMap('normalization');
if (!array_key_exists($key, $normalization)) {
throw $this->invalid(sprintf('normalization.%s', $key), 'is missing');
}
return $this->stringListFromValue($normalization[$key], sprintf('normalization.%s', $key), true);
}
/** @return string[] */
private function requiredTopLevelStringList(string $key): array
{
if (!array_key_exists($key, $this->config)) {
throw $this->invalid($key, 'is missing');
}
return $this->stringListFromValue($this->config[$key], $key, true);
}
/** @return string[] */
private function stringListFromValue(mixed $value, string $path, bool $required): array
{
if (!is_array($value)) {
if (!$required && $value === []) {
return [];
}
throw $this->invalid($path, 'must be a list of non-empty strings');
}
$out = [];
foreach ($value as $item) {
if (!is_scalar($item)) {
continue;
}
$item = $this->normalizeToken((string) $item);
if ($item === '' || in_array($item, $out, true)) {
continue;
}
$out[] = $item;
}
if ($required && $out === []) {
throw $this->invalid($path, 'must contain at least one non-empty string');
}
return $out;
}
/** @return array<string, string> */
private function stringMapFromValue(mixed $value, string $path, bool $required): array
{
if (!is_array($value)) {
throw $this->invalid($path, 'must be a map of non-empty strings');
}
$out = [];
foreach ($value as $key => $item) {
if (!is_scalar($key) || !is_scalar($item)) {
continue;
}
$key = trim((string) $key);
$item = trim((string) $item);
if ($key === '' || $item === '') {
continue;
}
$out[$key] = $item;
}
if ($required && $out === []) {
throw $this->invalid($path, 'must contain at least one non-empty map entry');
}
return $out;
}
/** @param string[] $terms */
private function removeProtectedTerms(array $terms): array
{
$protected = $this->getProtectedTerms();
return array_values(array_filter(
$terms,
static fn (string $term): bool => !in_array($term, $protected, true)
));
}
/**
* @param string[] $left
* @param string[] $right
* @return string[]
*/
private function mergeUnique(array $left, array $right): array
{
foreach ($right as $item) {
if (!in_array($item, $left, true)) {
$left[] = $item;
}
}
return $left;
}
private function normalizeToken(string $token): string
{
$token = trim($token);
return function_exists('mb_strtolower')
? mb_strtolower($token, 'UTF-8')
: strtolower($token);
}
private function invalid(string $path, string $message): InvalidArgumentException
{
return new InvalidArgumentException(sprintf('Invalid RetrieX language cleanup config at "%s": %s.', $path, $message));
}
}