This commit is contained in:
team 1
2026-05-03 20:50:26 +02:00
parent b5cc12e8d5
commit 3d0b6b1cf8

View File

@@ -0,0 +1,278 @@
<?php
declare(strict_types=1);
namespace App\Config;
use InvalidArgumentException;
/**
* YAML-backed resolver for language cleanup profiles.
*
* The resolver is intentionally additive in p22: existing runtime callers keep
* using their legacy lists until later patches wire profiles into Commerce and
* Agent logic.
*/
final class LanguageCleanupConfig
{
/**
* @param array<string, mixed> $config
*/
public function __construct(private readonly array $config)
{
}
/** @return string[] */
public function getLegacyStopWords(): array
{
return $this->requiredTopLevelStringList('words');
}
/** @return string[] */
public function getProtectedTerms(): array
{
return $this->requiredTopLevelStringList('protected_terms');
}
public function isProtectedTerm(string $term): bool
{
$term = $this->normalizeToken($term);
if ($term === '') {
return false;
}
return in_array($term, $this->getProtectedTerms(), true);
}
/** @return string[] */
public function getCleanupProfileNames(): array
{
return array_keys($this->requiredMap('cleanup_profiles'));
}
/** @return string[] */
public function getStopWordsForProfile(string $profileName): array
{
return $this->removeProtectedTerms(
$this->resolveGroupedTerms($profileName, 'stopword_groups', 'stopword_groups')
);
}
/** @return string[] */
public function getPhrasesForProfile(string $profileName): array
{
return $this->removeProtectedTerms(
$this->resolveGroupedTerms($profileName, 'phrase_groups', 'phrase_groups')
);
}
/** @return string[] */
public function getMetaTermsForProfile(string $profileName): array
{
return $this->removeProtectedTerms(
$this->resolveGroupedTerms($profileName, 'meta_term_groups', 'meta_term_groups')
);
}
/** @return array{stopwords:string[], phrases:string[], meta_terms:string[], protected_terms:string[]} */
public function getCleanupProfile(string $profileName): array
{
return [
'stopwords' => $this->getStopWordsForProfile($profileName),
'phrases' => $this->getPhrasesForProfile($profileName),
'meta_terms' => $this->getMetaTermsForProfile($profileName),
'protected_terms' => $this->getProtectedTermsForProfile($profileName),
];
}
/** @return string[] */
public function getProtectedTermsForProfile(string $profileName): array
{
$profile = $this->requiredCleanupProfile($profileName);
$groupNames = $this->stringListFromValue(
$this->profileValue($profile, 'protected_term_groups'),
sprintf('cleanup_profiles.%s.protected_term_groups', $profileName),
false
);
if ($groupNames === []) {
return $this->getProtectedTerms();
}
$out = [];
foreach ($groupNames as $groupName) {
if ($groupName === 'protected_terms') {
$out = $this->mergeUnique($out, $this->getProtectedTerms());
continue;
}
throw $this->invalid(
sprintf('cleanup_profiles.%s.protected_term_groups', $profileName),
sprintf('references unknown protected term group "%s"', $groupName)
);
}
return $out;
}
/** @return string[] */
private function resolveGroupedTerms(string $profileName, string $profileKey, string $rootKey): array
{
$profile = $this->requiredCleanupProfile($profileName);
$groupNames = $this->stringListFromValue(
$this->profileValue($profile, $profileKey),
sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey),
false
);
if ($groupNames === []) {
return [];
}
$groups = $this->requiredMap($rootKey);
$out = [];
foreach ($groupNames as $groupName) {
if (!array_key_exists($groupName, $groups)) {
throw $this->invalid(
sprintf('cleanup_profiles.%s.%s', $profileName, $profileKey),
sprintf('references unknown group "%s"', $groupName)
);
}
$out = $this->mergeUnique(
$out,
$this->stringListFromValue($groups[$groupName], sprintf('%s.%s', $rootKey, $groupName), true)
);
}
return $out;
}
/** @return array<string, mixed> */
private function requiredCleanupProfile(string $profileName): array
{
$profileName = trim($profileName);
if ($profileName === '') {
throw $this->invalid('cleanup_profiles', 'profile name must not be empty');
}
$profiles = $this->requiredMap('cleanup_profiles');
if (!array_key_exists($profileName, $profiles) || !is_array($profiles[$profileName])) {
throw $this->invalid('cleanup_profiles', sprintf('missing profile "%s"', $profileName));
}
/** @var array<string, mixed> $profile */
$profile = $profiles[$profileName];
return $profile;
}
private function profileValue(array $profile, string $key): mixed
{
if (!array_key_exists($key, $profile)) {
return [];
}
return $profile[$key];
}
/** @return array<string, mixed> */
private function requiredMap(string $key): array
{
if (!array_key_exists($key, $this->config) || !is_array($this->config[$key])) {
throw $this->invalid($key, 'must be a map');
}
/** @var array<string, mixed> $map */
$map = $this->config[$key];
if ($map === []) {
throw $this->invalid($key, 'must not be empty');
}
return $map;
}
/** @return string[] */
private function requiredTopLevelStringList(string $key): array
{
if (!array_key_exists($key, $this->config)) {
throw $this->invalid($key, 'is missing');
}
return $this->stringListFromValue($this->config[$key], $key, true);
}
/** @return string[] */
private function stringListFromValue(mixed $value, string $path, bool $required): array
{
if (!is_array($value)) {
if (!$required && $value === []) {
return [];
}
throw $this->invalid($path, 'must be a list of non-empty strings');
}
$out = [];
foreach ($value as $item) {
if (!is_scalar($item)) {
continue;
}
$item = $this->normalizeToken((string) $item);
if ($item === '' || in_array($item, $out, true)) {
continue;
}
$out[] = $item;
}
if ($required && $out === []) {
throw $this->invalid($path, 'must contain at least one non-empty string');
}
return $out;
}
/** @param string[] $terms */
private function removeProtectedTerms(array $terms): array
{
$protected = $this->getProtectedTerms();
return array_values(array_filter(
$terms,
static fn (string $term): bool => !in_array($term, $protected, true)
));
}
/**
* @param string[] $left
* @param string[] $right
* @return string[]
*/
private function mergeUnique(array $left, array $right): array
{
foreach ($right as $item) {
if (!in_array($item, $left, true)) {
$left[] = $item;
}
}
return $left;
}
private function normalizeToken(string $token): string
{
$token = trim($token);
return function_exists('mb_strtolower')
? mb_strtolower($token, 'UTF-8')
: strtolower($token);
}
private function invalid(string $path, string $message): InvalidArgumentException
{
return new InvalidArgumentException(sprintf('Invalid RetrieX language cleanup config at "%s": %s.', $path, $message));
}
}