last step

This commit is contained in:
team2
2026-04-29 22:22:57 +02:00
parent 8ece67b461
commit d618265044
12 changed files with 918 additions and 656 deletions

View File

@@ -4,243 +4,60 @@ declare(strict_types=1);
namespace App\Config;
use InvalidArgumentException;
final class CommerceQueryParserConfig
{
private const KNOWN_BRANDS = [
'heyl',
'horiba',
'neomeris',
];
private const PHRASES_TO_REMOVE = [
'ich suche',
'suche',
'habt ihr',
'gibt es',
'gebe mir',
'gib mir',
'zeige mir',
'welches gerät',
'welche gerät',
'welches modell',
'welches ist besser',
'welches ist am besten',
'alternative',
'alternativen',
'unter anderem',
'u a',
'welche',
'welcher',
'welches',
'welchen',
'sind',
'ist',
'geeignet',
'geeigent',
'verfügbarkeit',
'verfuegbarkeit',
];
private const FILTER_SEARCH_TOKENS = [
'auch',
'noch',
'nochmal',
'zusätzlich',
'dazu',
'davon',
'stattdessen',
'bitte',
'gern',
'gerne',
'zeige',
'zeig',
'such',
'suche',
'finde',
'find',
'mir',
'mal',
'von',
'im',
'in',
'für',
'fuer',
'welche',
'welcher',
'welches',
'welchen',
'sind',
'ist',
'geeignet',
'geeigent',
'verfügbarkeit',
'verfuegbarkeit',
'prüfe',
'pruefe',
'den',
'die',
'das',
'der',
'dem',
'des',
'und',
'oder',
'sowie',
'seine',
'seinen',
'seiner',
'seinem',
'seines',
'siene',
'sienen',
'siener',
'sienem',
'sienes',
'gebe',
'gib',
'nenne',
'nenn',
'preis',
'preise',
'preisen',
'kostet',
'kosten',
'ua',
'also',
'gut',
'gute',
'guten',
'guter',
'gutes',
'passen',
'passend',
];
private const SEARCH_TOKEN_CORRECTIONS = [
'siene' => 'seine',
'sienen' => 'seinen',
'siener' => 'seiner',
'sienem' => 'seinem',
'sienes' => 'seines',
'indicatoren' => 'indikatoren',
];
private const SEARCH_TOKEN_CANONICAL_MAP = [
'indikatoren' => 'indikator',
'indicators' => 'indikator',
'indicator' => 'indikator',
'reagenzien' => 'reagenz',
'reagents' => 'reagenz',
'reagent' => 'reagenz',
'produkte' => 'produkt',
];
private const SEMANTIC_SHOP_SEARCH_TOKENS = [
'indikator',
'indicator',
'reagenz',
'reagent',
'zubehör',
'zubehor',
'ersatzteil',
'verbrauchsmaterial',
'chemie',
'indikatorchemie',
'reagenzchemie',
'kit',
'set',
'filter',
'pumpe',
'pumpenkopf',
'motorblock',
'lösung',
'loesung',
'solution',
'teststreifen',
'gerät',
'geraet',
'messgerät',
'messgeraet',
'analysegerät',
'analysegeraet',
'analysator',
'monitor',
'controller',
'system',
];
/**
* @param array<string, mixed> $config
*/
public function __construct(
private readonly array $config = [],
private readonly ?DomainVocabularyConfig $vocabulary = null,
) {
}
/**
* @return string[]
*/
/** @return string[] */
public function getKnownBrands(): array
{
return $this->stringList(
'known_brands',
$this->vocabularyView('commerce_query.known_brands', self::KNOWN_BRANDS)
);
return $this->stringList('known_brands');
}
/**
* @return string[]
*/
/** @return string[] */
public function getPhrasesToRemove(): array
{
return $this->stringList(
'phrases_to_remove',
$this->vocabularyView('commerce_query.phrases_to_remove', self::PHRASES_TO_REMOVE)
);
return $this->stringList('phrases_to_remove');
}
public function getHistoryContextPattern(): string
{
return 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt';
return $this->string('patterns.history_context');
}
public function getHistoryContextValuePattern(): string
{
return '/\b(' . $this->getHistoryContextPattern() . ')\b/u';
return $this->renderTemplate(
$this->string('patterns.history_context_value_template'),
['fragment' => $this->getHistoryContextPattern()],
'patterns.history_context_value_template'
);
}
/**
* @return string[]
*/
/** @return string[] */
public function getFilterSearchTokens(): array
{
return $this->stringList(
'filter_search_tokens',
$this->vocabularyView('commerce_query.filter_search_tokens', self::FILTER_SEARCH_TOKENS)
);
return $this->stringList('filter_search_tokens');
}
/**
* @return array<string, string>
*/
/** @return array<string, string> */
public function getSearchTokenCorrections(): array
{
return $this->stringMap(
'search_token_corrections',
$this->vocabularyStringMap('commerce_query.search_token_corrections', self::SEARCH_TOKEN_CORRECTIONS)
);
return $this->stringMap('search_token_corrections');
}
/**
* @return array<string, string>
*/
/** @return array<string, string> */
public function getSearchTokenCanonicalMap(): array
{
return $this->stringMap(
'search_token_canonical_map',
$this->vocabularyStringMap('commerce_query.search_token_canonical', self::SEARCH_TOKEN_CANONICAL_MAP)
);
return $this->stringMap('search_token_canonical_map');
}
/**
@@ -253,181 +70,190 @@ final class CommerceQueryParserConfig
return $this->getFilterSearchTokens();
}
/**
* @return string[]
*/
/** @return string[] */
public function getNormalizationSearch(): array
{
return ['€'];
return $this->stringList('normalization.search', true);
}
/**
* @return string[]
*/
/** @return string[] */
public function getNormalizationReplace(): array
{
return [' euro '];
return $this->stringList('normalization.replace', true);
}
public function getPromptSanitizePattern(): string
{
return '/[^\p{L}\p{N}\s.,\-]/u';
return $this->string('patterns.prompt_sanitize');
}
public function getWhitespaceCollapsePattern(): string
{
return '/\s+/u';
return $this->string('patterns.whitespace_collapse');
}
public function getWhitespaceSplitPattern(): string
{
return '/\s+/u';
return $this->string('patterns.whitespace_split');
}
public function getSearchTextTrimCharacters(): string
{
return " \t\n\r\0\x0B-.,";
$characters = '';
foreach ($this->stringList('text.trim_characters') as $item) {
$characters .= match ($item) {
'space' => ' ',
'tab' => "\t",
'lf' => "\n",
'cr' => "\r",
'nul' => "\0",
'vertical_tab' => "\x0B",
default => $item,
};
}
return $characters;
}
public function getMinSearchTokenLength(): int
{
return 1;
return $this->int('limits.min_search_token_length');
}
public function getMinDirectProductTokenLength(): int
{
return 1;
return $this->int('limits.min_direct_product_token_length');
}
public function getHistoryQuestionPattern(): string
{
return '/^Question:\s*(.+)$/m';
return $this->string('patterns.history_question');
}
public function getPriceBetweenPattern(): string
{
return '/\bzwischen\s+(\d+(?:[.,]\d+)?)\s+und\s+(\d+(?:[.,]\d+)?)\s+euro\b/u';
return $this->string('patterns.price_between');
}
public function getPriceMaxPattern(): string
{
return '/\b(?:unter|bis|max(?:imal)?)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u';
return $this->string('patterns.price_max');
}
public function getPriceMinPattern(): string
{
return '/\b(?:ab|mindestens|min)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u';
return $this->string('patterns.price_min');
}
/**
* @return string[]
*/
/** @return string[] */
public function getPriceRemovalPatterns(CommerceIntentConfig $intentConfig): array
{
return [
'/\bzwischen\s+\d+(?:[.,]\d+)?\s+und\s+\d+(?:[.,]\d+)?\s*euro\b/u',
'/\b(?:unter|bis|max(?:imal)?|ab|mindestens|min)\s+\d+(?:[.,]\d+)?\s*euro\b/u',
'/\b(?:' . $intentConfig->getPricePattern() . ')\b/u',
$this->string('patterns.price_removal_between'),
$this->string('patterns.price_removal_minmax'),
$this->renderTemplate(
$this->string('patterns.price_removal_intent_template'),
['price_pattern' => $intentConfig->getPricePattern()],
'patterns.price_removal_intent_template'
),
];
}
public function getDirectProductDigitPattern(): string
{
return '/\d/u';
return $this->string('patterns.direct_product_digit');
}
public function getDirectProductMaxTokens(): int
{
return 4;
return $this->int('limits.direct_product_max_tokens');
}
public function getModelLikePattern(): string
{
return '/\b[a-zäöüß][a-zäöüß®\-]*(?:\s+[a-zäöüß][a-zäöüß®\-]*){0,2}\s+\d{2,5}[a-z0-9\-]*\b/u';
return $this->string('patterns.model_like');
}
public function getAccessoryLikePattern(): string
{
return '/\b(?:indikator|indicator|reagenz|reagent|kit|set)\s+\d{1,5}[a-z0-9\-]*\b/u';
return $this->string('patterns.accessory_like');
}
public function getContainsDigitPattern(): string
{
return '/\d/u';
return $this->string('patterns.contains_digit');
}
public function getModelNumberTokenPattern(): string
{
return '/^(?:\d{2,5}[a-z0-9\-]*|[a-z]{1,6}\d{1,5}[a-z0-9\-]*)$/u';
return $this->string('patterns.model_number_token');
}
public function getModelContextTokenPattern(): string
{
return '/^[\p{L}][\p{L}0-9®\-]{2,}$/u';
return $this->string('patterns.model_context_token');
}
public function getModelSuffixTokenPattern(): string
{
return '/^[a-z]{1,4}\d{0,3}$/u';
return $this->string('patterns.model_suffix_token');
}
public function getModelContextTokenWindow(): int
{
return 4;
return $this->int('limits.model_context_token_window');
}
public function getMinMeaningfulAlphaTokenLength(): int
{
return 2;
return $this->int('limits.min_meaningful_alpha_token_length');
}
public function getMaxShopSearchTokens(): int
{
return 6;
return $this->int('limits.max_shop_search_tokens');
}
public function getInstructionOrPresentationTokenPattern(): string
{
return '/^(?:zeig(?:e)?|such(?:e)?|find(?:e)?|gib|gebe|nenn(?:e)?|liefer(?:e)?|erstelle?|mach(?:e)?|brauch(?:e)?|will|möchte|moechte|hätte|haette|kannst|bitte|mal|alle|alles|komplett|vollständig|vollstaendig|gesamt|ganze|ganzen|liste|listung|auflistung|tabelle|tabellarisch|übersicht|uebersicht|anzeigen?|ausgeben?|darstellen?|antwort(?:e)?|erklär(?:e)?|erklaer(?:e)?|info|infos|informationen|dazu|hierzu|damit|davon|an|als|mit|ohne|inkl|inklusive|also|gut|gute|guten|guter|gutes|passend|passen)$/u';
}
public function getMeasurementValueTokenPattern(): string
{
return '/^\d+[.,]\d+$/u';
return $this->string('patterns.instruction_or_presentation_token');
}
/**
* Product/category tokens that are useful for Store API search even when they are not next to a model number.
* This is intentionally a semantic allowlist, not a spelling-error blocklist.
*
* @return string[]
*/
public function getMeasurementValueTokenPattern(): string
{
return $this->string('patterns.measurement_value_token');
}
/** @return string[] */
public function getSemanticShopSearchTokens(): array
{
return $this->stringList(
'semantic_shop_search_tokens',
$this->vocabularyView('commerce_query.semantic_shop_search_tokens', self::SEMANTIC_SHOP_SEARCH_TOKENS)
return $this->stringList('semantic_shop_search_tokens');
}
public function buildExactTokenRemovalPattern(string $token): string
{
return $this->renderTemplate(
$this->string('patterns.exact_token_removal_template'),
['token' => preg_quote($token, '/')],
'patterns.exact_token_removal_template'
);
}
/** @return string[] */
private function vocabularyView(string $path, array $fallback): array
public function buildBrandPartOfModelPattern(string $brand): string
{
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
}
/** @return array<string, string> */
private function vocabularyStringMap(string $path, array $fallback): array
{
return $this->vocabulary?->stringMap($path, $fallback) ?? $fallback;
return $this->renderTemplate(
$this->string('patterns.brand_part_of_model_template'),
['brand' => preg_quote($brand, '/')],
'patterns.brand_part_of_model_template'
);
}
/** @return string[] */
private function stringList(string $path, array $default): array
private function stringList(string $path, bool $preserveWhitespace = false): array
{
$value = $this->value($path, $default);
$value = $this->value($path);
if (!is_array($value)) {
return $default;
throw $this->invalid($path, 'must be a list of non-empty strings');
}
$out = [];
@@ -436,23 +262,31 @@ final class CommerceQueryParserConfig
continue;
}
$item = trim((string) $item);
if ($item === '' || in_array($item, $out, true)) {
$item = (string) $item;
if (!$preserveWhitespace) {
$item = trim($item);
}
if (trim($item) === '' || in_array($item, $out, true)) {
continue;
}
$out[] = $item;
}
return $out !== [] ? $out : $default;
if ($out === []) {
throw $this->invalid($path, 'must contain at least one non-empty string');
}
return $out;
}
/** @return array<string, string> */
private function stringMap(string $path, array $default): array
private function stringMap(string $path): array
{
$value = $this->value($path, $default);
$value = $this->value($path);
if (!is_array($value)) {
return $default;
throw $this->invalid($path, 'must be a map of non-empty strings');
}
$out = [];
@@ -468,15 +302,44 @@ final class CommerceQueryParserConfig
}
}
return $out !== [] ? $out : $default;
if ($out === []) {
throw $this->invalid($path, 'must contain at least one non-empty mapping');
}
return $out;
}
private function value(string $path, mixed $default): mixed
private function string(string $path): string
{
$value = $this->value($path);
if (!is_scalar($value)) {
throw $this->invalid($path, 'must be a non-empty string');
}
$value = (string) $value;
if ($value === '') {
throw $this->invalid($path, 'must be a non-empty string');
}
return $value;
}
private function int(string $path): int
{
$value = $this->value($path);
if (!is_int($value)) {
throw $this->invalid($path, 'must be an integer');
}
return $value;
}
private function value(string $path): mixed
{
$current = $this->config;
foreach (explode('.', $path) as $segment) {
if (!is_array($current) || !array_key_exists($segment, $current)) {
return $default;
throw $this->missing($path);
}
$current = $current[$segment];
@@ -485,13 +348,29 @@ final class CommerceQueryParserConfig
return $current;
}
public function buildExactTokenRemovalPattern(string $token): string
/**
* @param array<string, string> $replacements
*/
private function renderTemplate(string $template, array $replacements, string $path): string
{
return '/\b' . preg_quote($token, '/') . '\b/u';
foreach ($replacements as $placeholder => $value) {
$template = str_replace('{' . $placeholder . '}', $value, $template);
}
if (preg_match('/\{[A-Za-z_][A-Za-z0-9_]*\}/', $template) === 1) {
throw $this->invalid($path, 'contains unresolved placeholders');
}
return $template;
}
public function buildBrandPartOfModelPattern(string $brand): string
private function missing(string $path): InvalidArgumentException
{
return '/\b' . preg_quote($brand, '/') . '\s+\d{2,5}[a-z0-9\-]*\b/u';
return new InvalidArgumentException(sprintf('RetrieX commerce query config "%s" is missing.', $path));
}
}
private function invalid(string $path, string $reason): InvalidArgumentException
{
return new InvalidArgumentException(sprintf('RetrieX commerce query config "%s" %s.', $path, $reason));
}
}