last step
This commit is contained in:
@@ -4,243 +4,60 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Config;
|
||||
|
||||
use InvalidArgumentException;
|
||||
|
||||
final class CommerceQueryParserConfig
|
||||
{
|
||||
private const KNOWN_BRANDS = [
|
||||
'heyl',
|
||||
'horiba',
|
||||
'neomeris',
|
||||
];
|
||||
|
||||
private const PHRASES_TO_REMOVE = [
|
||||
'ich suche',
|
||||
'suche',
|
||||
'habt ihr',
|
||||
'gibt es',
|
||||
'gebe mir',
|
||||
'gib mir',
|
||||
'zeige mir',
|
||||
'welches gerät',
|
||||
'welche gerät',
|
||||
'welches modell',
|
||||
'welches ist besser',
|
||||
'welches ist am besten',
|
||||
'alternative',
|
||||
'alternativen',
|
||||
'unter anderem',
|
||||
'u a',
|
||||
'welche',
|
||||
'welcher',
|
||||
'welches',
|
||||
'welchen',
|
||||
'sind',
|
||||
'ist',
|
||||
'geeignet',
|
||||
'geeigent',
|
||||
'verfügbarkeit',
|
||||
'verfuegbarkeit',
|
||||
];
|
||||
|
||||
private const FILTER_SEARCH_TOKENS = [
|
||||
'auch',
|
||||
'noch',
|
||||
'nochmal',
|
||||
'zusätzlich',
|
||||
'dazu',
|
||||
'davon',
|
||||
'stattdessen',
|
||||
'bitte',
|
||||
'gern',
|
||||
'gerne',
|
||||
'zeige',
|
||||
'zeig',
|
||||
'such',
|
||||
'suche',
|
||||
'finde',
|
||||
'find',
|
||||
'mir',
|
||||
'mal',
|
||||
'von',
|
||||
'im',
|
||||
'in',
|
||||
'für',
|
||||
'fuer',
|
||||
'welche',
|
||||
'welcher',
|
||||
'welches',
|
||||
'welchen',
|
||||
'sind',
|
||||
'ist',
|
||||
'geeignet',
|
||||
'geeigent',
|
||||
'verfügbarkeit',
|
||||
'verfuegbarkeit',
|
||||
'prüfe',
|
||||
'pruefe',
|
||||
'den',
|
||||
'die',
|
||||
'das',
|
||||
'der',
|
||||
'dem',
|
||||
'des',
|
||||
'und',
|
||||
'oder',
|
||||
'sowie',
|
||||
'seine',
|
||||
'seinen',
|
||||
'seiner',
|
||||
'seinem',
|
||||
'seines',
|
||||
'siene',
|
||||
'sienen',
|
||||
'siener',
|
||||
'sienem',
|
||||
'sienes',
|
||||
'gebe',
|
||||
'gib',
|
||||
'nenne',
|
||||
'nenn',
|
||||
'preis',
|
||||
'preise',
|
||||
'preisen',
|
||||
'kostet',
|
||||
'kosten',
|
||||
'ua',
|
||||
'also',
|
||||
'gut',
|
||||
'gute',
|
||||
'guten',
|
||||
'guter',
|
||||
'gutes',
|
||||
'passen',
|
||||
'passend',
|
||||
];
|
||||
|
||||
private const SEARCH_TOKEN_CORRECTIONS = [
|
||||
'siene' => 'seine',
|
||||
'sienen' => 'seinen',
|
||||
'siener' => 'seiner',
|
||||
'sienem' => 'seinem',
|
||||
'sienes' => 'seines',
|
||||
'indicatoren' => 'indikatoren',
|
||||
];
|
||||
|
||||
private const SEARCH_TOKEN_CANONICAL_MAP = [
|
||||
'indikatoren' => 'indikator',
|
||||
'indicators' => 'indikator',
|
||||
'indicator' => 'indikator',
|
||||
'reagenzien' => 'reagenz',
|
||||
'reagents' => 'reagenz',
|
||||
'reagent' => 'reagenz',
|
||||
'produkte' => 'produkt',
|
||||
];
|
||||
|
||||
private const SEMANTIC_SHOP_SEARCH_TOKENS = [
|
||||
'indikator',
|
||||
'indicator',
|
||||
'reagenz',
|
||||
'reagent',
|
||||
'zubehör',
|
||||
'zubehor',
|
||||
'ersatzteil',
|
||||
'verbrauchsmaterial',
|
||||
'chemie',
|
||||
'indikatorchemie',
|
||||
'reagenzchemie',
|
||||
'kit',
|
||||
'set',
|
||||
'filter',
|
||||
'pumpe',
|
||||
'pumpenkopf',
|
||||
'motorblock',
|
||||
'lösung',
|
||||
'loesung',
|
||||
'solution',
|
||||
'teststreifen',
|
||||
'gerät',
|
||||
'geraet',
|
||||
'messgerät',
|
||||
'messgeraet',
|
||||
'analysegerät',
|
||||
'analysegeraet',
|
||||
'analysator',
|
||||
'monitor',
|
||||
'controller',
|
||||
'system',
|
||||
];
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $config
|
||||
*/
|
||||
public function __construct(
|
||||
private readonly array $config = [],
|
||||
private readonly ?DomainVocabularyConfig $vocabulary = null,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getKnownBrands(): array
|
||||
{
|
||||
return $this->stringList(
|
||||
'known_brands',
|
||||
$this->vocabularyView('commerce_query.known_brands', self::KNOWN_BRANDS)
|
||||
);
|
||||
return $this->stringList('known_brands');
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getPhrasesToRemove(): array
|
||||
{
|
||||
return $this->stringList(
|
||||
'phrases_to_remove',
|
||||
$this->vocabularyView('commerce_query.phrases_to_remove', self::PHRASES_TO_REMOVE)
|
||||
);
|
||||
return $this->stringList('phrases_to_remove');
|
||||
}
|
||||
|
||||
public function getHistoryContextPattern(): string
|
||||
{
|
||||
return 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt';
|
||||
return $this->string('patterns.history_context');
|
||||
}
|
||||
|
||||
public function getHistoryContextValuePattern(): string
|
||||
{
|
||||
return '/\b(' . $this->getHistoryContextPattern() . ')\b/u';
|
||||
return $this->renderTemplate(
|
||||
$this->string('patterns.history_context_value_template'),
|
||||
['fragment' => $this->getHistoryContextPattern()],
|
||||
'patterns.history_context_value_template'
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getFilterSearchTokens(): array
|
||||
{
|
||||
return $this->stringList(
|
||||
'filter_search_tokens',
|
||||
$this->vocabularyView('commerce_query.filter_search_tokens', self::FILTER_SEARCH_TOKENS)
|
||||
);
|
||||
return $this->stringList('filter_search_tokens');
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, string>
|
||||
*/
|
||||
/** @return array<string, string> */
|
||||
public function getSearchTokenCorrections(): array
|
||||
{
|
||||
return $this->stringMap(
|
||||
'search_token_corrections',
|
||||
$this->vocabularyStringMap('commerce_query.search_token_corrections', self::SEARCH_TOKEN_CORRECTIONS)
|
||||
);
|
||||
return $this->stringMap('search_token_corrections');
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, string>
|
||||
*/
|
||||
/** @return array<string, string> */
|
||||
public function getSearchTokenCanonicalMap(): array
|
||||
{
|
||||
return $this->stringMap(
|
||||
'search_token_canonical_map',
|
||||
$this->vocabularyStringMap('commerce_query.search_token_canonical', self::SEARCH_TOKEN_CANONICAL_MAP)
|
||||
);
|
||||
return $this->stringMap('search_token_canonical_map');
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -253,181 +70,190 @@ final class CommerceQueryParserConfig
|
||||
return $this->getFilterSearchTokens();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getNormalizationSearch(): array
|
||||
{
|
||||
return ['€'];
|
||||
return $this->stringList('normalization.search', true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getNormalizationReplace(): array
|
||||
{
|
||||
return [' euro '];
|
||||
return $this->stringList('normalization.replace', true);
|
||||
}
|
||||
|
||||
public function getPromptSanitizePattern(): string
|
||||
{
|
||||
return '/[^\p{L}\p{N}\s.,\-]/u';
|
||||
return $this->string('patterns.prompt_sanitize');
|
||||
}
|
||||
|
||||
public function getWhitespaceCollapsePattern(): string
|
||||
{
|
||||
return '/\s+/u';
|
||||
return $this->string('patterns.whitespace_collapse');
|
||||
}
|
||||
|
||||
public function getWhitespaceSplitPattern(): string
|
||||
{
|
||||
return '/\s+/u';
|
||||
return $this->string('patterns.whitespace_split');
|
||||
}
|
||||
|
||||
public function getSearchTextTrimCharacters(): string
|
||||
{
|
||||
return " \t\n\r\0\x0B-.,";
|
||||
$characters = '';
|
||||
foreach ($this->stringList('text.trim_characters') as $item) {
|
||||
$characters .= match ($item) {
|
||||
'space' => ' ',
|
||||
'tab' => "\t",
|
||||
'lf' => "\n",
|
||||
'cr' => "\r",
|
||||
'nul' => "\0",
|
||||
'vertical_tab' => "\x0B",
|
||||
default => $item,
|
||||
};
|
||||
}
|
||||
|
||||
return $characters;
|
||||
}
|
||||
|
||||
public function getMinSearchTokenLength(): int
|
||||
{
|
||||
return 1;
|
||||
return $this->int('limits.min_search_token_length');
|
||||
}
|
||||
|
||||
public function getMinDirectProductTokenLength(): int
|
||||
{
|
||||
return 1;
|
||||
return $this->int('limits.min_direct_product_token_length');
|
||||
}
|
||||
|
||||
public function getHistoryQuestionPattern(): string
|
||||
{
|
||||
return '/^Question:\s*(.+)$/m';
|
||||
return $this->string('patterns.history_question');
|
||||
}
|
||||
|
||||
public function getPriceBetweenPattern(): string
|
||||
{
|
||||
return '/\bzwischen\s+(\d+(?:[.,]\d+)?)\s+und\s+(\d+(?:[.,]\d+)?)\s+euro\b/u';
|
||||
return $this->string('patterns.price_between');
|
||||
}
|
||||
|
||||
public function getPriceMaxPattern(): string
|
||||
{
|
||||
return '/\b(?:unter|bis|max(?:imal)?)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u';
|
||||
return $this->string('patterns.price_max');
|
||||
}
|
||||
|
||||
public function getPriceMinPattern(): string
|
||||
{
|
||||
return '/\b(?:ab|mindestens|min)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u';
|
||||
return $this->string('patterns.price_min');
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
/** @return string[] */
|
||||
public function getPriceRemovalPatterns(CommerceIntentConfig $intentConfig): array
|
||||
{
|
||||
return [
|
||||
'/\bzwischen\s+\d+(?:[.,]\d+)?\s+und\s+\d+(?:[.,]\d+)?\s*euro\b/u',
|
||||
'/\b(?:unter|bis|max(?:imal)?|ab|mindestens|min)\s+\d+(?:[.,]\d+)?\s*euro\b/u',
|
||||
'/\b(?:' . $intentConfig->getPricePattern() . ')\b/u',
|
||||
$this->string('patterns.price_removal_between'),
|
||||
$this->string('patterns.price_removal_minmax'),
|
||||
$this->renderTemplate(
|
||||
$this->string('patterns.price_removal_intent_template'),
|
||||
['price_pattern' => $intentConfig->getPricePattern()],
|
||||
'patterns.price_removal_intent_template'
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
public function getDirectProductDigitPattern(): string
|
||||
{
|
||||
return '/\d/u';
|
||||
return $this->string('patterns.direct_product_digit');
|
||||
}
|
||||
|
||||
public function getDirectProductMaxTokens(): int
|
||||
{
|
||||
return 4;
|
||||
return $this->int('limits.direct_product_max_tokens');
|
||||
}
|
||||
|
||||
public function getModelLikePattern(): string
|
||||
{
|
||||
return '/\b[a-zäöüß][a-zäöüß®\-]*(?:\s+[a-zäöüß][a-zäöüß®\-]*){0,2}\s+\d{2,5}[a-z0-9\-]*\b/u';
|
||||
return $this->string('patterns.model_like');
|
||||
}
|
||||
|
||||
public function getAccessoryLikePattern(): string
|
||||
{
|
||||
return '/\b(?:indikator|indicator|reagenz|reagent|kit|set)\s+\d{1,5}[a-z0-9\-]*\b/u';
|
||||
return $this->string('patterns.accessory_like');
|
||||
}
|
||||
|
||||
public function getContainsDigitPattern(): string
|
||||
{
|
||||
return '/\d/u';
|
||||
return $this->string('patterns.contains_digit');
|
||||
}
|
||||
|
||||
public function getModelNumberTokenPattern(): string
|
||||
{
|
||||
return '/^(?:\d{2,5}[a-z0-9\-]*|[a-z]{1,6}\d{1,5}[a-z0-9\-]*)$/u';
|
||||
return $this->string('patterns.model_number_token');
|
||||
}
|
||||
|
||||
public function getModelContextTokenPattern(): string
|
||||
{
|
||||
return '/^[\p{L}][\p{L}0-9®\-]{2,}$/u';
|
||||
return $this->string('patterns.model_context_token');
|
||||
}
|
||||
|
||||
public function getModelSuffixTokenPattern(): string
|
||||
{
|
||||
return '/^[a-z]{1,4}\d{0,3}$/u';
|
||||
return $this->string('patterns.model_suffix_token');
|
||||
}
|
||||
|
||||
public function getModelContextTokenWindow(): int
|
||||
{
|
||||
return 4;
|
||||
return $this->int('limits.model_context_token_window');
|
||||
}
|
||||
|
||||
public function getMinMeaningfulAlphaTokenLength(): int
|
||||
{
|
||||
return 2;
|
||||
return $this->int('limits.min_meaningful_alpha_token_length');
|
||||
}
|
||||
|
||||
public function getMaxShopSearchTokens(): int
|
||||
{
|
||||
return 6;
|
||||
return $this->int('limits.max_shop_search_tokens');
|
||||
}
|
||||
|
||||
public function getInstructionOrPresentationTokenPattern(): string
|
||||
{
|
||||
return '/^(?:zeig(?:e)?|such(?:e)?|find(?:e)?|gib|gebe|nenn(?:e)?|liefer(?:e)?|erstelle?|mach(?:e)?|brauch(?:e)?|will|möchte|moechte|hätte|haette|kannst|bitte|mal|alle|alles|komplett|vollständig|vollstaendig|gesamt|ganze|ganzen|liste|listung|auflistung|tabelle|tabellarisch|übersicht|uebersicht|anzeigen?|ausgeben?|darstellen?|antwort(?:e)?|erklär(?:e)?|erklaer(?:e)?|info|infos|informationen|dazu|hierzu|damit|davon|an|als|mit|ohne|inkl|inklusive|also|gut|gute|guten|guter|gutes|passend|passen)$/u';
|
||||
}
|
||||
public function getMeasurementValueTokenPattern(): string
|
||||
{
|
||||
return '/^\d+[.,]\d+$/u';
|
||||
return $this->string('patterns.instruction_or_presentation_token');
|
||||
}
|
||||
|
||||
/**
|
||||
* Product/category tokens that are useful for Store API search even when they are not next to a model number.
|
||||
* This is intentionally a semantic allowlist, not a spelling-error blocklist.
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
public function getMeasurementValueTokenPattern(): string
|
||||
{
|
||||
return $this->string('patterns.measurement_value_token');
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function getSemanticShopSearchTokens(): array
|
||||
{
|
||||
return $this->stringList(
|
||||
'semantic_shop_search_tokens',
|
||||
$this->vocabularyView('commerce_query.semantic_shop_search_tokens', self::SEMANTIC_SHOP_SEARCH_TOKENS)
|
||||
return $this->stringList('semantic_shop_search_tokens');
|
||||
}
|
||||
|
||||
public function buildExactTokenRemovalPattern(string $token): string
|
||||
{
|
||||
return $this->renderTemplate(
|
||||
$this->string('patterns.exact_token_removal_template'),
|
||||
['token' => preg_quote($token, '/')],
|
||||
'patterns.exact_token_removal_template'
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/** @return string[] */
|
||||
private function vocabularyView(string $path, array $fallback): array
|
||||
public function buildBrandPartOfModelPattern(string $brand): string
|
||||
{
|
||||
return $this->vocabulary?->view($path, $fallback) ?? $fallback;
|
||||
}
|
||||
|
||||
/** @return array<string, string> */
|
||||
private function vocabularyStringMap(string $path, array $fallback): array
|
||||
{
|
||||
return $this->vocabulary?->stringMap($path, $fallback) ?? $fallback;
|
||||
return $this->renderTemplate(
|
||||
$this->string('patterns.brand_part_of_model_template'),
|
||||
['brand' => preg_quote($brand, '/')],
|
||||
'patterns.brand_part_of_model_template'
|
||||
);
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
private function stringList(string $path, array $default): array
|
||||
private function stringList(string $path, bool $preserveWhitespace = false): array
|
||||
{
|
||||
$value = $this->value($path, $default);
|
||||
$value = $this->value($path);
|
||||
if (!is_array($value)) {
|
||||
return $default;
|
||||
throw $this->invalid($path, 'must be a list of non-empty strings');
|
||||
}
|
||||
|
||||
$out = [];
|
||||
@@ -436,23 +262,31 @@ final class CommerceQueryParserConfig
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = trim((string) $item);
|
||||
if ($item === '' || in_array($item, $out, true)) {
|
||||
$item = (string) $item;
|
||||
if (!$preserveWhitespace) {
|
||||
$item = trim($item);
|
||||
}
|
||||
|
||||
if (trim($item) === '' || in_array($item, $out, true)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = $item;
|
||||
}
|
||||
|
||||
return $out !== [] ? $out : $default;
|
||||
if ($out === []) {
|
||||
throw $this->invalid($path, 'must contain at least one non-empty string');
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/** @return array<string, string> */
|
||||
private function stringMap(string $path, array $default): array
|
||||
private function stringMap(string $path): array
|
||||
{
|
||||
$value = $this->value($path, $default);
|
||||
$value = $this->value($path);
|
||||
if (!is_array($value)) {
|
||||
return $default;
|
||||
throw $this->invalid($path, 'must be a map of non-empty strings');
|
||||
}
|
||||
|
||||
$out = [];
|
||||
@@ -468,15 +302,44 @@ final class CommerceQueryParserConfig
|
||||
}
|
||||
}
|
||||
|
||||
return $out !== [] ? $out : $default;
|
||||
if ($out === []) {
|
||||
throw $this->invalid($path, 'must contain at least one non-empty mapping');
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
private function value(string $path, mixed $default): mixed
|
||||
private function string(string $path): string
|
||||
{
|
||||
$value = $this->value($path);
|
||||
if (!is_scalar($value)) {
|
||||
throw $this->invalid($path, 'must be a non-empty string');
|
||||
}
|
||||
|
||||
$value = (string) $value;
|
||||
if ($value === '') {
|
||||
throw $this->invalid($path, 'must be a non-empty string');
|
||||
}
|
||||
|
||||
return $value;
|
||||
}
|
||||
|
||||
private function int(string $path): int
|
||||
{
|
||||
$value = $this->value($path);
|
||||
if (!is_int($value)) {
|
||||
throw $this->invalid($path, 'must be an integer');
|
||||
}
|
||||
|
||||
return $value;
|
||||
}
|
||||
|
||||
private function value(string $path): mixed
|
||||
{
|
||||
$current = $this->config;
|
||||
foreach (explode('.', $path) as $segment) {
|
||||
if (!is_array($current) || !array_key_exists($segment, $current)) {
|
||||
return $default;
|
||||
throw $this->missing($path);
|
||||
}
|
||||
|
||||
$current = $current[$segment];
|
||||
@@ -485,13 +348,29 @@ final class CommerceQueryParserConfig
|
||||
return $current;
|
||||
}
|
||||
|
||||
public function buildExactTokenRemovalPattern(string $token): string
|
||||
/**
|
||||
* @param array<string, string> $replacements
|
||||
*/
|
||||
private function renderTemplate(string $template, array $replacements, string $path): string
|
||||
{
|
||||
return '/\b' . preg_quote($token, '/') . '\b/u';
|
||||
foreach ($replacements as $placeholder => $value) {
|
||||
$template = str_replace('{' . $placeholder . '}', $value, $template);
|
||||
}
|
||||
|
||||
if (preg_match('/\{[A-Za-z_][A-Za-z0-9_]*\}/', $template) === 1) {
|
||||
throw $this->invalid($path, 'contains unresolved placeholders');
|
||||
}
|
||||
|
||||
return $template;
|
||||
}
|
||||
|
||||
public function buildBrandPartOfModelPattern(string $brand): string
|
||||
private function missing(string $path): InvalidArgumentException
|
||||
{
|
||||
return '/\b' . preg_quote($brand, '/') . '\s+\d{2,5}[a-z0-9\-]*\b/u';
|
||||
return new InvalidArgumentException(sprintf('RetrieX commerce query config "%s" is missing.', $path));
|
||||
}
|
||||
}
|
||||
|
||||
private function invalid(string $path, string $reason): InvalidArgumentException
|
||||
{
|
||||
return new InvalidArgumentException(sprintf('RetrieX commerce query config "%s" %s.', $path, $reason));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user