367 lines
8.7 KiB
PHP
367 lines
8.7 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Config;
|
|
|
|
final class CommerceQueryParserConfig
|
|
{
|
|
/**
|
|
* @return string[]
|
|
*/
|
|
public function getKnownBrands(): array
|
|
{
|
|
return [
|
|
'heyl',
|
|
'horiba',
|
|
'neomeris',
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @return string[]
|
|
*/
|
|
public function getPhrasesToRemove(): array
|
|
{
|
|
return [
|
|
'ich suche',
|
|
'suche',
|
|
'habt ihr',
|
|
'gibt es',
|
|
'gebe mir',
|
|
'gib mir',
|
|
'zeige mir',
|
|
'welches gerät',
|
|
'welche gerät',
|
|
'welches modell',
|
|
'welches ist besser',
|
|
'welches ist am besten',
|
|
'alternative',
|
|
'alternativen',
|
|
'unter anderem',
|
|
'u a',
|
|
'welche',
|
|
'welcher',
|
|
'welches',
|
|
'welchen',
|
|
'sind',
|
|
'ist',
|
|
'geeignet',
|
|
'geeigent',
|
|
'verfügbarkeit',
|
|
'verfuegbarkeit',
|
|
];
|
|
}
|
|
|
|
public function getHistoryContextPattern(): string
|
|
{
|
|
return 'chat|auch|noch|nochmal|zusätzlich|dazu|davon|stattdessen|alternative|alternativen|größer|groesser|kleiner|gleich(?:e|en|er|es)?|derselbe|dieselbe|dasselbe|wie oben|wie zuvor|wie gehabt';
|
|
}
|
|
|
|
public function getHistoryContextValuePattern(): string
|
|
{
|
|
return '/\b(' . $this->getHistoryContextPattern() . ')\b/u';
|
|
}
|
|
|
|
/**
|
|
* @return string[]
|
|
*/
|
|
public function getFilterSearchTokens(): array
|
|
{
|
|
return [
|
|
'auch',
|
|
'noch',
|
|
'nochmal',
|
|
'zusätzlich',
|
|
'dazu',
|
|
'davon',
|
|
'stattdessen',
|
|
'bitte',
|
|
'gern',
|
|
'gerne',
|
|
'zeige',
|
|
'zeig',
|
|
'such',
|
|
'suche',
|
|
'finde',
|
|
'find',
|
|
'mir',
|
|
'mal',
|
|
'von',
|
|
'im',
|
|
'in',
|
|
'für',
|
|
'fuer',
|
|
'welche',
|
|
'welcher',
|
|
'welches',
|
|
'welchen',
|
|
'sind',
|
|
'ist',
|
|
'geeignet',
|
|
'geeigent',
|
|
'verfügbarkeit',
|
|
'verfuegbarkeit',
|
|
'prüfe',
|
|
'pruefe',
|
|
'den',
|
|
'die',
|
|
'das',
|
|
'der',
|
|
'dem',
|
|
'des',
|
|
'und',
|
|
'oder',
|
|
'sowie',
|
|
'seine',
|
|
'seinen',
|
|
'seiner',
|
|
'seinem',
|
|
'seines',
|
|
'siene',
|
|
'sienen',
|
|
'siener',
|
|
'sienem',
|
|
'sienes',
|
|
'gebe',
|
|
'gib',
|
|
'nenne',
|
|
'nenn',
|
|
'preis',
|
|
'preise',
|
|
'preisen',
|
|
'kostet',
|
|
'kosten',
|
|
'ua',
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @return array<string, string>
|
|
*/
|
|
public function getSearchTokenCorrections(): array
|
|
{
|
|
return [
|
|
'siene' => 'seine',
|
|
'sienen' => 'seinen',
|
|
'siener' => 'seiner',
|
|
'sienem' => 'seinem',
|
|
'sienes' => 'seines',
|
|
'indicatoren' => 'indikatoren',
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @return array<string, string>
|
|
*/
|
|
public function getSearchTokenCanonicalMap(): array
|
|
{
|
|
return [
|
|
'indikatoren' => 'indikator',
|
|
'indicators' => 'indikator',
|
|
'indicator' => 'indikator',
|
|
'reagenzien' => 'reagenz',
|
|
'reagents' => 'reagenz',
|
|
'reagent' => 'reagenz',
|
|
'produkte' => 'produkt',
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Backward-compatible alias for older callers.
|
|
*
|
|
* @return string[]
|
|
*/
|
|
public function getFilterSearchTokensPattern(): array
|
|
{
|
|
return $this->getFilterSearchTokens();
|
|
}
|
|
|
|
/**
|
|
* @return string[]
|
|
*/
|
|
public function getNormalizationSearch(): array
|
|
{
|
|
return ['€'];
|
|
}
|
|
|
|
/**
|
|
* @return string[]
|
|
*/
|
|
public function getNormalizationReplace(): array
|
|
{
|
|
return [' euro '];
|
|
}
|
|
|
|
public function getPromptSanitizePattern(): string
|
|
{
|
|
return '/[^\p{L}\p{N}\s.,\-]/u';
|
|
}
|
|
|
|
public function getWhitespaceCollapsePattern(): string
|
|
{
|
|
return '/\s+/u';
|
|
}
|
|
|
|
public function getWhitespaceSplitPattern(): string
|
|
{
|
|
return '/\s+/u';
|
|
}
|
|
|
|
public function getSearchTextTrimCharacters(): string
|
|
{
|
|
return " \t\n\r\0\x0B-.,";
|
|
}
|
|
|
|
public function getMinSearchTokenLength(): int
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
public function getMinDirectProductTokenLength(): int
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
public function getHistoryQuestionPattern(): string
|
|
{
|
|
return '/^Question:\s*(.+)$/m';
|
|
}
|
|
|
|
public function getPriceBetweenPattern(): string
|
|
{
|
|
return '/\bzwischen\s+(\d+(?:[.,]\d+)?)\s+und\s+(\d+(?:[.,]\d+)?)\s+euro\b/u';
|
|
}
|
|
|
|
public function getPriceMaxPattern(): string
|
|
{
|
|
return '/\b(?:unter|bis|max(?:imal)?)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u';
|
|
}
|
|
|
|
public function getPriceMinPattern(): string
|
|
{
|
|
return '/\b(?:ab|mindestens|min)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u';
|
|
}
|
|
|
|
/**
|
|
* @return string[]
|
|
*/
|
|
public function getPriceRemovalPatterns(CommerceIntentConfig $intentConfig): array
|
|
{
|
|
return [
|
|
'/\bzwischen\s+\d+(?:[.,]\d+)?\s+und\s+\d+(?:[.,]\d+)?\s*euro\b/u',
|
|
'/\b(?:unter|bis|max(?:imal)?|ab|mindestens|min)\s+\d+(?:[.,]\d+)?\s*euro\b/u',
|
|
'/\b(?:' . $intentConfig->getPricePattern() . ')\b/u',
|
|
];
|
|
}
|
|
|
|
public function getDirectProductDigitPattern(): string
|
|
{
|
|
return '/\d/u';
|
|
}
|
|
|
|
public function getDirectProductMaxTokens(): int
|
|
{
|
|
return 4;
|
|
}
|
|
|
|
public function getModelLikePattern(): string
|
|
{
|
|
return '/\b[a-zäöüß][a-zäöüß®\-]*(?:\s+[a-zäöüß][a-zäöüß®\-]*){0,2}\s+\d{2,5}[a-z0-9\-]*\b/u';
|
|
}
|
|
|
|
public function getAccessoryLikePattern(): string
|
|
{
|
|
return '/\b(?:indikator|indicator|reagenz|reagent|kit|set)\s+\d{1,5}[a-z0-9\-]*\b/u';
|
|
}
|
|
|
|
public function getContainsDigitPattern(): string
|
|
{
|
|
return '/\d/u';
|
|
}
|
|
|
|
public function getModelNumberTokenPattern(): string
|
|
{
|
|
return '/^(?:\d{2,5}[a-z0-9\-]*|[a-z]{1,6}\d{1,5}[a-z0-9\-]*)$/u';
|
|
}
|
|
|
|
public function getModelContextTokenPattern(): string
|
|
{
|
|
return '/^[\p{L}][\p{L}0-9®\-]{2,}$/u';
|
|
}
|
|
|
|
public function getModelSuffixTokenPattern(): string
|
|
{
|
|
return '/^[a-z]{1,4}\d{0,3}$/u';
|
|
}
|
|
|
|
public function getModelContextTokenWindow(): int
|
|
{
|
|
return 2;
|
|
}
|
|
|
|
public function getMinMeaningfulAlphaTokenLength(): int
|
|
{
|
|
return 2;
|
|
}
|
|
|
|
public function getMaxShopSearchTokens(): int
|
|
{
|
|
return 6;
|
|
}
|
|
|
|
public function getInstructionOrPresentationTokenPattern(): string
|
|
{
|
|
return '/^(?:zeig(?:e)?|such(?:e)?|find(?:e)?|gib|gebe|nenn(?:e)?|liefer(?:e)?|erstelle?|mach(?:e)?|brauch(?:e)?|will|möchte|moechte|hätte|haette|kannst|bitte|mal|alle|alles|komplett|vollständig|vollstaendig|gesamt|ganze|ganzen|liste|listung|auflistung|tabelle|tabellarisch|übersicht|uebersicht|anzeigen?|ausgeben?|darstellen?|antwort(?:e)?|erklär(?:e)?|erklaer(?:e)?|info|infos|informationen|dazu|hierzu|damit|davon|an|als|mit|ohne|inkl|inklusive)$/u';
|
|
}
|
|
|
|
/**
|
|
* Product/category tokens that are useful for Store API search even when they are not next to a model number.
|
|
* This is intentionally a semantic allowlist, not a spelling-error blocklist.
|
|
*
|
|
* @return string[]
|
|
*/
|
|
public function getSemanticShopSearchTokens(): array
|
|
{
|
|
return [
|
|
'indikator',
|
|
'indicator',
|
|
'reagenz',
|
|
'reagent',
|
|
'zubehör',
|
|
'zubehor',
|
|
'ersatzteil',
|
|
'verbrauchsmaterial',
|
|
'kit',
|
|
'set',
|
|
'filter',
|
|
'pumpe',
|
|
'pumpenkopf',
|
|
'motorblock',
|
|
'lösung',
|
|
'loesung',
|
|
'solution',
|
|
'teststreifen',
|
|
'gerät',
|
|
'geraet',
|
|
'messgerät',
|
|
'messgeraet',
|
|
'analysegerät',
|
|
'analysegeraet',
|
|
'analysator',
|
|
'monitor',
|
|
'controller',
|
|
'system',
|
|
];
|
|
}
|
|
|
|
public function buildExactTokenRemovalPattern(string $token): string
|
|
{
|
|
return '/\b' . preg_quote($token, '/') . '\b/u';
|
|
}
|
|
|
|
public function buildBrandPartOfModelPattern(string $brand): string
|
|
{
|
|
return '/\b' . preg_quote($brand, '/') . '\s+\d{2,5}[a-z0-9\-]*\b/u';
|
|
}
|
|
} |