676 lines
20 KiB
PHP
676 lines
20 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Commerce;
|
|
|
|
use App\Commerce\Dto\CommerceSearchQuery;
|
|
use App\Config\CommerceIntentConfig;
|
|
use App\Config\CommerceQueryParserConfig;
|
|
use App\Config\LanguageCleanupConfig;
|
|
use App\Knowledge\Retrieval\QueryCleaner;
|
|
use App\Knowledge\Text\TextNormalizer;
|
|
|
|
final readonly class CommerceQueryParser
|
|
{
|
|
public function __construct(
|
|
private TextNormalizer $textNormalizer,
|
|
private QueryCleaner $queryCleaner,
|
|
private CommerceQueryParserConfig $config,
|
|
private CommerceIntentConfig $intentConfig,
|
|
private LanguageCleanupConfig $languageCleanupConfig,
|
|
) {
|
|
}
|
|
|
|
public function parse(
|
|
string $originalPrompt,
|
|
string $intent,
|
|
string $historyContext = ''
|
|
): CommerceSearchQuery {
|
|
$normalizedPrompt = $this->normalize($originalPrompt);
|
|
$isDirectProductQuery = $this->isDirectProductQuery($normalizedPrompt);
|
|
|
|
[$priceMin, $priceMax] = $this->extractPriceRange($normalizedPrompt);
|
|
$sizes = $this->extractSizes($normalizedPrompt);
|
|
$brand = $this->extractBrand($normalizedPrompt);
|
|
|
|
$searchText = $this->buildSearchText(
|
|
prompt: $normalizedPrompt,
|
|
sizes: $sizes,
|
|
brand: $brand,
|
|
priceMin: $priceMin,
|
|
priceMax: $priceMax,
|
|
preserveDirectProductQuery: $isDirectProductQuery
|
|
);
|
|
|
|
if (
|
|
!$isDirectProductQuery
|
|
&& $historyContext !== ''
|
|
&& $this->shouldUseHistoryContext($normalizedPrompt)
|
|
) {
|
|
$historyParse = $this->parseHistoryContext($historyContext);
|
|
|
|
if ($historyParse !== null) {
|
|
$searchText = $this->mergeSearchTexts(
|
|
$historyParse['searchText'],
|
|
$searchText
|
|
);
|
|
|
|
if (($brand === null || $brand === '') && $historyParse['brand'] !== null && $historyParse['brand'] !== '') {
|
|
$brand = $historyParse['brand'];
|
|
}
|
|
}
|
|
}
|
|
|
|
$finalSearchText = $searchText !== '' ? $searchText : $normalizedPrompt;
|
|
|
|
return new CommerceSearchQuery(
|
|
originalPrompt: $originalPrompt,
|
|
normalizedPrompt: $normalizedPrompt,
|
|
searchText: $finalSearchText,
|
|
brand: $brand,
|
|
sizes: $sizes,
|
|
properties: [],
|
|
priceMin: $priceMin,
|
|
priceMax: $priceMax,
|
|
intent: $intent,
|
|
needsLlmFallback: false,
|
|
);
|
|
}
|
|
|
|
private function normalize(string $prompt): string
|
|
{
|
|
$value = $this->textNormalizer->normalize($prompt);
|
|
$value = mb_strtolower(trim($value));
|
|
$value = str_replace(
|
|
$this->config->getNormalizationSearch(),
|
|
$this->config->getNormalizationReplace(),
|
|
$value
|
|
);
|
|
$value = preg_replace($this->config->getPromptSanitizePattern(), ' ', $value) ?? $value;
|
|
$value = preg_replace($this->config->getWhitespaceCollapsePattern(), ' ', $value) ?? $value;
|
|
$value = $this->applySearchTokenCorrections($value);
|
|
|
|
return trim($value);
|
|
}
|
|
|
|
/**
|
|
* @return array{0:?float,1:?float}
|
|
*/
|
|
private function extractPriceRange(string $prompt): array
|
|
{
|
|
$priceMin = null;
|
|
$priceMax = null;
|
|
|
|
if (preg_match($this->config->getPriceBetweenPattern(), $prompt, $matches) === 1) {
|
|
$a = $this->toFloat($matches[1]);
|
|
$b = $this->toFloat($matches[2]);
|
|
|
|
if ($a !== null && $b !== null) {
|
|
return [min($a, $b), max($a, $b)];
|
|
}
|
|
}
|
|
|
|
if (preg_match($this->config->getPriceMaxPattern(), $prompt, $matches) === 1) {
|
|
$priceMax = $this->toFloat($matches[1]);
|
|
}
|
|
|
|
if (preg_match($this->config->getPriceMinPattern(), $prompt, $matches) === 1) {
|
|
$priceMin = $this->toFloat($matches[1]);
|
|
}
|
|
|
|
return [$priceMin, $priceMax];
|
|
}
|
|
|
|
/**
|
|
* @return string[]
|
|
*/
|
|
private function extractSizes(string $prompt): array
|
|
{
|
|
$sizes = [];
|
|
|
|
if (preg_match_all($this->intentConfig->getSizeExtractionPattern(), $prompt, $matches) === false) {
|
|
return [];
|
|
}
|
|
|
|
foreach ($matches[1] ?? [] as $size) {
|
|
$sizes[] = trim((string) $size);
|
|
}
|
|
|
|
if (preg_match_all($this->intentConfig->getSizeTokenValuePattern(), $prompt, $tokenMatches) !== false) {
|
|
foreach ($tokenMatches[0] ?? [] as $sizeToken) {
|
|
$sizes[] = trim((string) $sizeToken);
|
|
}
|
|
}
|
|
|
|
return array_values(array_unique(array_filter(
|
|
$sizes,
|
|
static fn(string $value): bool => $value !== ''
|
|
)));
|
|
}
|
|
|
|
private function extractBrand(string $prompt): ?string
|
|
{
|
|
foreach ($this->config->getKnownBrands() as $brand) {
|
|
$normalizedBrand = $this->normalize((string) $brand);
|
|
|
|
if ($normalizedBrand !== '' && str_contains($prompt, $normalizedBrand)) {
|
|
return $normalizedBrand;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* @param string[] $sizes
|
|
*/
|
|
private function buildSearchText(
|
|
string $prompt,
|
|
array $sizes,
|
|
?string $brand,
|
|
?float $priceMin,
|
|
?float $priceMax,
|
|
bool $preserveDirectProductQuery = false
|
|
): string {
|
|
if ($preserveDirectProductQuery) {
|
|
return $this->buildDirectProductSearchText($prompt);
|
|
}
|
|
|
|
$text = $this->wrapForPhraseReplacement($prompt);
|
|
|
|
foreach ($this->getCommercePhrasesToRemove() as $phrase) {
|
|
$normalizedPhrase = $this->normalize((string) $phrase);
|
|
|
|
if ($normalizedPhrase === '') {
|
|
continue;
|
|
}
|
|
|
|
$text = str_replace(
|
|
$this->wrapForPhraseReplacement($normalizedPhrase),
|
|
' ',
|
|
$text
|
|
);
|
|
}
|
|
|
|
foreach ($sizes as $size) {
|
|
$normalizedSize = $this->normalize((string) $size);
|
|
|
|
if ($normalizedSize === '') {
|
|
continue;
|
|
}
|
|
|
|
$text = preg_replace(
|
|
$this->config->buildExactTokenRemovalPattern($normalizedSize),
|
|
' ',
|
|
$text
|
|
) ?? $text;
|
|
}
|
|
|
|
// Keep known brand terms in the shop search text because the Store API
|
|
// request does not add a separate manufacturer filter.
|
|
|
|
if ($priceMin !== null || $priceMax !== null) {
|
|
foreach ($this->config->getPriceRemovalPatterns($this->intentConfig) as $pattern) {
|
|
$text = preg_replace($pattern, ' ', $text) ?? $text;
|
|
}
|
|
}
|
|
|
|
$text = preg_replace($this->config->getWhitespaceCollapsePattern(), ' ', $text) ?? $text;
|
|
$text = trim($text, $this->config->getSearchTextTrimCharacters());
|
|
|
|
$tokens = array_filter(
|
|
explode(' ', $text),
|
|
fn(string $token): bool => mb_strlen($token) > $this->config->getMinSearchTokenLength()
|
|
);
|
|
|
|
$tokens = $this->normalizeSearchTokens($tokens);
|
|
|
|
return trim(implode(' ', $tokens));
|
|
}
|
|
|
|
private function buildDirectProductSearchText(string $prompt): string
|
|
{
|
|
$text = preg_replace($this->config->getWhitespaceCollapsePattern(), ' ', $prompt) ?? $prompt;
|
|
$text = trim($text, $this->config->getSearchTextTrimCharacters());
|
|
|
|
$tokens = array_filter(
|
|
explode(' ', $text),
|
|
fn(string $token): bool => mb_strlen($token) >= $this->config->getMinDirectProductTokenLength()
|
|
);
|
|
|
|
$tokens = $this->normalizeSearchTokens($tokens);
|
|
$tokens = $this->compactShopSearchTokens($tokens);
|
|
|
|
return trim(implode(' ', $tokens));
|
|
}
|
|
|
|
/**
|
|
* Keep the Store API query narrow without relying on endless spelling-specific stop words.
|
|
*
|
|
* Direct product queries often contain user instructions such as "show all as a list".
|
|
* Shopware search performs best when the query only contains product-defining tokens:
|
|
* model numbers, the immediately related model name, brands, and semantic commerce terms.
|
|
*
|
|
* @param string[] $tokens
|
|
* @return string[]
|
|
*/
|
|
private function compactShopSearchTokens(array $tokens): array
|
|
{
|
|
$tokens = array_values(array_filter(
|
|
$tokens,
|
|
fn(string $token): bool => !$this->isQueryNoiseToken($token)
|
|
));
|
|
|
|
if ($tokens === []) {
|
|
return [];
|
|
}
|
|
|
|
$keep = [];
|
|
|
|
foreach ($tokens as $index => $token) {
|
|
if ($this->isModelNumberToken($token)) {
|
|
$keep[$index] = true;
|
|
|
|
for ($offset = 1; $offset <= $this->config->getModelContextTokenWindow(); $offset++) {
|
|
$previousIndex = $index - $offset;
|
|
|
|
if (!isset($tokens[$previousIndex])) {
|
|
break;
|
|
}
|
|
|
|
if ($this->isSemanticShopToken($tokens[$previousIndex])) {
|
|
$keep[$previousIndex] = true;
|
|
continue;
|
|
}
|
|
|
|
if (!$this->isLikelyModelContextToken($tokens[$previousIndex])) {
|
|
break;
|
|
}
|
|
|
|
$keep[$previousIndex] = true;
|
|
}
|
|
|
|
$nextIndex = $index + 1;
|
|
if (isset($tokens[$nextIndex]) && $this->isModelSuffixToken($tokens[$nextIndex])) {
|
|
$keep[$nextIndex] = true;
|
|
}
|
|
}
|
|
|
|
if ($this->isSemanticShopToken($token) || $this->isKnownBrandToken($token) || $this->isMeasurementValueToken($token)) {
|
|
$keep[$index] = true;
|
|
}
|
|
}
|
|
|
|
if ($keep === []) {
|
|
return $this->limitShopSearchTokens($tokens);
|
|
}
|
|
|
|
ksort($keep);
|
|
|
|
$compacted = [];
|
|
foreach (array_keys($keep) as $index) {
|
|
$compacted[] = $tokens[$index];
|
|
}
|
|
|
|
return $this->limitShopSearchTokens(array_values(array_unique($compacted)));
|
|
}
|
|
|
|
private function isQueryNoiseToken(string $token): bool
|
|
{
|
|
$token = trim(mb_strtolower($token, 'UTF-8'));
|
|
|
|
if ($token === '') {
|
|
return true;
|
|
}
|
|
|
|
if ($this->isMeasurementValueToken($token)) {
|
|
return false;
|
|
}
|
|
|
|
if (preg_match($this->config->getContainsDigitPattern(), $token) === 1) {
|
|
return false;
|
|
}
|
|
|
|
if (
|
|
mb_strlen($token) <= $this->config->getMinMeaningfulAlphaTokenLength()
|
|
&& !$this->isProtectedCommerceSearchToken($token)
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
if ($this->isSearchControlToken($token)) {
|
|
return true;
|
|
}
|
|
|
|
return preg_match($this->config->getInstructionOrPresentationTokenPattern(), $token) === 1;
|
|
}
|
|
|
|
private function isModelNumberToken(string $token): bool
|
|
{
|
|
return preg_match($this->config->getModelNumberTokenPattern(), $token) === 1;
|
|
}
|
|
|
|
private function isMeasurementValueToken(string $token): bool
|
|
{
|
|
return preg_match($this->config->getMeasurementValueTokenPattern(), $token) === 1;
|
|
}
|
|
|
|
private function isLikelyModelContextToken(string $token): bool
|
|
{
|
|
if ($this->isQueryNoiseToken($token)) {
|
|
return false;
|
|
}
|
|
|
|
if ($this->isSemanticShopToken($token)) {
|
|
return false;
|
|
}
|
|
|
|
return preg_match($this->config->getModelContextTokenPattern(), $token) === 1;
|
|
}
|
|
|
|
private function isModelSuffixToken(string $token): bool
|
|
{
|
|
if ($this->isQueryNoiseToken($token)) {
|
|
return false;
|
|
}
|
|
|
|
return preg_match($this->config->getModelSuffixTokenPattern(), $token) === 1;
|
|
}
|
|
|
|
private function isSemanticShopToken(string $token): bool
|
|
{
|
|
return in_array($token, $this->config->getSemanticShopSearchTokens(), true);
|
|
}
|
|
|
|
private function isKnownBrandToken(string $token): bool
|
|
{
|
|
return in_array($token, $this->config->getKnownBrands(), true);
|
|
}
|
|
|
|
private function isProtectedCommerceSearchToken(string $token): bool
|
|
{
|
|
$token = trim(mb_strtolower($token, 'UTF-8'));
|
|
|
|
if ($token === '') {
|
|
return false;
|
|
}
|
|
|
|
foreach ($this->languageCleanupConfig->getProtectedTermsForProfile($this->config->getCleanupProfile()) as $protectedTerm) {
|
|
foreach ($this->normalizeSearchTokens([$protectedTerm]) as $normalizedTerm) {
|
|
if ($token === $normalizedTerm) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* @param string[] $tokens
|
|
* @return string[]
|
|
*/
|
|
private function limitShopSearchTokens(array $tokens): array
|
|
{
|
|
$limit = $this->config->getMaxShopSearchTokens();
|
|
|
|
if ($limit <= 0 || count($tokens) <= $limit) {
|
|
return $tokens;
|
|
}
|
|
|
|
return array_slice($tokens, 0, $limit);
|
|
}
|
|
|
|
private function shouldUseHistoryContext(string $prompt): bool
|
|
{
|
|
return preg_match($this->config->getHistoryContextValuePattern(), $prompt) === 1;
|
|
}
|
|
|
|
private function extractLatestQuestionFromHistory(string $historyContext): string
|
|
{
|
|
$result = preg_match_all($this->config->getHistoryQuestionPattern(), $historyContext, $matches);
|
|
|
|
if ($result === false) {
|
|
return '';
|
|
}
|
|
|
|
$questions = $matches[1] ?? [];
|
|
if ($questions === []) {
|
|
return '';
|
|
}
|
|
|
|
$lastQuestion = end($questions);
|
|
|
|
return is_string($lastQuestion) ? trim($lastQuestion) : '';
|
|
}
|
|
|
|
private function mergeSearchTexts(string $historySearchText, string $currentSearchText): string
|
|
{
|
|
$tokens = [];
|
|
|
|
foreach ([$historySearchText, $currentSearchText] as $text) {
|
|
if ($text === '') {
|
|
continue;
|
|
}
|
|
|
|
foreach (explode(' ', $text) as $token) {
|
|
$token = trim($token);
|
|
|
|
if ($token === '' || mb_strlen($token) <= $this->config->getMinSearchTokenLength()) {
|
|
continue;
|
|
}
|
|
|
|
foreach ($this->normalizeSearchTokens([$token]) as $normalizedToken) {
|
|
$tokens[$normalizedToken] = $normalizedToken;
|
|
}
|
|
}
|
|
}
|
|
|
|
return implode(' ', array_values($tokens));
|
|
}
|
|
|
|
/**
|
|
* @param string[] $tokens
|
|
* @return string[]
|
|
*/
|
|
private function filterSearchTokens(array $tokens): array
|
|
{
|
|
return $this->normalizeSearchTokens($tokens);
|
|
}
|
|
|
|
/**
|
|
* @param string[] $tokens
|
|
* @return string[]
|
|
*/
|
|
private function normalizeSearchTokens(array $tokens): array
|
|
{
|
|
$normalizedTokens = [];
|
|
|
|
foreach ($tokens as $token) {
|
|
$token = trim(mb_strtolower((string) $token, 'UTF-8'));
|
|
|
|
if ($token === '') {
|
|
continue;
|
|
}
|
|
|
|
$token = $this->config->getSearchTokenCorrections()[$token] ?? $token;
|
|
$token = $this->config->getSearchTokenCanonicalMap()[$token] ?? $token;
|
|
|
|
if ($this->isSearchControlToken($token)) {
|
|
continue;
|
|
}
|
|
|
|
$normalizedTokens[$token] = $token;
|
|
}
|
|
|
|
return array_values($normalizedTokens);
|
|
}
|
|
|
|
private function applySearchTokenCorrections(string $text): string
|
|
{
|
|
if ($text === '') {
|
|
return '';
|
|
}
|
|
|
|
foreach ($this->config->getSearchTokenCorrections() as $from => $to) {
|
|
$text = preg_replace(
|
|
'/\b' . preg_quote((string) $from, '/') . '\b/u',
|
|
(string) $to,
|
|
$text
|
|
) ?? $text;
|
|
}
|
|
|
|
return preg_replace($this->config->getWhitespaceCollapsePattern(), ' ', $text) ?? $text;
|
|
}
|
|
|
|
/** @return string[] */
|
|
private function getCommercePhrasesToRemove(): array
|
|
{
|
|
return $this->mergeUniqueTokens(
|
|
$this->languageCleanupConfig->getPhrasesForProfile($this->config->getCleanupProfile()),
|
|
$this->config->getPhrasesToRemove()
|
|
);
|
|
}
|
|
|
|
/** @return string[] */
|
|
private function getCommerceFilterSearchTokens(): array
|
|
{
|
|
return $this->mergeUniqueTokens(
|
|
$this->languageCleanupConfig->getStopWordsForProfile($this->config->getCleanupProfile()),
|
|
$this->config->getFilterSearchTokens()
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @param string[] $left
|
|
* @param string[] $right
|
|
* @return string[]
|
|
*/
|
|
private function mergeUniqueTokens(array $left, array $right): array
|
|
{
|
|
$out = [];
|
|
|
|
foreach ([$left, $right] as $list) {
|
|
foreach ($list as $token) {
|
|
$token = trim(mb_strtolower((string) $token, 'UTF-8'));
|
|
if ($token === '' || in_array($token, $out, true)) {
|
|
continue;
|
|
}
|
|
|
|
$out[] = $token;
|
|
}
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
private function isSearchControlToken(string $token): bool
|
|
{
|
|
$token = trim(mb_strtolower($token));
|
|
|
|
if ($token === '') {
|
|
return true;
|
|
}
|
|
|
|
if (in_array($token, $this->getCommerceFilterSearchTokens(), true)) {
|
|
return true;
|
|
}
|
|
|
|
return in_array($token, $this->config->getSearchControlTokens(), true);
|
|
}
|
|
|
|
private function isDirectProductQuery(string $prompt): bool
|
|
{
|
|
if ($prompt === '') {
|
|
return false;
|
|
}
|
|
|
|
if ($this->containsModelLikePhrase($prompt)) {
|
|
return true;
|
|
}
|
|
|
|
if ($this->containsAccessoryLikePhrase($prompt)) {
|
|
return true;
|
|
}
|
|
|
|
$tokens = preg_split(
|
|
$this->config->getWhitespaceSplitPattern(),
|
|
$prompt,
|
|
-1,
|
|
PREG_SPLIT_NO_EMPTY
|
|
) ?: [];
|
|
|
|
$tokens = $this->filterSearchTokens($tokens);
|
|
|
|
return count($tokens) <= $this->config->getDirectProductMaxTokens()
|
|
&& preg_match($this->config->getDirectProductDigitPattern(), implode(' ', $tokens)) === 1;
|
|
}
|
|
|
|
private function containsModelLikePhrase(string $text): bool
|
|
{
|
|
return preg_match($this->config->getModelLikePattern(), $text) === 1;
|
|
}
|
|
|
|
private function containsAccessoryLikePhrase(string $text): bool
|
|
{
|
|
return preg_match($this->config->getAccessoryLikePattern(), $text) === 1;
|
|
}
|
|
|
|
private function isBrandPartOfModelPhrase(string $prompt, string $brand): bool
|
|
{
|
|
if ($brand === '') {
|
|
return false;
|
|
}
|
|
|
|
return preg_match(
|
|
$this->config->buildBrandPartOfModelPattern($brand),
|
|
$prompt
|
|
) === 1;
|
|
}
|
|
|
|
private function toFloat(string $value): ?float
|
|
{
|
|
$value = str_replace(',', '.', trim($value));
|
|
|
|
return is_numeric($value) ? (float) $value : null;
|
|
}
|
|
|
|
/**
|
|
* @return array{searchText:string, brand:?string}|null
|
|
*/
|
|
private function parseHistoryContext(string $historyContext): ?array
|
|
{
|
|
$latestHistoryQuestion = $this->extractLatestQuestionFromHistory($historyContext);
|
|
|
|
if ($latestHistoryQuestion === '') {
|
|
return null;
|
|
}
|
|
|
|
$normalizedHistoryPrompt = $this->normalize($latestHistoryQuestion);
|
|
$isDirectHistoryProductQuery = $this->isDirectProductQuery($normalizedHistoryPrompt);
|
|
|
|
[$historyPriceMin, $historyPriceMax] = $this->extractPriceRange($normalizedHistoryPrompt);
|
|
$historySizes = $this->extractSizes($normalizedHistoryPrompt);
|
|
$historyBrand = $this->extractBrand($normalizedHistoryPrompt);
|
|
|
|
$historySearchText = $this->buildSearchText(
|
|
prompt: $normalizedHistoryPrompt,
|
|
sizes: $historySizes,
|
|
brand: $historyBrand,
|
|
priceMin: $historyPriceMin,
|
|
priceMax: $historyPriceMax,
|
|
preserveDirectProductQuery: $isDirectHistoryProductQuery
|
|
);
|
|
|
|
return [
|
|
'searchText' => $historySearchText,
|
|
'brand' => $historyBrand,
|
|
];
|
|
}
|
|
|
|
private function wrapForPhraseReplacement(string $text): string
|
|
{
|
|
return ' ' . $text . ' ';
|
|
}
|
|
} |