Files
MtoRagSystem/src/Commerce/CommerceQueryParser.php

622 lines
18 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Commerce;
use App\Commerce\Dto\CommerceSearchQuery;
use App\Config\CommerceIntentConfig;
use App\Config\CommerceQueryParserConfig;
use App\Knowledge\Retrieval\QueryCleaner;
use App\Knowledge\Text\TextNormalizer;
final readonly class CommerceQueryParser
{
public function __construct(
private TextNormalizer $textNormalizer,
private QueryCleaner $queryCleaner,
private CommerceQueryParserConfig $config,
private CommerceIntentConfig $intentConfig,
) {
}
public function parse(
string $originalPrompt,
string $intent,
string $historyContext = ''
): CommerceSearchQuery {
$normalizedPrompt = $this->normalize($originalPrompt);
$isDirectProductQuery = $this->isDirectProductQuery($normalizedPrompt);
[$priceMin, $priceMax] = $this->extractPriceRange($normalizedPrompt);
$sizes = $this->extractSizes($normalizedPrompt);
$brand = $this->extractBrand($normalizedPrompt);
$searchText = $this->buildSearchText(
prompt: $normalizedPrompt,
sizes: $sizes,
brand: $brand,
priceMin: $priceMin,
priceMax: $priceMax,
preserveDirectProductQuery: $isDirectProductQuery
);
if (
!$isDirectProductQuery
&& $historyContext !== ''
&& $this->shouldUseHistoryContext($normalizedPrompt)
) {
$historyParse = $this->parseHistoryContext($historyContext);
if ($historyParse !== null) {
$searchText = $this->mergeSearchTexts(
$historyParse['searchText'],
$searchText
);
if (($brand === null || $brand === '') && $historyParse['brand'] !== null && $historyParse['brand'] !== '') {
$brand = $historyParse['brand'];
}
}
}
$finalSearchText = $searchText !== '' ? $searchText : $normalizedPrompt;
return new CommerceSearchQuery(
originalPrompt: $originalPrompt,
normalizedPrompt: $normalizedPrompt,
searchText: $finalSearchText,
brand: $brand,
sizes: $sizes,
properties: [],
priceMin: $priceMin,
priceMax: $priceMax,
intent: $intent,
needsLlmFallback: false,
);
}
private function normalize(string $prompt): string
{
$value = $this->textNormalizer->normalize($prompt);
$value = mb_strtolower(trim($value));
$value = str_replace(
$this->config->getNormalizationSearch(),
$this->config->getNormalizationReplace(),
$value
);
$value = preg_replace($this->config->getPromptSanitizePattern(), ' ', $value) ?? $value;
$value = preg_replace($this->config->getWhitespaceCollapsePattern(), ' ', $value) ?? $value;
$value = $this->applySearchTokenCorrections($value);
return trim($value);
}
/**
* @return array{0:?float,1:?float}
*/
private function extractPriceRange(string $prompt): array
{
$priceMin = null;
$priceMax = null;
if (preg_match($this->config->getPriceBetweenPattern(), $prompt, $matches) === 1) {
$a = $this->toFloat($matches[1]);
$b = $this->toFloat($matches[2]);
if ($a !== null && $b !== null) {
return [min($a, $b), max($a, $b)];
}
}
if (preg_match($this->config->getPriceMaxPattern(), $prompt, $matches) === 1) {
$priceMax = $this->toFloat($matches[1]);
}
if (preg_match($this->config->getPriceMinPattern(), $prompt, $matches) === 1) {
$priceMin = $this->toFloat($matches[1]);
}
return [$priceMin, $priceMax];
}
/**
* @return string[]
*/
private function extractSizes(string $prompt): array
{
$sizes = [];
if (preg_match_all($this->intentConfig->getSizeExtractionPattern(), $prompt, $matches) === false) {
return [];
}
foreach ($matches[1] ?? [] as $size) {
$sizes[] = trim((string) $size);
}
if (preg_match_all($this->intentConfig->getSizeTokenValuePattern(), $prompt, $tokenMatches) !== false) {
foreach ($tokenMatches[0] ?? [] as $sizeToken) {
$sizes[] = trim((string) $sizeToken);
}
}
return array_values(array_unique(array_filter(
$sizes,
static fn(string $value): bool => $value !== ''
)));
}
private function extractBrand(string $prompt): ?string
{
foreach ($this->config->getKnownBrands() as $brand) {
$normalizedBrand = $this->normalize((string) $brand);
if ($normalizedBrand !== '' && str_contains($prompt, $normalizedBrand)) {
return $normalizedBrand;
}
}
return null;
}
/**
* @param string[] $sizes
*/
private function buildSearchText(
string $prompt,
array $sizes,
?string $brand,
?float $priceMin,
?float $priceMax,
bool $preserveDirectProductQuery = false
): string {
if ($preserveDirectProductQuery) {
return $this->buildDirectProductSearchText($prompt);
}
$text = $this->wrapForPhraseReplacement($prompt);
foreach ($this->config->getPhrasesToRemove() as $phrase) {
$normalizedPhrase = $this->normalize((string) $phrase);
if ($normalizedPhrase === '') {
continue;
}
$text = str_replace(
$this->wrapForPhraseReplacement($normalizedPhrase),
' ',
$text
);
}
foreach ($sizes as $size) {
$normalizedSize = $this->normalize((string) $size);
if ($normalizedSize === '') {
continue;
}
$text = preg_replace(
$this->config->buildExactTokenRemovalPattern($normalizedSize),
' ',
$text
) ?? $text;
}
// Keep known brand terms in the shop search text because the Store API
// request does not add a separate manufacturer filter.
if ($priceMin !== null || $priceMax !== null) {
foreach ($this->config->getPriceRemovalPatterns($this->intentConfig) as $pattern) {
$text = preg_replace($pattern, ' ', $text) ?? $text;
}
}
$text = preg_replace($this->config->getWhitespaceCollapsePattern(), ' ', $text) ?? $text;
$text = trim($text, $this->config->getSearchTextTrimCharacters());
$tokens = array_filter(
explode(' ', $text),
fn(string $token): bool => mb_strlen($token) > $this->config->getMinSearchTokenLength()
);
$tokens = $this->normalizeSearchTokens($tokens);
return trim(implode(' ', $tokens));
}
private function buildDirectProductSearchText(string $prompt): string
{
$text = preg_replace($this->config->getWhitespaceCollapsePattern(), ' ', $prompt) ?? $prompt;
$text = trim($text, $this->config->getSearchTextTrimCharacters());
$tokens = array_filter(
explode(' ', $text),
fn(string $token): bool => mb_strlen($token) >= $this->config->getMinDirectProductTokenLength()
);
$tokens = $this->normalizeSearchTokens($tokens);
$tokens = $this->compactShopSearchTokens($tokens);
return trim(implode(' ', $tokens));
}
/**
* Keep the Store API query narrow without relying on endless spelling-specific stop words.
*
* Direct product queries often contain user instructions such as "show all as a list".
* Shopware search performs best when the query only contains product-defining tokens:
* model numbers, the immediately related model name, brands, and semantic commerce terms.
*
* @param string[] $tokens
* @return string[]
*/
private function compactShopSearchTokens(array $tokens): array
{
$tokens = array_values(array_filter(
$tokens,
fn(string $token): bool => !$this->isQueryNoiseToken($token)
));
if ($tokens === []) {
return [];
}
$keep = [];
foreach ($tokens as $index => $token) {
if ($this->isModelNumberToken($token)) {
$keep[$index] = true;
for ($offset = 1; $offset <= $this->config->getModelContextTokenWindow(); $offset++) {
$previousIndex = $index - $offset;
if (!isset($tokens[$previousIndex])) {
break;
}
if ($this->isSemanticShopToken($tokens[$previousIndex])) {
$keep[$previousIndex] = true;
continue;
}
if (!$this->isLikelyModelContextToken($tokens[$previousIndex])) {
break;
}
$keep[$previousIndex] = true;
}
$nextIndex = $index + 1;
if (isset($tokens[$nextIndex]) && $this->isModelSuffixToken($tokens[$nextIndex])) {
$keep[$nextIndex] = true;
}
}
if ($this->isSemanticShopToken($token) || $this->isKnownBrandToken($token) || $this->isMeasurementValueToken($token)) {
$keep[$index] = true;
}
}
if ($keep === []) {
return $this->limitShopSearchTokens($tokens);
}
ksort($keep);
$compacted = [];
foreach (array_keys($keep) as $index) {
$compacted[] = $tokens[$index];
}
return $this->limitShopSearchTokens(array_values(array_unique($compacted)));
}
private function isQueryNoiseToken(string $token): bool
{
$token = trim(mb_strtolower($token, 'UTF-8'));
if ($token === '') {
return true;
}
if ($this->isMeasurementValueToken($token)) {
return false;
}
if (preg_match($this->config->getContainsDigitPattern(), $token) === 1) {
return false;
}
if (mb_strlen($token) <= $this->config->getMinMeaningfulAlphaTokenLength()) {
return true;
}
if ($this->isSearchControlToken($token)) {
return true;
}
return preg_match($this->config->getInstructionOrPresentationTokenPattern(), $token) === 1;
}
private function isModelNumberToken(string $token): bool
{
return preg_match($this->config->getModelNumberTokenPattern(), $token) === 1;
}
private function isMeasurementValueToken(string $token): bool
{
return preg_match($this->config->getMeasurementValueTokenPattern(), $token) === 1;
}
private function isLikelyModelContextToken(string $token): bool
{
if ($this->isQueryNoiseToken($token)) {
return false;
}
if ($this->isSemanticShopToken($token)) {
return false;
}
return preg_match($this->config->getModelContextTokenPattern(), $token) === 1;
}
private function isModelSuffixToken(string $token): bool
{
if ($this->isQueryNoiseToken($token)) {
return false;
}
return preg_match($this->config->getModelSuffixTokenPattern(), $token) === 1;
}
private function isSemanticShopToken(string $token): bool
{
return in_array($token, $this->config->getSemanticShopSearchTokens(), true);
}
private function isKnownBrandToken(string $token): bool
{
return in_array($token, $this->config->getKnownBrands(), true);
}
/**
* @param string[] $tokens
* @return string[]
*/
private function limitShopSearchTokens(array $tokens): array
{
$limit = $this->config->getMaxShopSearchTokens();
if ($limit <= 0 || count($tokens) <= $limit) {
return $tokens;
}
return array_slice($tokens, 0, $limit);
}
private function shouldUseHistoryContext(string $prompt): bool
{
return preg_match($this->config->getHistoryContextValuePattern(), $prompt) === 1;
}
private function extractLatestQuestionFromHistory(string $historyContext): string
{
$result = preg_match_all($this->config->getHistoryQuestionPattern(), $historyContext, $matches);
if ($result === false) {
return '';
}
$questions = $matches[1] ?? [];
if ($questions === []) {
return '';
}
$lastQuestion = end($questions);
return is_string($lastQuestion) ? trim($lastQuestion) : '';
}
private function mergeSearchTexts(string $historySearchText, string $currentSearchText): string
{
$tokens = [];
foreach ([$historySearchText, $currentSearchText] as $text) {
if ($text === '') {
continue;
}
foreach (explode(' ', $text) as $token) {
$token = trim($token);
if ($token === '' || mb_strlen($token) <= $this->config->getMinSearchTokenLength()) {
continue;
}
foreach ($this->normalizeSearchTokens([$token]) as $normalizedToken) {
$tokens[$normalizedToken] = $normalizedToken;
}
}
}
return implode(' ', array_values($tokens));
}
/**
* @param string[] $tokens
* @return string[]
*/
private function filterSearchTokens(array $tokens): array
{
return $this->normalizeSearchTokens($tokens);
}
/**
* @param string[] $tokens
* @return string[]
*/
private function normalizeSearchTokens(array $tokens): array
{
$normalizedTokens = [];
foreach ($tokens as $token) {
$token = trim(mb_strtolower((string) $token, 'UTF-8'));
if ($token === '') {
continue;
}
$token = $this->config->getSearchTokenCorrections()[$token] ?? $token;
$token = $this->config->getSearchTokenCanonicalMap()[$token] ?? $token;
if ($this->isSearchControlToken($token)) {
continue;
}
$normalizedTokens[$token] = $token;
}
return array_values($normalizedTokens);
}
private function applySearchTokenCorrections(string $text): string
{
if ($text === '') {
return '';
}
foreach ($this->config->getSearchTokenCorrections() as $from => $to) {
$text = preg_replace(
'/\b' . preg_quote((string) $from, '/') . '\b/u',
(string) $to,
$text
) ?? $text;
}
return preg_replace($this->config->getWhitespaceCollapsePattern(), ' ', $text) ?? $text;
}
private function isSearchControlToken(string $token): bool
{
$token = trim(mb_strtolower($token));
if ($token === '') {
return true;
}
if (in_array($token, $this->config->getFilterSearchTokens(), true)) {
return true;
}
return in_array($token, [
'shop',
'store',
'produkt',
'produkte',
'artikel',
'kaufen',
'kaufe',
'bestellen',
'bestelle',
'online',
], true);
}
private function isDirectProductQuery(string $prompt): bool
{
if ($prompt === '') {
return false;
}
if ($this->containsModelLikePhrase($prompt)) {
return true;
}
if ($this->containsAccessoryLikePhrase($prompt)) {
return true;
}
$tokens = preg_split(
$this->config->getWhitespaceSplitPattern(),
$prompt,
-1,
PREG_SPLIT_NO_EMPTY
) ?: [];
$tokens = $this->filterSearchTokens($tokens);
return count($tokens) <= $this->config->getDirectProductMaxTokens()
&& preg_match($this->config->getDirectProductDigitPattern(), implode(' ', $tokens)) === 1;
}
private function containsModelLikePhrase(string $text): bool
{
return preg_match($this->config->getModelLikePattern(), $text) === 1;
}
private function containsAccessoryLikePhrase(string $text): bool
{
return preg_match($this->config->getAccessoryLikePattern(), $text) === 1;
}
private function isBrandPartOfModelPhrase(string $prompt, string $brand): bool
{
if ($brand === '') {
return false;
}
return preg_match(
$this->config->buildBrandPartOfModelPattern($brand),
$prompt
) === 1;
}
private function toFloat(string $value): ?float
{
$value = str_replace(',', '.', trim($value));
return is_numeric($value) ? (float) $value : null;
}
/**
* @return array{searchText:string, brand:?string}|null
*/
private function parseHistoryContext(string $historyContext): ?array
{
$latestHistoryQuestion = $this->extractLatestQuestionFromHistory($historyContext);
if ($latestHistoryQuestion === '') {
return null;
}
$normalizedHistoryPrompt = $this->normalize($latestHistoryQuestion);
$isDirectHistoryProductQuery = $this->isDirectProductQuery($normalizedHistoryPrompt);
[$historyPriceMin, $historyPriceMax] = $this->extractPriceRange($normalizedHistoryPrompt);
$historySizes = $this->extractSizes($normalizedHistoryPrompt);
$historyBrand = $this->extractBrand($normalizedHistoryPrompt);
$historySearchText = $this->buildSearchText(
prompt: $normalizedHistoryPrompt,
sizes: $historySizes,
brand: $historyBrand,
priceMin: $historyPriceMin,
priceMax: $historyPriceMax,
preserveDirectProductQuery: $isDirectHistoryProductQuery
);
return [
'searchText' => $historySearchText,
'brand' => $historyBrand,
];
}
private function wrapForPhraseReplacement(string $text): string
{
return ' ' . $text . ' ';
}
}