362 lines
11 KiB
PHP
362 lines
11 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Commerce;
|
|
|
|
use App\Commerce\Dto\CommerceSearchQuery;
|
|
use App\Config\CommerceIntentConfig;
|
|
use App\Config\CommerceQueryParserConfig;
|
|
use App\Knowledge\Retrieval\QueryCleaner;
|
|
use App\Knowledge\Text\TextNormalizer;
|
|
|
|
final readonly class CommerceQueryParser
|
|
{
|
|
public function __construct(
|
|
private TextNormalizer $textNormalizer,
|
|
private QueryCleaner $queryCleaner,
|
|
private CommerceQueryParserConfig $config,
|
|
private CommerceIntentConfig $intentConfig,
|
|
) {
|
|
}
|
|
|
|
public function parse(
|
|
string $originalPrompt,
|
|
string $intent,
|
|
string $historyContext = ''
|
|
): CommerceSearchQuery {
|
|
$normalizedPrompt = $this->normalize($originalPrompt);
|
|
$isDirectProductQuery = $this->isDirectProductQuery($normalizedPrompt);
|
|
|
|
[$priceMin, $priceMax] = $this->extractPriceRange($normalizedPrompt);
|
|
$sizes = $this->extractSizes($normalizedPrompt);
|
|
$brand = $this->extractBrand($normalizedPrompt);
|
|
|
|
$searchText = $this->buildSearchText(
|
|
prompt: $normalizedPrompt,
|
|
sizes: $sizes,
|
|
brand: $brand,
|
|
priceMin: $priceMin,
|
|
priceMax: $priceMax,
|
|
preserveDirectProductQuery: $isDirectProductQuery
|
|
);
|
|
|
|
if (
|
|
!$isDirectProductQuery
|
|
&& $historyContext !== ''
|
|
&& $this->shouldUseHistoryContext($normalizedPrompt)
|
|
) {
|
|
$latestHistoryQuestion = $this->extractLatestQuestionFromHistory($historyContext);
|
|
|
|
if ($latestHistoryQuestion !== '') {
|
|
$normalizedHistoryPrompt = $this->normalize($latestHistoryQuestion);
|
|
$isDirectHistoryProductQuery = $this->isDirectProductQuery($normalizedHistoryPrompt);
|
|
|
|
[$historyPriceMin, $historyPriceMax] = $this->extractPriceRange($normalizedHistoryPrompt);
|
|
$historySizes = $this->extractSizes($normalizedHistoryPrompt);
|
|
$historyBrand = $this->extractBrand($normalizedHistoryPrompt);
|
|
|
|
$historySearchText = $this->buildSearchText(
|
|
prompt: $normalizedHistoryPrompt,
|
|
sizes: $historySizes,
|
|
brand: $historyBrand,
|
|
priceMin: $historyPriceMin,
|
|
priceMax: $historyPriceMax,
|
|
preserveDirectProductQuery: $isDirectHistoryProductQuery
|
|
);
|
|
|
|
$searchText = $this->mergeSearchTexts($historySearchText, $searchText);
|
|
|
|
if (($brand === null || $brand === '') && $historyBrand !== null && $historyBrand !== '') {
|
|
$brand = $historyBrand;
|
|
}
|
|
}
|
|
}
|
|
|
|
$finalSearchText = $searchText !== '' ? $searchText : $normalizedPrompt;
|
|
|
|
return new CommerceSearchQuery(
|
|
originalPrompt: $originalPrompt,
|
|
normalizedPrompt: $normalizedPrompt,
|
|
searchText: $finalSearchText,
|
|
brand: $brand,
|
|
sizes: $sizes,
|
|
properties: [],
|
|
priceMin: $priceMin,
|
|
priceMax: $priceMax,
|
|
intent: $intent,
|
|
needsLlmFallback: false,
|
|
);
|
|
}
|
|
|
|
private function normalize(string $prompt): string
|
|
{
|
|
$value = $this->textNormalizer->normalize($prompt);
|
|
$value = $this->queryCleaner->clean($value);
|
|
$value = mb_strtolower(trim($value));
|
|
$value = str_replace(['€'], ' euro ', $value);
|
|
$value = preg_replace('/[^\p{L}\p{N}\s.,\-]/u', ' ', $value) ?? $value;
|
|
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
|
|
|
return trim($value);
|
|
}
|
|
|
|
/**
|
|
* @return array{0:?float,1:?float}
|
|
*/
|
|
private function extractPriceRange(string $prompt): array
|
|
{
|
|
$priceMin = null;
|
|
$priceMax = null;
|
|
|
|
if (preg_match('/\bzwischen\s+(\d+(?:[.,]\d+)?)\s+und\s+(\d+(?:[.,]\d+)?)\s+euro\b/u', $prompt, $m) === 1) {
|
|
$a = $this->toFloat($m[1]);
|
|
$b = $this->toFloat($m[2]);
|
|
|
|
if ($a !== null && $b !== null) {
|
|
return [min($a, $b), max($a, $b)];
|
|
}
|
|
}
|
|
|
|
if (preg_match('/\b(?:unter|bis|max(?:imal)?)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u', $prompt, $m) === 1) {
|
|
$priceMax = $this->toFloat($m[1]);
|
|
}
|
|
|
|
if (preg_match('/\b(?:ab|mindestens|min)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u', $prompt, $m) === 1) {
|
|
$priceMin = $this->toFloat($m[1]);
|
|
}
|
|
|
|
return [$priceMin, $priceMax];
|
|
}
|
|
|
|
/**
|
|
* @return string[]
|
|
*/
|
|
private function extractSizes(string $prompt): array
|
|
{
|
|
$sizes = [];
|
|
|
|
$sizePattern = $this->intentConfig->getSizePattern();
|
|
if (preg_match_all('/\b(?:' . $sizePattern . ')\s*([a-z0-9.-]+)\b/u', $prompt, $matches) === false) {
|
|
return [];
|
|
}
|
|
|
|
foreach ($matches[1] as $size) {
|
|
$sizes[] = trim($size);
|
|
}
|
|
|
|
$sizeTokenPattern = $this->intentConfig->getSizeTokenPattern();
|
|
if (preg_match_all('/\b(' . $sizeTokenPattern . ')\b/u', $prompt, $tokenMatches) !== false) {
|
|
foreach ($tokenMatches[1] as $sizeToken) {
|
|
$sizes[] = trim($sizeToken);
|
|
}
|
|
}
|
|
|
|
return array_values(array_unique(array_filter($sizes, static fn($v) => $v !== '')));
|
|
}
|
|
|
|
private function extractBrand(string $prompt): ?string
|
|
{
|
|
foreach ($this->config->getKnownBrands() as $brand) {
|
|
$normalizedBrand = $this->normalize((string) $brand);
|
|
|
|
if ($normalizedBrand !== '' && str_contains($prompt, $normalizedBrand)) {
|
|
return $normalizedBrand;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
private function buildSearchText(
|
|
string $prompt,
|
|
array $sizes,
|
|
?string $brand,
|
|
?float $priceMin,
|
|
?float $priceMax,
|
|
bool $preserveDirectProductQuery = false
|
|
): string {
|
|
if ($preserveDirectProductQuery) {
|
|
return $this->buildDirectProductSearchText($prompt);
|
|
}
|
|
|
|
$text = ' ' . $prompt . ' ';
|
|
|
|
foreach ($this->config->getPhrasesToRemove() as $phrase) {
|
|
$normalizedPhrase = $this->normalize((string) $phrase);
|
|
if ($normalizedPhrase === '') {
|
|
continue;
|
|
}
|
|
|
|
$text = str_replace(' ' . $normalizedPhrase . ' ', ' ', $text);
|
|
}
|
|
|
|
foreach ($sizes as $size) {
|
|
$normalizedSize = $this->normalize((string) $size);
|
|
if ($normalizedSize === '') {
|
|
continue;
|
|
}
|
|
|
|
$text = preg_replace('/\b' . preg_quote($normalizedSize, '/') . '\b/u', ' ', $text) ?? $text;
|
|
}
|
|
|
|
if ($brand !== null && $brand !== '' && !$this->isBrandPartOfModelPhrase($prompt, $brand)) {
|
|
$text = preg_replace('/\b' . preg_quote($brand, '/') . '\b/u', ' ', $text) ?? $text;
|
|
}
|
|
|
|
if ($priceMin !== null || $priceMax !== null) {
|
|
$text = preg_replace('/\bzwischen\s+\d+(?:[.,]\d+)?\s+und\s+\d+(?:[.,]\d+)?\s*euro\b/u', ' ', $text) ?? $text;
|
|
$text = preg_replace('/\b(?:unter|bis|max(?:imal)?|ab|mindestens|min)\s+\d+(?:[.,]\d+)?\s*euro\b/u', ' ', $text) ?? $text;
|
|
$text = preg_replace('/\b' . $this->intentConfig->getPricePattern() . '\b/u', ' ', $text) ?? $text;
|
|
}
|
|
|
|
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
|
|
$text = trim($text, " \t\n\r\0\x0B-.,");
|
|
|
|
$tokens = array_filter(
|
|
explode(' ', $text),
|
|
static fn(string $token): bool => mb_strlen($token) > 1
|
|
);
|
|
|
|
$tokens = $this->filterSearchTokens($tokens);
|
|
|
|
return trim(implode(' ', $tokens));
|
|
}
|
|
|
|
private function buildDirectProductSearchText(string $prompt): string
|
|
{
|
|
$text = $prompt;
|
|
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
|
|
$text = trim($text, " \t\n\r\0\x0B-.,");
|
|
|
|
$tokens = array_filter(
|
|
explode(' ', $text),
|
|
static fn(string $token): bool => mb_strlen($token) > 0
|
|
);
|
|
|
|
$tokens = array_values(array_unique($tokens));
|
|
|
|
return trim(implode(' ', $tokens));
|
|
}
|
|
|
|
private function shouldUseHistoryContext(string $prompt): bool
|
|
{
|
|
return preg_match(
|
|
'/\b(' . $this->config->getHistoryContextPattern() . ')\b/u',
|
|
$prompt
|
|
) === 1;
|
|
}
|
|
|
|
private function extractLatestQuestionFromHistory(string $historyContext): string
|
|
{
|
|
$result = preg_match_all('/^Question:\s*(.+)$/m', $historyContext, $matches);
|
|
|
|
if ($result === false) {
|
|
return '';
|
|
}
|
|
|
|
$questions = $matches[1] ?? [];
|
|
if ($questions === []) {
|
|
return '';
|
|
}
|
|
|
|
$lastQuestion = end($questions);
|
|
|
|
return is_string($lastQuestion) ? trim($lastQuestion) : '';
|
|
}
|
|
|
|
private function mergeSearchTexts(string $historySearchText, string $currentSearchText): string
|
|
{
|
|
$tokens = [];
|
|
|
|
foreach ([$historySearchText, $currentSearchText] as $text) {
|
|
if ($text === '') {
|
|
continue;
|
|
}
|
|
|
|
foreach (explode(' ', $text) as $token) {
|
|
$token = trim($token);
|
|
|
|
if ($token === '' || mb_strlen($token) <= 1) {
|
|
continue;
|
|
}
|
|
|
|
$tokens[$token] = $token;
|
|
}
|
|
}
|
|
|
|
return implode(' ', array_values($tokens));
|
|
}
|
|
|
|
/**
|
|
* @param string[] $tokens
|
|
* @return string[]
|
|
*/
|
|
private function filterSearchTokens(array $tokens): array
|
|
{
|
|
$stopWords = $this->config->getFilterSearchTokensPattern();
|
|
|
|
return array_values(array_filter(
|
|
$tokens,
|
|
static fn(string $token): bool => !in_array($token, $stopWords, true)
|
|
));
|
|
}
|
|
|
|
private function isDirectProductQuery(string $prompt): bool
|
|
{
|
|
if ($prompt === '') {
|
|
return false;
|
|
}
|
|
|
|
if ($this->containsModelLikePhrase($prompt)) {
|
|
return true;
|
|
}
|
|
|
|
if ($this->containsAccessoryLikePhrase($prompt)) {
|
|
return true;
|
|
}
|
|
|
|
$tokens = preg_split('/\s+/u', $prompt, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
|
|
|
if (count($tokens) <= 4 && preg_match('/\d/u', $prompt) === 1) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
private function containsModelLikePhrase(string $text): bool
|
|
{
|
|
return preg_match(
|
|
'/\b[a-zäöüß][a-zäöüß®\-]*(?:\s+[a-zäöüß][a-zäöüß®\-]*){0,2}\s+\d{2,5}[a-z0-9\-]*\b/u',
|
|
$text
|
|
) === 1;
|
|
}
|
|
|
|
private function containsAccessoryLikePhrase(string $text): bool
|
|
{
|
|
return preg_match(
|
|
'/\b(?:indikator|indicator|reagenz|reagent|kit|set)\s+\d{1,5}[a-z0-9\-]*\b/u',
|
|
$text
|
|
) === 1;
|
|
}
|
|
|
|
private function isBrandPartOfModelPhrase(string $prompt, string $brand): bool
|
|
{
|
|
if ($brand === '') {
|
|
return false;
|
|
}
|
|
|
|
return preg_match(
|
|
'/\b' . preg_quote($brand, '/') . '\s+\d{2,5}[a-z0-9\-]*\b/u',
|
|
$prompt
|
|
) === 1;
|
|
}
|
|
|
|
private function toFloat(string $value): ?float
|
|
{
|
|
$value = str_replace(',', '.', trim($value));
|
|
|
|
return is_numeric($value) ? (float) $value : null;
|
|
}
|
|
} |