optimize retrieval

This commit is contained in:
team 1
2026-04-23 15:47:53 +02:00
parent 8a31e99669
commit 87417febf4
13 changed files with 2093 additions and 287 deletions

View File

@@ -4,6 +4,7 @@ declare(strict_types=1);
namespace App\Commerce;
use App\Commerce\Dto\CommerceReferenceContext;
use App\Commerce\Dto\CommerceSearchQuery;
use App\Config\CommerceIntentConfig;
use App\Config\CommerceQueryParserConfig;
@@ -23,10 +24,12 @@ final readonly class CommerceQueryParser
public function parse(
string $originalPrompt,
string $intent,
string $historyContext = ''
string $historyContext = '',
?CommerceReferenceContext $referenceContext = null
): CommerceSearchQuery {
$normalizedPrompt = $this->normalize($originalPrompt);
$isDirectProductQuery = $this->isDirectProductQuery($normalizedPrompt);
$isReferenceOnlyFollowUp = $this->isReferenceOnlyFollowUp($normalizedPrompt);
[$priceMin, $priceMax] = $this->extractPriceRange($normalizedPrompt);
$sizes = $this->extractSizes($normalizedPrompt);
@@ -44,7 +47,7 @@ final readonly class CommerceQueryParser
if (
!$isDirectProductQuery
&& $historyContext !== ''
&& $this->shouldUseHistoryContext($normalizedPrompt)
&& $this->shouldUseHistoryContext($normalizedPrompt, $searchText)
) {
$latestHistoryQuestion = $this->extractLatestQuestionFromHistory($historyContext);
@@ -73,7 +76,29 @@ final readonly class CommerceQueryParser
}
}
$finalSearchText = $searchText !== '' ? $searchText : $normalizedPrompt;
if (
!$isDirectProductQuery
&& $referenceContext !== null
&& $this->shouldUseReferenceContext($normalizedPrompt, $searchText)
) {
$referenceSearchText = $this->buildReferenceSearchText($referenceContext);
if ($isReferenceOnlyFollowUp || $this->isTooGenericSearchText($searchText)) {
$searchText = $referenceSearchText !== '' ? $referenceSearchText : $searchText;
} else {
$searchText = $this->mergeSearchTexts($referenceSearchText, $searchText);
}
if (($brand === null || $brand === '') && $referenceContext->manufacturer !== null) {
$normalizedManufacturer = $this->normalize($referenceContext->manufacturer);
if ($normalizedManufacturer !== '') {
$brand = $normalizedManufacturer;
}
}
}
$finalSearchText = trim($searchText !== '' ? $searchText : $normalizedPrompt);
return new CommerceSearchQuery(
originalPrompt: $originalPrompt,
@@ -93,7 +118,7 @@ final readonly class CommerceQueryParser
{
$value = $this->textNormalizer->normalize($prompt);
$value = $this->queryCleaner->clean($value);
$value = mb_strtolower(trim($value));
$value = mb_strtolower(trim($value), 'UTF-8');
$value = str_replace(['€'], ' euro ', $value);
$value = preg_replace('/[^\p{L}\p{N}\s.,\-]/u', ' ', $value) ?? $value;
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
@@ -126,6 +151,17 @@ final readonly class CommerceQueryParser
$priceMin = $this->toFloat($m[1]);
}
// NEW:
// Recognize comparative lower-bound phrasing such as:
// - mehr als 3000 euro
// - über 3000 euro
// - ueber 3000 euro
// - größer als 3000 euro
// - groesser als 3000 euro
if (preg_match('/\b(?:mehr\s+als|über|ueber|größer\s+als|groesser\s+als)\s+(\d+(?:[.,]\d+)?)\s+euro\b/u', $prompt, $m) === 1) {
$priceMin = $this->toFloat($m[1]);
}
return [$priceMin, $priceMax];
}
@@ -152,7 +188,10 @@ final readonly class CommerceQueryParser
}
}
return array_values(array_unique(array_filter($sizes, static fn($v) => $v !== '')));
return array_values(array_unique(array_filter(
$sizes,
static fn(string $value): bool => $value !== ''
)));
}
private function extractBrand(string $prompt): ?string
@@ -184,6 +223,7 @@ final readonly class CommerceQueryParser
foreach ($this->config->getPhrasesToRemove() as $phrase) {
$normalizedPhrase = $this->normalize((string) $phrase);
if ($normalizedPhrase === '') {
continue;
}
@@ -193,6 +233,7 @@ final readonly class CommerceQueryParser
foreach ($sizes as $size) {
$normalizedSize = $this->normalize((string) $size);
if ($normalizedSize === '') {
continue;
}
@@ -207,6 +248,7 @@ final readonly class CommerceQueryParser
if ($priceMin !== null || $priceMax !== null) {
$text = preg_replace('/\bzwischen\s+\d+(?:[.,]\d+)?\s+und\s+\d+(?:[.,]\d+)?\s*euro\b/u', ' ', $text) ?? $text;
$text = preg_replace('/\b(?:unter|bis|max(?:imal)?|ab|mindestens|min)\s+\d+(?:[.,]\d+)?\s*euro\b/u', ' ', $text) ?? $text;
$text = preg_replace('/\b(?:mehr\s+als|über|ueber|größer\s+als|groesser\s+als)\s+\d+(?:[.,]\d+)?\s*euro\b/u', ' ', $text) ?? $text;
$text = preg_replace('/\b' . $this->intentConfig->getPricePattern() . '\b/u', ' ', $text) ?? $text;
}
@@ -219,14 +261,14 @@ final readonly class CommerceQueryParser
);
$tokens = $this->filterSearchTokens($tokens);
$tokens = $this->stripReferenceOnlyTokens($tokens);
return trim(implode(' ', $tokens));
}
private function buildDirectProductSearchText(string $prompt): string
{
$text = $prompt;
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
$text = preg_replace('/\s+/u', ' ', $prompt) ?? $prompt;
$text = trim($text, " \t\n\r\0\x0B-.,");
$tokens = array_filter(
@@ -234,17 +276,61 @@ final readonly class CommerceQueryParser
static fn(string $token): bool => mb_strlen($token) > 0
);
$tokens = array_values(array_unique($tokens));
return trim(implode(' ', $tokens));
return trim(implode(' ', array_values(array_unique($tokens))));
}
private function shouldUseHistoryContext(string $prompt): bool
private function shouldUseHistoryContext(string $prompt, string $searchText): bool
{
return preg_match(
'/\b(' . $this->config->getHistoryContextPattern() . ')\b/u',
$prompt
) === 1;
if ($this->isReferenceOnlyFollowUp($prompt)) {
return true;
}
if ($this->isTooGenericSearchText($searchText)) {
return true;
}
return preg_match('/\b(' . $this->config->getHistoryContextPattern() . ')\b/u', $prompt) === 1;
}
private function shouldUseReferenceContext(string $prompt, string $searchText): bool
{
if ($this->isReferenceOnlyFollowUp($prompt)) {
return true;
}
return $this->isTooGenericSearchText($searchText);
}
private function isReferenceOnlyFollowUp(string $prompt): bool
{
return preg_match('/\b(' . $this->config->getReferenceFollowUpPattern() . ')\b/u', $prompt) === 1;
}
private function isTooGenericSearchText(string $searchText): bool
{
$tokens = array_values(array_filter(
preg_split('/\s+/u', $searchText, -1, PREG_SPLIT_NO_EMPTY) ?: [],
static fn(string $token): bool => $token !== ''
));
if ($tokens === []) {
return true;
}
$genericTokens = array_fill_keys($this->config->getReferenceOnlyTokens(), true);
foreach ($tokens as $token) {
if (!isset($genericTokens[$token])) {
return false;
}
}
return true;
}
private function buildReferenceSearchText(CommerceReferenceContext $referenceContext): string
{
return $this->normalize($referenceContext->buildReferenceSearchText());
}
private function extractLatestQuestionFromHistory(string $historyContext): string
@@ -256,6 +342,7 @@ final readonly class CommerceQueryParser
}
$questions = $matches[1] ?? [];
if ($questions === []) {
return '';
}
@@ -265,11 +352,11 @@ final readonly class CommerceQueryParser
return is_string($lastQuestion) ? trim($lastQuestion) : '';
}
private function mergeSearchTexts(string $historySearchText, string $currentSearchText): string
private function mergeSearchTexts(string $left, string $right): string
{
$tokens = [];
foreach ([$historySearchText, $currentSearchText] as $text) {
foreach ([$left, $right] as $text) {
if ($text === '') {
continue;
}
@@ -294,11 +381,25 @@ final readonly class CommerceQueryParser
*/
private function filterSearchTokens(array $tokens): array
{
$stopWords = $this->config->getFilterSearchTokensPattern();
$stopWords = array_fill_keys($this->config->getFilterSearchTokensPattern(), true);
return array_values(array_filter(
$tokens,
static fn(string $token): bool => !in_array($token, $stopWords, true)
static fn(string $token): bool => !isset($stopWords[$token])
));
}
/**
* @param string[] $tokens
* @return string[]
*/
private function stripReferenceOnlyTokens(array $tokens): array
{
$referenceOnly = array_fill_keys($this->config->getReferenceOnlyTokens(), true);
return array_values(array_filter(
$tokens,
static fn(string $token): bool => !isset($referenceOnly[$token])
));
}
@@ -318,11 +419,7 @@ final readonly class CommerceQueryParser
$tokens = preg_split('/\s+/u', $prompt, -1, PREG_SPLIT_NO_EMPTY) ?: [];
if (count($tokens) <= 4 && preg_match('/\d/u', $prompt) === 1) {
return true;
}
return false;
return count($tokens) <= 4 && preg_match('/\d/u', $prompt) === 1;
}
private function containsModelLikePhrase(string $text): bool