From 4823752b3e580b541522522156a279220815eb51 Mon Sep 17 00:00:00 2001 From: team2 Date: Sat, 25 Apr 2026 21:41:39 +0200 Subject: [PATCH] optimize cleanup search query shop api --- src/Commerce/CommerceQueryParser.php | 62 +++++++++++++++++++---- src/Config/CommerceQueryParserConfig.php | 64 ++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 11 deletions(-) diff --git a/src/Commerce/CommerceQueryParser.php b/src/Commerce/CommerceQueryParser.php index 87fc6f3..1e5f1d7 100644 --- a/src/Commerce/CommerceQueryParser.php +++ b/src/Commerce/CommerceQueryParser.php @@ -88,6 +88,7 @@ final readonly class CommerceQueryParser ); $value = preg_replace($this->config->getPromptSanitizePattern(), ' ', $value) ?? $value; $value = preg_replace($this->config->getWhitespaceCollapsePattern(), ' ', $value) ?? $value; + $value = $this->applySearchTokenCorrections($value); return trim($value); } @@ -222,7 +223,7 @@ final readonly class CommerceQueryParser fn(string $token): bool => mb_strlen($token) > $this->config->getMinSearchTokenLength() ); - $tokens = $this->filterSearchTokens($tokens); + $tokens = $this->normalizeSearchTokens($tokens); return trim(implode(' ', $tokens)); } @@ -237,8 +238,7 @@ final readonly class CommerceQueryParser fn(string $token): bool => mb_strlen($token) >= $this->config->getMinDirectProductTokenLength() ); - $tokens = $this->filterSearchTokens($tokens); - $tokens = array_values(array_unique($tokens)); + $tokens = $this->normalizeSearchTokens($tokens); return trim(implode(' ', $tokens)); } @@ -282,11 +282,9 @@ final readonly class CommerceQueryParser continue; } - if ($this->isSearchControlToken($token)) { - continue; + foreach ($this->normalizeSearchTokens([$token]) as $normalizedToken) { + $tokens[$normalizedToken] = $normalizedToken; } - - $tokens[$token] = $token; } } @@ -299,10 +297,52 @@ final readonly class CommerceQueryParser */ private function filterSearchTokens(array $tokens): array { - return array_values(array_filter( - $tokens, - fn(string $token): bool => !$this->isSearchControlToken($token) - )); + return $this->normalizeSearchTokens($tokens); + } + + /** + * @param string[] $tokens + * @return string[] + */ + private function normalizeSearchTokens(array $tokens): array + { + $normalizedTokens = []; + + foreach ($tokens as $token) { + $token = trim(mb_strtolower((string) $token, 'UTF-8')); + + if ($token === '') { + continue; + } + + $token = $this->config->getSearchTokenCorrections()[$token] ?? $token; + $token = $this->config->getSearchTokenCanonicalMap()[$token] ?? $token; + + if ($this->isSearchControlToken($token)) { + continue; + } + + $normalizedTokens[$token] = $token; + } + + return array_values($normalizedTokens); + } + + private function applySearchTokenCorrections(string $text): string + { + if ($text === '') { + return ''; + } + + foreach ($this->config->getSearchTokenCorrections() as $from => $to) { + $text = preg_replace( + '/\b' . preg_quote((string) $from, '/') . '\b/u', + (string) $to, + $text + ) ?? $text; + } + + return preg_replace($this->config->getWhitespaceCollapsePattern(), ' ', $text) ?? $text; } private function isSearchControlToken(string $token): bool diff --git a/src/Config/CommerceQueryParserConfig.php b/src/Config/CommerceQueryParserConfig.php index ab26683..4dbe405 100644 --- a/src/Config/CommerceQueryParserConfig.php +++ b/src/Config/CommerceQueryParserConfig.php @@ -28,6 +28,8 @@ final class CommerceQueryParserConfig 'suche', 'habt ihr', 'gibt es', + 'gebe mir', + 'gib mir', 'zeige mir', 'welches gerät', 'welche gerät', @@ -36,6 +38,8 @@ final class CommerceQueryParserConfig 'welches ist am besten', 'alternative', 'alternativen', + 'unter anderem', + 'u a', 'welche', 'welcher', 'welches', @@ -100,6 +104,66 @@ final class CommerceQueryParserConfig 'verfuegbarkeit', 'prüfe', 'pruefe', + 'den', + 'die', + 'das', + 'der', + 'dem', + 'des', + 'und', + 'oder', + 'sowie', + 'seine', + 'seinen', + 'seiner', + 'seinem', + 'seines', + 'siene', + 'sienen', + 'siener', + 'sienem', + 'sienes', + 'gebe', + 'gib', + 'nenne', + 'nenn', + 'preis', + 'preise', + 'preisen', + 'kostet', + 'kosten', + 'ua', + ]; + } + + /** + * @return array + */ + public function getSearchTokenCorrections(): array + { + return [ + 'siene' => 'seine', + 'sienen' => 'seinen', + 'siener' => 'seiner', + 'sienem' => 'seinem', + 'sienes' => 'seines', + 'indicatoren' => 'indikatoren', + ]; + } + + /** + * @return array + */ + public function getSearchTokenCanonicalMap(): array + { + return [ + 'indikatoren' => 'indikator', + 'indicators' => 'indikator', + 'indicator' => 'indikator', + 'reagenzien' => 'reagenz', + 'reagents' => 'reagenz', + 'reagent' => 'reagenz', + 'produkte' => 'produkt', ]; }