harden retrieval logic
bugfixes
This commit is contained in:
@@ -10,8 +10,7 @@ final readonly class QueryEnricher
|
||||
{
|
||||
public function __construct(
|
||||
private QueryEnricherConfig $config
|
||||
)
|
||||
{
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -19,58 +18,46 @@ final readonly class QueryEnricher
|
||||
*
|
||||
* Example:
|
||||
* - input: "water hardness device"
|
||||
* - output: "water hardness device | Synonyms: residual hardness, model"
|
||||
* - output: "water hardness device residual hardness model"
|
||||
*/
|
||||
public function enrichPrompt(string $query): string
|
||||
{
|
||||
if (trim($query) === '') {
|
||||
$originalQuery = trim($query);
|
||||
|
||||
if ($originalQuery === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Keep the original query untouched for the final output.
|
||||
$originalQuery = $query;
|
||||
|
||||
// Normalize the query for case-insensitive matching.
|
||||
$normalizedQuery = $this->normalize($query);
|
||||
|
||||
// Expected format:
|
||||
// [
|
||||
// 'trousers' => 'jeans',
|
||||
// 'jacket' => 'coat',
|
||||
// ]
|
||||
$mapping = $this->config->getEnrichQueryList();
|
||||
|
||||
// Build a bidirectional lookup table:
|
||||
// key -> value
|
||||
// value -> key
|
||||
$lookup = $this->buildBidirectionalLookup($mapping);
|
||||
|
||||
// Split the query into searchable tokens.
|
||||
$tokens = $this->tokenize($normalizedQuery);
|
||||
$normalizedQuery = $this->normalizeForMatching($originalQuery);
|
||||
|
||||
$matches = [];
|
||||
|
||||
foreach ($tokens as $token) {
|
||||
// If the token exists in the lookup table, add the mapped counterpart.
|
||||
if (isset($lookup[$token])) {
|
||||
$matches[] = $lookup[$token];
|
||||
foreach ($lookup as $needle => $mappedValue) {
|
||||
if ($needle === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($this->containsWholePhrase($normalizedQuery, $needle)) {
|
||||
$matches[] = $mappedValue;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove duplicates while preserving order.
|
||||
$matches = array_values(array_unique($matches));
|
||||
$matches = array_values(array_unique(array_filter(
|
||||
$matches,
|
||||
static fn(string $value): bool => trim($value) !== ''
|
||||
)));
|
||||
|
||||
// If no matches were found, return the original query unchanged.
|
||||
if ($matches === []) {
|
||||
return $originalQuery;
|
||||
}
|
||||
|
||||
// Append the matched counterpart terms to the original query.
|
||||
return $originalQuery . ' | Synonyms: ' . implode(', ', $matches);
|
||||
return trim($originalQuery . ' ' . implode(' ', $matches));
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes a string for case-insensitive comparison.
|
||||
* Normalizes a string for case-insensitive matching.
|
||||
*/
|
||||
private function normalize(string $value): string
|
||||
{
|
||||
@@ -78,13 +65,29 @@ final readonly class QueryEnricher
|
||||
}
|
||||
|
||||
/**
|
||||
* Tokenizes the query into words.
|
||||
* Normalizes a string for phrase-aware matching.
|
||||
*
|
||||
* Splits on every character that is not a letter or number.
|
||||
* This keeps words searchable across spaces, punctuation and hyphens.
|
||||
*/
|
||||
private function tokenize(string $value): array
|
||||
private function normalizeForMatching(string $value): string
|
||||
{
|
||||
return preg_split('/[^\p{L}\p{N}]+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||||
$value = $this->normalize($value);
|
||||
$value = preg_replace('/[^\p{L}\p{N}]+/u', ' ', $value) ?? $value;
|
||||
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
||||
|
||||
return trim($value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether a normalized phrase exists as a full phrase in a normalized query.
|
||||
*/
|
||||
private function containsWholePhrase(string $normalizedQuery, string $normalizedPhrase): bool
|
||||
{
|
||||
if ($normalizedQuery === '' || $normalizedPhrase === '') {
|
||||
return false;
|
||||
}
|
||||
|
||||
return str_contains(' ' . $normalizedQuery . ' ', ' ' . $normalizedPhrase . ' ');
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -112,19 +115,20 @@ final readonly class QueryEnricher
|
||||
$key = trim((string) $key);
|
||||
$value = trim((string) $value);
|
||||
|
||||
// Skip incomplete pairs.
|
||||
if ($key === '' || $value === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$normalizedKey = $this->normalize($key);
|
||||
$normalizedValue = $this->normalize($value);
|
||||
$normalizedKey = $this->normalizeForMatching($key);
|
||||
$normalizedValue = $this->normalizeForMatching($value);
|
||||
|
||||
// If the key is found in the query, return the value.
|
||||
$lookup[$normalizedKey] = $value;
|
||||
if ($normalizedKey !== '') {
|
||||
$lookup[$normalizedKey] = $value;
|
||||
}
|
||||
|
||||
// If the value is found in the query, return the key.
|
||||
$lookup[$normalizedValue] = $key;
|
||||
if ($normalizedValue !== '') {
|
||||
$lookup[$normalizedValue] = $key;
|
||||
}
|
||||
}
|
||||
|
||||
return $lookup;
|
||||
|
||||
Reference in New Issue
Block a user