lexical logic
This commit is contained in:
@@ -8,6 +8,14 @@ use App\Config\QueryEnricherConfig;
|
||||
|
||||
final readonly class QueryEnricher
|
||||
{
|
||||
/**
|
||||
* Keep enrichment conservative.
|
||||
*
|
||||
* The enriched semantic query should help vector retrieval,
|
||||
* but must not become bloated enough to dilute the original user intent.
|
||||
*/
|
||||
private const MAX_EXPANSIONS = 4;
|
||||
|
||||
public function __construct(
|
||||
private QueryEnricherConfig $config
|
||||
) {
|
||||
@@ -16,6 +24,12 @@ final readonly class QueryEnricher
|
||||
/**
|
||||
* Enriches the query with mapped counterpart terms.
|
||||
*
|
||||
* Design goals:
|
||||
* - preserve the original query unchanged at the front
|
||||
* - only append counterpart terms that are not already present
|
||||
* - prefer longer / more specific phrase matches over short generic matches
|
||||
* - keep the number of appended terms intentionally small
|
||||
*
|
||||
* Example:
|
||||
* - input: "water hardness device"
|
||||
* - output: "water hardness device residual hardness model"
|
||||
@@ -29,26 +43,63 @@ final readonly class QueryEnricher
|
||||
}
|
||||
|
||||
$mapping = $this->config->getEnrichQueryList();
|
||||
|
||||
if ($mapping === []) {
|
||||
return $originalQuery;
|
||||
}
|
||||
|
||||
$lookup = $this->buildBidirectionalLookup($mapping);
|
||||
|
||||
if ($lookup === []) {
|
||||
return $originalQuery;
|
||||
}
|
||||
|
||||
$lookup = $this->sortLookupBySpecificity($lookup);
|
||||
$normalizedQuery = $this->normalizeForMatching($originalQuery);
|
||||
|
||||
$matches = [];
|
||||
if ($normalizedQuery === '') {
|
||||
return $originalQuery;
|
||||
}
|
||||
|
||||
foreach ($lookup as $needle => $mappedValue) {
|
||||
if ($needle === '') {
|
||||
$matches = [];
|
||||
$seenNormalizedExpansions = [];
|
||||
|
||||
foreach ($lookup as $normalizedNeedle => $mappedValue) {
|
||||
if ($normalizedNeedle === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($this->containsWholePhrase($normalizedQuery, $needle)) {
|
||||
$matches[] = $mappedValue;
|
||||
if (!$this->containsWholePhrase($normalizedQuery, $normalizedNeedle)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$mappedValue = trim($mappedValue);
|
||||
if ($mappedValue === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$normalizedMappedValue = $this->normalizeForMatching($mappedValue);
|
||||
if ($normalizedMappedValue === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Do not re-add information that is already present in the query.
|
||||
if ($this->containsWholePhrase($normalizedQuery, $normalizedMappedValue)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isset($seenNormalizedExpansions[$normalizedMappedValue])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$matches[] = $mappedValue;
|
||||
$seenNormalizedExpansions[$normalizedMappedValue] = true;
|
||||
|
||||
if (count($matches) >= self::MAX_EXPANSIONS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$matches = array_values(array_unique(array_filter(
|
||||
$matches,
|
||||
static fn(string $value): bool => trim($value) !== ''
|
||||
)));
|
||||
|
||||
if ($matches === []) {
|
||||
return $originalQuery;
|
||||
}
|
||||
@@ -106,6 +157,11 @@ final readonly class QueryEnricher
|
||||
* 'jacket' => 'coat',
|
||||
* 'coat' => 'jacket',
|
||||
* ]
|
||||
*
|
||||
* Returned format:
|
||||
* [
|
||||
* '<normalized needle>' => '<original mapped value>',
|
||||
* ]
|
||||
*/
|
||||
private function buildBidirectionalLookup(array $mapping): array
|
||||
{
|
||||
@@ -122,15 +178,49 @@ final readonly class QueryEnricher
|
||||
$normalizedKey = $this->normalizeForMatching($key);
|
||||
$normalizedValue = $this->normalizeForMatching($value);
|
||||
|
||||
if ($normalizedKey !== '') {
|
||||
if ($normalizedKey !== '' && !isset($lookup[$normalizedKey])) {
|
||||
$lookup[$normalizedKey] = $value;
|
||||
}
|
||||
|
||||
if ($normalizedValue !== '') {
|
||||
if ($normalizedValue !== '' && !isset($lookup[$normalizedValue])) {
|
||||
$lookup[$normalizedValue] = $key;
|
||||
}
|
||||
}
|
||||
|
||||
return $lookup;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sorts phrase rules by specificity so longer / more precise phrases win first.
|
||||
*
|
||||
* Priority:
|
||||
* 1. more words
|
||||
* 2. longer character length
|
||||
* 3. lexical order for deterministic output
|
||||
*
|
||||
* @param array<string, string> $lookup
|
||||
* @return array<string, string>
|
||||
*/
|
||||
private function sortLookupBySpecificity(array $lookup): array
|
||||
{
|
||||
uksort($lookup, static function (string $a, string $b): int {
|
||||
$aWordCount = substr_count($a, ' ') + 1;
|
||||
$bWordCount = substr_count($b, ' ') + 1;
|
||||
|
||||
if ($aWordCount !== $bWordCount) {
|
||||
return $bWordCount <=> $aWordCount;
|
||||
}
|
||||
|
||||
$aLength = mb_strlen($a, 'UTF-8');
|
||||
$bLength = mb_strlen($b, 'UTF-8');
|
||||
|
||||
if ($aLength !== $bLength) {
|
||||
return $bLength <=> $aLength;
|
||||
}
|
||||
|
||||
return strcmp($a, $b);
|
||||
});
|
||||
|
||||
return $lookup;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user