224 lines
6.1 KiB
PHP
224 lines
6.1 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Knowledge\Retrieval;
|
|
|
|
use App\Config\QueryEnricherConfig;
|
|
|
|
final readonly class QueryEnricher
|
|
{
|
|
/**
|
|
* Keep enrichment conservative.
|
|
*
|
|
* The enriched semantic query should help vector retrieval,
|
|
* but must not become bloated enough to dilute the original user intent.
|
|
*/
|
|
public function __construct(
|
|
private QueryEnricherConfig $config
|
|
) {
|
|
}
|
|
|
|
/**
|
|
* Enriches the query with mapped counterpart terms.
|
|
*
|
|
* Design goals:
|
|
* - preserve the original query unchanged at the front
|
|
* - only append counterpart terms that are not already present
|
|
* - prefer longer / more specific phrase matches over short generic matches
|
|
* - keep the number of appended terms intentionally small
|
|
*
|
|
* Example:
|
|
* - input: "water hardness device"
|
|
* - output: "water hardness device residual hardness model"
|
|
*/
|
|
public function enrichPrompt(string $query): string
|
|
{
|
|
$originalQuery = trim($query);
|
|
|
|
if ($originalQuery === '') {
|
|
return '';
|
|
}
|
|
|
|
$mapping = $this->config->getEnrichQueryList();
|
|
|
|
if ($mapping === []) {
|
|
return $originalQuery;
|
|
}
|
|
|
|
$lookup = $this->buildBidirectionalLookup($mapping);
|
|
|
|
if ($lookup === []) {
|
|
return $originalQuery;
|
|
}
|
|
|
|
$lookup = $this->sortLookupBySpecificity($lookup);
|
|
$normalizedQuery = $this->normalizeForMatching($originalQuery);
|
|
|
|
if ($normalizedQuery === '') {
|
|
return $originalQuery;
|
|
}
|
|
|
|
$matches = [];
|
|
$seenNormalizedExpansions = [];
|
|
|
|
foreach ($lookup as $normalizedNeedle => $mappedValue) {
|
|
if ($normalizedNeedle === '') {
|
|
continue;
|
|
}
|
|
|
|
if (!$this->containsWholePhrase($normalizedQuery, $normalizedNeedle)) {
|
|
continue;
|
|
}
|
|
|
|
$mappedValue = trim($mappedValue);
|
|
if ($mappedValue === '') {
|
|
continue;
|
|
}
|
|
|
|
$normalizedMappedValue = $this->normalizeForMatching($mappedValue);
|
|
if ($normalizedMappedValue === '') {
|
|
continue;
|
|
}
|
|
|
|
// Do not re-add information that is already present in the query.
|
|
if ($this->containsWholePhrase($normalizedQuery, $normalizedMappedValue)) {
|
|
continue;
|
|
}
|
|
|
|
if (isset($seenNormalizedExpansions[$normalizedMappedValue])) {
|
|
continue;
|
|
}
|
|
|
|
$matches[] = $mappedValue;
|
|
$seenNormalizedExpansions[$normalizedMappedValue] = true;
|
|
|
|
if (count($matches) >= $this->config->getMaxExpansions()) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ($matches === []) {
|
|
return $originalQuery;
|
|
}
|
|
|
|
return trim($originalQuery . ' ' . implode(' ', $matches));
|
|
}
|
|
|
|
/**
|
|
* Normalizes a string for case-insensitive matching.
|
|
*/
|
|
private function normalize(string $value): string
|
|
{
|
|
return mb_strtolower(trim($value), 'UTF-8');
|
|
}
|
|
|
|
/**
|
|
* Normalizes a string for phrase-aware matching.
|
|
*
|
|
* This keeps words searchable across spaces, punctuation and hyphens.
|
|
*/
|
|
private function normalizeForMatching(string $value): string
|
|
{
|
|
$value = $this->normalize($value);
|
|
$value = preg_replace('/[^\p{L}\p{N}]+/u', ' ', $value) ?? $value;
|
|
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
|
|
|
return trim($value);
|
|
}
|
|
|
|
/**
|
|
* Checks whether a normalized phrase exists as a full phrase in a normalized query.
|
|
*/
|
|
private function containsWholePhrase(string $normalizedQuery, string $normalizedPhrase): bool
|
|
{
|
|
if ($normalizedQuery === '' || $normalizedPhrase === '') {
|
|
return false;
|
|
}
|
|
|
|
return str_contains(' ' . $normalizedQuery . ' ', ' ' . $normalizedPhrase . ' ');
|
|
}
|
|
|
|
/**
|
|
* Builds a lookup table that works in both directions.
|
|
*
|
|
* Example:
|
|
* [
|
|
* 'trousers' => 'jeans',
|
|
* 'jacket' => 'coat',
|
|
* ]
|
|
*
|
|
* becomes:
|
|
* [
|
|
* 'trousers' => 'jeans',
|
|
* 'jeans' => 'trousers',
|
|
* 'jacket' => 'coat',
|
|
* 'coat' => 'jacket',
|
|
* ]
|
|
*
|
|
* Returned format:
|
|
* [
|
|
* '<normalized needle>' => '<original mapped value>',
|
|
* ]
|
|
*/
|
|
private function buildBidirectionalLookup(array $mapping): array
|
|
{
|
|
$lookup = [];
|
|
|
|
foreach ($mapping as $key => $value) {
|
|
$key = trim((string) $key);
|
|
$value = trim((string) $value);
|
|
|
|
if ($key === '' || $value === '') {
|
|
continue;
|
|
}
|
|
|
|
$normalizedKey = $this->normalizeForMatching($key);
|
|
$normalizedValue = $this->normalizeForMatching($value);
|
|
|
|
if ($normalizedKey !== '' && !isset($lookup[$normalizedKey])) {
|
|
$lookup[$normalizedKey] = $value;
|
|
}
|
|
|
|
if ($normalizedValue !== '' && !isset($lookup[$normalizedValue])) {
|
|
$lookup[$normalizedValue] = $key;
|
|
}
|
|
}
|
|
|
|
return $lookup;
|
|
}
|
|
|
|
/**
|
|
* Sorts phrase rules by specificity so longer / more precise phrases win first.
|
|
*
|
|
* Priority:
|
|
* 1. more words
|
|
* 2. longer character length
|
|
* 3. lexical order for deterministic output
|
|
*
|
|
* @param array<string, string> $lookup
|
|
* @return array<string, string>
|
|
*/
|
|
private function sortLookupBySpecificity(array $lookup): array
|
|
{
|
|
uksort($lookup, static function (string $a, string $b): int {
|
|
$aWordCount = substr_count($a, ' ') + 1;
|
|
$bWordCount = substr_count($b, ' ') + 1;
|
|
|
|
if ($aWordCount !== $bWordCount) {
|
|
return $bWordCount <=> $aWordCount;
|
|
}
|
|
|
|
$aLength = mb_strlen($a, 'UTF-8');
|
|
$bLength = mb_strlen($b, 'UTF-8');
|
|
|
|
if ($aLength !== $bLength) {
|
|
return $bLength <=> $aLength;
|
|
}
|
|
|
|
return strcmp($a, $b);
|
|
});
|
|
|
|
return $lookup;
|
|
}
|
|
} |