p98
This commit is contained in:
@@ -118,6 +118,11 @@ final class NdjsonHybridRetrieverConfig
|
||||
return $this->requiredInt('exact_document_max_chunks', 1);
|
||||
}
|
||||
|
||||
public function queryCleanupProfile(): string
|
||||
{
|
||||
return $this->requiredString('query_cleanup_profile');
|
||||
}
|
||||
|
||||
public function focusedProductWindow(): int
|
||||
{
|
||||
return $this->requiredInt('focused_product_window', 1);
|
||||
@@ -350,6 +355,7 @@ final class NdjsonHybridRetrieverConfig
|
||||
'dominant_doc_min_hits' => $this->dominantDocMinHits(),
|
||||
'dominant_doc_max_chunks' => $this->dominantDocMaxChunks(),
|
||||
'exact_document_max_chunks' => $this->exactDocumentMaxChunks(),
|
||||
'query_cleanup_profile' => $this->queryCleanupProfile(),
|
||||
'focused_product_window' => $this->focusedProductWindow(),
|
||||
'focused_product_min_score' => $this->focusedProductMinScore(),
|
||||
'focused_product_min_gap' => $this->focusedProductMinGap(),
|
||||
|
||||
@@ -49,7 +49,6 @@ final readonly class RetriexEffectiveConfigProvider
|
||||
'llm' => [
|
||||
'timeout_seconds' => $this->param('retriex.llm.timeout_seconds'),
|
||||
'num_predict' => $this->param('retriex.llm.num_predict'),
|
||||
'call_models' => $this->param('retriex.llm.call_models'),
|
||||
],
|
||||
'retrieval' => $this->retrievalConfig(),
|
||||
'prompt' => $this->promptConfig(),
|
||||
@@ -86,7 +85,6 @@ final readonly class RetriexEffectiveConfigProvider
|
||||
$this->validateRuntime($config['runtime'], $errors, $warnings);
|
||||
$this->validateIndex($config['index'], $errors, $warnings);
|
||||
$this->validateModel($config['model_generation'], $errors, $warnings);
|
||||
$this->validateLlm($config['llm'], $errors, $warnings);
|
||||
$this->validateRetrieval($config['retrieval'], $errors, $warnings);
|
||||
$this->validatePrompt($config['prompt'], $errors, $warnings);
|
||||
$this->validateAgent($config['agent'], $errors, $warnings);
|
||||
@@ -1716,46 +1714,6 @@ final readonly class RetriexEffectiveConfigProvider
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $llm
|
||||
* @param list<string> $errors
|
||||
* @param list<string> $warnings
|
||||
*/
|
||||
private function validateLlm(array $llm, array &$errors, array &$warnings): void
|
||||
{
|
||||
$callModels = $llm['call_models'] ?? [];
|
||||
if (!is_array($callModels)) {
|
||||
$errors[] = 'llm.call_models must be a map.';
|
||||
return;
|
||||
}
|
||||
|
||||
$knownCalls = [
|
||||
'input_normalization',
|
||||
'shop_query_optimization',
|
||||
'final_answer',
|
||||
];
|
||||
|
||||
foreach ($callModels as $callName => $modelName) {
|
||||
if (!is_string($callName) || trim($callName) === '') {
|
||||
$errors[] = 'llm.call_models contains an invalid call name.';
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!in_array($callName, $knownCalls, true)) {
|
||||
$warnings[] = 'llm.call_models contains an unknown call name: ' . $callName . '.';
|
||||
}
|
||||
|
||||
if ($modelName !== null && !is_string($modelName)) {
|
||||
$errors[] = 'llm.call_models.' . $callName . ' must be null or a string model name.';
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_string($modelName) && trim($modelName) === '') {
|
||||
$warnings[] = 'llm.call_models.' . $callName . ' is empty and will use the default model.';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $retrieval
|
||||
* @param list<string> $errors
|
||||
@@ -1782,6 +1740,13 @@ final readonly class RetriexEffectiveConfigProvider
|
||||
$errors[] = 'retrieval.generic_exact_selection_cleanup_profile references unknown language cleanup profile: ' . trim($cleanupProfile) . '.';
|
||||
}
|
||||
|
||||
$queryCleanupProfile = $retrieval['query_cleanup_profile'] ?? null;
|
||||
if (!is_string($queryCleanupProfile) || trim($queryCleanupProfile) === '') {
|
||||
$errors[] = 'retrieval.query_cleanup_profile must be a non-empty string.';
|
||||
} elseif (!in_array(trim($queryCleanupProfile), $this->languageCleanupConfig->getCleanupProfileNames(), true)) {
|
||||
$errors[] = 'retrieval.query_cleanup_profile references unknown language cleanup profile: ' . trim($queryCleanupProfile) . '.';
|
||||
}
|
||||
|
||||
$this->validateStringListMap($retrieval['vocabulary'] ?? [], 'retrieval.vocabulary', $errors, $warnings);
|
||||
|
||||
$inventory = $retrieval['inventory_parameter'] ?? [];
|
||||
|
||||
@@ -357,7 +357,11 @@ final readonly class NdjsonChunkLookup
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mb_strlen($token, 'UTF-8') < 3 && preg_match('/\d/u', $token) !== 1) {
|
||||
if (
|
||||
mb_strlen($token, 'UTF-8') < 3
|
||||
&& preg_match('/\d/u', $token) !== 1
|
||||
&& !$this->isImportantShortTitleToken($token)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -367,6 +371,15 @@ final readonly class NdjsonChunkLookup
|
||||
return array_values(array_unique($out));
|
||||
}
|
||||
|
||||
private function isImportantShortTitleToken(string $token): bool
|
||||
{
|
||||
if ($token === '' || mb_strlen($token, 'UTF-8') >= 3) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return in_array($token, $this->retrieverConfig->importantShortModelTokens(), true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string,bool>
|
||||
*/
|
||||
|
||||
@@ -5,13 +5,15 @@ declare(strict_types=1);
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Config\LanguageCleanupConfig;
|
||||
use App\Config\NdjsonHybridRetrieverConfig;
|
||||
use App\Knowledge\StopWords;
|
||||
|
||||
final readonly class QueryCleaner
|
||||
{
|
||||
public function __construct(
|
||||
private StopWords $stopWords,
|
||||
private LanguageCleanupConfig $languageCleanupConfig
|
||||
private LanguageCleanupConfig $languageCleanupConfig,
|
||||
private NdjsonHybridRetrieverConfig $retrieverConfig
|
||||
) {
|
||||
}
|
||||
|
||||
@@ -21,9 +23,8 @@ final readonly class QueryCleaner
|
||||
* Important:
|
||||
* - Unicode-safe
|
||||
* - Numbers are preserved
|
||||
* - Negations are preserved
|
||||
* - No aggressive token-length filtering
|
||||
* - Stop words are removed
|
||||
* - Negations are preserved by protected-term aware cleanup profiles
|
||||
* - Stop words are resolved from the generic legacy list plus YAML cleanup profile terms
|
||||
*/
|
||||
public function clean(string $query): string
|
||||
{
|
||||
@@ -31,49 +32,49 @@ final readonly class QueryCleaner
|
||||
return '';
|
||||
}
|
||||
|
||||
// 1. Convert to lowercase in a Unicode-safe way
|
||||
$profile = $this->loadCleanupProfile();
|
||||
|
||||
// 1. Convert to lowercase in a Unicode-safe way.
|
||||
$query = mb_strtolower($query, 'UTF-8');
|
||||
|
||||
// 2. Treat hyphens and slashes as word separators
|
||||
// 2. Treat hyphens and slashes as word separators.
|
||||
$query = $this->languageCleanupConfig->replaceWordSeparatorsWithSpace($query);
|
||||
|
||||
// 3. Remove special characters, but keep:
|
||||
// - letters
|
||||
// - numbers
|
||||
// - other Unicode letters
|
||||
// 3. Remove configured cleanup phrases before punctuation stripping.
|
||||
$query = $this->removePhrases($query, $profile['phrases']);
|
||||
|
||||
// 4. Remove special characters, but keep letters, numbers and other Unicode letters.
|
||||
$query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query);
|
||||
|
||||
if ($query === null) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// 4. Normalize multiple whitespace characters
|
||||
// 5. Normalize multiple whitespace characters.
|
||||
$query = preg_replace('/\s+/u', ' ', $query);
|
||||
$query = trim($query);
|
||||
$query = trim((string) $query);
|
||||
|
||||
if ($query === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
// 5. Tokenize the query
|
||||
$tokens = preg_split('/\s+/u', $query);
|
||||
|
||||
if ($tokens === false) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$profileTerms = array_fill_keys(array_merge($profile['stopwords'], $profile['meta_terms']), true);
|
||||
$cleanTokens = [];
|
||||
|
||||
foreach ($tokens as $token) {
|
||||
|
||||
$token = trim($token);
|
||||
|
||||
if ($token === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Remove stop words
|
||||
if ($this->stopWords->isStopWord($token)) {
|
||||
if ($this->stopWords->isStopWord($token) || isset($profileTerms[$token])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -86,4 +87,42 @@ final readonly class QueryCleaner
|
||||
|
||||
return implode(' ', $cleanTokens);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{stopwords:string[], phrases:string[], meta_terms:string[], protected_terms:string[]}
|
||||
*/
|
||||
private function loadCleanupProfile(): array
|
||||
{
|
||||
return $this->languageCleanupConfig->getCleanupProfile($this->retrieverConfig->queryCleanupProfile());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[] $phrases
|
||||
*/
|
||||
private function removePhrases(string $query, array $phrases): string
|
||||
{
|
||||
foreach ($phrases as $phrase) {
|
||||
$phrase = trim(mb_strtolower($phrase, 'UTF-8'));
|
||||
|
||||
if ($phrase === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$normalizedPhrase = $this->languageCleanupConfig->replaceWordSeparatorsWithSpace($phrase);
|
||||
$parts = preg_split('/\s+/u', $normalizedPhrase, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||||
|
||||
if ($parts === []) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$pattern = implode('\\s+', array_map(
|
||||
static fn (string $part): string => preg_quote($part, '/'),
|
||||
$parts
|
||||
));
|
||||
|
||||
$query = preg_replace('/(?<!\p{L})(?:' . $pattern . ')(?!\p{L})/u', ' ', $query) ?? $query;
|
||||
}
|
||||
|
||||
return $query;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user