This commit is contained in:
team 1
2026-05-12 07:53:49 +02:00
parent aa80acb10f
commit e072a8e15e
8 changed files with 175 additions and 60 deletions

View File

@@ -118,6 +118,11 @@ final class NdjsonHybridRetrieverConfig
return $this->requiredInt('exact_document_max_chunks', 1);
}
public function queryCleanupProfile(): string
{
return $this->requiredString('query_cleanup_profile');
}
public function focusedProductWindow(): int
{
return $this->requiredInt('focused_product_window', 1);
@@ -350,6 +355,7 @@ final class NdjsonHybridRetrieverConfig
'dominant_doc_min_hits' => $this->dominantDocMinHits(),
'dominant_doc_max_chunks' => $this->dominantDocMaxChunks(),
'exact_document_max_chunks' => $this->exactDocumentMaxChunks(),
'query_cleanup_profile' => $this->queryCleanupProfile(),
'focused_product_window' => $this->focusedProductWindow(),
'focused_product_min_score' => $this->focusedProductMinScore(),
'focused_product_min_gap' => $this->focusedProductMinGap(),

View File

@@ -49,7 +49,6 @@ final readonly class RetriexEffectiveConfigProvider
'llm' => [
'timeout_seconds' => $this->param('retriex.llm.timeout_seconds'),
'num_predict' => $this->param('retriex.llm.num_predict'),
'call_models' => $this->param('retriex.llm.call_models'),
],
'retrieval' => $this->retrievalConfig(),
'prompt' => $this->promptConfig(),
@@ -86,7 +85,6 @@ final readonly class RetriexEffectiveConfigProvider
$this->validateRuntime($config['runtime'], $errors, $warnings);
$this->validateIndex($config['index'], $errors, $warnings);
$this->validateModel($config['model_generation'], $errors, $warnings);
$this->validateLlm($config['llm'], $errors, $warnings);
$this->validateRetrieval($config['retrieval'], $errors, $warnings);
$this->validatePrompt($config['prompt'], $errors, $warnings);
$this->validateAgent($config['agent'], $errors, $warnings);
@@ -1716,46 +1714,6 @@ final readonly class RetriexEffectiveConfigProvider
}
}
/**
* @param array<string, mixed> $llm
* @param list<string> $errors
* @param list<string> $warnings
*/
private function validateLlm(array $llm, array &$errors, array &$warnings): void
{
$callModels = $llm['call_models'] ?? [];
if (!is_array($callModels)) {
$errors[] = 'llm.call_models must be a map.';
return;
}
$knownCalls = [
'input_normalization',
'shop_query_optimization',
'final_answer',
];
foreach ($callModels as $callName => $modelName) {
if (!is_string($callName) || trim($callName) === '') {
$errors[] = 'llm.call_models contains an invalid call name.';
continue;
}
if (!in_array($callName, $knownCalls, true)) {
$warnings[] = 'llm.call_models contains an unknown call name: ' . $callName . '.';
}
if ($modelName !== null && !is_string($modelName)) {
$errors[] = 'llm.call_models.' . $callName . ' must be null or a string model name.';
continue;
}
if (is_string($modelName) && trim($modelName) === '') {
$warnings[] = 'llm.call_models.' . $callName . ' is empty and will use the default model.';
}
}
}
/**
* @param array<string, mixed> $retrieval
* @param list<string> $errors
@@ -1782,6 +1740,13 @@ final readonly class RetriexEffectiveConfigProvider
$errors[] = 'retrieval.generic_exact_selection_cleanup_profile references unknown language cleanup profile: ' . trim($cleanupProfile) . '.';
}
$queryCleanupProfile = $retrieval['query_cleanup_profile'] ?? null;
if (!is_string($queryCleanupProfile) || trim($queryCleanupProfile) === '') {
$errors[] = 'retrieval.query_cleanup_profile must be a non-empty string.';
} elseif (!in_array(trim($queryCleanupProfile), $this->languageCleanupConfig->getCleanupProfileNames(), true)) {
$errors[] = 'retrieval.query_cleanup_profile references unknown language cleanup profile: ' . trim($queryCleanupProfile) . '.';
}
$this->validateStringListMap($retrieval['vocabulary'] ?? [], 'retrieval.vocabulary', $errors, $warnings);
$inventory = $retrieval['inventory_parameter'] ?? [];

View File

@@ -357,7 +357,11 @@ final readonly class NdjsonChunkLookup
continue;
}
if (mb_strlen($token, 'UTF-8') < 3 && preg_match('/\d/u', $token) !== 1) {
if (
mb_strlen($token, 'UTF-8') < 3
&& preg_match('/\d/u', $token) !== 1
&& !$this->isImportantShortTitleToken($token)
) {
continue;
}
@@ -367,6 +371,15 @@ final readonly class NdjsonChunkLookup
return array_values(array_unique($out));
}
private function isImportantShortTitleToken(string $token): bool
{
if ($token === '' || mb_strlen($token, 'UTF-8') >= 3) {
return false;
}
return in_array($token, $this->retrieverConfig->importantShortModelTokens(), true);
}
/**
* @return array<string,bool>
*/

View File

@@ -5,13 +5,15 @@ declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Config\LanguageCleanupConfig;
use App\Config\NdjsonHybridRetrieverConfig;
use App\Knowledge\StopWords;
final readonly class QueryCleaner
{
public function __construct(
private StopWords $stopWords,
private LanguageCleanupConfig $languageCleanupConfig
private LanguageCleanupConfig $languageCleanupConfig,
private NdjsonHybridRetrieverConfig $retrieverConfig
) {
}
@@ -21,9 +23,8 @@ final readonly class QueryCleaner
* Important:
* - Unicode-safe
* - Numbers are preserved
* - Negations are preserved
* - No aggressive token-length filtering
* - Stop words are removed
* - Negations are preserved by protected-term aware cleanup profiles
* - Stop words are resolved from the generic legacy list plus YAML cleanup profile terms
*/
public function clean(string $query): string
{
@@ -31,49 +32,49 @@ final readonly class QueryCleaner
return '';
}
// 1. Convert to lowercase in a Unicode-safe way
$profile = $this->loadCleanupProfile();
// 1. Convert to lowercase in a Unicode-safe way.
$query = mb_strtolower($query, 'UTF-8');
// 2. Treat hyphens and slashes as word separators
// 2. Treat hyphens and slashes as word separators.
$query = $this->languageCleanupConfig->replaceWordSeparatorsWithSpace($query);
// 3. Remove special characters, but keep:
// - letters
// - numbers
// - other Unicode letters
// 3. Remove configured cleanup phrases before punctuation stripping.
$query = $this->removePhrases($query, $profile['phrases']);
// 4. Remove special characters, but keep letters, numbers and other Unicode letters.
$query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query);
if ($query === null) {
return '';
}
// 4. Normalize multiple whitespace characters
// 5. Normalize multiple whitespace characters.
$query = preg_replace('/\s+/u', ' ', $query);
$query = trim($query);
$query = trim((string) $query);
if ($query === '') {
return '';
}
// 5. Tokenize the query
$tokens = preg_split('/\s+/u', $query);
if ($tokens === false) {
return '';
}
$profileTerms = array_fill_keys(array_merge($profile['stopwords'], $profile['meta_terms']), true);
$cleanTokens = [];
foreach ($tokens as $token) {
$token = trim($token);
if ($token === '') {
continue;
}
// Remove stop words
if ($this->stopWords->isStopWord($token)) {
if ($this->stopWords->isStopWord($token) || isset($profileTerms[$token])) {
continue;
}
@@ -86,4 +87,42 @@ final readonly class QueryCleaner
return implode(' ', $cleanTokens);
}
}
/**
* @return array{stopwords:string[], phrases:string[], meta_terms:string[], protected_terms:string[]}
*/
private function loadCleanupProfile(): array
{
return $this->languageCleanupConfig->getCleanupProfile($this->retrieverConfig->queryCleanupProfile());
}
/**
* @param string[] $phrases
*/
private function removePhrases(string $query, array $phrases): string
{
foreach ($phrases as $phrase) {
$phrase = trim(mb_strtolower($phrase, 'UTF-8'));
if ($phrase === '') {
continue;
}
$normalizedPhrase = $this->languageCleanupConfig->replaceWordSeparatorsWithSpace($phrase);
$parts = preg_split('/\s+/u', $normalizedPhrase, -1, PREG_SPLIT_NO_EMPTY) ?: [];
if ($parts === []) {
continue;
}
$pattern = implode('\\s+', array_map(
static fn (string $part): string => preg_quote($part, '/'),
$parts
));
$query = preg_replace('/(?<!\p{L})(?:' . $pattern . ')(?!\p{L})/u', ' ', $query) ?? $query;
}
return $query;
}
}