optimize data retrieve by customfields und enricher

This commit is contained in:
team 1
2026-04-13 16:11:19 +02:00
parent 0a05ccaee3
commit f7685c6fb5
10 changed files with 234 additions and 32 deletions

View File

@@ -41,8 +41,11 @@ final class NdjsonHybridRetriever implements RetrieverInterface
private readonly SalesIntentLite $salesIntentLite,
private readonly CatalogIntentLite $catalogIntent,
private readonly IntentRouteResolver $routeResolver,
private readonly EntityCatalogService $entityCatalogService
) {}
private readonly EntityCatalogService $entityCatalogService,
private readonly QueryEnricher $queryEnricher,
)
{
}
// =========================================================
// PUBLIC API
@@ -126,10 +129,11 @@ final class NdjsonHybridRetriever implements RetrieverInterface
// =========================================================
private function execute(
string $prompt,
string $prompt,
ModelGenerationConfig $config,
bool $withScores
): array {
bool $withScores
): array
{
$entityLabel = $this->catalogIntent->detect($prompt);
$salesIntent = $this->detectSalesIntent($prompt);
@@ -195,11 +199,12 @@ final class NdjsonHybridRetriever implements RetrieverInterface
// =========================================================
private function runCore(
string $prompt,
string $prompt,
ModelGenerationConfig $config,
bool $withScores,
string $salesIntent
): array {
bool $withScores,
string $salesIntent
): array
{
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
@@ -207,6 +212,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$isListQuery = $this->intentLite->isListQuery($prompt);
$cleanQuery = $this->queryCleaner->clean($prompt);
$cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery);
if ($cleanQuery === '') {
return [
@@ -316,9 +322,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface
private function computeThresholdAndTopK(
string $salesIntent,
bool $isListQuery,
int $vectorTopKBase
): array {
bool $isListQuery,
int $vectorTopKBase
): array
{
$threshold = self::VECTOR_SCORE_THRESHOLD;
$topK = $vectorTopKBase;
@@ -344,9 +351,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface
array $globalHits,
array $scopedHits,
float $threshold,
bool $boostScoped,
bool $captureRaw
): array {
bool $boostScoped,
bool $captureRaw
): array
{
$rrfScores = [];
$rawScores = [];

View File

@@ -0,0 +1,128 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
final class QueryEnricher
{
public function enrichPrompt(string $query): string
{
// Return early if the input is empty or contains only whitespace.
if (trim($query) === '') {
return '';
}
// Keep the original query untouched for the final output.
$originalQuery = $query;
// Normalize the query for case-insensitive matching.
$normalizedQuery = $this->normalize($query);
// Expect an associative array like:
// [
// 'hose' => 'jeans',
// 'jacke' => 'mantel',
// ]
$mapping = $this->enrichQueryList();
// Build a bidirectional lookup table:
// key -> value
// value -> key
$lookup = $this->buildBidirectionalLookup($mapping);
// Split the query into searchable words/tokens.
$tokens = $this->tokenize($normalizedQuery);
$matches = [];
foreach ($tokens as $token) {
// If the token exists in the lookup table, add the mapped counterpart.
if (isset($lookup[$token])) {
$matches[] = $lookup[$token];
}
}
// Remove duplicates while preserving order.
$matches = array_values(array_unique($matches));
// If nothing was found, return the original query unchanged.
if ($matches === []) {
return $originalQuery;
}
// Append the matched counterpart terms to the original prompt.
return $originalQuery . " | Pseudonyme: " . implode(', ', $matches);
}
/**
* Normalize a string for case-insensitive comparison.
*/
private function normalize(string $value): string
{
return mb_strtolower(trim($value), 'UTF-8');
}
/**
* Tokenize the query into words.
* Splits on everything that is not a letter or number.
*/
private function tokenize(string $value): array
{
return preg_split('/[^\p{L}\p{N}]+/u', $value, -1, PREG_SPLIT_NO_EMPTY) ?: [];
}
/**
* Build a lookup table that works in both directions.
*
* Example:
* [
* 'hose' => 'jeans',
* 'jacke' => 'mantel',
* ]
*
* becomes:
* [
* 'hose' => 'jeans',
* 'jeans' => 'hose',
* 'jacke' => 'mantel',
* 'mantel' => 'jacke',
* ]
*/
private function buildBidirectionalLookup(array $mapping): array
{
$lookup = [];
foreach ($mapping as $key => $value) {
$key = trim((string)$key);
$value = trim((string)$value);
// Skip incomplete pairs.
if ($key === '' || $value === '') {
continue;
}
$normalizedKey = $this->normalize($key);
$normalizedValue = $this->normalize($value);
// If the key is found in the query, return the value.
$lookup[$normalizedKey] = $value;
// If the value is found in the query, return the key.
$lookup[$normalizedValue] = $key;
}
return $lookup;
}
public function enrichQueryList(): array
{
return [
'Wasserhärte' => "Resthärte",
'Gerät' => 'Modell',
'Indikator' => 'Chemie',
'Wasserhärte-Grenzwert'=>'Resthärte',
'Resthärte-Grenzwert'=>'Wasserhärte'
];
}
}