add comments
This commit is contained in:
@@ -6,24 +6,27 @@ class AgentRunnerConfig
|
||||
{
|
||||
public function getShopPrompt($prompt): string
|
||||
{
|
||||
/**
|
||||
* Erzeuge aus dem folgenden Nutzereingabetext einen kurzen Suchtext für die Shopware-6-Suche. Regeln: - Gib nur den finalen Suchtext aus. - erstelle immer die singular form von den relevanten Suchbegriffen - Keine Einleitung, keine Erklärung, keine Anführungszeichen. - Verwende nur die shop relevanten Suchbegriffe für eine Shopsuche aus dem Nutzereingabetext. - Maximal 6 Suchbegriffe, besser weniger. - Entferne Füllwörter, Höflichkeitsformen und irrelevante Wörter. - Erhalte Produktnamen, Marken, Modellnummern und zusammengesetzte Begriffe exakt, wenn sie relevant sind. - Zahlen, die zu einem Produktnamen oder Modell gehören (zb Indikator 300 oder Testomat 808 oder Testomat 2000), müssen erhalten bleiben. - Trenne die Begriffe nur durch Leerzeichen. Ausgabeformat: Keyword1 Keyword2 Keyword3
|
||||
*/
|
||||
return '
|
||||
Erzeuge aus dem folgenden Nutzereingabetext einen kurzen Suchtext für die Shopware-6-Suche.
|
||||
Generate a short search query for Shopware 6 from the following user input text.
|
||||
|
||||
Rules:
|
||||
- Output only the final search query.
|
||||
- Always convert relevant search terms to their singular form.
|
||||
- No introduction, no explanation, no quotation marks.
|
||||
- Use only shop-relevant search terms from the user input for a shop search.
|
||||
- Maximum 6 search terms, preferably fewer.
|
||||
- Remove filler words, polite phrases, and irrelevant words.
|
||||
- Preserve product names, brands, model numbers, and compound terms exactly if they are relevant.
|
||||
- Numbers that belong to a product name or model must be preserved (e.g. Indikator 300, Testomat 808, Testomat 2000).
|
||||
- Separate terms using spaces only.
|
||||
|
||||
Output format:
|
||||
Keyword1 Keyword2 Keyword3
|
||||
|
||||
Regeln:
|
||||
- Gib nur den finalen Suchtext aus.
|
||||
- erstelle immer die singular form von den relevanten Suchbegriffen
|
||||
- Keine Einleitung, keine Erklärung, keine Anführungszeichen.
|
||||
- Verwende nur die shop relevanten Suchbegriffe für eine Shopsuche aus dem Nutzereingabetext.
|
||||
- Maximal 6 Suchbegriffe, besser weniger.
|
||||
- Entferne Füllwörter, Höflichkeitsformen und irrelevante Wörter.
|
||||
- Erhalte Produktnamen, Marken, Modellnummern und zusammengesetzte Begriffe exakt, wenn sie relevant sind.
|
||||
- Zahlen, die zu einem Produktnamen oder Modell gehören (zb Indikator 300 oder Testomat 808), müssen erhalten bleiben.
|
||||
- Trenne die Begriffe nur durch Leerzeichen.
|
||||
|
||||
Ausgabeformat:
|
||||
Keyword1 Keyword2 Keyword3
|
||||
|
||||
Nutzereingabetext: ' . $prompt . '
|
||||
';
|
||||
input text: ' . $prompt . '
|
||||
';
|
||||
}
|
||||
}
|
||||
20
src/Config/QueryEnricherConfig.php
Normal file
20
src/Config/QueryEnricherConfig.php
Normal file
@@ -0,0 +1,20 @@
|
||||
<?php
|
||||
|
||||
namespace App\Config;
|
||||
|
||||
class QueryEnricherConfig
|
||||
{
|
||||
public function getEnrichQueryList(): array
|
||||
{
|
||||
return [
|
||||
'Wasserhärte' => 'Resthärte',
|
||||
'Gerät' => 'Modell',
|
||||
'Indikator' => 'Chemie',
|
||||
'Seminar' => 'Webinar',
|
||||
'Schulung' => 'Seminar',
|
||||
'Indikatoren' => 'Indikator',
|
||||
'Wasserhärte-Grenzwert' => 'Resthärte',
|
||||
'Resthärte-Grenzwert' => 'Wasserhärte',
|
||||
];
|
||||
}
|
||||
}
|
||||
@@ -24,11 +24,11 @@ use Symfony\Component\Routing\Annotation\Route;
|
||||
* - Client identity is resolved exclusively via ClientIdResolver
|
||||
* - No user identifiers are accepted from the request
|
||||
*/
|
||||
final class HistoryController
|
||||
final readonly class HistoryController
|
||||
{
|
||||
public function __construct(
|
||||
private readonly ContextService $contextService,
|
||||
private readonly ClientIdResolver $clientIdResolver,
|
||||
private ContextService $contextService,
|
||||
private ClientIdResolver $clientIdResolver,
|
||||
) {}
|
||||
|
||||
/**
|
||||
|
||||
@@ -6,12 +6,11 @@ declare(strict_types=1);
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Knowledge\ChunkManager;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
final class NdjsonChunkLookup
|
||||
final readonly class NdjsonChunkLookup
|
||||
{
|
||||
public function __construct(
|
||||
private readonly ChunkManager $chunkManager
|
||||
private ChunkManager $chunkManager
|
||||
)
|
||||
{
|
||||
}
|
||||
@@ -33,7 +32,6 @@ final class NdjsonChunkLookup
|
||||
|
||||
$found[$id] = $row;
|
||||
|
||||
// Early exit sobald alle gefunden
|
||||
if (\count($found) === \count($wanted)) {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -14,7 +14,20 @@ use App\Repository\ModelGenerationConfigRepository;
|
||||
use App\Routing\IntentRouteResolver;
|
||||
use App\Tag\TagRoutingService;
|
||||
use App\Vector\VectorSearchClient;
|
||||
use Doctrine\DBAL\Exception;
|
||||
use RuntimeException;
|
||||
|
||||
/**
|
||||
* Hybrid retriever for NDJSON-based knowledge chunks.
|
||||
*
|
||||
* Main responsibilities:
|
||||
* - detect high-level request intent
|
||||
* - optionally short-circuit to catalog list output
|
||||
* - run vector retrieval globally and optionally document-scoped
|
||||
* - fuse both result sets with RRF-style scoring
|
||||
* - apply selection rules for list queries vs. sales-style queries
|
||||
* - return either plain chunk texts or debug metadata
|
||||
*/
|
||||
final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
public function __construct(
|
||||
@@ -37,15 +50,27 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
// PUBLIC API
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* Returns the final retrieval payload as plain text chunks.
|
||||
*
|
||||
* Behaviour:
|
||||
* - loads active retrieval config
|
||||
* - executes the full orchestration pipeline
|
||||
* - if the route resolves to a catalog list, returns the catalog block only
|
||||
* - otherwise returns the selected chunk texts
|
||||
* @throws Exception
|
||||
*/
|
||||
public function retrieve(string $prompt): array
|
||||
{
|
||||
$config = $this->requireConfig();
|
||||
$result = $this->execute($prompt, $config, false);
|
||||
|
||||
// Catalog list responses bypass normal chunk retrieval completely.
|
||||
if ($result['catalogBlock'] !== null) {
|
||||
return [$result['catalogBlock']];
|
||||
}
|
||||
|
||||
// No selected chunks means no usable retrieval result.
|
||||
if ($result['selectedChunkIds'] === []) {
|
||||
return [];
|
||||
}
|
||||
@@ -56,11 +81,23 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a debug-friendly retrieval result with scoring/meta information.
|
||||
*
|
||||
* This method is used for inspection and tuning:
|
||||
* - selected chunk ids
|
||||
* - raw vector scores
|
||||
* - fused RRF scores
|
||||
* - intent / route information
|
||||
* - threshold and list-query flags
|
||||
* @throws Exception
|
||||
*/
|
||||
public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array
|
||||
{
|
||||
$config = $config ?? $this->requireConfig();
|
||||
$result = $this->execute($prompt, $config, true);
|
||||
|
||||
// For catalog list routes we expose a synthetic debug row.
|
||||
if ($result['catalogBlock'] !== null) {
|
||||
return [[
|
||||
'rank' => 1,
|
||||
@@ -86,6 +123,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
foreach ($result['selectedChunkIds'] as $chunkId) {
|
||||
|
||||
// Skip ids that could not be resolved to real chunk rows.
|
||||
if (!isset($result['rows'][$chunkId])) {
|
||||
continue;
|
||||
}
|
||||
@@ -114,6 +152,17 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
// CENTRAL ORCHESTRATION
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* Central orchestration entrypoint.
|
||||
*
|
||||
* Pipeline:
|
||||
* 1. Detect catalog entity and sales intent
|
||||
* 2. Resolve route
|
||||
* 3. If route is a catalog list route, try direct catalog output
|
||||
* 4. Otherwise, run the normal hybrid retrieval core
|
||||
* 5. Select final chunk ids depending on query type
|
||||
* @throws Exception
|
||||
*/
|
||||
private function execute(
|
||||
string $prompt,
|
||||
ModelGenerationConfig $config,
|
||||
@@ -125,6 +174,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$salesIntent = $this->detectSalesIntent($prompt);
|
||||
$route = $this->routeResolver->resolve($salesIntent, $entityLabel);
|
||||
|
||||
// Fast path:
|
||||
// If the route explicitly asks for a catalog list and we have an entity label,
|
||||
// we return a prebuilt catalog block instead of semantic chunk retrieval.
|
||||
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
|
||||
|
||||
$catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);
|
||||
@@ -147,6 +199,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
$core = $this->runCore($prompt, $config, $withScores, $salesIntent);
|
||||
|
||||
// No ranked chunks or no resolved rows means retrieval produced nothing usable.
|
||||
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
||||
return [
|
||||
'route' => $route,
|
||||
@@ -162,6 +215,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
];
|
||||
}
|
||||
|
||||
// Selection strategy depends on query type:
|
||||
// - list queries prefer deduplicated chunks
|
||||
// - sales queries prefer spread across docs / chunk distance
|
||||
$selectedChunkIds = $core['is_list_query']
|
||||
? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
|
||||
: $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
||||
@@ -184,6 +240,20 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
// CORE PIPELINE
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* Executes the actual hybrid retrieval logic.
|
||||
*
|
||||
* Steps:
|
||||
* - derive limits from config within hard safety caps
|
||||
* - detect whether the prompt is a "list query"
|
||||
* - clean and enrich the prompt
|
||||
* - compute threshold + vector topK based on intent/query type
|
||||
* - route query into candidate document ids via tag routing
|
||||
* - run global and optional scoped vector search
|
||||
* - fuse hits
|
||||
* - resolve chunk ids to chunk rows
|
||||
* @throws Exception
|
||||
*/
|
||||
private function runCore(
|
||||
string $prompt,
|
||||
ModelGenerationConfig $config,
|
||||
@@ -197,9 +267,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
$isListQuery = $this->intentLite->isListQuery($prompt);
|
||||
|
||||
// The prompt is normalized first, then enriched before retrieval.
|
||||
$cleanQuery = $this->queryCleaner->clean($prompt);
|
||||
$cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery);
|
||||
|
||||
// Empty cleaned query means retrieval would be meaningless.
|
||||
if ($cleanQuery === '') {
|
||||
return [
|
||||
'limit' => $limit,
|
||||
@@ -218,18 +290,22 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$vectorTopKBase
|
||||
);
|
||||
|
||||
// Tag routing tries to narrow retrieval to relevant document ids.
|
||||
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
||||
$candidateDocIds = is_array($candidateDocIds)
|
||||
? array_values(array_unique(array_filter($candidateDocIds, 'is_string')))
|
||||
: [];
|
||||
|
||||
// Always run a global search.
|
||||
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
||||
|
||||
// Optionally run a scoped search if tag routing yielded document candidates.
|
||||
$scopedHits = [];
|
||||
if ($candidateDocIds !== []) {
|
||||
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
|
||||
}
|
||||
|
||||
// Nothing found at all.
|
||||
if ($globalHits === [] && $scopedHits === []) {
|
||||
return [
|
||||
'limit' => $limit,
|
||||
@@ -242,6 +318,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
];
|
||||
}
|
||||
|
||||
// Fuse global and scoped hits with optional scoped boost.
|
||||
$fused = $this->fuseHits(
|
||||
$globalHits,
|
||||
$scopedHits,
|
||||
@@ -253,10 +330,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$rrfScores = $fused['rrf_scores'];
|
||||
$rawScores = $fused['raw_scores'];
|
||||
|
||||
// Fallback:
|
||||
// If all hits were filtered by threshold but global hits exist,
|
||||
// derive a weak RRF ranking from the raw hit order.
|
||||
if ($rrfScores === [] && $globalHits !== []) {
|
||||
$rrfScores = $this->fallbackRrfFromHits(
|
||||
$globalHits,
|
||||
NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN
|
||||
$globalHits
|
||||
);
|
||||
}
|
||||
|
||||
@@ -272,8 +351,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
];
|
||||
}
|
||||
|
||||
// Highest fused score first.
|
||||
arsort($rrfScores);
|
||||
$rankedChunkIds = array_keys($rrfScores);
|
||||
|
||||
// Resolve the ranking to actual NDJSON chunk rows.
|
||||
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
|
||||
|
||||
return [
|
||||
@@ -291,21 +373,39 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
// SUPPORT
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* Loads the active model generation config.
|
||||
*
|
||||
* Retrieval is not allowed to proceed without an active config.
|
||||
*/
|
||||
private function requireConfig(): ModelGenerationConfig
|
||||
{
|
||||
$config = $this->configRepository->findActiveForModel();
|
||||
if ($config === null) {
|
||||
throw new \RuntimeException('No active ModelGenerationConfig found.');
|
||||
throw new RuntimeException('No active ModelGenerationConfig found.');
|
||||
}
|
||||
return $config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts the normalized sales intent string from the intent detector.
|
||||
*
|
||||
* Falls back to DISCOVERY when the detector payload is incomplete.
|
||||
*/
|
||||
private function detectSalesIntent(string $prompt): string
|
||||
{
|
||||
$data = $this->salesIntentLite->detect($prompt);
|
||||
return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes retrieval threshold and vector topK.
|
||||
*
|
||||
* Rules:
|
||||
* - objection/pricing intents are slightly stricter
|
||||
* - list queries are allowed to retrieve a wider candidate set
|
||||
* - all values are clamped to global hard limits
|
||||
*/
|
||||
private function computeThresholdAndTopK(
|
||||
string $salesIntent,
|
||||
bool $isListQuery,
|
||||
@@ -333,6 +433,15 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return [$threshold, $topK];
|
||||
}
|
||||
|
||||
/**
|
||||
* Fuses multiple hit lists into one RRF-style score map.
|
||||
*
|
||||
* Notes:
|
||||
* - only hits above threshold are considered
|
||||
* - rank position within each hit list contributes to the final score
|
||||
* - scoped hits can be boosted
|
||||
* - raw scores are optionally captured for debug output
|
||||
*/
|
||||
private function fuseHits(
|
||||
array $globalHits,
|
||||
array $scopedHits,
|
||||
@@ -351,18 +460,21 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
|
||||
// Every hit must provide a chunk id and a numeric score.
|
||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$raw = (float)$hit['score'];
|
||||
|
||||
// Threshold is applied before rank fusion.
|
||||
if ($raw < $threshold) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkId = (string)$hit['chunk_id'];
|
||||
|
||||
// Store the best raw score per chunk for debug inspection.
|
||||
if ($captureRaw) {
|
||||
$rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
|
||||
}
|
||||
@@ -370,10 +482,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$rank++;
|
||||
$rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
|
||||
|
||||
// Scoped result lists can get a slight relevance bonus.
|
||||
if ($boost) {
|
||||
$rrf *= 1.2;
|
||||
}
|
||||
|
||||
// Scores from multiple hit lists accumulate.
|
||||
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
|
||||
}
|
||||
};
|
||||
@@ -387,7 +501,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
];
|
||||
}
|
||||
|
||||
private function fallbackRrfFromHits(array $hits, int $topN): array
|
||||
/**
|
||||
* Builds a fallback RRF ranking purely from hit order.
|
||||
*
|
||||
* Used when thresholding removed all fused candidates but
|
||||
* the global hit list itself still exists.
|
||||
*/
|
||||
private function fallbackRrfFromHits(array $hits): array
|
||||
{
|
||||
$rrf = [];
|
||||
$rank = 0;
|
||||
@@ -401,7 +521,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$rank++;
|
||||
$rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
|
||||
|
||||
if ($rank >= $topN) {
|
||||
if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -409,6 +529,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return $rrf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Selection strategy for list-style queries.
|
||||
*
|
||||
* Goal:
|
||||
* - avoid near-identical chunks
|
||||
* - prefer diverse list entries
|
||||
* - stop once the configured limit is reached
|
||||
*/
|
||||
private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$seen = [];
|
||||
@@ -425,6 +553,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
continue;
|
||||
}
|
||||
|
||||
// Deduplicate by normalized chunk text.
|
||||
$key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));
|
||||
|
||||
if (isset($seen[$key])) {
|
||||
@@ -442,6 +571,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Selection strategy for sales-oriented queries.
|
||||
*
|
||||
* Goal:
|
||||
* - avoid overloading the result with chunks from the same document
|
||||
* - avoid chunks that are too close to each other in the same document
|
||||
* - preserve top-ranked relevance while improving contextual spread
|
||||
*/
|
||||
private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$out = [];
|
||||
@@ -457,14 +594,17 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
||||
|
||||
// Sales selection requires a valid document context.
|
||||
if (!is_string($docId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Limit how many chunks may come from the same document.
|
||||
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Enforce a minimum distance between chunk positions of the same document.
|
||||
if (is_int($chunkIndex)) {
|
||||
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
|
||||
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
|
||||
@@ -490,6 +630,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts selected chunk ids into the final plain text result list.
|
||||
*/
|
||||
private function collectTextsFromIds(array $chunkIds, array $rows): array
|
||||
{
|
||||
$out = [];
|
||||
|
||||
@@ -9,14 +9,14 @@ use App\Knowledge\StopWords;
|
||||
final class QueryCleaner
|
||||
{
|
||||
/**
|
||||
* Bereinigt eine Query ausschließlich für Retrieval-Zwecke.
|
||||
* Cleans a query strictly for retrieval purposes.
|
||||
*
|
||||
* Wichtig:
|
||||
* - Unicode-sicher
|
||||
* - Zahlen bleiben erhalten
|
||||
* - Negationen bleiben erhalten
|
||||
* - Keine aggressive Token-Längen-Filterung
|
||||
* - StopWords werden entfernt
|
||||
* Important:
|
||||
* - Unicode-safe
|
||||
* - Numbers are preserved
|
||||
* - Negations are preserved
|
||||
* - No aggressive token-length filtering
|
||||
* - Stop words are removed
|
||||
*/
|
||||
public function clean(string $query): string
|
||||
{
|
||||
@@ -24,23 +24,23 @@ final class QueryCleaner
|
||||
return '';
|
||||
}
|
||||
|
||||
// 1. Unicode-sicher lowercase
|
||||
// 1. Convert to lowercase in a Unicode-safe way
|
||||
$query = mb_strtolower($query, 'UTF-8');
|
||||
|
||||
// 2. Bindestriche & Slashes als Worttrenner behandeln
|
||||
// 2. Treat hyphens and slashes as word separators
|
||||
$query = str_replace(['-', '/'], ' ', $query);
|
||||
|
||||
// 3. Sonderzeichen entfernen, aber:
|
||||
// - Buchstaben behalten
|
||||
// - Zahlen behalten
|
||||
// - Umlaute behalten
|
||||
// 3. Remove special characters, but keep:
|
||||
// - letters
|
||||
// - numbers
|
||||
// - other Unicode letters
|
||||
$query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query);
|
||||
|
||||
if ($query === null) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// 4. Mehrfache Whitespaces normalisieren
|
||||
// 4. Normalize multiple whitespace characters
|
||||
$query = preg_replace('/\s+/u', ' ', $query);
|
||||
$query = trim($query);
|
||||
|
||||
@@ -48,7 +48,7 @@ final class QueryCleaner
|
||||
return '';
|
||||
}
|
||||
|
||||
// 5. Tokenisierung
|
||||
// 5. Tokenize the query
|
||||
$tokens = preg_split('/\s+/u', $query);
|
||||
|
||||
if ($tokens === false) {
|
||||
@@ -65,7 +65,7 @@ final class QueryCleaner
|
||||
continue;
|
||||
}
|
||||
|
||||
// StopWords entfernen
|
||||
// Remove stop words
|
||||
if (StopWords::isStopWord($token)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -4,11 +4,25 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
final class QueryEnricher
|
||||
use App\Config\QueryEnricherConfig;
|
||||
|
||||
final readonly class QueryEnricher
|
||||
{
|
||||
public function __construct(
|
||||
private QueryEnricherConfig $config
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Enriches the query with mapped counterpart terms.
|
||||
*
|
||||
* Example:
|
||||
* - input: "water hardness device"
|
||||
* - output: "water hardness device | Synonyms: residual hardness, model"
|
||||
*/
|
||||
public function enrichPrompt(string $query): string
|
||||
{
|
||||
// Return early if the input is empty or contains only whitespace.
|
||||
if (trim($query) === '') {
|
||||
return '';
|
||||
}
|
||||
@@ -19,19 +33,19 @@ final class QueryEnricher
|
||||
// Normalize the query for case-insensitive matching.
|
||||
$normalizedQuery = $this->normalize($query);
|
||||
|
||||
// Expect an associative array like:
|
||||
// Expected format:
|
||||
// [
|
||||
// 'hose' => 'jeans',
|
||||
// 'jacke' => 'mantel',
|
||||
// 'trousers' => 'jeans',
|
||||
// 'jacket' => 'coat',
|
||||
// ]
|
||||
$mapping = $this->enrichQueryList();
|
||||
$mapping = $this->config->getEnrichQueryList();
|
||||
|
||||
// Build a bidirectional lookup table:
|
||||
// key -> value
|
||||
// value -> key
|
||||
$lookup = $this->buildBidirectionalLookup($mapping);
|
||||
|
||||
// Split the query into searchable words/tokens.
|
||||
// Split the query into searchable tokens.
|
||||
$tokens = $this->tokenize($normalizedQuery);
|
||||
|
||||
$matches = [];
|
||||
@@ -46,17 +60,17 @@ final class QueryEnricher
|
||||
// Remove duplicates while preserving order.
|
||||
$matches = array_values(array_unique($matches));
|
||||
|
||||
// If nothing was found, return the original query unchanged.
|
||||
// If no matches were found, return the original query unchanged.
|
||||
if ($matches === []) {
|
||||
return $originalQuery;
|
||||
}
|
||||
|
||||
// Append the matched counterpart terms to the original prompt.
|
||||
return $originalQuery . " | Pseudonyme: " . implode(', ', $matches);
|
||||
// Append the matched counterpart terms to the original query.
|
||||
return $originalQuery . ' | Synonyms: ' . implode(', ', $matches);
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a string for case-insensitive comparison.
|
||||
* Normalizes a string for case-insensitive comparison.
|
||||
*/
|
||||
private function normalize(string $value): string
|
||||
{
|
||||
@@ -64,8 +78,9 @@ final class QueryEnricher
|
||||
}
|
||||
|
||||
/**
|
||||
* Tokenize the query into words.
|
||||
* Splits on everything that is not a letter or number.
|
||||
* Tokenizes the query into words.
|
||||
*
|
||||
* Splits on every character that is not a letter or number.
|
||||
*/
|
||||
private function tokenize(string $value): array
|
||||
{
|
||||
@@ -73,20 +88,20 @@ final class QueryEnricher
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a lookup table that works in both directions.
|
||||
* Builds a lookup table that works in both directions.
|
||||
*
|
||||
* Example:
|
||||
* [
|
||||
* 'hose' => 'jeans',
|
||||
* 'jacke' => 'mantel',
|
||||
* 'trousers' => 'jeans',
|
||||
* 'jacket' => 'coat',
|
||||
* ]
|
||||
*
|
||||
* becomes:
|
||||
* [
|
||||
* 'hose' => 'jeans',
|
||||
* 'jeans' => 'hose',
|
||||
* 'jacke' => 'mantel',
|
||||
* 'mantel' => 'jacke',
|
||||
* 'trousers' => 'jeans',
|
||||
* 'jeans' => 'trousers',
|
||||
* 'jacket' => 'coat',
|
||||
* 'coat' => 'jacket',
|
||||
* ]
|
||||
*/
|
||||
private function buildBidirectionalLookup(array $mapping): array
|
||||
@@ -94,8 +109,8 @@ final class QueryEnricher
|
||||
$lookup = [];
|
||||
|
||||
foreach ($mapping as $key => $value) {
|
||||
$key = trim((string)$key);
|
||||
$value = trim((string)$value);
|
||||
$key = trim((string) $key);
|
||||
$value = trim((string) $value);
|
||||
|
||||
// Skip incomplete pairs.
|
||||
if ($key === '' || $value === '') {
|
||||
@@ -114,18 +129,4 @@ final class QueryEnricher
|
||||
|
||||
return $lookup;
|
||||
}
|
||||
|
||||
public function enrichQueryList(): array
|
||||
{
|
||||
return [
|
||||
'Wasserhärte' => "Resthärte",
|
||||
'Gerät' => 'Modell',
|
||||
'Indikator' => 'Chemie',
|
||||
'Seminar' => 'Webinar',
|
||||
'Schulung' => 'Seminar',
|
||||
'Indikatoren' => 'Indikator',
|
||||
'Wasserhärte-Grenzwert'=>'Resthärte',
|
||||
'Resthärte-Grenzwert'=>'Wasserhärte'
|
||||
];
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user