add comments

This commit is contained in:
team 1
2026-04-16 20:26:25 +02:00
parent e5f035a961
commit eeebdfa21a
7 changed files with 246 additions and 81 deletions

View File

@@ -6,24 +6,27 @@ class AgentRunnerConfig
{
public function getShopPrompt($prompt): string
{
/**
* Erzeuge aus dem folgenden Nutzereingabetext einen kurzen Suchtext für die Shopware-6-Suche. Regeln: - Gib nur den finalen Suchtext aus. - erstelle immer die singular form von den relevanten Suchbegriffen - Keine Einleitung, keine Erklärung, keine Anführungszeichen. - Verwende nur die shop relevanten Suchbegriffe für eine Shopsuche aus dem Nutzereingabetext. - Maximal 6 Suchbegriffe, besser weniger. - Entferne Füllwörter, Höflichkeitsformen und irrelevante Wörter. - Erhalte Produktnamen, Marken, Modellnummern und zusammengesetzte Begriffe exakt, wenn sie relevant sind. - Zahlen, die zu einem Produktnamen oder Modell gehören (zb Indikator 300 oder Testomat 808 oder Testomat 2000), müssen erhalten bleiben. - Trenne die Begriffe nur durch Leerzeichen. Ausgabeformat: Keyword1 Keyword2 Keyword3
*/
return '
Erzeuge aus dem folgenden Nutzereingabetext einen kurzen Suchtext für die Shopware-6-Suche.
Generate a short search query for Shopware 6 from the following user input text.
Regeln:
- Gib nur den finalen Suchtext aus.
- erstelle immer die singular form von den relevanten Suchbegriffen
- Keine Einleitung, keine Erklärung, keine Anführungszeichen.
- Verwende nur die shop relevanten Suchbegriffe für eine Shopsuche aus dem Nutzereingabetext.
- Maximal 6 Suchbegriffe, besser weniger.
- Entferne Füllwörter, Höflichkeitsformen und irrelevante Wörter.
- Erhalte Produktnamen, Marken, Modellnummern und zusammengesetzte Begriffe exakt, wenn sie relevant sind.
- Zahlen, die zu einem Produktnamen oder Modell gehören (zb Indikator 300 oder Testomat 808), müssen erhalten bleiben.
- Trenne die Begriffe nur durch Leerzeichen.
Rules:
- Output only the final search query.
- Always convert relevant search terms to their singular form.
- No introduction, no explanation, no quotation marks.
- Use only shop-relevant search terms from the user input for a shop search.
- Maximum 6 search terms, preferably fewer.
- Remove filler words, polite phrases, and irrelevant words.
- Preserve product names, brands, model numbers, and compound terms exactly if they are relevant.
- Numbers that belong to a product name or model must be preserved (e.g. Indikator 300, Testomat 808, Testomat 2000).
- Separate terms using spaces only.
Ausgabeformat:
Output format:
Keyword1 Keyword2 Keyword3
Nutzereingabetext: ' . $prompt . '
input text: ' . $prompt . '
';
}
}

View File

@@ -0,0 +1,20 @@
<?php
namespace App\Config;
class QueryEnricherConfig
{
public function getEnrichQueryList(): array
{
return [
'Wasserhärte' => 'Resthärte',
'Gerät' => 'Modell',
'Indikator' => 'Chemie',
'Seminar' => 'Webinar',
'Schulung' => 'Seminar',
'Indikatoren' => 'Indikator',
'Wasserhärte-Grenzwert' => 'Resthärte',
'Resthärte-Grenzwert' => 'Wasserhärte',
];
}
}

View File

@@ -24,11 +24,11 @@ use Symfony\Component\Routing\Annotation\Route;
* - Client identity is resolved exclusively via ClientIdResolver
* - No user identifiers are accepted from the request
*/
final class HistoryController
final readonly class HistoryController
{
public function __construct(
private readonly ContextService $contextService,
private readonly ClientIdResolver $clientIdResolver,
private ContextService $contextService,
private ClientIdResolver $clientIdResolver,
) {}
/**

View File

@@ -6,12 +6,11 @@ declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Knowledge\ChunkManager;
use Symfony\Component\Uid\Uuid;
final class NdjsonChunkLookup
final readonly class NdjsonChunkLookup
{
public function __construct(
private readonly ChunkManager $chunkManager
private ChunkManager $chunkManager
)
{
}
@@ -33,7 +32,6 @@ final class NdjsonChunkLookup
$found[$id] = $row;
// Early exit sobald alle gefunden
if (\count($found) === \count($wanted)) {
break;
}

View File

@@ -14,7 +14,20 @@ use App\Repository\ModelGenerationConfigRepository;
use App\Routing\IntentRouteResolver;
use App\Tag\TagRoutingService;
use App\Vector\VectorSearchClient;
use Doctrine\DBAL\Exception;
use RuntimeException;
/**
* Hybrid retriever for NDJSON-based knowledge chunks.
*
* Main responsibilities:
* - detect high-level request intent
* - optionally short-circuit to catalog list output
* - run vector retrieval globally and optionally document-scoped
* - fuse both result sets with RRF-style scoring
* - apply selection rules for list queries vs. sales-style queries
* - return either plain chunk texts or debug metadata
*/
final readonly class NdjsonHybridRetriever implements RetrieverInterface
{
public function __construct(
@@ -37,15 +50,27 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
// PUBLIC API
// =========================================================
/**
* Returns the final retrieval payload as plain text chunks.
*
* Behaviour:
* - loads active retrieval config
* - executes the full orchestration pipeline
* - if the route resolves to a catalog list, returns the catalog block only
* - otherwise returns the selected chunk texts
* @throws Exception
*/
public function retrieve(string $prompt): array
{
$config = $this->requireConfig();
$result = $this->execute($prompt, $config, false);
// Catalog list responses bypass normal chunk retrieval completely.
if ($result['catalogBlock'] !== null) {
return [$result['catalogBlock']];
}
// No selected chunks means no usable retrieval result.
if ($result['selectedChunkIds'] === []) {
return [];
}
@@ -56,11 +81,23 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
);
}
/**
* Returns a debug-friendly retrieval result with scoring/meta information.
*
* This method is used for inspection and tuning:
* - selected chunk ids
* - raw vector scores
* - fused RRF scores
* - intent / route information
* - threshold and list-query flags
* @throws Exception
*/
public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array
{
$config = $config ?? $this->requireConfig();
$result = $this->execute($prompt, $config, true);
// For catalog list routes we expose a synthetic debug row.
if ($result['catalogBlock'] !== null) {
return [[
'rank' => 1,
@@ -86,6 +123,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
foreach ($result['selectedChunkIds'] as $chunkId) {
// Skip ids that could not be resolved to real chunk rows.
if (!isset($result['rows'][$chunkId])) {
continue;
}
@@ -114,6 +152,17 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
// CENTRAL ORCHESTRATION
// =========================================================
/**
* Central orchestration entrypoint.
*
* Pipeline:
* 1. Detect catalog entity and sales intent
* 2. Resolve route
* 3. If route is a catalog list route, try direct catalog output
* 4. Otherwise, run the normal hybrid retrieval core
* 5. Select final chunk ids depending on query type
* @throws Exception
*/
private function execute(
string $prompt,
ModelGenerationConfig $config,
@@ -125,6 +174,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$salesIntent = $this->detectSalesIntent($prompt);
$route = $this->routeResolver->resolve($salesIntent, $entityLabel);
// Fast path:
// If the route explicitly asks for a catalog list and we have an entity label,
// we return a prebuilt catalog block instead of semantic chunk retrieval.
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
$catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);
@@ -147,6 +199,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$core = $this->runCore($prompt, $config, $withScores, $salesIntent);
// No ranked chunks or no resolved rows means retrieval produced nothing usable.
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
return [
'route' => $route,
@@ -162,6 +215,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
];
}
// Selection strategy depends on query type:
// - list queries prefer deduplicated chunks
// - sales queries prefer spread across docs / chunk distance
$selectedChunkIds = $core['is_list_query']
? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
: $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
@@ -184,6 +240,20 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
// CORE PIPELINE
// =========================================================
/**
* Executes the actual hybrid retrieval logic.
*
* Steps:
* - derive limits from config within hard safety caps
* - detect whether the prompt is a "list query"
* - clean and enrich the prompt
* - compute threshold + vector topK based on intent/query type
* - route query into candidate document ids via tag routing
* - run global and optional scoped vector search
* - fuse hits
* - resolve chunk ids to chunk rows
* @throws Exception
*/
private function runCore(
string $prompt,
ModelGenerationConfig $config,
@@ -197,9 +267,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$isListQuery = $this->intentLite->isListQuery($prompt);
// The prompt is normalized first, then enriched before retrieval.
$cleanQuery = $this->queryCleaner->clean($prompt);
$cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery);
// Empty cleaned query means retrieval would be meaningless.
if ($cleanQuery === '') {
return [
'limit' => $limit,
@@ -218,18 +290,22 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$vectorTopKBase
);
// Tag routing tries to narrow retrieval to relevant document ids.
$candidateDocIds = $this->tagRouting->route($cleanQuery);
$candidateDocIds = is_array($candidateDocIds)
? array_values(array_unique(array_filter($candidateDocIds, 'is_string')))
: [];
// Always run a global search.
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
// Optionally run a scoped search if tag routing yielded document candidates.
$scopedHits = [];
if ($candidateDocIds !== []) {
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
}
// Nothing found at all.
if ($globalHits === [] && $scopedHits === []) {
return [
'limit' => $limit,
@@ -242,6 +318,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
];
}
// Fuse global and scoped hits with optional scoped boost.
$fused = $this->fuseHits(
$globalHits,
$scopedHits,
@@ -253,10 +330,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$rrfScores = $fused['rrf_scores'];
$rawScores = $fused['raw_scores'];
// Fallback:
// If all hits were filtered by threshold but global hits exist,
// derive a weak RRF ranking from the raw hit order.
if ($rrfScores === [] && $globalHits !== []) {
$rrfScores = $this->fallbackRrfFromHits(
$globalHits,
NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN
$globalHits
);
}
@@ -272,8 +351,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
];
}
// Highest fused score first.
arsort($rrfScores);
$rankedChunkIds = array_keys($rrfScores);
// Resolve the ranking to actual NDJSON chunk rows.
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
return [
@@ -291,21 +373,39 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
// SUPPORT
// =========================================================
/**
* Loads the active model generation config.
*
* Retrieval is not allowed to proceed without an active config.
*/
private function requireConfig(): ModelGenerationConfig
{
$config = $this->configRepository->findActiveForModel();
if ($config === null) {
throw new \RuntimeException('No active ModelGenerationConfig found.');
throw new RuntimeException('No active ModelGenerationConfig found.');
}
return $config;
}
/**
* Extracts the normalized sales intent string from the intent detector.
*
* Falls back to DISCOVERY when the detector payload is incomplete.
*/
private function detectSalesIntent(string $prompt): string
{
$data = $this->salesIntentLite->detect($prompt);
return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
}
/**
* Computes retrieval threshold and vector topK.
*
* Rules:
* - objection/pricing intents are slightly stricter
* - list queries are allowed to retrieve a wider candidate set
* - all values are clamped to global hard limits
*/
private function computeThresholdAndTopK(
string $salesIntent,
bool $isListQuery,
@@ -333,6 +433,15 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return [$threshold, $topK];
}
/**
* Fuses multiple hit lists into one RRF-style score map.
*
* Notes:
* - only hits above threshold are considered
* - rank position within each hit list contributes to the final score
* - scoped hits can be boosted
* - raw scores are optionally captured for debug output
*/
private function fuseHits(
array $globalHits,
array $scopedHits,
@@ -351,18 +460,21 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
foreach ($hits as $hit) {
// Every hit must provide a chunk id and a numeric score.
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
$raw = (float)$hit['score'];
// Threshold is applied before rank fusion.
if ($raw < $threshold) {
continue;
}
$chunkId = (string)$hit['chunk_id'];
// Store the best raw score per chunk for debug inspection.
if ($captureRaw) {
$rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
}
@@ -370,10 +482,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$rank++;
$rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
// Scoped result lists can get a slight relevance bonus.
if ($boost) {
$rrf *= 1.2;
}
// Scores from multiple hit lists accumulate.
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
}
};
@@ -387,7 +501,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
];
}
private function fallbackRrfFromHits(array $hits, int $topN): array
/**
* Builds a fallback RRF ranking purely from hit order.
*
* Used when thresholding removed all fused candidates but
* the global hit list itself still exists.
*/
private function fallbackRrfFromHits(array $hits): array
{
$rrf = [];
$rank = 0;
@@ -401,7 +521,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$rank++;
$rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
if ($rank >= $topN) {
if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) {
break;
}
}
@@ -409,6 +529,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return $rrf;
}
/**
* Selection strategy for list-style queries.
*
* Goal:
* - avoid near-identical chunks
* - prefer diverse list entries
* - stop once the configured limit is reached
*/
private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array
{
$seen = [];
@@ -425,6 +553,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
continue;
}
// Deduplicate by normalized chunk text.
$key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));
if (isset($seen[$key])) {
@@ -442,6 +571,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return $out;
}
/**
* Selection strategy for sales-oriented queries.
*
* Goal:
* - avoid overloading the result with chunks from the same document
* - avoid chunks that are too close to each other in the same document
* - preserve top-ranked relevance while improving contextual spread
*/
private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array
{
$out = [];
@@ -457,14 +594,17 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$docId = $rows[$chunkId]['document_id'] ?? null;
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
// Sales selection requires a valid document context.
if (!is_string($docId)) {
continue;
}
// Limit how many chunks may come from the same document.
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
continue;
}
// Enforce a minimum distance between chunk positions of the same document.
if (is_int($chunkIndex)) {
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
@@ -490,6 +630,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return $out;
}
/**
* Converts selected chunk ids into the final plain text result list.
*/
private function collectTextsFromIds(array $chunkIds, array $rows): array
{
$out = [];

View File

@@ -9,14 +9,14 @@ use App\Knowledge\StopWords;
final class QueryCleaner
{
/**
* Bereinigt eine Query ausschließlich für Retrieval-Zwecke.
* Cleans a query strictly for retrieval purposes.
*
* Wichtig:
* - Unicode-sicher
* - Zahlen bleiben erhalten
* - Negationen bleiben erhalten
* - Keine aggressive Token-Längen-Filterung
* - StopWords werden entfernt
* Important:
* - Unicode-safe
* - Numbers are preserved
* - Negations are preserved
* - No aggressive token-length filtering
* - Stop words are removed
*/
public function clean(string $query): string
{
@@ -24,23 +24,23 @@ final class QueryCleaner
return '';
}
// 1. Unicode-sicher lowercase
// 1. Convert to lowercase in a Unicode-safe way
$query = mb_strtolower($query, 'UTF-8');
// 2. Bindestriche & Slashes als Worttrenner behandeln
// 2. Treat hyphens and slashes as word separators
$query = str_replace(['-', '/'], ' ', $query);
// 3. Sonderzeichen entfernen, aber:
// - Buchstaben behalten
// - Zahlen behalten
// - Umlaute behalten
// 3. Remove special characters, but keep:
// - letters
// - numbers
// - other Unicode letters
$query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query);
if ($query === null) {
return '';
}
// 4. Mehrfache Whitespaces normalisieren
// 4. Normalize multiple whitespace characters
$query = preg_replace('/\s+/u', ' ', $query);
$query = trim($query);
@@ -48,7 +48,7 @@ final class QueryCleaner
return '';
}
// 5. Tokenisierung
// 5. Tokenize the query
$tokens = preg_split('/\s+/u', $query);
if ($tokens === false) {
@@ -65,7 +65,7 @@ final class QueryCleaner
continue;
}
// StopWords entfernen
// Remove stop words
if (StopWords::isStopWord($token)) {
continue;
}

View File

@@ -4,11 +4,25 @@ declare(strict_types=1);
namespace App\Knowledge\Retrieval;
final class QueryEnricher
use App\Config\QueryEnricherConfig;
final readonly class QueryEnricher
{
public function __construct(
private QueryEnricherConfig $config
)
{
}
/**
* Enriches the query with mapped counterpart terms.
*
* Example:
* - input: "water hardness device"
* - output: "water hardness device | Synonyms: residual hardness, model"
*/
public function enrichPrompt(string $query): string
{
// Return early if the input is empty or contains only whitespace.
if (trim($query) === '') {
return '';
}
@@ -19,19 +33,19 @@ final class QueryEnricher
// Normalize the query for case-insensitive matching.
$normalizedQuery = $this->normalize($query);
// Expect an associative array like:
// Expected format:
// [
// 'hose' => 'jeans',
// 'jacke' => 'mantel',
// 'trousers' => 'jeans',
// 'jacket' => 'coat',
// ]
$mapping = $this->enrichQueryList();
$mapping = $this->config->getEnrichQueryList();
// Build a bidirectional lookup table:
// key -> value
// value -> key
$lookup = $this->buildBidirectionalLookup($mapping);
// Split the query into searchable words/tokens.
// Split the query into searchable tokens.
$tokens = $this->tokenize($normalizedQuery);
$matches = [];
@@ -46,17 +60,17 @@ final class QueryEnricher
// Remove duplicates while preserving order.
$matches = array_values(array_unique($matches));
// If nothing was found, return the original query unchanged.
// If no matches were found, return the original query unchanged.
if ($matches === []) {
return $originalQuery;
}
// Append the matched counterpart terms to the original prompt.
return $originalQuery . " | Pseudonyme: " . implode(', ', $matches);
// Append the matched counterpart terms to the original query.
return $originalQuery . ' | Synonyms: ' . implode(', ', $matches);
}
/**
* Normalize a string for case-insensitive comparison.
* Normalizes a string for case-insensitive comparison.
*/
private function normalize(string $value): string
{
@@ -64,8 +78,9 @@ final class QueryEnricher
}
/**
* Tokenize the query into words.
* Splits on everything that is not a letter or number.
* Tokenizes the query into words.
*
* Splits on every character that is not a letter or number.
*/
private function tokenize(string $value): array
{
@@ -73,20 +88,20 @@ final class QueryEnricher
}
/**
* Build a lookup table that works in both directions.
* Builds a lookup table that works in both directions.
*
* Example:
* [
* 'hose' => 'jeans',
* 'jacke' => 'mantel',
* 'trousers' => 'jeans',
* 'jacket' => 'coat',
* ]
*
* becomes:
* [
* 'hose' => 'jeans',
* 'jeans' => 'hose',
* 'jacke' => 'mantel',
* 'mantel' => 'jacke',
* 'trousers' => 'jeans',
* 'jeans' => 'trousers',
* 'jacket' => 'coat',
* 'coat' => 'jacket',
* ]
*/
private function buildBidirectionalLookup(array $mapping): array
@@ -94,8 +109,8 @@ final class QueryEnricher
$lookup = [];
foreach ($mapping as $key => $value) {
$key = trim((string)$key);
$value = trim((string)$value);
$key = trim((string) $key);
$value = trim((string) $value);
// Skip incomplete pairs.
if ($key === '' || $value === '') {
@@ -114,18 +129,4 @@ final class QueryEnricher
return $lookup;
}
public function enrichQueryList(): array
{
return [
'Wasserhärte' => "Resthärte",
'Gerät' => 'Modell',
'Indikator' => 'Chemie',
'Seminar' => 'Webinar',
'Schulung' => 'Seminar',
'Indikatoren' => 'Indikator',
'Wasserhärte-Grenzwert'=>'Resthärte',
'Resthärte-Grenzwert'=>'Wasserhärte'
];
}
}