diff --git a/src/Config/AgentRunnerConfig.php b/src/Config/AgentRunnerConfig.php index 5ca4789..62bb278 100644 --- a/src/Config/AgentRunnerConfig.php +++ b/src/Config/AgentRunnerConfig.php @@ -6,24 +6,27 @@ class AgentRunnerConfig { public function getShopPrompt($prompt): string { + /** + * Erzeuge aus dem folgenden Nutzereingabetext einen kurzen Suchtext für die Shopware-6-Suche. Regeln: - Gib nur den finalen Suchtext aus. - erstelle immer die singular form von den relevanten Suchbegriffen - Keine Einleitung, keine Erklärung, keine Anführungszeichen. - Verwende nur die shop relevanten Suchbegriffe für eine Shopsuche aus dem Nutzereingabetext. - Maximal 6 Suchbegriffe, besser weniger. - Entferne Füllwörter, Höflichkeitsformen und irrelevante Wörter. - Erhalte Produktnamen, Marken, Modellnummern und zusammengesetzte Begriffe exakt, wenn sie relevant sind. - Zahlen, die zu einem Produktnamen oder Modell gehören (zb Indikator 300 oder Testomat 808 oder Testomat 2000), müssen erhalten bleiben. - Trenne die Begriffe nur durch Leerzeichen. Ausgabeformat: Keyword1 Keyword2 Keyword3 + */ return ' - Erzeuge aus dem folgenden Nutzereingabetext einen kurzen Suchtext für die Shopware-6-Suche. + Generate a short search query for Shopware 6 from the following user input text. + + Rules: + - Output only the final search query. + - Always convert relevant search terms to their singular form. + - No introduction, no explanation, no quotation marks. + - Use only shop-relevant search terms from the user input for a shop search. + - Maximum 6 search terms, preferably fewer. + - Remove filler words, polite phrases, and irrelevant words. + - Preserve product names, brands, model numbers, and compound terms exactly if they are relevant. + - Numbers that belong to a product name or model must be preserved (e.g. Indikator 300, Testomat 808, Testomat 2000). + - Separate terms using spaces only. + + Output format: + Keyword1 Keyword2 Keyword3 - Regeln: - - Gib nur den finalen Suchtext aus. - - erstelle immer die singular form von den relevanten Suchbegriffen - - Keine Einleitung, keine Erklärung, keine Anführungszeichen. - - Verwende nur die shop relevanten Suchbegriffe für eine Shopsuche aus dem Nutzereingabetext. - - Maximal 6 Suchbegriffe, besser weniger. - - Entferne Füllwörter, Höflichkeitsformen und irrelevante Wörter. - - Erhalte Produktnamen, Marken, Modellnummern und zusammengesetzte Begriffe exakt, wenn sie relevant sind. - - Zahlen, die zu einem Produktnamen oder Modell gehören (zb Indikator 300 oder Testomat 808), müssen erhalten bleiben. - - Trenne die Begriffe nur durch Leerzeichen. - - Ausgabeformat: - Keyword1 Keyword2 Keyword3 - - Nutzereingabetext: ' . $prompt . ' - '; + input text: ' . $prompt . ' + '; } } \ No newline at end of file diff --git a/src/Config/QueryEnricherConfig.php b/src/Config/QueryEnricherConfig.php new file mode 100644 index 0000000..3710c65 --- /dev/null +++ b/src/Config/QueryEnricherConfig.php @@ -0,0 +1,20 @@ + 'Resthärte', + 'Gerät' => 'Modell', + 'Indikator' => 'Chemie', + 'Seminar' => 'Webinar', + 'Schulung' => 'Seminar', + 'Indikatoren' => 'Indikator', + 'Wasserhärte-Grenzwert' => 'Resthärte', + 'Resthärte-Grenzwert' => 'Wasserhärte', + ]; + } +} \ No newline at end of file diff --git a/src/Controller/HistoryController.php b/src/Controller/HistoryController.php index 15e0022..a3d6fa3 100644 --- a/src/Controller/HistoryController.php +++ b/src/Controller/HistoryController.php @@ -24,11 +24,11 @@ use Symfony\Component\Routing\Annotation\Route; * - Client identity is resolved exclusively via ClientIdResolver * - No user identifiers are accepted from the request */ -final class HistoryController +final readonly class HistoryController { public function __construct( - private readonly ContextService $contextService, - private readonly ClientIdResolver $clientIdResolver, + private ContextService $contextService, + private ClientIdResolver $clientIdResolver, ) {} /** diff --git a/src/Knowledge/Retrieval/NdjsonChunkLookup.php b/src/Knowledge/Retrieval/NdjsonChunkLookup.php index 7fd8ae4..136a271 100644 --- a/src/Knowledge/Retrieval/NdjsonChunkLookup.php +++ b/src/Knowledge/Retrieval/NdjsonChunkLookup.php @@ -6,12 +6,11 @@ declare(strict_types=1); namespace App\Knowledge\Retrieval; use App\Knowledge\ChunkManager; -use Symfony\Component\Uid\Uuid; -final class NdjsonChunkLookup +final readonly class NdjsonChunkLookup { public function __construct( - private readonly ChunkManager $chunkManager + private ChunkManager $chunkManager ) { } @@ -33,7 +32,6 @@ final class NdjsonChunkLookup $found[$id] = $row; - // Early exit sobald alle gefunden if (\count($found) === \count($wanted)) { break; } diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index ce215e0..523ff30 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -14,7 +14,20 @@ use App\Repository\ModelGenerationConfigRepository; use App\Routing\IntentRouteResolver; use App\Tag\TagRoutingService; use App\Vector\VectorSearchClient; +use Doctrine\DBAL\Exception; +use RuntimeException; +/** + * Hybrid retriever for NDJSON-based knowledge chunks. + * + * Main responsibilities: + * - detect high-level request intent + * - optionally short-circuit to catalog list output + * - run vector retrieval globally and optionally document-scoped + * - fuse both result sets with RRF-style scoring + * - apply selection rules for list queries vs. sales-style queries + * - return either plain chunk texts or debug metadata + */ final readonly class NdjsonHybridRetriever implements RetrieverInterface { public function __construct( @@ -37,15 +50,27 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface // PUBLIC API // ========================================================= + /** + * Returns the final retrieval payload as plain text chunks. + * + * Behaviour: + * - loads active retrieval config + * - executes the full orchestration pipeline + * - if the route resolves to a catalog list, returns the catalog block only + * - otherwise returns the selected chunk texts + * @throws Exception + */ public function retrieve(string $prompt): array { $config = $this->requireConfig(); $result = $this->execute($prompt, $config, false); + // Catalog list responses bypass normal chunk retrieval completely. if ($result['catalogBlock'] !== null) { return [$result['catalogBlock']]; } + // No selected chunks means no usable retrieval result. if ($result['selectedChunkIds'] === []) { return []; } @@ -56,11 +81,23 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface ); } + /** + * Returns a debug-friendly retrieval result with scoring/meta information. + * + * This method is used for inspection and tuning: + * - selected chunk ids + * - raw vector scores + * - fused RRF scores + * - intent / route information + * - threshold and list-query flags + * @throws Exception + */ public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array { $config = $config ?? $this->requireConfig(); $result = $this->execute($prompt, $config, true); + // For catalog list routes we expose a synthetic debug row. if ($result['catalogBlock'] !== null) { return [[ 'rank' => 1, @@ -86,6 +123,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface foreach ($result['selectedChunkIds'] as $chunkId) { + // Skip ids that could not be resolved to real chunk rows. if (!isset($result['rows'][$chunkId])) { continue; } @@ -114,6 +152,17 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface // CENTRAL ORCHESTRATION // ========================================================= + /** + * Central orchestration entrypoint. + * + * Pipeline: + * 1. Detect catalog entity and sales intent + * 2. Resolve route + * 3. If route is a catalog list route, try direct catalog output + * 4. Otherwise, run the normal hybrid retrieval core + * 5. Select final chunk ids depending on query type + * @throws Exception + */ private function execute( string $prompt, ModelGenerationConfig $config, @@ -125,6 +174,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $salesIntent = $this->detectSalesIntent($prompt); $route = $this->routeResolver->resolve($salesIntent, $entityLabel); + // Fast path: + // If the route explicitly asks for a catalog list and we have an entity label, + // we return a prebuilt catalog block instead of semantic chunk retrieval. if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) { $catalogBlock = $this->entityCatalogService->listByTerm($entityLabel); @@ -147,6 +199,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $core = $this->runCore($prompt, $config, $withScores, $salesIntent); + // No ranked chunks or no resolved rows means retrieval produced nothing usable. if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) { return [ 'route' => $route, @@ -162,6 +215,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface ]; } + // Selection strategy depends on query type: + // - list queries prefer deduplicated chunks + // - sales queries prefer spread across docs / chunk distance $selectedChunkIds = $core['is_list_query'] ? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']) : $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']); @@ -184,6 +240,20 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface // CORE PIPELINE // ========================================================= + /** + * Executes the actual hybrid retrieval logic. + * + * Steps: + * - derive limits from config within hard safety caps + * - detect whether the prompt is a "list query" + * - clean and enrich the prompt + * - compute threshold + vector topK based on intent/query type + * - route query into candidate document ids via tag routing + * - run global and optional scoped vector search + * - fuse hits + * - resolve chunk ids to chunk rows + * @throws Exception + */ private function runCore( string $prompt, ModelGenerationConfig $config, @@ -197,9 +267,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $isListQuery = $this->intentLite->isListQuery($prompt); + // The prompt is normalized first, then enriched before retrieval. $cleanQuery = $this->queryCleaner->clean($prompt); $cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery); + // Empty cleaned query means retrieval would be meaningless. if ($cleanQuery === '') { return [ 'limit' => $limit, @@ -218,18 +290,22 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $vectorTopKBase ); + // Tag routing tries to narrow retrieval to relevant document ids. $candidateDocIds = $this->tagRouting->route($cleanQuery); $candidateDocIds = is_array($candidateDocIds) ? array_values(array_unique(array_filter($candidateDocIds, 'is_string'))) : []; + // Always run a global search. $globalHits = $this->vectorClient->search($cleanQuery, $topK); + // Optionally run a scoped search if tag routing yielded document candidates. $scopedHits = []; if ($candidateDocIds !== []) { $scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds); } + // Nothing found at all. if ($globalHits === [] && $scopedHits === []) { return [ 'limit' => $limit, @@ -242,6 +318,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface ]; } + // Fuse global and scoped hits with optional scoped boost. $fused = $this->fuseHits( $globalHits, $scopedHits, @@ -253,10 +330,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $rrfScores = $fused['rrf_scores']; $rawScores = $fused['raw_scores']; + // Fallback: + // If all hits were filtered by threshold but global hits exist, + // derive a weak RRF ranking from the raw hit order. if ($rrfScores === [] && $globalHits !== []) { $rrfScores = $this->fallbackRrfFromHits( - $globalHits, - NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN + $globalHits ); } @@ -272,8 +351,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface ]; } + // Highest fused score first. arsort($rrfScores); $rankedChunkIds = array_keys($rrfScores); + + // Resolve the ranking to actual NDJSON chunk rows. $rows = $this->lookup->findByChunkIds($rankedChunkIds); return [ @@ -291,21 +373,39 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface // SUPPORT // ========================================================= + /** + * Loads the active model generation config. + * + * Retrieval is not allowed to proceed without an active config. + */ private function requireConfig(): ModelGenerationConfig { $config = $this->configRepository->findActiveForModel(); if ($config === null) { - throw new \RuntimeException('No active ModelGenerationConfig found.'); + throw new RuntimeException('No active ModelGenerationConfig found.'); } return $config; } + /** + * Extracts the normalized sales intent string from the intent detector. + * + * Falls back to DISCOVERY when the detector payload is incomplete. + */ private function detectSalesIntent(string $prompt): string { $data = $this->salesIntentLite->detect($prompt); return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY); } + /** + * Computes retrieval threshold and vector topK. + * + * Rules: + * - objection/pricing intents are slightly stricter + * - list queries are allowed to retrieve a wider candidate set + * - all values are clamped to global hard limits + */ private function computeThresholdAndTopK( string $salesIntent, bool $isListQuery, @@ -333,6 +433,15 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return [$threshold, $topK]; } + /** + * Fuses multiple hit lists into one RRF-style score map. + * + * Notes: + * - only hits above threshold are considered + * - rank position within each hit list contributes to the final score + * - scoped hits can be boosted + * - raw scores are optionally captured for debug output + */ private function fuseHits( array $globalHits, array $scopedHits, @@ -351,18 +460,21 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface foreach ($hits as $hit) { + // Every hit must provide a chunk id and a numeric score. if (!isset($hit['chunk_id'], $hit['score'])) { continue; } $raw = (float)$hit['score']; + // Threshold is applied before rank fusion. if ($raw < $threshold) { continue; } $chunkId = (string)$hit['chunk_id']; + // Store the best raw score per chunk for debug inspection. if ($captureRaw) { $rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw); } @@ -370,10 +482,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $rank++; $rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); + // Scoped result lists can get a slight relevance bonus. if ($boost) { $rrf *= 1.2; } + // Scores from multiple hit lists accumulate. $rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf; } }; @@ -387,7 +501,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface ]; } - private function fallbackRrfFromHits(array $hits, int $topN): array + /** + * Builds a fallback RRF ranking purely from hit order. + * + * Used when thresholding removed all fused candidates but + * the global hit list itself still exists. + */ + private function fallbackRrfFromHits(array $hits): array { $rrf = []; $rank = 0; @@ -401,7 +521,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $rank++; $rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank); - if ($rank >= $topN) { + if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) { break; } } @@ -409,6 +529,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $rrf; } + /** + * Selection strategy for list-style queries. + * + * Goal: + * - avoid near-identical chunks + * - prefer diverse list entries + * - stop once the configured limit is reached + */ private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array { $seen = []; @@ -425,6 +553,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface continue; } + // Deduplicate by normalized chunk text. $key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk))); if (isset($seen[$key])) { @@ -442,6 +571,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $out; } + /** + * Selection strategy for sales-oriented queries. + * + * Goal: + * - avoid overloading the result with chunks from the same document + * - avoid chunks that are too close to each other in the same document + * - preserve top-ranked relevance while improving contextual spread + */ private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array { $out = []; @@ -457,14 +594,17 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $docId = $rows[$chunkId]['document_id'] ?? null; $chunkIndex = $rows[$chunkId]['chunk_index'] ?? null; + // Sales selection requires a valid document context. if (!is_string($docId)) { continue; } + // Limit how many chunks may come from the same document. if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) { continue; } + // Enforce a minimum distance between chunk positions of the same document. if (is_int($chunkIndex)) { foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) { if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) { @@ -490,6 +630,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return $out; } + /** + * Converts selected chunk ids into the final plain text result list. + */ private function collectTextsFromIds(array $chunkIds, array $rows): array { $out = []; diff --git a/src/Knowledge/Retrieval/QueryCleaner.php b/src/Knowledge/Retrieval/QueryCleaner.php index ef2942c..dbb465c 100644 --- a/src/Knowledge/Retrieval/QueryCleaner.php +++ b/src/Knowledge/Retrieval/QueryCleaner.php @@ -9,14 +9,14 @@ use App\Knowledge\StopWords; final class QueryCleaner { /** - * Bereinigt eine Query ausschließlich für Retrieval-Zwecke. + * Cleans a query strictly for retrieval purposes. * - * Wichtig: - * - Unicode-sicher - * - Zahlen bleiben erhalten - * - Negationen bleiben erhalten - * - Keine aggressive Token-Längen-Filterung - * - StopWords werden entfernt + * Important: + * - Unicode-safe + * - Numbers are preserved + * - Negations are preserved + * - No aggressive token-length filtering + * - Stop words are removed */ public function clean(string $query): string { @@ -24,23 +24,23 @@ final class QueryCleaner return ''; } - // 1. Unicode-sicher lowercase + // 1. Convert to lowercase in a Unicode-safe way $query = mb_strtolower($query, 'UTF-8'); - // 2. Bindestriche & Slashes als Worttrenner behandeln + // 2. Treat hyphens and slashes as word separators $query = str_replace(['-', '/'], ' ', $query); - // 3. Sonderzeichen entfernen, aber: - // - Buchstaben behalten - // - Zahlen behalten - // - Umlaute behalten + // 3. Remove special characters, but keep: + // - letters + // - numbers + // - other Unicode letters $query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query); if ($query === null) { return ''; } - // 4. Mehrfache Whitespaces normalisieren + // 4. Normalize multiple whitespace characters $query = preg_replace('/\s+/u', ' ', $query); $query = trim($query); @@ -48,7 +48,7 @@ final class QueryCleaner return ''; } - // 5. Tokenisierung + // 5. Tokenize the query $tokens = preg_split('/\s+/u', $query); if ($tokens === false) { @@ -65,7 +65,7 @@ final class QueryCleaner continue; } - // StopWords entfernen + // Remove stop words if (StopWords::isStopWord($token)) { continue; } diff --git a/src/Knowledge/Retrieval/QueryEnricher.php b/src/Knowledge/Retrieval/QueryEnricher.php index 1bac354..bf00664 100644 --- a/src/Knowledge/Retrieval/QueryEnricher.php +++ b/src/Knowledge/Retrieval/QueryEnricher.php @@ -4,11 +4,25 @@ declare(strict_types=1); namespace App\Knowledge\Retrieval; -final class QueryEnricher +use App\Config\QueryEnricherConfig; + +final readonly class QueryEnricher { + public function __construct( + private QueryEnricherConfig $config + ) + { + } + + /** + * Enriches the query with mapped counterpart terms. + * + * Example: + * - input: "water hardness device" + * - output: "water hardness device | Synonyms: residual hardness, model" + */ public function enrichPrompt(string $query): string { - // Return early if the input is empty or contains only whitespace. if (trim($query) === '') { return ''; } @@ -19,19 +33,19 @@ final class QueryEnricher // Normalize the query for case-insensitive matching. $normalizedQuery = $this->normalize($query); - // Expect an associative array like: + // Expected format: // [ - // 'hose' => 'jeans', - // 'jacke' => 'mantel', + // 'trousers' => 'jeans', + // 'jacket' => 'coat', // ] - $mapping = $this->enrichQueryList(); + $mapping = $this->config->getEnrichQueryList(); // Build a bidirectional lookup table: // key -> value // value -> key $lookup = $this->buildBidirectionalLookup($mapping); - // Split the query into searchable words/tokens. + // Split the query into searchable tokens. $tokens = $this->tokenize($normalizedQuery); $matches = []; @@ -46,17 +60,17 @@ final class QueryEnricher // Remove duplicates while preserving order. $matches = array_values(array_unique($matches)); - // If nothing was found, return the original query unchanged. + // If no matches were found, return the original query unchanged. if ($matches === []) { return $originalQuery; } - // Append the matched counterpart terms to the original prompt. - return $originalQuery . " | Pseudonyme: " . implode(', ', $matches); + // Append the matched counterpart terms to the original query. + return $originalQuery . ' | Synonyms: ' . implode(', ', $matches); } /** - * Normalize a string for case-insensitive comparison. + * Normalizes a string for case-insensitive comparison. */ private function normalize(string $value): string { @@ -64,8 +78,9 @@ final class QueryEnricher } /** - * Tokenize the query into words. - * Splits on everything that is not a letter or number. + * Tokenizes the query into words. + * + * Splits on every character that is not a letter or number. */ private function tokenize(string $value): array { @@ -73,20 +88,20 @@ final class QueryEnricher } /** - * Build a lookup table that works in both directions. + * Builds a lookup table that works in both directions. * * Example: * [ - * 'hose' => 'jeans', - * 'jacke' => 'mantel', + * 'trousers' => 'jeans', + * 'jacket' => 'coat', * ] * * becomes: * [ - * 'hose' => 'jeans', - * 'jeans' => 'hose', - * 'jacke' => 'mantel', - * 'mantel' => 'jacke', + * 'trousers' => 'jeans', + * 'jeans' => 'trousers', + * 'jacket' => 'coat', + * 'coat' => 'jacket', * ] */ private function buildBidirectionalLookup(array $mapping): array @@ -94,8 +109,8 @@ final class QueryEnricher $lookup = []; foreach ($mapping as $key => $value) { - $key = trim((string)$key); - $value = trim((string)$value); + $key = trim((string) $key); + $value = trim((string) $value); // Skip incomplete pairs. if ($key === '' || $value === '') { @@ -114,18 +129,4 @@ final class QueryEnricher return $lookup; } - - public function enrichQueryList(): array - { - return [ - 'Wasserhärte' => "Resthärte", - 'Gerät' => 'Modell', - 'Indikator' => 'Chemie', - 'Seminar' => 'Webinar', - 'Schulung' => 'Seminar', - 'Indikatoren' => 'Indikator', - 'Wasserhärte-Grenzwert'=>'Resthärte', - 'Resthärte-Grenzwert'=>'Wasserhärte' - ]; - } } \ No newline at end of file