add comments
This commit is contained in:
@@ -6,24 +6,27 @@ class AgentRunnerConfig
|
|||||||
{
|
{
|
||||||
public function getShopPrompt($prompt): string
|
public function getShopPrompt($prompt): string
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
* Erzeuge aus dem folgenden Nutzereingabetext einen kurzen Suchtext für die Shopware-6-Suche. Regeln: - Gib nur den finalen Suchtext aus. - erstelle immer die singular form von den relevanten Suchbegriffen - Keine Einleitung, keine Erklärung, keine Anführungszeichen. - Verwende nur die shop relevanten Suchbegriffe für eine Shopsuche aus dem Nutzereingabetext. - Maximal 6 Suchbegriffe, besser weniger. - Entferne Füllwörter, Höflichkeitsformen und irrelevante Wörter. - Erhalte Produktnamen, Marken, Modellnummern und zusammengesetzte Begriffe exakt, wenn sie relevant sind. - Zahlen, die zu einem Produktnamen oder Modell gehören (zb Indikator 300 oder Testomat 808 oder Testomat 2000), müssen erhalten bleiben. - Trenne die Begriffe nur durch Leerzeichen. Ausgabeformat: Keyword1 Keyword2 Keyword3
|
||||||
|
*/
|
||||||
return '
|
return '
|
||||||
Erzeuge aus dem folgenden Nutzereingabetext einen kurzen Suchtext für die Shopware-6-Suche.
|
Generate a short search query for Shopware 6 from the following user input text.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- Output only the final search query.
|
||||||
|
- Always convert relevant search terms to their singular form.
|
||||||
|
- No introduction, no explanation, no quotation marks.
|
||||||
|
- Use only shop-relevant search terms from the user input for a shop search.
|
||||||
|
- Maximum 6 search terms, preferably fewer.
|
||||||
|
- Remove filler words, polite phrases, and irrelevant words.
|
||||||
|
- Preserve product names, brands, model numbers, and compound terms exactly if they are relevant.
|
||||||
|
- Numbers that belong to a product name or model must be preserved (e.g. Indikator 300, Testomat 808, Testomat 2000).
|
||||||
|
- Separate terms using spaces only.
|
||||||
|
|
||||||
|
Output format:
|
||||||
|
Keyword1 Keyword2 Keyword3
|
||||||
|
|
||||||
Regeln:
|
input text: ' . $prompt . '
|
||||||
- Gib nur den finalen Suchtext aus.
|
';
|
||||||
- erstelle immer die singular form von den relevanten Suchbegriffen
|
|
||||||
- Keine Einleitung, keine Erklärung, keine Anführungszeichen.
|
|
||||||
- Verwende nur die shop relevanten Suchbegriffe für eine Shopsuche aus dem Nutzereingabetext.
|
|
||||||
- Maximal 6 Suchbegriffe, besser weniger.
|
|
||||||
- Entferne Füllwörter, Höflichkeitsformen und irrelevante Wörter.
|
|
||||||
- Erhalte Produktnamen, Marken, Modellnummern und zusammengesetzte Begriffe exakt, wenn sie relevant sind.
|
|
||||||
- Zahlen, die zu einem Produktnamen oder Modell gehören (zb Indikator 300 oder Testomat 808), müssen erhalten bleiben.
|
|
||||||
- Trenne die Begriffe nur durch Leerzeichen.
|
|
||||||
|
|
||||||
Ausgabeformat:
|
|
||||||
Keyword1 Keyword2 Keyword3
|
|
||||||
|
|
||||||
Nutzereingabetext: ' . $prompt . '
|
|
||||||
';
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
20
src/Config/QueryEnricherConfig.php
Normal file
20
src/Config/QueryEnricherConfig.php
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Config;
|
||||||
|
|
||||||
|
class QueryEnricherConfig
|
||||||
|
{
|
||||||
|
public function getEnrichQueryList(): array
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
'Wasserhärte' => 'Resthärte',
|
||||||
|
'Gerät' => 'Modell',
|
||||||
|
'Indikator' => 'Chemie',
|
||||||
|
'Seminar' => 'Webinar',
|
||||||
|
'Schulung' => 'Seminar',
|
||||||
|
'Indikatoren' => 'Indikator',
|
||||||
|
'Wasserhärte-Grenzwert' => 'Resthärte',
|
||||||
|
'Resthärte-Grenzwert' => 'Wasserhärte',
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -24,11 +24,11 @@ use Symfony\Component\Routing\Annotation\Route;
|
|||||||
* - Client identity is resolved exclusively via ClientIdResolver
|
* - Client identity is resolved exclusively via ClientIdResolver
|
||||||
* - No user identifiers are accepted from the request
|
* - No user identifiers are accepted from the request
|
||||||
*/
|
*/
|
||||||
final class HistoryController
|
final readonly class HistoryController
|
||||||
{
|
{
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly ContextService $contextService,
|
private ContextService $contextService,
|
||||||
private readonly ClientIdResolver $clientIdResolver,
|
private ClientIdResolver $clientIdResolver,
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -6,12 +6,11 @@ declare(strict_types=1);
|
|||||||
namespace App\Knowledge\Retrieval;
|
namespace App\Knowledge\Retrieval;
|
||||||
|
|
||||||
use App\Knowledge\ChunkManager;
|
use App\Knowledge\ChunkManager;
|
||||||
use Symfony\Component\Uid\Uuid;
|
|
||||||
|
|
||||||
final class NdjsonChunkLookup
|
final readonly class NdjsonChunkLookup
|
||||||
{
|
{
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly ChunkManager $chunkManager
|
private ChunkManager $chunkManager
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@@ -33,7 +32,6 @@ final class NdjsonChunkLookup
|
|||||||
|
|
||||||
$found[$id] = $row;
|
$found[$id] = $row;
|
||||||
|
|
||||||
// Early exit sobald alle gefunden
|
|
||||||
if (\count($found) === \count($wanted)) {
|
if (\count($found) === \count($wanted)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,7 +14,20 @@ use App\Repository\ModelGenerationConfigRepository;
|
|||||||
use App\Routing\IntentRouteResolver;
|
use App\Routing\IntentRouteResolver;
|
||||||
use App\Tag\TagRoutingService;
|
use App\Tag\TagRoutingService;
|
||||||
use App\Vector\VectorSearchClient;
|
use App\Vector\VectorSearchClient;
|
||||||
|
use Doctrine\DBAL\Exception;
|
||||||
|
use RuntimeException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hybrid retriever for NDJSON-based knowledge chunks.
|
||||||
|
*
|
||||||
|
* Main responsibilities:
|
||||||
|
* - detect high-level request intent
|
||||||
|
* - optionally short-circuit to catalog list output
|
||||||
|
* - run vector retrieval globally and optionally document-scoped
|
||||||
|
* - fuse both result sets with RRF-style scoring
|
||||||
|
* - apply selection rules for list queries vs. sales-style queries
|
||||||
|
* - return either plain chunk texts or debug metadata
|
||||||
|
*/
|
||||||
final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||||
{
|
{
|
||||||
public function __construct(
|
public function __construct(
|
||||||
@@ -37,15 +50,27 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
// PUBLIC API
|
// PUBLIC API
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the final retrieval payload as plain text chunks.
|
||||||
|
*
|
||||||
|
* Behaviour:
|
||||||
|
* - loads active retrieval config
|
||||||
|
* - executes the full orchestration pipeline
|
||||||
|
* - if the route resolves to a catalog list, returns the catalog block only
|
||||||
|
* - otherwise returns the selected chunk texts
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
public function retrieve(string $prompt): array
|
public function retrieve(string $prompt): array
|
||||||
{
|
{
|
||||||
$config = $this->requireConfig();
|
$config = $this->requireConfig();
|
||||||
$result = $this->execute($prompt, $config, false);
|
$result = $this->execute($prompt, $config, false);
|
||||||
|
|
||||||
|
// Catalog list responses bypass normal chunk retrieval completely.
|
||||||
if ($result['catalogBlock'] !== null) {
|
if ($result['catalogBlock'] !== null) {
|
||||||
return [$result['catalogBlock']];
|
return [$result['catalogBlock']];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// No selected chunks means no usable retrieval result.
|
||||||
if ($result['selectedChunkIds'] === []) {
|
if ($result['selectedChunkIds'] === []) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
@@ -56,11 +81,23 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a debug-friendly retrieval result with scoring/meta information.
|
||||||
|
*
|
||||||
|
* This method is used for inspection and tuning:
|
||||||
|
* - selected chunk ids
|
||||||
|
* - raw vector scores
|
||||||
|
* - fused RRF scores
|
||||||
|
* - intent / route information
|
||||||
|
* - threshold and list-query flags
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array
|
public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array
|
||||||
{
|
{
|
||||||
$config = $config ?? $this->requireConfig();
|
$config = $config ?? $this->requireConfig();
|
||||||
$result = $this->execute($prompt, $config, true);
|
$result = $this->execute($prompt, $config, true);
|
||||||
|
|
||||||
|
// For catalog list routes we expose a synthetic debug row.
|
||||||
if ($result['catalogBlock'] !== null) {
|
if ($result['catalogBlock'] !== null) {
|
||||||
return [[
|
return [[
|
||||||
'rank' => 1,
|
'rank' => 1,
|
||||||
@@ -86,6 +123,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
|
|
||||||
foreach ($result['selectedChunkIds'] as $chunkId) {
|
foreach ($result['selectedChunkIds'] as $chunkId) {
|
||||||
|
|
||||||
|
// Skip ids that could not be resolved to real chunk rows.
|
||||||
if (!isset($result['rows'][$chunkId])) {
|
if (!isset($result['rows'][$chunkId])) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -114,6 +152,17 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
// CENTRAL ORCHESTRATION
|
// CENTRAL ORCHESTRATION
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Central orchestration entrypoint.
|
||||||
|
*
|
||||||
|
* Pipeline:
|
||||||
|
* 1. Detect catalog entity and sales intent
|
||||||
|
* 2. Resolve route
|
||||||
|
* 3. If route is a catalog list route, try direct catalog output
|
||||||
|
* 4. Otherwise, run the normal hybrid retrieval core
|
||||||
|
* 5. Select final chunk ids depending on query type
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
private function execute(
|
private function execute(
|
||||||
string $prompt,
|
string $prompt,
|
||||||
ModelGenerationConfig $config,
|
ModelGenerationConfig $config,
|
||||||
@@ -125,6 +174,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
$salesIntent = $this->detectSalesIntent($prompt);
|
$salesIntent = $this->detectSalesIntent($prompt);
|
||||||
$route = $this->routeResolver->resolve($salesIntent, $entityLabel);
|
$route = $this->routeResolver->resolve($salesIntent, $entityLabel);
|
||||||
|
|
||||||
|
// Fast path:
|
||||||
|
// If the route explicitly asks for a catalog list and we have an entity label,
|
||||||
|
// we return a prebuilt catalog block instead of semantic chunk retrieval.
|
||||||
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
|
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
|
||||||
|
|
||||||
$catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);
|
$catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);
|
||||||
@@ -147,6 +199,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
|
|
||||||
$core = $this->runCore($prompt, $config, $withScores, $salesIntent);
|
$core = $this->runCore($prompt, $config, $withScores, $salesIntent);
|
||||||
|
|
||||||
|
// No ranked chunks or no resolved rows means retrieval produced nothing usable.
|
||||||
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
||||||
return [
|
return [
|
||||||
'route' => $route,
|
'route' => $route,
|
||||||
@@ -162,6 +215,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Selection strategy depends on query type:
|
||||||
|
// - list queries prefer deduplicated chunks
|
||||||
|
// - sales queries prefer spread across docs / chunk distance
|
||||||
$selectedChunkIds = $core['is_list_query']
|
$selectedChunkIds = $core['is_list_query']
|
||||||
? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
|
? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
|
||||||
: $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
: $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
||||||
@@ -184,6 +240,20 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
// CORE PIPELINE
|
// CORE PIPELINE
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Executes the actual hybrid retrieval logic.
|
||||||
|
*
|
||||||
|
* Steps:
|
||||||
|
* - derive limits from config within hard safety caps
|
||||||
|
* - detect whether the prompt is a "list query"
|
||||||
|
* - clean and enrich the prompt
|
||||||
|
* - compute threshold + vector topK based on intent/query type
|
||||||
|
* - route query into candidate document ids via tag routing
|
||||||
|
* - run global and optional scoped vector search
|
||||||
|
* - fuse hits
|
||||||
|
* - resolve chunk ids to chunk rows
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
private function runCore(
|
private function runCore(
|
||||||
string $prompt,
|
string $prompt,
|
||||||
ModelGenerationConfig $config,
|
ModelGenerationConfig $config,
|
||||||
@@ -197,9 +267,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
|
|
||||||
$isListQuery = $this->intentLite->isListQuery($prompt);
|
$isListQuery = $this->intentLite->isListQuery($prompt);
|
||||||
|
|
||||||
|
// The prompt is normalized first, then enriched before retrieval.
|
||||||
$cleanQuery = $this->queryCleaner->clean($prompt);
|
$cleanQuery = $this->queryCleaner->clean($prompt);
|
||||||
$cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery);
|
$cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery);
|
||||||
|
|
||||||
|
// Empty cleaned query means retrieval would be meaningless.
|
||||||
if ($cleanQuery === '') {
|
if ($cleanQuery === '') {
|
||||||
return [
|
return [
|
||||||
'limit' => $limit,
|
'limit' => $limit,
|
||||||
@@ -218,18 +290,22 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
$vectorTopKBase
|
$vectorTopKBase
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Tag routing tries to narrow retrieval to relevant document ids.
|
||||||
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
||||||
$candidateDocIds = is_array($candidateDocIds)
|
$candidateDocIds = is_array($candidateDocIds)
|
||||||
? array_values(array_unique(array_filter($candidateDocIds, 'is_string')))
|
? array_values(array_unique(array_filter($candidateDocIds, 'is_string')))
|
||||||
: [];
|
: [];
|
||||||
|
|
||||||
|
// Always run a global search.
|
||||||
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
||||||
|
|
||||||
|
// Optionally run a scoped search if tag routing yielded document candidates.
|
||||||
$scopedHits = [];
|
$scopedHits = [];
|
||||||
if ($candidateDocIds !== []) {
|
if ($candidateDocIds !== []) {
|
||||||
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
|
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Nothing found at all.
|
||||||
if ($globalHits === [] && $scopedHits === []) {
|
if ($globalHits === [] && $scopedHits === []) {
|
||||||
return [
|
return [
|
||||||
'limit' => $limit,
|
'limit' => $limit,
|
||||||
@@ -242,6 +318,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fuse global and scoped hits with optional scoped boost.
|
||||||
$fused = $this->fuseHits(
|
$fused = $this->fuseHits(
|
||||||
$globalHits,
|
$globalHits,
|
||||||
$scopedHits,
|
$scopedHits,
|
||||||
@@ -253,10 +330,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
$rrfScores = $fused['rrf_scores'];
|
$rrfScores = $fused['rrf_scores'];
|
||||||
$rawScores = $fused['raw_scores'];
|
$rawScores = $fused['raw_scores'];
|
||||||
|
|
||||||
|
// Fallback:
|
||||||
|
// If all hits were filtered by threshold but global hits exist,
|
||||||
|
// derive a weak RRF ranking from the raw hit order.
|
||||||
if ($rrfScores === [] && $globalHits !== []) {
|
if ($rrfScores === [] && $globalHits !== []) {
|
||||||
$rrfScores = $this->fallbackRrfFromHits(
|
$rrfScores = $this->fallbackRrfFromHits(
|
||||||
$globalHits,
|
$globalHits
|
||||||
NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -272,8 +351,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Highest fused score first.
|
||||||
arsort($rrfScores);
|
arsort($rrfScores);
|
||||||
$rankedChunkIds = array_keys($rrfScores);
|
$rankedChunkIds = array_keys($rrfScores);
|
||||||
|
|
||||||
|
// Resolve the ranking to actual NDJSON chunk rows.
|
||||||
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
|
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
|
||||||
|
|
||||||
return [
|
return [
|
||||||
@@ -291,21 +373,39 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
// SUPPORT
|
// SUPPORT
|
||||||
// =========================================================
|
// =========================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads the active model generation config.
|
||||||
|
*
|
||||||
|
* Retrieval is not allowed to proceed without an active config.
|
||||||
|
*/
|
||||||
private function requireConfig(): ModelGenerationConfig
|
private function requireConfig(): ModelGenerationConfig
|
||||||
{
|
{
|
||||||
$config = $this->configRepository->findActiveForModel();
|
$config = $this->configRepository->findActiveForModel();
|
||||||
if ($config === null) {
|
if ($config === null) {
|
||||||
throw new \RuntimeException('No active ModelGenerationConfig found.');
|
throw new RuntimeException('No active ModelGenerationConfig found.');
|
||||||
}
|
}
|
||||||
return $config;
|
return $config;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts the normalized sales intent string from the intent detector.
|
||||||
|
*
|
||||||
|
* Falls back to DISCOVERY when the detector payload is incomplete.
|
||||||
|
*/
|
||||||
private function detectSalesIntent(string $prompt): string
|
private function detectSalesIntent(string $prompt): string
|
||||||
{
|
{
|
||||||
$data = $this->salesIntentLite->detect($prompt);
|
$data = $this->salesIntentLite->detect($prompt);
|
||||||
return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
|
return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes retrieval threshold and vector topK.
|
||||||
|
*
|
||||||
|
* Rules:
|
||||||
|
* - objection/pricing intents are slightly stricter
|
||||||
|
* - list queries are allowed to retrieve a wider candidate set
|
||||||
|
* - all values are clamped to global hard limits
|
||||||
|
*/
|
||||||
private function computeThresholdAndTopK(
|
private function computeThresholdAndTopK(
|
||||||
string $salesIntent,
|
string $salesIntent,
|
||||||
bool $isListQuery,
|
bool $isListQuery,
|
||||||
@@ -333,6 +433,15 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
return [$threshold, $topK];
|
return [$threshold, $topK];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fuses multiple hit lists into one RRF-style score map.
|
||||||
|
*
|
||||||
|
* Notes:
|
||||||
|
* - only hits above threshold are considered
|
||||||
|
* - rank position within each hit list contributes to the final score
|
||||||
|
* - scoped hits can be boosted
|
||||||
|
* - raw scores are optionally captured for debug output
|
||||||
|
*/
|
||||||
private function fuseHits(
|
private function fuseHits(
|
||||||
array $globalHits,
|
array $globalHits,
|
||||||
array $scopedHits,
|
array $scopedHits,
|
||||||
@@ -351,18 +460,21 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
|
|
||||||
foreach ($hits as $hit) {
|
foreach ($hits as $hit) {
|
||||||
|
|
||||||
|
// Every hit must provide a chunk id and a numeric score.
|
||||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$raw = (float)$hit['score'];
|
$raw = (float)$hit['score'];
|
||||||
|
|
||||||
|
// Threshold is applied before rank fusion.
|
||||||
if ($raw < $threshold) {
|
if ($raw < $threshold) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$chunkId = (string)$hit['chunk_id'];
|
$chunkId = (string)$hit['chunk_id'];
|
||||||
|
|
||||||
|
// Store the best raw score per chunk for debug inspection.
|
||||||
if ($captureRaw) {
|
if ($captureRaw) {
|
||||||
$rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
|
$rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
|
||||||
}
|
}
|
||||||
@@ -370,10 +482,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
$rank++;
|
$rank++;
|
||||||
$rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
|
$rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
|
||||||
|
|
||||||
|
// Scoped result lists can get a slight relevance bonus.
|
||||||
if ($boost) {
|
if ($boost) {
|
||||||
$rrf *= 1.2;
|
$rrf *= 1.2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Scores from multiple hit lists accumulate.
|
||||||
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
|
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -387,7 +501,13 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
private function fallbackRrfFromHits(array $hits, int $topN): array
|
/**
|
||||||
|
* Builds a fallback RRF ranking purely from hit order.
|
||||||
|
*
|
||||||
|
* Used when thresholding removed all fused candidates but
|
||||||
|
* the global hit list itself still exists.
|
||||||
|
*/
|
||||||
|
private function fallbackRrfFromHits(array $hits): array
|
||||||
{
|
{
|
||||||
$rrf = [];
|
$rrf = [];
|
||||||
$rank = 0;
|
$rank = 0;
|
||||||
@@ -401,7 +521,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
$rank++;
|
$rank++;
|
||||||
$rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
|
$rrf[(string)$hit['chunk_id']] = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
|
||||||
|
|
||||||
if ($rank >= $topN) {
|
if ($rank >= NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -409,6 +529,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
return $rrf;
|
return $rrf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Selection strategy for list-style queries.
|
||||||
|
*
|
||||||
|
* Goal:
|
||||||
|
* - avoid near-identical chunks
|
||||||
|
* - prefer diverse list entries
|
||||||
|
* - stop once the configured limit is reached
|
||||||
|
*/
|
||||||
private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array
|
private function selectListChunkIds(array $chunkIds, array $rows, int $limit): array
|
||||||
{
|
{
|
||||||
$seen = [];
|
$seen = [];
|
||||||
@@ -425,6 +553,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Deduplicate by normalized chunk text.
|
||||||
$key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));
|
$key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));
|
||||||
|
|
||||||
if (isset($seen[$key])) {
|
if (isset($seen[$key])) {
|
||||||
@@ -442,6 +571,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
return $out;
|
return $out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Selection strategy for sales-oriented queries.
|
||||||
|
*
|
||||||
|
* Goal:
|
||||||
|
* - avoid overloading the result with chunks from the same document
|
||||||
|
* - avoid chunks that are too close to each other in the same document
|
||||||
|
* - preserve top-ranked relevance while improving contextual spread
|
||||||
|
*/
|
||||||
private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array
|
private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array
|
||||||
{
|
{
|
||||||
$out = [];
|
$out = [];
|
||||||
@@ -457,14 +594,17 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||||
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
||||||
|
|
||||||
|
// Sales selection requires a valid document context.
|
||||||
if (!is_string($docId)) {
|
if (!is_string($docId)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Limit how many chunks may come from the same document.
|
||||||
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
|
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Enforce a minimum distance between chunk positions of the same document.
|
||||||
if (is_int($chunkIndex)) {
|
if (is_int($chunkIndex)) {
|
||||||
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
|
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
|
||||||
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
|
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
|
||||||
@@ -490,6 +630,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
return $out;
|
return $out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts selected chunk ids into the final plain text result list.
|
||||||
|
*/
|
||||||
private function collectTextsFromIds(array $chunkIds, array $rows): array
|
private function collectTextsFromIds(array $chunkIds, array $rows): array
|
||||||
{
|
{
|
||||||
$out = [];
|
$out = [];
|
||||||
|
|||||||
@@ -9,14 +9,14 @@ use App\Knowledge\StopWords;
|
|||||||
final class QueryCleaner
|
final class QueryCleaner
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* Bereinigt eine Query ausschließlich für Retrieval-Zwecke.
|
* Cleans a query strictly for retrieval purposes.
|
||||||
*
|
*
|
||||||
* Wichtig:
|
* Important:
|
||||||
* - Unicode-sicher
|
* - Unicode-safe
|
||||||
* - Zahlen bleiben erhalten
|
* - Numbers are preserved
|
||||||
* - Negationen bleiben erhalten
|
* - Negations are preserved
|
||||||
* - Keine aggressive Token-Längen-Filterung
|
* - No aggressive token-length filtering
|
||||||
* - StopWords werden entfernt
|
* - Stop words are removed
|
||||||
*/
|
*/
|
||||||
public function clean(string $query): string
|
public function clean(string $query): string
|
||||||
{
|
{
|
||||||
@@ -24,23 +24,23 @@ final class QueryCleaner
|
|||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
// 1. Unicode-sicher lowercase
|
// 1. Convert to lowercase in a Unicode-safe way
|
||||||
$query = mb_strtolower($query, 'UTF-8');
|
$query = mb_strtolower($query, 'UTF-8');
|
||||||
|
|
||||||
// 2. Bindestriche & Slashes als Worttrenner behandeln
|
// 2. Treat hyphens and slashes as word separators
|
||||||
$query = str_replace(['-', '/'], ' ', $query);
|
$query = str_replace(['-', '/'], ' ', $query);
|
||||||
|
|
||||||
// 3. Sonderzeichen entfernen, aber:
|
// 3. Remove special characters, but keep:
|
||||||
// - Buchstaben behalten
|
// - letters
|
||||||
// - Zahlen behalten
|
// - numbers
|
||||||
// - Umlaute behalten
|
// - other Unicode letters
|
||||||
$query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query);
|
$query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query);
|
||||||
|
|
||||||
if ($query === null) {
|
if ($query === null) {
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
// 4. Mehrfache Whitespaces normalisieren
|
// 4. Normalize multiple whitespace characters
|
||||||
$query = preg_replace('/\s+/u', ' ', $query);
|
$query = preg_replace('/\s+/u', ' ', $query);
|
||||||
$query = trim($query);
|
$query = trim($query);
|
||||||
|
|
||||||
@@ -48,7 +48,7 @@ final class QueryCleaner
|
|||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
// 5. Tokenisierung
|
// 5. Tokenize the query
|
||||||
$tokens = preg_split('/\s+/u', $query);
|
$tokens = preg_split('/\s+/u', $query);
|
||||||
|
|
||||||
if ($tokens === false) {
|
if ($tokens === false) {
|
||||||
@@ -65,7 +65,7 @@ final class QueryCleaner
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// StopWords entfernen
|
// Remove stop words
|
||||||
if (StopWords::isStopWord($token)) {
|
if (StopWords::isStopWord($token)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,11 +4,25 @@ declare(strict_types=1);
|
|||||||
|
|
||||||
namespace App\Knowledge\Retrieval;
|
namespace App\Knowledge\Retrieval;
|
||||||
|
|
||||||
final class QueryEnricher
|
use App\Config\QueryEnricherConfig;
|
||||||
|
|
||||||
|
final readonly class QueryEnricher
|
||||||
{
|
{
|
||||||
|
public function __construct(
|
||||||
|
private QueryEnricherConfig $config
|
||||||
|
)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enriches the query with mapped counterpart terms.
|
||||||
|
*
|
||||||
|
* Example:
|
||||||
|
* - input: "water hardness device"
|
||||||
|
* - output: "water hardness device | Synonyms: residual hardness, model"
|
||||||
|
*/
|
||||||
public function enrichPrompt(string $query): string
|
public function enrichPrompt(string $query): string
|
||||||
{
|
{
|
||||||
// Return early if the input is empty or contains only whitespace.
|
|
||||||
if (trim($query) === '') {
|
if (trim($query) === '') {
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
@@ -19,19 +33,19 @@ final class QueryEnricher
|
|||||||
// Normalize the query for case-insensitive matching.
|
// Normalize the query for case-insensitive matching.
|
||||||
$normalizedQuery = $this->normalize($query);
|
$normalizedQuery = $this->normalize($query);
|
||||||
|
|
||||||
// Expect an associative array like:
|
// Expected format:
|
||||||
// [
|
// [
|
||||||
// 'hose' => 'jeans',
|
// 'trousers' => 'jeans',
|
||||||
// 'jacke' => 'mantel',
|
// 'jacket' => 'coat',
|
||||||
// ]
|
// ]
|
||||||
$mapping = $this->enrichQueryList();
|
$mapping = $this->config->getEnrichQueryList();
|
||||||
|
|
||||||
// Build a bidirectional lookup table:
|
// Build a bidirectional lookup table:
|
||||||
// key -> value
|
// key -> value
|
||||||
// value -> key
|
// value -> key
|
||||||
$lookup = $this->buildBidirectionalLookup($mapping);
|
$lookup = $this->buildBidirectionalLookup($mapping);
|
||||||
|
|
||||||
// Split the query into searchable words/tokens.
|
// Split the query into searchable tokens.
|
||||||
$tokens = $this->tokenize($normalizedQuery);
|
$tokens = $this->tokenize($normalizedQuery);
|
||||||
|
|
||||||
$matches = [];
|
$matches = [];
|
||||||
@@ -46,17 +60,17 @@ final class QueryEnricher
|
|||||||
// Remove duplicates while preserving order.
|
// Remove duplicates while preserving order.
|
||||||
$matches = array_values(array_unique($matches));
|
$matches = array_values(array_unique($matches));
|
||||||
|
|
||||||
// If nothing was found, return the original query unchanged.
|
// If no matches were found, return the original query unchanged.
|
||||||
if ($matches === []) {
|
if ($matches === []) {
|
||||||
return $originalQuery;
|
return $originalQuery;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Append the matched counterpart terms to the original prompt.
|
// Append the matched counterpart terms to the original query.
|
||||||
return $originalQuery . " | Pseudonyme: " . implode(', ', $matches);
|
return $originalQuery . ' | Synonyms: ' . implode(', ', $matches);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalize a string for case-insensitive comparison.
|
* Normalizes a string for case-insensitive comparison.
|
||||||
*/
|
*/
|
||||||
private function normalize(string $value): string
|
private function normalize(string $value): string
|
||||||
{
|
{
|
||||||
@@ -64,8 +78,9 @@ final class QueryEnricher
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tokenize the query into words.
|
* Tokenizes the query into words.
|
||||||
* Splits on everything that is not a letter or number.
|
*
|
||||||
|
* Splits on every character that is not a letter or number.
|
||||||
*/
|
*/
|
||||||
private function tokenize(string $value): array
|
private function tokenize(string $value): array
|
||||||
{
|
{
|
||||||
@@ -73,20 +88,20 @@ final class QueryEnricher
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Build a lookup table that works in both directions.
|
* Builds a lookup table that works in both directions.
|
||||||
*
|
*
|
||||||
* Example:
|
* Example:
|
||||||
* [
|
* [
|
||||||
* 'hose' => 'jeans',
|
* 'trousers' => 'jeans',
|
||||||
* 'jacke' => 'mantel',
|
* 'jacket' => 'coat',
|
||||||
* ]
|
* ]
|
||||||
*
|
*
|
||||||
* becomes:
|
* becomes:
|
||||||
* [
|
* [
|
||||||
* 'hose' => 'jeans',
|
* 'trousers' => 'jeans',
|
||||||
* 'jeans' => 'hose',
|
* 'jeans' => 'trousers',
|
||||||
* 'jacke' => 'mantel',
|
* 'jacket' => 'coat',
|
||||||
* 'mantel' => 'jacke',
|
* 'coat' => 'jacket',
|
||||||
* ]
|
* ]
|
||||||
*/
|
*/
|
||||||
private function buildBidirectionalLookup(array $mapping): array
|
private function buildBidirectionalLookup(array $mapping): array
|
||||||
@@ -94,8 +109,8 @@ final class QueryEnricher
|
|||||||
$lookup = [];
|
$lookup = [];
|
||||||
|
|
||||||
foreach ($mapping as $key => $value) {
|
foreach ($mapping as $key => $value) {
|
||||||
$key = trim((string)$key);
|
$key = trim((string) $key);
|
||||||
$value = trim((string)$value);
|
$value = trim((string) $value);
|
||||||
|
|
||||||
// Skip incomplete pairs.
|
// Skip incomplete pairs.
|
||||||
if ($key === '' || $value === '') {
|
if ($key === '' || $value === '') {
|
||||||
@@ -114,18 +129,4 @@ final class QueryEnricher
|
|||||||
|
|
||||||
return $lookup;
|
return $lookup;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function enrichQueryList(): array
|
|
||||||
{
|
|
||||||
return [
|
|
||||||
'Wasserhärte' => "Resthärte",
|
|
||||||
'Gerät' => 'Modell',
|
|
||||||
'Indikator' => 'Chemie',
|
|
||||||
'Seminar' => 'Webinar',
|
|
||||||
'Schulung' => 'Seminar',
|
|
||||||
'Indikatoren' => 'Indikator',
|
|
||||||
'Wasserhärte-Grenzwert'=>'Resthärte',
|
|
||||||
'Resthärte-Grenzwert'=>'Wasserhärte'
|
|
||||||
];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user