optimize as sales rag

This commit is contained in:
team2
2026-02-27 21:03:59 +01:00
parent efa9b17c2f
commit 3a5804e44c
6 changed files with 541 additions and 213 deletions

View File

@@ -0,0 +1,160 @@
<?php
declare(strict_types=1);
namespace App\Intent;
/**
* SalesIntentLite
*
* Deterministische Vertriebs-Intent-Erkennung.
* Kein LLM, kein ML, nur regelbasierte Klassifikation.
*
* WICHTIG:
* - Immer mit ORIGINAL-Prompt aufrufen.
* - Nicht mit gereinigter Query.
*/
final class SalesIntentLite
{
public const DISCOVERY = 'discovery';
public const PRICING = 'pricing';
public const COMPARISON = 'comparison';
public const OBJECTION = 'objection';
public const IMPLEMENTATION = 'implementation';
public const ROI = 'roi';
public function detect(string $originalPrompt): array
{
$p = $this->normalize($originalPrompt);
$scores = [
self::PRICING => 0,
self::COMPARISON => 0,
self::OBJECTION => 0,
self::IMPLEMENTATION => 0,
self::ROI => 0,
];
// ------------------------------------------------------------
// PRICING
// ------------------------------------------------------------
$pricingWords = [
'preis', 'preise', 'kosten', 'lizenz', 'lizenzmodell',
'paket', 'pakete', 'tarif', 'tarife',
'gebühr', 'gebuehr', 'monatlich', 'jährlich', 'jaehrlich',
'abo', 'subscription'
];
foreach ($pricingWords as $word) {
if (preg_match('/\b' . preg_quote($word, '/') . '\b/u', $p)) {
$scores[self::PRICING] += 2;
}
}
// ------------------------------------------------------------
// COMPARISON
// ------------------------------------------------------------
$comparisonPatterns = [
'/\bvergleich\b/u',
'/\bvs\b/u',
'/\boder\b/u',
'/\balternative(n)?\b/u',
'/\bunterschied(e)?\b/u',
'/\bbesser\b/u',
];
foreach ($comparisonPatterns as $pattern) {
if (preg_match($pattern, $p)) {
$scores[self::COMPARISON] += 2;
}
}
// ------------------------------------------------------------
// OBJECTION
// ------------------------------------------------------------
$objectionWords = [
'problem', 'risiko', 'nachteil', 'datenschutz',
'dsgvo', 'sicherheit', 'compliance',
'kritik', 'zweifel', 'unsicher'
];
foreach ($objectionWords as $word) {
if (preg_match('/\b' . preg_quote($word, '/') . '\b/u', $p)) {
$scores[self::OBJECTION] += 2;
}
}
// ------------------------------------------------------------
// IMPLEMENTATION
// ------------------------------------------------------------
$implementationWords = [
'implementierung', 'einführung', 'einfuehrung',
'integration', 'aufwand', 'setup',
'rollout', 'migration', 'installation',
'technisch', 'api', 'schnittstelle'
];
foreach ($implementationWords as $word) {
if (preg_match('/\b' . preg_quote($word, '/') . '\b/u', $p)) {
$scores[self::IMPLEMENTATION] += 2;
}
}
// ------------------------------------------------------------
// ROI / Business Case
// ------------------------------------------------------------
$roiWords = [
'roi', 'rentabilität', 'rentabilitaet',
'business case', 'nutzen',
'effizienz', 'einsparung', 'umsatz',
'wert', 'vorteil'
];
foreach ($roiWords as $word) {
if (preg_match('/\b' . preg_quote($word, '/') . '\b/u', $p)) {
$scores[self::ROI] += 2;
}
}
// ------------------------------------------------------------
// Entscheidung
// ------------------------------------------------------------
arsort($scores);
$topIntent = array_key_first($scores);
$topScore = $scores[$topIntent] ?? 0;
if ($topScore <= 0) {
return [
'intent' => self::DISCOVERY,
'score' => 0,
];
}
return [
'intent' => $topIntent,
'score' => $topScore,
];
}
private function normalize(string $s): string
{
$s = mb_strtolower($s);
$replacements = [
'ä' => 'ae',
'ö' => 'oe',
'ü' => 'ue',
'ß' => 'ss',
];
foreach ($replacements as $umlaut => $alt) {
if (str_contains($s, $umlaut)) {
$s .= ' ' . str_replace($umlaut, $alt, $s);
break;
}
}
return $s;
}
}

View File

@@ -41,6 +41,7 @@ final readonly class KnowledgeIngestService
foreach ($chunks as $chunkText) {
if ($title !== '' && !str_starts_with($chunkText, $title)) {
//title with backticks
$chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
}

View File

@@ -6,7 +6,7 @@ namespace App\Knowledge\Retrieval;
use App\Entity\ModelGenerationConfig;
use App\Intent\IntentLite;
use App\Knowledge\ChunkManager;
use App\Intent\SalesIntentLite;
use App\Knowledge\QueryCleaner;
use App\Repository\ModelGenerationConfigRepository;
use App\Tag\TagRoutingService;
@@ -21,11 +21,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
private const LIST_BONUS = 1.5;
/**
* Tags must only provide a small bonus (never act as a gate/filter).
* Enterprise default: keep it low, otherwise tags will dominate ranking again.
*/
private const TAG_SCORE_BONUS = 0.1 * (1 - self::VECTOR_SCORE_THRESHOLD);
private const MAX_CHUNKS_PER_DOC = 2;
private const MIN_CHUNK_DISTANCE = 2;
private const RRF_K = 60;
public function __construct(
private readonly NdjsonChunkLookup $lookup,
@@ -33,7 +31,8 @@ final class NdjsonHybridRetriever implements RetrieverInterface
private readonly TagRoutingService $tagRouting,
private readonly ModelGenerationConfigRepository $configRepository,
private readonly QueryCleaner $queryCleaner,
private readonly IntentLite $intentLite
private readonly IntentLite $intentLite,
private readonly SalesIntentLite $salesIntentLite
)
{
}
@@ -49,27 +48,63 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return $this->retrieveInternal($prompt, $config);
}
public function retrieveInternal(string $prompt, ModelGenerationConfig $config): array
{
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
// Important: list-intent detection must run on the original prompt
// (cleaning might remove "show/list" etc.).
$isListQuery = $this->intentLite->isListQuery($prompt);
$salesIntent = $this->salesIntentLite->detect($prompt)['intent'];
// -------------------------------------------------
// CLEAN QUERY (retrieval-only: tag routing + vector search)
// -------------------------------------------------
$cleanQuery = $this->queryCleaner->clean($prompt);
if ($cleanQuery === '') {
$cleanQuery = $prompt;
}
// -------------------------------------------------
// 1) Tag routing (cleaned query) -> bonus only
// Intent-based adjustments
// -------------------------------------------------
$threshold = self::VECTOR_SCORE_THRESHOLD;
$topK = $vectorTopKBase;
switch ($salesIntent) {
case SalesIntentLite::PRICING:
$threshold += 0.02; // more precision
break;
case SalesIntentLite::COMPARISON:
$topK = (int)round($vectorTopKBase * 1.4);
break;
case SalesIntentLite::OBJECTION:
$threshold -= 0.02;
break;
case SalesIntentLite::IMPLEMENTATION:
$topK = (int)round($vectorTopKBase * 1.3);
break;
case SalesIntentLite::ROI:
$topK = (int)round($vectorTopKBase * 1.2);
break;
case SalesIntentLite::DISCOVERY:
default:
$threshold -= 0.03;
break;
}
if ($isListQuery) {
$topK = (int)round($topK * self::LIST_BONUS);
}
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
// -------------------------------------------------
// Tag routing
// -------------------------------------------------
$candidateDocIds = $this->tagRouting->route($cleanQuery);
$candidateSet = null;
@@ -78,215 +113,144 @@ final class NdjsonHybridRetriever implements RetrieverInterface
}
// -------------------------------------------------
// 2) Determine TopK
// Dual search
// -------------------------------------------------
$topK = $vectorTopKBase;
// List mode: increase coverage to rank more documents
if ($isListQuery) {
$topK = (int)round($vectorTopKBase * self::LIST_BONUS);
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
$scopedHits = [];
if ($candidateSet !== null) {
$scopedHits = $this->vectorClient->searchScoped(
$cleanQuery,
$topK,
array_keys($candidateSet)
);
}
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
// -------------------------------------------------
// 3) Vector search (always GLOBAL; tags are NOT a filter)
// -------------------------------------------------
$hits = $this->vectorClient->search($cleanQuery, $topK);
if ($hits === []) {
// Tags must NOT act as a fallback (otherwise they become too powerful again).
if ($globalHits === [] && $scopedHits === []) {
return [];
}
// -------------------------------------------------
// 4) Collect chunkIds + scores (raw)
// RRF Fusion
// -------------------------------------------------
/** @var array<string,float> $rawScoreByChunkId */
$rawScoreByChunkId = [];
$rrfScores = [];
$this->applyRrf($globalHits, $rrfScores, $threshold);
$this->applyRrf($scopedHits, $rrfScores, $threshold, $salesIntent === SalesIntentLite::OBJECTION);
if ($rrfScores === []) {
return [];
}
arsort($rrfScores);
$rankedChunkIds = array_keys($rrfScores);
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
if ($rows === []) {
return [];
}
if (!$isListQuery) {
return $this->collectSalesOptimized(
$rankedChunkIds,
$rows,
$limit
);
}
return $this->collectTexts($rankedChunkIds, $rows, $limit);
}
private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void
{
$rank = 0;
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
$raw = (float)$hit['score'];
// Apply the threshold to the RAW score (quality gate)
if ($raw < self::VECTOR_SCORE_THRESHOLD) {
if ($raw < $threshold) {
continue;
}
$chunkId = (string)$hit['chunk_id'];
// If a chunk appears multiple times, keep the best raw score
if (!isset($rawScoreByChunkId[$chunkId]) || $raw > $rawScoreByChunkId[$chunkId]) {
$rawScoreByChunkId[$chunkId] = $raw;
}
}
$rank++;
$rrf = 1 / (self::RRF_K + $rank);
if ($rawScoreByChunkId === []) {
return [];
}
// Lookup returns document_id + text etc.
$rows = $this->lookup->findByChunkIds(array_keys($rawScoreByChunkId));
// -------------------------------------------------
// 5) Adjusted score (tag bonus) + ranking
// -------------------------------------------------
/** @var array<string,float> $adjScoreByChunkId */
$adjScoreByChunkId = [];
foreach ($rawScoreByChunkId as $chunkId => $rawScore) {
if (!isset($rows[$chunkId])) {
continue;
if ($boost) {
$rrf *= 1.2; // scoped boost for objections
}
$adj = $rawScore;
if ($candidateSet !== null) {
$docId = $rows[$chunkId]['document_id'] ?? null;
if (is_string($docId) && isset($candidateSet[$docId])) {
$adj += self::TAG_SCORE_BONUS;
}
if (!isset($rrfScores[$chunkId])) {
$rrfScores[$chunkId] = 0.0;
}
$adjScoreByChunkId[$chunkId] = $adj;
$rrfScores[$chunkId] += $rrf;
}
if ($adjScoreByChunkId === []) {
return [];
}
// Sort: adjusted desc, deterministic tie-break by chunkId
uksort($adjScoreByChunkId, static function (string $a, string $b) use ($adjScoreByChunkId): int {
$sa = $adjScoreByChunkId[$a];
$sb = $adjScoreByChunkId[$b];
if ($sa === $sb) {
return $a <=> $b;
}
return ($sb <=> $sa);
});
$rankedChunkIds = array_keys($adjScoreByChunkId);
// -------------------------------------------------
// 6) List mode -> document ranking (with tag bonus in scores)
// -------------------------------------------------
if ($isListQuery) {
$rankedDocIds = $this->rankDocumentsFromAdjustedScores($adjScoreByChunkId, $rows);
if ($rankedDocIds === []) {
return [];
}
$topDocIds = array_slice($rankedDocIds, 0, $limit);
return $this->collectBestChunkPerDocumentAdjusted($topDocIds, $adjScoreByChunkId, $rows);
}
// -------------------------------------------------
// 7) Normal chunk mode (by adjusted ranking)
// -------------------------------------------------
return $this->collectTexts($rankedChunkIds, $rows, $limit);
}
// =========================================================
// LIST QUERY DETECTION
// =========================================================
// =========================================================
// DOCUMENT RANKING (Adjusted scores incl. tag bonus)
// =========================================================
/**
* @param array<string,float> $adjScoreByChunkId
* @param array<string,array<string,mixed>> $rows
* @return string[]
*/
private function rankDocumentsFromAdjustedScores(array $adjScoreByChunkId, array $rows): array
private function collectSalesOptimized(array $chunkIds, array $rows, int $limit): array
{
$documentScores = [];
$out = [];
$docCounter = [];
$docChunkPositions = [];
foreach ($adjScoreByChunkId as $chunkId => $score) {
if (!isset($rows[$chunkId])) {
foreach ($chunkIds as $chunkId) {
if (!isset($rows[$chunkId]['text'])) {
continue;
}
$docId = $rows[$chunkId]['document_id'] ?? null;
if (!is_string($docId) || $docId === '') {
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
if (!is_string($docId)) {
continue;
}
$documentScores[$docId][] = (float)$score;
}
if ($documentScores === []) {
return [];
}
$ranked = [];
foreach ($documentScores as $docId => $scores) {
rsort($scores);
$topScores = array_slice($scores, 0, 3);
$ranked[$docId] = array_sum($topScores) / count($topScores);
}
arsort($ranked);
return array_keys($ranked);
}
/**
* @param string[] $docIds
* @param array<string,float> $adjScoreByChunkId
* @param array<string,array<string,mixed>> $rows
* @return string[]
*/
private function collectBestChunkPerDocumentAdjusted(array $docIds, array $adjScoreByChunkId, array $rows): array
{
$result = [];
foreach ($docIds as $docId) {
$bestScore = -INF;
$bestText = null;
foreach ($adjScoreByChunkId as $chunkId => $score) {
if (!isset($rows[$chunkId])) {
continue;
}
if (($rows[$chunkId]['document_id'] ?? null) !== $docId) {
continue;
}
if ((float)$score > $bestScore) {
$bestScore = (float)$score;
$bestText = $rows[$chunkId]['text'] ?? null;
}
if (($docCounter[$docId] ?? 0) >= self::MAX_CHUNKS_PER_DOC) {
continue;
}
if (is_string($bestText) && $bestText !== '') {
$result[] = trim($bestText);
if (is_int($chunkIndex)) {
$prev = $docChunkPositions[$docId] ?? [];
foreach ($prev as $prevIdx) {
if (abs($prevIdx - $chunkIndex) < self::MIN_CHUNK_DISTANCE) {
continue 2;
}
}
$docChunkPositions[$docId][] = $chunkIndex;
}
$text = trim((string)$rows[$chunkId]['text']);
if ($text === '') {
continue;
}
$out[] = $text;
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
if (count($out) >= $limit) {
break;
}
}
return $result;
return $out;
}
// =========================================================
// NORMAL MODE
// =========================================================
private function collectTexts(array $chunkIds, array $rows, int $limit): array
{
$seen = [];
$out = [];
foreach ($chunkIds as $id) {
if (!isset($rows[$id]['text'])) {
continue;
}

View File

@@ -9,7 +9,16 @@ use Symfony\Contracts\HttpClient\HttpClientInterface;
final readonly class TagVectorSearchClient
{
private const MIN_SCORE = 0.4; // 🔥 Tag Confidence Gate
/**
* Minimum similarity score required for a tag to be considered.
* Acts as a confidence gate to avoid noisy routing.
*/
private const MIN_SCORE = 0.4;
/**
* Hard limit to prevent excessive requests.
*/
private const MAX_LIMIT = 50;
public function __construct(
private HttpClientInterface $http,
@@ -18,11 +27,18 @@ final readonly class TagVectorSearchClient
) {}
/**
* Executes a vector search against the Python tag index.
*
* @return array<int, array{tag_id:string, score:float}>
*/
public function search(string $query, int $limit = 8): array
{
$limit = max(1, min($limit, 50));
$query = trim($query);
if ($query === '') {
return [];
}
$limit = max(1, min($limit, self::MAX_LIMIT));
try {
$response = $this->http->request(
@@ -38,7 +54,10 @@ final readonly class TagVectorSearchClient
);
if ($response->getStatusCode() !== 200) {
$this->agentLogger->warning('Tag vector service returned non-200');
$this->agentLogger->warning(
'Tag vector service returned non-200',
['status' => $response->getStatusCode()]
);
return [];
}
@@ -46,12 +65,14 @@ final readonly class TagVectorSearchClient
} catch (\Throwable $e) {
$this->agentLogger->warning(
'Tag vector service unreachable: ' . $e->getMessage()
'Tag vector service unreachable',
['error' => $e->getMessage()]
);
return [];
}
if (!is_array($data)) {
$this->agentLogger->warning('Tag vector service returned invalid payload');
return [];
}

View File

@@ -9,7 +9,16 @@ use Symfony\Contracts\HttpClient\HttpClientInterface;
final class VectorSearchClient
{
private const MIN_SCORE = 0.30; // 🔥 weicher als Tag-Gate
/**
* Soft minimum similarity threshold.
* Lower than tag gate to allow broader recall.
*/
private const MIN_SCORE = 0.30;
/**
* Hard limit clamp to avoid abusive queries.
*/
private const MAX_LIMIT = 200;
private HttpClientInterface $http;
private string $serviceUrl;
@@ -26,18 +35,34 @@ final class VectorSearchClient
}
/**
* Standard global search
* Standard global search.
*
* @return array<int, array{
* chunk_id:string,
* score:float,
* document_id:?string,
* chunk_index:?int
* }>
*/
public function search(string $query, int $limit = 5): array
{
return $this->executeSearch([
'query' => $query,
'limit' => $limit,
'query' => trim($query),
'limit' => $this->clampLimit($limit),
]);
}
/**
* Scoped search: nur innerhalb bestimmter Dokumente
* Scoped search: only inside specific documents.
*
* @param array<int,string> $docIds
*
* @return array<int, array{
* chunk_id:string,
* score:float,
* document_id:?string,
* chunk_index:?int
* }>
*/
public function searchScoped(
string $query,
@@ -49,14 +74,23 @@ final class VectorSearchClient
}
return $this->executeSearch([
'query' => $query,
'limit' => $limit,
'query' => trim($query),
'limit' => $this->clampLimit($limit),
'doc_ids' => array_values($docIds),
]);
}
/**
* Gemeinsame HTTP-Logik (keine Duplikation)
* Shared HTTP logic.
*
* @param array<string,mixed> $payload
*
* @return array<int, array{
* chunk_id:string,
* score:float,
* document_id:?string,
* chunk_index:?int
* }>
*/
private function executeSearch(array $payload): array
{
@@ -71,7 +105,10 @@ final class VectorSearchClient
);
if ($response->getStatusCode() !== 200) {
$this->agentLogger->error('Vector service returned non-200 (chunks)');
$this->agentLogger->error(
'Vector service returned non-200 (chunks)',
['status' => $response->getStatusCode()]
);
return [];
}
@@ -79,12 +116,14 @@ final class VectorSearchClient
} catch (\Throwable $e) {
$this->agentLogger->error(
'Vector service unreachable (chunks): ' . $e->getMessage()
'Vector service unreachable (chunks)',
['error' => $e->getMessage()]
);
return [];
}
if (!is_array($data)) {
$this->agentLogger->warning('Vector service returned invalid payload (chunks)');
return [];
}
@@ -109,12 +148,41 @@ final class VectorSearchClient
continue;
}
$documentId = null;
if (isset($row['document_id']) && is_string($row['document_id']) && $row['document_id'] !== '') {
$documentId = $row['document_id'];
}
$chunkIndex = null;
if (isset($row['chunk_index'])) {
if (is_int($row['chunk_index'])) {
$chunkIndex = $row['chunk_index'];
} elseif (is_string($row['chunk_index']) && ctype_digit($row['chunk_index'])) {
$chunkIndex = (int)$row['chunk_index'];
}
}
$filtered[] = [
'chunk_id' => $chunkId,
'score' => $score,
'chunk_id' => $chunkId,
'score' => $score,
'document_id' => $documentId,
'chunk_index' => $chunkIndex,
];
}
return $filtered;
}
private function clampLimit(int $limit): int
{
if ($limit < 1) {
return 1;
}
if ($limit > self::MAX_LIMIT) {
return self::MAX_LIMIT;
}
return $limit;
}
}