add catalog mode

This commit is contained in:
team2
2026-02-28 13:51:54 +01:00
parent 47a3c9cca2
commit d3294464ea
7 changed files with 484 additions and 201 deletions

View File

@@ -0,0 +1,118 @@
<?php
declare(strict_types=1);
namespace App\Catalog;
use App\Tag\TagVectorSearchClient;
use Doctrine\DBAL\Connection;
use Symfony\Component\Uid\Uuid;
/**
* EntityCatalogService
*
* Deterministische Katalog-Listen auf Basis eines Entity-Terms:
* - TagVectorSearch (Score-Gate + Ambiguity-Check)
* - DB Query auf document_tag + document (ACTIVE)
* - Rückgabe als EIN Textblock (string) oder null (Fallback auf normalen Retrieval)
*/
final class EntityCatalogService
{
private const MIN_SCORE = 0.55;
private const AMBIGUITY_DELTA = 0.05;
public function __construct(
private readonly TagVectorSearchClient $tagVectorClient,
private readonly Connection $connection,
) {}
/**
* @return string|null Textblock oder null (wenn kein sicherer Catalog möglich ist)
*/
public function listByTerm(string $entityTerm): ?string
{
$entityTerm = trim($entityTerm);
if ($entityTerm === '') {
return null;
}
// 1) Tag-Vektorsuche (Top 3 für Ambiguity-Prüfung)
$hits = $this->tagVectorClient->search($entityTerm, 3);
if ($hits === []) {
return null;
}
$best = $hits[0];
$bestScore = isset($best['score']) ? (float)$best['score'] : 0.0;
if ($bestScore < self::MIN_SCORE) {
return null;
}
// 2) Ambiguity: wenn Top2 zu nah ist → konservativ abbrechen
if (isset($hits[1])) {
$secondScore = isset($hits[1]['score']) ? (float)$hits[1]['score'] : 0.0;
if (abs($bestScore - $secondScore) < self::AMBIGUITY_DELTA) {
return null;
}
}
$tagHex = (string)($best['tag_id'] ?? '');
if ($tagHex === '') {
return null;
}
// 3) DB Query: alle ACTIVE Dokumente zu diesem Tag
$rows = $this->connection->fetchAllAssociative(
'
SELECT d.title
FROM document d
INNER JOIN document_tag dt ON dt.document_id = d.id
WHERE dt.tag_id = :tagId
AND d.status = :status
ORDER BY d.title ASC
',
[
'tagId' => Uuid::fromString($tagHex)->toBinary(),
'status' => 'ACTIVE',
]
);
if ($rows === []) {
return null;
}
$titles = [];
foreach ($rows as $row) {
$t = trim((string)($row['title'] ?? ''));
if ($t !== '') {
$titles[] = $t;
}
}
if ($titles === []) {
return null;
}
return $this->buildTextBlock($entityTerm, $titles);
}
private function buildTextBlock(string $entityTerm, array $titles): string
{
$headline = match ($entityTerm) {
'geräte' => 'Folgende Geräte sind verfügbar:',
'indikatoren' => 'Folgende Indikatoren sind verfügbar:',
'funktionen' => 'Folgende Funktionen sind verfügbar:',
'zubehör' => 'Folgendes Zubehör ist verfügbar:',
default => 'Folgende Einträge sind verfügbar:',
};
$lines = [];
foreach ($titles as $title) {
$lines[] = '- ' . $title;
}
return $headline . "\n\n" . implode("\n", $lines);
}
}

View File

@@ -0,0 +1,138 @@
<?php
declare(strict_types=1);
namespace App\Intent;
/**
* CatalogIntentLite
*
* Minimal, deterministische Erkennung von Katalog-/Entity-Listenanfragen.
*
* Ziel:
* - "Liste aller Geräte" / "Welche Indikatoren gibt es?" / "Zeig mir alle Funktionen"
*
* Guardrails:
* - Kein Catalog-Mode bei Sales-/Pricing-/Comparison-/ROI-/Implementation-/Objection-Intents.
* - Kein Catalog-Mode ohne expliziten Entity-Term.
*
* WICHTIG:
* - Immer mit ORIGINAL-Prompt aufrufen.
* - Kein LLM, kein ML.
*/
final class CatalogIntentLite
{
/**
* Listensignale (leichtgewichtig) IntentLite bleibt weiterhin für "allgemeine" List Detection zuständig.
*/
private const LIST_SIGNALS = [
'liste',
'auflisten',
'aufzaehl',
'aufzähl',
'übersicht',
'uebersicht',
'welche gibt es',
'welche sind',
'zeig mir alle',
'zeige mir alle',
'alle',
];
/**
* Entity-Terms, die wir als Katalogtypen unterstützen.
*
* Left side: canonical term (für Tag-Suche)
* Right side: Such-Synonyme, die im Prompt vorkommen dürfen.
*/
private const ENTITY_TERMS = [
'geräte' => ['gerät', 'geräte', 'geraet', 'geraete', 'device', 'devices'],
'indikatoren' => ['indikator', 'indikatoren', 'indicator', 'indicators'],
'funktionen' => ['funktion', 'funktionen', 'feature', 'features', 'funktionalität', 'funktionalitaet'],
'zubehör' => ['zubehör', 'zubehoer', 'accessory', 'accessories', 'zubehor'],
];
public function __construct(
private readonly SalesIntentLite $salesIntentLite,
) {}
/**
* @return string|null canonical entity term (z. B. "geräte") oder null wenn kein Catalog-Intent.
*/
public function detect(string $originalPrompt): ?string
{
$p = $this->normalize($originalPrompt);
// 1) Muss ein Listen-Signal enthalten
if (!$this->containsAny($p, self::LIST_SIGNALS)) {
return null;
}
// 2) Guardrail: Kein Catalog-Mode bei Sales-Intents
$sales = $this->salesIntentLite->detect($originalPrompt);
$intent = (string)($sales['intent'] ?? SalesIntentLite::DISCOVERY);
if ($intent !== SalesIntentLite::DISCOVERY) {
return null;
}
// 3) Expliziten Entity-Term extrahieren (sonst kein Catalog)
foreach (self::ENTITY_TERMS as $canonical => $synonyms) {
foreach ($synonyms as $syn) {
if ($this->containsWord($p, $syn)) {
return $canonical;
}
}
}
return null;
}
// ------------------------------------------------------------
// Helpers
// ------------------------------------------------------------
private function containsAny(string $haystack, array $needles): bool
{
foreach ($needles as $needle) {
if ($needle === '') {
continue;
}
if (str_contains($haystack, $needle)) {
return true;
}
}
return false;
}
private function containsWord(string $haystack, string $word): bool
{
$word = trim($word);
if ($word === '') {
return false;
}
return preg_match('/\b' . preg_quote($word, '/') . '\b/u', $haystack) === 1;
}
private function normalize(string $s): string
{
$s = mb_strtolower($s);
// Umlaute absichern (analog IntentLite/SalesIntentLite)
$replacements = [
'ä' => 'ae',
'ö' => 'oe',
'ü' => 'ue',
'ß' => 'ss',
];
foreach ($replacements as $umlaut => $alt) {
if (str_contains($s, $umlaut)) {
$s .= ' ' . str_replace($umlaut, $alt, $s);
break;
}
}
return $s;
}
}

View File

@@ -4,7 +4,9 @@ declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Catalog\EntityCatalogService;
use App\Entity\ModelGenerationConfig;
use App\Intent\CatalogIntentLite;
use App\Intent\IntentLite;
use App\Intent\SalesIntentLite;
use App\Knowledge\QueryCleaner;
@@ -32,7 +34,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
private readonly ModelGenerationConfigRepository $configRepository,
private readonly QueryCleaner $queryCleaner,
private readonly IntentLite $intentLite,
private readonly SalesIntentLite $salesIntentLite
private readonly SalesIntentLite $salesIntentLite,
private readonly CatalogIntentLite $catalogIntent,
private readonly EntityCatalogService $entityCatalogService
)
{
}
@@ -54,6 +58,17 @@ final class NdjsonHybridRetriever implements RetrieverInterface
public function retrieveInternal(string $prompt, ModelGenerationConfig $config): array
{
// 🔵 ENTITY CATALOG EARLY EXIT (jetzt auch im Admin-Test aktiv)
$entityTerm = $this->catalogIntent->detect($prompt);
if ($entityTerm !== null) {
$catalogBlock = $this->entityCatalogService->listByTerm($entityTerm);
if ($catalogBlock !== null) {
return [$catalogBlock];
}
}
$core = $this->runCore($prompt, $config, false);
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
@@ -111,8 +126,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return [];
}
// 1) Production-like selection: wir selektieren Texte,
// aber in Debug brauchen wir die ChunkIds dazu.
$selectedChunkIds = $core['is_list_query']
? $this->selectChunkIdsListMode($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
: $this->selectChunkIdsSalesMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
@@ -121,7 +134,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return [];
}
// 2) Ausgabe inklusive Scores
$out = [];
$rank = 0;
@@ -179,7 +191,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$cleanQuery = $prompt;
}
// Intent-based adjustments (identisch zur Produktionslogik)
$threshold = self::VECTOR_SCORE_THRESHOLD;
$topK = $vectorTopKBase;
@@ -216,7 +227,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
// Tag routing (identisch)
$candidateDocIds = $this->tagRouting->route($cleanQuery);
$candidateSet = null;
@@ -224,7 +234,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$candidateSet = array_fill_keys($candidateDocIds, true);
}
// Dual search (identisch)
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
$scopedHits = [];
@@ -249,7 +258,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$rrfScores = [];
$rawScores = [];
// RRF (identisch) + optional raw capture
$this->applyRrfWithOptionalRaw($globalHits, $rrfScores, $rawScores, $threshold, false, $withScores);
$this->applyRrfWithOptionalRaw(
$scopedHits,
@@ -292,13 +300,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
];
}
/**
* Gleiche Logik wie applyRrf(), aber optional mit raw-score capture.
*
* @param array<int, array{chunk_id:string, score:float}> $hits
* @param array<string,float> $rrfScores
* @param array<string,float> $rawScores
*/
private function applyRrfWithOptionalRaw(
array $hits,
array &$rrfScores,
@@ -322,7 +323,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$chunkId = (string)$hit['chunk_id'];
if ($captureRaw) {
// wenn global+scoped vorkommt: bestes raw behalten
if (!isset($rawScores[$chunkId]) || $raw > $rawScores[$chunkId]) {
$rawScores[$chunkId] = $raw;
}
@@ -343,15 +343,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
}
}
// =========================================================
// DEBUG SELECTION HELPERS (identisch zu Produktionsregeln)
// =========================================================
/**
* List-Mode nutzt exakt collectTexts() Regeln, aber gibt ChunkIds zurück.
*
* @return string[]
*/
private function selectChunkIdsListMode(array $chunkIds, array $rows, int $limit): array
{
$seen = [];
@@ -384,11 +375,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return $out;
}
/**
* Normal-Mode nutzt exakt collectSalesOptimized() Regeln, aber gibt ChunkIds zurück.
*
* @return string[]
*/
private function selectChunkIdsSalesMode(array $chunkIds, array $rows, int $limit): array
{
$out = [];
@@ -437,10 +423,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
return $out;
}
// =========================================================
// ORIGINAL METHODS (UNVERÄNDERT)
// =========================================================
private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void
{
$rank = 0;

View File

@@ -58,6 +58,7 @@ final readonly class TagVectorSearchClient
'Tag vector service returned non-200',
['status' => $response->getStatusCode()]
);
return [];
}
@@ -68,6 +69,7 @@ final readonly class TagVectorSearchClient
'Tag vector service unreachable',
['error' => $e->getMessage()]
);
return [];
}