add catalog mode
This commit is contained in:
118
src/Catalog/EntityCatalogService.php
Normal file
118
src/Catalog/EntityCatalogService.php
Normal file
@@ -0,0 +1,118 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Catalog;
|
||||
|
||||
use App\Tag\TagVectorSearchClient;
|
||||
use Doctrine\DBAL\Connection;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
/**
|
||||
* EntityCatalogService
|
||||
*
|
||||
* Deterministische Katalog-Listen auf Basis eines Entity-Terms:
|
||||
* - TagVectorSearch (Score-Gate + Ambiguity-Check)
|
||||
* - DB Query auf document_tag + document (ACTIVE)
|
||||
* - Rückgabe als EIN Textblock (string) oder null (Fallback auf normalen Retrieval)
|
||||
*/
|
||||
final class EntityCatalogService
|
||||
{
|
||||
private const MIN_SCORE = 0.55;
|
||||
private const AMBIGUITY_DELTA = 0.05;
|
||||
|
||||
public function __construct(
|
||||
private readonly TagVectorSearchClient $tagVectorClient,
|
||||
private readonly Connection $connection,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* @return string|null Textblock oder null (wenn kein sicherer Catalog möglich ist)
|
||||
*/
|
||||
public function listByTerm(string $entityTerm): ?string
|
||||
{
|
||||
$entityTerm = trim($entityTerm);
|
||||
if ($entityTerm === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 1) Tag-Vektorsuche (Top 3 für Ambiguity-Prüfung)
|
||||
$hits = $this->tagVectorClient->search($entityTerm, 3);
|
||||
|
||||
if ($hits === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$best = $hits[0];
|
||||
|
||||
$bestScore = isset($best['score']) ? (float)$best['score'] : 0.0;
|
||||
if ($bestScore < self::MIN_SCORE) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 2) Ambiguity: wenn Top2 zu nah ist → konservativ abbrechen
|
||||
if (isset($hits[1])) {
|
||||
$secondScore = isset($hits[1]['score']) ? (float)$hits[1]['score'] : 0.0;
|
||||
if (abs($bestScore - $secondScore) < self::AMBIGUITY_DELTA) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
$tagHex = (string)($best['tag_id'] ?? '');
|
||||
if ($tagHex === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 3) DB Query: alle ACTIVE Dokumente zu diesem Tag
|
||||
$rows = $this->connection->fetchAllAssociative(
|
||||
'
|
||||
SELECT d.title
|
||||
FROM document d
|
||||
INNER JOIN document_tag dt ON dt.document_id = d.id
|
||||
WHERE dt.tag_id = :tagId
|
||||
AND d.status = :status
|
||||
ORDER BY d.title ASC
|
||||
',
|
||||
[
|
||||
'tagId' => Uuid::fromString($tagHex)->toBinary(),
|
||||
'status' => 'ACTIVE',
|
||||
]
|
||||
);
|
||||
|
||||
if ($rows === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$titles = [];
|
||||
foreach ($rows as $row) {
|
||||
$t = trim((string)($row['title'] ?? ''));
|
||||
if ($t !== '') {
|
||||
$titles[] = $t;
|
||||
}
|
||||
}
|
||||
|
||||
if ($titles === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return $this->buildTextBlock($entityTerm, $titles);
|
||||
}
|
||||
|
||||
private function buildTextBlock(string $entityTerm, array $titles): string
|
||||
{
|
||||
$headline = match ($entityTerm) {
|
||||
'geräte' => 'Folgende Geräte sind verfügbar:',
|
||||
'indikatoren' => 'Folgende Indikatoren sind verfügbar:',
|
||||
'funktionen' => 'Folgende Funktionen sind verfügbar:',
|
||||
'zubehör' => 'Folgendes Zubehör ist verfügbar:',
|
||||
default => 'Folgende Einträge sind verfügbar:',
|
||||
};
|
||||
|
||||
$lines = [];
|
||||
foreach ($titles as $title) {
|
||||
$lines[] = '- ' . $title;
|
||||
}
|
||||
|
||||
return $headline . "\n\n" . implode("\n", $lines);
|
||||
}
|
||||
}
|
||||
138
src/Intent/CatalogIntentLite.php
Normal file
138
src/Intent/CatalogIntentLite.php
Normal file
@@ -0,0 +1,138 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Intent;
|
||||
|
||||
/**
|
||||
* CatalogIntentLite
|
||||
*
|
||||
* Minimal, deterministische Erkennung von Katalog-/Entity-Listenanfragen.
|
||||
*
|
||||
* Ziel:
|
||||
* - "Liste aller Geräte" / "Welche Indikatoren gibt es?" / "Zeig mir alle Funktionen"
|
||||
*
|
||||
* Guardrails:
|
||||
* - Kein Catalog-Mode bei Sales-/Pricing-/Comparison-/ROI-/Implementation-/Objection-Intents.
|
||||
* - Kein Catalog-Mode ohne expliziten Entity-Term.
|
||||
*
|
||||
* WICHTIG:
|
||||
* - Immer mit ORIGINAL-Prompt aufrufen.
|
||||
* - Kein LLM, kein ML.
|
||||
*/
|
||||
final class CatalogIntentLite
|
||||
{
|
||||
/**
|
||||
* Listensignale (leichtgewichtig) – IntentLite bleibt weiterhin für "allgemeine" List Detection zuständig.
|
||||
*/
|
||||
private const LIST_SIGNALS = [
|
||||
'liste',
|
||||
'auflisten',
|
||||
'aufzaehl',
|
||||
'aufzähl',
|
||||
'übersicht',
|
||||
'uebersicht',
|
||||
'welche gibt es',
|
||||
'welche sind',
|
||||
'zeig mir alle',
|
||||
'zeige mir alle',
|
||||
'alle',
|
||||
];
|
||||
|
||||
/**
|
||||
* Entity-Terms, die wir als Katalogtypen unterstützen.
|
||||
*
|
||||
* Left side: canonical term (für Tag-Suche)
|
||||
* Right side: Such-Synonyme, die im Prompt vorkommen dürfen.
|
||||
*/
|
||||
private const ENTITY_TERMS = [
|
||||
'geräte' => ['gerät', 'geräte', 'geraet', 'geraete', 'device', 'devices'],
|
||||
'indikatoren' => ['indikator', 'indikatoren', 'indicator', 'indicators'],
|
||||
'funktionen' => ['funktion', 'funktionen', 'feature', 'features', 'funktionalität', 'funktionalitaet'],
|
||||
'zubehör' => ['zubehör', 'zubehoer', 'accessory', 'accessories', 'zubehor'],
|
||||
];
|
||||
|
||||
public function __construct(
|
||||
private readonly SalesIntentLite $salesIntentLite,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* @return string|null canonical entity term (z. B. "geräte") oder null wenn kein Catalog-Intent.
|
||||
*/
|
||||
public function detect(string $originalPrompt): ?string
|
||||
{
|
||||
$p = $this->normalize($originalPrompt);
|
||||
|
||||
// 1) Muss ein Listen-Signal enthalten
|
||||
if (!$this->containsAny($p, self::LIST_SIGNALS)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 2) Guardrail: Kein Catalog-Mode bei Sales-Intents
|
||||
$sales = $this->salesIntentLite->detect($originalPrompt);
|
||||
$intent = (string)($sales['intent'] ?? SalesIntentLite::DISCOVERY);
|
||||
|
||||
if ($intent !== SalesIntentLite::DISCOVERY) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 3) Expliziten Entity-Term extrahieren (sonst kein Catalog)
|
||||
foreach (self::ENTITY_TERMS as $canonical => $synonyms) {
|
||||
foreach ($synonyms as $syn) {
|
||||
if ($this->containsWord($p, $syn)) {
|
||||
return $canonical;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// Helpers
|
||||
// ------------------------------------------------------------
|
||||
|
||||
private function containsAny(string $haystack, array $needles): bool
|
||||
{
|
||||
foreach ($needles as $needle) {
|
||||
if ($needle === '') {
|
||||
continue;
|
||||
}
|
||||
if (str_contains($haystack, $needle)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private function containsWord(string $haystack, string $word): bool
|
||||
{
|
||||
$word = trim($word);
|
||||
if ($word === '') {
|
||||
return false;
|
||||
}
|
||||
return preg_match('/\b' . preg_quote($word, '/') . '\b/u', $haystack) === 1;
|
||||
}
|
||||
|
||||
private function normalize(string $s): string
|
||||
{
|
||||
$s = mb_strtolower($s);
|
||||
|
||||
// Umlaute absichern (analog IntentLite/SalesIntentLite)
|
||||
$replacements = [
|
||||
'ä' => 'ae',
|
||||
'ö' => 'oe',
|
||||
'ü' => 'ue',
|
||||
'ß' => 'ss',
|
||||
];
|
||||
|
||||
foreach ($replacements as $umlaut => $alt) {
|
||||
if (str_contains($s, $umlaut)) {
|
||||
$s .= ' ' . str_replace($umlaut, $alt, $s);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $s;
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,9 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Catalog\EntityCatalogService;
|
||||
use App\Entity\ModelGenerationConfig;
|
||||
use App\Intent\CatalogIntentLite;
|
||||
use App\Intent\IntentLite;
|
||||
use App\Intent\SalesIntentLite;
|
||||
use App\Knowledge\QueryCleaner;
|
||||
@@ -32,7 +34,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
private readonly ModelGenerationConfigRepository $configRepository,
|
||||
private readonly QueryCleaner $queryCleaner,
|
||||
private readonly IntentLite $intentLite,
|
||||
private readonly SalesIntentLite $salesIntentLite
|
||||
private readonly SalesIntentLite $salesIntentLite,
|
||||
private readonly CatalogIntentLite $catalogIntent,
|
||||
private readonly EntityCatalogService $entityCatalogService
|
||||
)
|
||||
{
|
||||
}
|
||||
@@ -54,6 +58,17 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
public function retrieveInternal(string $prompt, ModelGenerationConfig $config): array
|
||||
{
|
||||
// 🔵 ENTITY CATALOG EARLY EXIT (jetzt auch im Admin-Test aktiv)
|
||||
$entityTerm = $this->catalogIntent->detect($prompt);
|
||||
|
||||
if ($entityTerm !== null) {
|
||||
$catalogBlock = $this->entityCatalogService->listByTerm($entityTerm);
|
||||
|
||||
if ($catalogBlock !== null) {
|
||||
return [$catalogBlock];
|
||||
}
|
||||
}
|
||||
|
||||
$core = $this->runCore($prompt, $config, false);
|
||||
|
||||
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
||||
@@ -111,8 +126,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return [];
|
||||
}
|
||||
|
||||
// 1) Production-like selection: wir selektieren Texte,
|
||||
// aber in Debug brauchen wir die ChunkIds dazu.
|
||||
$selectedChunkIds = $core['is_list_query']
|
||||
? $this->selectChunkIdsListMode($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
|
||||
: $this->selectChunkIdsSalesMode($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
||||
@@ -121,7 +134,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return [];
|
||||
}
|
||||
|
||||
// 2) Ausgabe inklusive Scores
|
||||
$out = [];
|
||||
$rank = 0;
|
||||
|
||||
@@ -179,7 +191,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$cleanQuery = $prompt;
|
||||
}
|
||||
|
||||
// Intent-based adjustments (identisch zur Produktionslogik)
|
||||
$threshold = self::VECTOR_SCORE_THRESHOLD;
|
||||
$topK = $vectorTopKBase;
|
||||
|
||||
@@ -216,7 +227,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
$topK = max(1, min($topK, self::HARD_MAX_VECTORK));
|
||||
|
||||
// Tag routing (identisch)
|
||||
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
||||
$candidateSet = null;
|
||||
|
||||
@@ -224,7 +234,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$candidateSet = array_fill_keys($candidateDocIds, true);
|
||||
}
|
||||
|
||||
// Dual search (identisch)
|
||||
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
||||
|
||||
$scopedHits = [];
|
||||
@@ -249,7 +258,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$rrfScores = [];
|
||||
$rawScores = [];
|
||||
|
||||
// RRF (identisch) + optional raw capture
|
||||
$this->applyRrfWithOptionalRaw($globalHits, $rrfScores, $rawScores, $threshold, false, $withScores);
|
||||
$this->applyRrfWithOptionalRaw(
|
||||
$scopedHits,
|
||||
@@ -292,13 +300,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gleiche Logik wie applyRrf(), aber optional mit raw-score capture.
|
||||
*
|
||||
* @param array<int, array{chunk_id:string, score:float}> $hits
|
||||
* @param array<string,float> $rrfScores
|
||||
* @param array<string,float> $rawScores
|
||||
*/
|
||||
private function applyRrfWithOptionalRaw(
|
||||
array $hits,
|
||||
array &$rrfScores,
|
||||
@@ -322,7 +323,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$chunkId = (string)$hit['chunk_id'];
|
||||
|
||||
if ($captureRaw) {
|
||||
// wenn global+scoped vorkommt: bestes raw behalten
|
||||
if (!isset($rawScores[$chunkId]) || $raw > $rawScores[$chunkId]) {
|
||||
$rawScores[$chunkId] = $raw;
|
||||
}
|
||||
@@ -343,15 +343,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// DEBUG SELECTION HELPERS (identisch zu Produktionsregeln)
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* List-Mode nutzt exakt collectTexts() Regeln, aber gibt ChunkIds zurück.
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
private function selectChunkIdsListMode(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$seen = [];
|
||||
@@ -384,11 +375,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normal-Mode nutzt exakt collectSalesOptimized() Regeln, aber gibt ChunkIds zurück.
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
private function selectChunkIdsSalesMode(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$out = [];
|
||||
@@ -437,10 +423,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return $out;
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// ORIGINAL METHODS (UNVERÄNDERT)
|
||||
// =========================================================
|
||||
|
||||
private function applyRrf(array $hits, array &$rrfScores, float $threshold, bool $boost = false): void
|
||||
{
|
||||
$rank = 0;
|
||||
|
||||
@@ -58,6 +58,7 @@ final readonly class TagVectorSearchClient
|
||||
'Tag vector service returned non-200',
|
||||
['status' => $response->getStatusCode()]
|
||||
);
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
@@ -68,6 +69,7 @@ final readonly class TagVectorSearchClient
|
||||
'Tag vector service unreachable',
|
||||
['error' => $e->getMessage()]
|
||||
);
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user