diff --git a/src/Controller/Admin/ModelGenerationConfigController.php b/src/Controller/Admin/ModelGenerationConfigController.php index a07ff13..0a67a5a 100644 --- a/src/Controller/Admin/ModelGenerationConfigController.php +++ b/src/Controller/Admin/ModelGenerationConfigController.php @@ -98,7 +98,7 @@ class ModelGenerationConfigController extends AbstractController $prompt = trim((string) $request->request->get('prompt')); if ($prompt !== '') { - $results = $retriever->retrieveForConfig($prompt, $config); + $results = $retriever->retrieveInternal($prompt, $config); } } diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index c9ca74d..d2ccbb6 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -14,16 +14,18 @@ use App\Vector\VectorSearchClient; final class NdjsonHybridRetriever implements RetrieverInterface { - private const VECTOR_SCORE_THRESHOLD = 0.75; + private const VECTOR_SCORE_THRESHOLD = 0.72; - private const HARD_MAX_CHUNKS = 200; - private const HARD_MAX_VECTORK = 200; + private const HARD_MAX_CHUNKS = 120; + private const HARD_MAX_VECTORK = 250; + + private const LIST_BONUS = 1.5; /** - * Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter). - * Enterprise Default: klein halten, sonst dominieren Tags wieder. + * Tags must only provide a small bonus (never act as a gate/filter). + * Enterprise default: keep it low, otherwise tags will dominate ranking again. */ - private const TAG_SCORE_BONUS = 0.5; + private const TAG_SCORE_BONUS = 0.1 * (1 - self::VECTOR_SCORE_THRESHOLD); public function __construct( private readonly NdjsonChunkLookup $lookup, @@ -47,21 +49,18 @@ final class NdjsonHybridRetriever implements RetrieverInterface return $this->retrieveInternal($prompt, $config); } - public function retrieveForConfig(string $prompt, ModelGenerationConfig $config): array - { - return $this->retrieveInternal($prompt, $config); - } - private function retrieveInternal(string $prompt, ModelGenerationConfig $config): array + public function retrieveInternal(string $prompt, ModelGenerationConfig $config): array { $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); - // Wichtig: List-Detection bleibt auf Originalprompt (sonst entfernst du "zeige/liste" etc.) + // Important: list-intent detection must run on the original prompt + // (cleaning might remove "show/list" etc.). $isListQuery = $this->intentLite->isListQuery($prompt); // ------------------------------------------------- - // CLEAN QUERY (nur für Retrieval: Tags + Vector) + // CLEAN QUERY (retrieval-only: tag routing + vector search) // ------------------------------------------------- $cleanQuery = $this->queryCleaner->clean($prompt); if ($cleanQuery === '') { @@ -69,7 +68,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface } // ------------------------------------------------- - // 1) Tag Routing (bereinigte Query) -> NUR Bonus + // 1) Tag routing (cleaned query) -> bonus only // ------------------------------------------------- $candidateDocIds = $this->tagRouting->route($cleanQuery); $candidateSet = null; @@ -79,29 +78,29 @@ final class NdjsonHybridRetriever implements RetrieverInterface } // ------------------------------------------------- - // 2) TopK bestimmen + // 2) Determine TopK // ------------------------------------------------- $topK = $vectorTopKBase; - // List mode: höhere Abdeckung, um mehr Dokumente zu ranken + // List mode: increase coverage to rank more documents if ($isListQuery) { - $topK = (int)round($vectorTopKBase * 2.5); + $topK = (int)round($vectorTopKBase * self::LIST_BONUS); } $topK = max(1, min($topK, self::HARD_MAX_VECTORK)); // ------------------------------------------------- - // 3) Vector Search (immer GLOBAL; Tags sind KEIN Filter) + // 3) Vector search (always GLOBAL; tags are NOT a filter) // ------------------------------------------------- $hits = $this->vectorClient->search($cleanQuery, $topK); if ($hits === []) { - // Tags dürfen NICHT als Fallback wirken (sonst wieder zu mächtig) + // Tags must NOT act as a fallback (otherwise they become too powerful again). return []; } // ------------------------------------------------- - // 4) ChunkIds + Scores sammeln (raw) + // 4) Collect chunkIds + scores (raw) // ------------------------------------------------- /** @var array $rawScoreByChunkId */ $rawScoreByChunkId = []; @@ -113,14 +112,14 @@ final class NdjsonHybridRetriever implements RetrieverInterface $raw = (float)$hit['score']; - // Threshold wird auf RAW Score angewendet (Qualitätsgate) + // Apply the threshold to the RAW score (quality gate) if ($raw < self::VECTOR_SCORE_THRESHOLD) { continue; } $chunkId = (string)$hit['chunk_id']; - // Falls mehrfach: den besten raw score behalten + // If a chunk appears multiple times, keep the best raw score if (!isset($rawScoreByChunkId[$chunkId]) || $raw > $rawScoreByChunkId[$chunkId]) { $rawScoreByChunkId[$chunkId] = $raw; } @@ -130,11 +129,11 @@ final class NdjsonHybridRetriever implements RetrieverInterface return []; } - // Lookup liefert docId + Text etc. + // Lookup returns document_id + text etc. $rows = $this->lookup->findByChunkIds(array_keys($rawScoreByChunkId)); // ------------------------------------------------- - // 5) Adjusted Score (Tag Bonus) + Ranking + // 5) Adjusted score (tag bonus) + ranking // ------------------------------------------------- /** @var array $adjScoreByChunkId */ $adjScoreByChunkId = []; @@ -174,7 +173,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface $rankedChunkIds = array_keys($adjScoreByChunkId); // ------------------------------------------------- - // 6) Listenmodus → Dokument-Ranking (mit Tag-Bonus in Scores) + // 6) List mode -> document ranking (with tag bonus in scores) // ------------------------------------------------- if ($isListQuery) { $rankedDocIds = $this->rankDocumentsFromAdjustedScores($adjScoreByChunkId, $rows); @@ -189,7 +188,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface } // ------------------------------------------------- - // 7) Normaler Chunk-Modus (nach adjusted Ranking) + // 7) Normal chunk mode (by adjusted ranking) // ------------------------------------------------- return $this->collectTexts($rankedChunkIds, $rows, $limit); } @@ -199,7 +198,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface // ========================================================= // ========================================================= - // DOCUMENT RANKING (Adjusted Scores incl. Tag Bonus) + // DOCUMENT RANKING (Adjusted scores incl. tag bonus) // ========================================================= /** diff --git a/src/Service/ModelGenerationConfigManager.php b/src/Service/ModelGenerationConfigManager.php index fed0a0d..0434b94 100644 --- a/src/Service/ModelGenerationConfigManager.php +++ b/src/Service/ModelGenerationConfigManager.php @@ -64,7 +64,7 @@ final class ModelGenerationConfigManager // Enterprise RAG Warnbereich (optional, nur Logging) if ($config->getTemperature() > 0.7) { - // hier könntest du optional Logging einbauen + // hier könntest man optional Logging einbauen } } } diff --git a/src/Tag/TagNdjsonExporter.php b/src/Tag/TagNdjsonExporter.php index eeb5c93..976f3bb 100644 --- a/src/Tag/TagNdjsonExporter.php +++ b/src/Tag/TagNdjsonExporter.php @@ -8,11 +8,11 @@ use App\Entity\DocumentTag; use App\Entity\Tag; use Doctrine\ORM\EntityManagerInterface; -final class TagNdjsonExporter +final readonly class TagNdjsonExporter { public function __construct( private EntityManagerInterface $em, - private string $tagsNdjsonPath, + private string $tagsNdjsonPath, ) {} /** diff --git a/src/Tag/TagService.php b/src/Tag/TagService.php index 4a83985..0081c40 100644 --- a/src/Tag/TagService.php +++ b/src/Tag/TagService.php @@ -10,11 +10,11 @@ use App\Entity\DocumentTag; use App\Service\TagRebuildJobService; use Doctrine\ORM\EntityManagerInterface; -final class TagService +final readonly class TagService { public function __construct( private EntityManagerInterface $em, - private TagRebuildJobService $jobs, + private TagRebuildJobService $jobs, ) {} // ========================================================= diff --git a/src/Tag/TagVectorIndexBuilder.php b/src/Tag/TagVectorIndexBuilder.php index 8ef38d4..b0f6d64 100644 --- a/src/Tag/TagVectorIndexBuilder.php +++ b/src/Tag/TagVectorIndexBuilder.php @@ -7,17 +7,17 @@ namespace App\Tag; use App\Index\IndexMetaManager; use Psr\Log\LoggerInterface; -final class TagVectorIndexBuilder +final readonly class TagVectorIndexBuilder { public function __construct( - private readonly string $pythonBin, - private readonly string $scriptPath, - private readonly string $tagsNdjsonPath, - private readonly string $vectorTagsIndexPath, - private readonly string $embeddingModel, - private readonly int $timeoutSeconds, - private readonly LoggerInterface $agentLogger, - private readonly IndexMetaManager $metaManager, // ✅ NEU + private string $pythonBin, + private string $scriptPath, + private string $tagsNdjsonPath, + private string $vectorTagsIndexPath, + private string $embeddingModel, + private int $timeoutSeconds, + private LoggerInterface $agentLogger, + private IndexMetaManager $metaManager, // ✅ NEU ) {} public function build(): void