configRepository->findActiveForModel(); if ($config === null) { throw new \RuntimeException('No active ModelGenerationConfig found.'); } return $this->retrieveInternal($prompt, $config); } public function retrieveForConfig(string $prompt, ModelGenerationConfig $config): array { return $this->retrieveInternal($prompt, $config); } private function retrieveInternal(string $prompt, ModelGenerationConfig $config): array { $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); // Wichtig: List-Detection bleibt auf Originalprompt (sonst entfernst du "zeige/liste" etc.) $isListQuery = $this->isListQuery($prompt); // ------------------------------------------------- // CLEAN QUERY (nur für Retrieval: Tags + Vector) // ------------------------------------------------- $cleanQuery = $this->queryCleaner->clean($prompt); if ($cleanQuery === '') { $cleanQuery = $prompt; } // ------------------------------------------------- // 1) Tag Routing (bereinigte Query) // ------------------------------------------------- $candidateDocIds = $this->tagRouting->route($cleanQuery); $candidateSet = null; if (is_array($candidateDocIds) && $candidateDocIds !== []) { $candidateSet = array_fill_keys($candidateDocIds, true); } // ------------------------------------------------- // 2) TopK bestimmen // ------------------------------------------------- $topK = $vectorTopKBase; if ($isListQuery) { $topK = max($vectorTopKBase * 3, 80); } if ($candidateSet !== null) { $topK = min( max($topK * self::VECTOR_TOPK_MULTIPLIER_WHEN_ROUTED, $topK), self::HARD_MAX_VECTORK ); } // ------------------------------------------------- // 3) Vector Search (bereinigte Query; scoped wenn möglich) // ------------------------------------------------- if ($candidateSet !== null) { $hits = $this->vectorClient->searchScoped( $cleanQuery, $topK, array_keys($candidateSet) ); // Wenn scoped nichts liefert → global fallback if ($hits === []) { $hits = $this->vectorClient->search($cleanQuery, $vectorTopKBase); } } else { $hits = $this->vectorClient->search($cleanQuery, $topK); } if ($hits === []) { return $candidateSet !== null ? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit) : []; } // ------------------------------------------------- // 4) ChunkIds + Lookup // ------------------------------------------------- $chunkIds = []; foreach ($hits as $hit) { if (!isset($hit['chunk_id'], $hit['score'])) { continue; } if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) { continue; } $chunkIds[] = (string)$hit['chunk_id']; } if ($chunkIds === []) { return $candidateSet !== null ? $this->fallbackChunksFromCandidateDocs($candidateSet, $limit) : []; } $rows = $this->lookup->findByChunkIds($chunkIds); // ------------------------------------------------- // 5) Listenmodus → Dokument-Ranking // ------------------------------------------------- if ($isListQuery && $candidateSet !== null) { $rankedDocIds = $this->rankDocumentsFromHits($hits, $rows, $candidateSet); if ($rankedDocIds === []) { return $this->fallbackChunksFromCandidateDocs($candidateSet, $limit); } $topDocIds = array_slice($rankedDocIds, 0, $limit); return $this->collectBestChunkPerDocument($topDocIds, $hits, $rows); } // ------------------------------------------------- // 6) Normaler Chunk-Modus // ------------------------------------------------- return $this->collectTexts($chunkIds, $rows, $limit); } // ========================================================= // LIST QUERY DETECTION // ========================================================= private function isListQuery(string $prompt): bool { $prompt = mb_strtolower($prompt); return str_contains($prompt, 'liste') || str_contains($prompt, 'zeige') || str_contains($prompt, 'nenn') || str_contains($prompt, 'welche') || preg_match('/\b\d+\b/', $prompt) === 1; } // ========================================================= // DOCUMENT RANKING // ========================================================= private function rankDocumentsFromHits( array $hits, array $rows, array $candidateSet ): array { $documentScores = []; foreach ($hits as $hit) { $chunkId = (string)($hit['chunk_id'] ?? ''); if (!isset($rows[$chunkId])) { continue; } $docId = $rows[$chunkId]['document_id'] ?? null; if (!is_string($docId) || !isset($candidateSet[$docId])) { continue; } $documentScores[$docId][] = (float)$hit['score']; } if ($documentScores === []) { return []; } $ranked = []; foreach ($documentScores as $docId => $scores) { rsort($scores); $topScores = array_slice($scores, 0, 3); $ranked[$docId] = array_sum($topScores) / count($topScores); } arsort($ranked); return array_keys($ranked); } private function collectBestChunkPerDocument( array $docIds, array $hits, array $rows ): array { $result = []; foreach ($docIds as $docId) { $bestScore = -INF; $bestText = null; foreach ($hits as $hit) { $chunkId = (string)($hit['chunk_id'] ?? ''); if (!isset($rows[$chunkId])) { continue; } if (($rows[$chunkId]['document_id'] ?? null) !== $docId) { continue; } if ((float)$hit['score'] > $bestScore) { $bestScore = (float)$hit['score']; $bestText = $rows[$chunkId]['text'] ?? null; } } if (is_string($bestText) && $bestText !== '') { $result[] = trim($bestText); } } return $result; } // ========================================================= // FALLBACK + NORMAL MODE // ========================================================= private function fallbackChunksFromCandidateDocs(array $candidateSet, int $limit): array { $seen = []; $out = []; foreach ($this->chunkManager->streamAll() as $row) { $docId = $row['document_id'] ?? null; if (!is_string($docId) || !isset($candidateSet[$docId])) { continue; } $text = $row['text'] ?? null; if (!is_string($text) || $text === '') { continue; } $chunk = trim($text); $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); if (isset($seen[$key])) { continue; } $seen[$key] = true; $out[] = $chunk; if (\count($out) >= $limit) { break; } } return $out; } private function collectTexts(array $chunkIds, array $rows, int $limit): array { $seen = []; $out = []; foreach ($chunkIds as $id) { if (!isset($rows[$id]['text'])) { continue; } $chunk = trim($rows[$id]['text']); $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); if (isset($seen[$key])) { continue; } $seen[$key] = true; $out[] = $chunk; if (\count($out) >= $limit) { break; } } return $out; } }