diff --git a/patch_history/RETRIEX_PATCH_100_ADMIN_EVAL_UX_README.md b/patch_history/RETRIEX_PATCH_100_ADMIN_EVAL_UX_README.md new file mode 100644 index 0000000..87580ec --- /dev/null +++ b/patch_history/RETRIEX_PATCH_100_ADMIN_EVAL_UX_README.md @@ -0,0 +1,75 @@ +# RetrieX Patch p100 - Admin Eval UX + +Status: patch-only candidate +Basis: confirmed v1.6.2 + p99/p99b/p99c green eval suite + +## Ziel + +p100 macht die mit p99 eingeführte Eval-Suite im Admin sichtbar und bedienbar, ohne die produktive RAG-, Shop-, Prompt-, Scoring- oder Antwortlogik fachlich zu ändern. + +## Enthalten + +- Neuer Admin-Bereich `/admin/evals/` +- Übersicht über die Eval-Typen: + - `retrieval` + - `shop_query` + - `followup` + - `answer_guard` +- Anzeige der Case-Anzahl pro Typ +- Anzeige typspezifischer letzter Reports aus `tests/evals/reports/-last-run.json` +- Run-Buttons pro Eval-Typ +- Formular zum Ausführen eines kompletten Typs oder einer einzelnen Case-ID +- Detailansicht für PASS/FAIL, Fehler und Result-Details +- CLI-Referenz im Admin +- Sidebar-Link unter KI-Endpunkte +- Link von der KI-/LLM-Setup-Seite zur Eval Suite + +## Report-Verhalten + +Admin-Runs schreiben zwei Reports: + +- `tests/evals/reports/-last-run.json` +- `tests/evals/reports/last-run.json` + +Die CLI bleibt unverändert und schreibt weiterhin den bekannten `last-run.json`. + +## Rollen + +Der neue Bereich ist auf Controller-Ebene durch `ROLE_KNOWLEDGE_ADMIN` geschützt. + +## Nicht geändert + +- keine Retrieval-Gewichte +- keine Shopquery-Erzeugungslogik +- keine Follow-up-Logik +- keine Answer-Guard-Logik +- keine Prompt-Änderung +- keine YAML-Vokabularänderung +- keine Modellparameteränderung +- keine Datenbankmigration + +## Geänderte Dateien + +- `src/Controller/Admin/AdminEvalController.php` +- `src/Service/Admin/EvalAdminService.php` +- `templates/admin/evals/index.html.twig` +- `templates/admin/base.html.twig` +- `templates/admin/model_config/list.html.twig` +- `patch_history/RETRIEX_PATCH_100_ADMIN_EVAL_UX_README.md` + +## Prüfung nach Einspielen + +```bash +php bin/console mto:agent:config:validate +php bin/console mto:agent:eval:run retrieval +php bin/console mto:agent:eval:run shop_query +php bin/console mto:agent:eval:run followup +php bin/console mto:agent:eval:run answer_guard +``` + +Zusätzlich im Browser prüfen: + +- `/admin/evals/` +- Eval-Typ ausführen +- Detailreport öffnen +- Sidebar-Link sichtbar für Knowledge Admins diff --git a/src/Controller/Admin/AdminEvalController.php b/src/Controller/Admin/AdminEvalController.php new file mode 100644 index 0000000..611b335 --- /dev/null +++ b/src/Controller/Admin/AdminEvalController.php @@ -0,0 +1,68 @@ +denyAccessUnlessGranted(ApplicationRoles::ROLE_KNOWLEDGE_ADMIN); + + $selectedType = trim((string) $request->query->get('type', '')); + if ($selectedType === '' || !in_array($selectedType, $evals->supportedTypeNames(), true)) { + $selectedType = 'retrieval'; + } + + return $this->render('admin/evals/index.html.twig', [ + 'types' => $evals->supportedTypes(), + 'overview' => $evals->overview(), + 'cases_by_type' => $evals->casesByType(), + 'selected_type' => $selectedType, + 'selected_report' => $evals->readTypeReport($selectedType), + 'last_report' => $evals->readLastReport(), + ]); + } + + #[Route('/run', name: 'admin_evals_run', methods: ['POST'])] + public function run(Request $request, EvalAdminService $evals): Response + { + $this->denyAccessUnlessGranted(ApplicationRoles::ROLE_KNOWLEDGE_ADMIN); + + if (!$this->isCsrfTokenValid('admin_eval_run', (string) $request->request->get('_token'))) { + throw $this->createAccessDeniedException(); + } + + $type = trim((string) $request->request->get('type', 'retrieval')); + $caseId = trim((string) $request->request->get('case_id', '')); + + try { + $report = $evals->run($type, $caseId !== '' ? $caseId : null); + $this->addFlash( + ((int) ($report['failed'] ?? 0)) === 0 ? 'success' : 'danger', + sprintf( + 'Eval %s abgeschlossen: %d/%d bestanden.', + $type, + (int) ($report['passed'] ?? 0), + (int) ($report['total'] ?? 0) + ) + ); + } catch (\Throwable $e) { + $this->addFlash('danger', $e->getMessage()); + } + + return $this->redirectToRoute('admin_evals_index', [ + 'type' => $type, + ]); + } +} diff --git a/src/Service/Admin/EvalAdminService.php b/src/Service/Admin/EvalAdminService.php new file mode 100644 index 0000000..4c67c8b --- /dev/null +++ b/src/Service/Admin/EvalAdminService.php @@ -0,0 +1,227 @@ + + */ + private const TYPES = [ + 'retrieval' => 'Retrieval', + 'shop_query' => 'Shopquery', + 'followup' => 'Follow-up', + 'answer_guard' => 'Answer-Guard', + ]; + + public function __construct( + private EvalCaseLoader $caseLoader, + private AgentEvalRunner $runner, + private EvalReportWriter $reportWriter, + private string $projectDir, + ) { + } + + /** + * @return array + */ + public function supportedTypes(): array + { + return self::TYPES; + } + + /** + * @return array + */ + public function supportedTypeNames(): array + { + return array_keys(self::TYPES); + } + + public function assertSupportedType(string $type): string + { + $type = trim($type); + + if (!array_key_exists($type, self::TYPES)) { + throw new \InvalidArgumentException(sprintf('Unsupported eval type: %s', $type)); + } + + return $type; + } + + /** + * @return array> + */ + public function casesByType(): array + { + $casesByType = []; + + foreach (array_keys(self::TYPES) as $type) { + $casesByType[$type] = array_map( + static fn (EvalCase $case): array => [ + 'id' => $case->id, + 'type' => $case->type, + 'prompt' => $case->prompt, + ], + $this->loadCases($type) + ); + } + + return $casesByType; + } + + /** + * @return array> + */ + public function overview(): array + { + $overview = []; + + foreach (self::TYPES as $type => $label) { + $cases = $this->loadCases($type); + $report = $this->readTypeReport($type); + + $overview[] = [ + 'type' => $type, + 'label' => $label, + 'case_count' => count($cases), + 'report' => $report, + 'status' => $this->statusFromReport($report), + ]; + } + + return $overview; + } + + /** + * @return array + */ + public function run(string $type, ?string $caseId = null): array + { + $type = $this->assertSupportedType($type); + $caseId = trim((string) $caseId); + $cases = $this->loadCases($type); + + if ($caseId !== '') { + $cases = array_values(array_filter( + $cases, + static fn (EvalCase $case): bool => $case->id === $caseId + )); + } + + if ($cases === []) { + throw new \RuntimeException('No eval cases selected.'); + } + + $results = $this->runner->runAll($cases); + $report = $this->buildReport($type, $caseId !== '' ? $caseId : null, $results); + + $typeReportPath = $this->reportWriter->write($report, sprintf('%s-last-run.json', $type)); + $lastReportPath = $this->reportWriter->write($report); + + $report['written_to'] = $typeReportPath; + $report['last_run_written_to'] = $lastReportPath; + + return $report; + } + + /** + * @return array|null + */ + public function readTypeReport(string $type): ?array + { + $type = $this->assertSupportedType($type); + + return $this->readReportFile(sprintf('%s/tests/evals/reports/%s-last-run.json', $this->projectDir, $type)); + } + + /** + * @return array|null + */ + public function readLastReport(): ?array + { + return $this->readReportFile(sprintf('%s/tests/evals/reports/last-run.json', $this->projectDir)); + } + + /** + * @return array + */ + private function loadCases(string $type): array + { + return $this->caseLoader->load($this->assertSupportedType($type)); + } + + /** + * @param array $results + * @return array + */ + private function buildReport(string $type, ?string $caseId, array $results): array + { + $passed = count(array_filter( + $results, + static fn (EvalResult $result): bool => $result->passed + )); + $failed = count($results) - $passed; + + return [ + 'type' => $type, + 'case_filter' => $caseId, + 'total' => count($results), + 'passed' => $passed, + 'failed' => $failed, + 'generated_at' => (new \DateTimeImmutable())->format(\DateTimeInterface::ATOM), + 'results' => array_map( + static fn (EvalResult $result): array => $result->toArray(), + $results + ), + ]; + } + + /** + * @return array|null + */ + private function readReportFile(string $path): ?array + { + if (!is_file($path)) { + return null; + } + + $raw = file_get_contents($path); + + if (!is_string($raw) || trim($raw) === '') { + return null; + } + + $decoded = json_decode($raw, true); + + if (!is_array($decoded)) { + return null; + } + + return $decoded; + } + + private function statusFromReport(?array $report): string + { + if ($report === null) { + return 'not_run'; + } + + $failed = (int) ($report['failed'] ?? 0); + $total = (int) ($report['total'] ?? 0); + + if ($total <= 0) { + return 'empty'; + } + + return $failed === 0 ? 'green' : 'red'; + } +} diff --git a/templates/admin/base.html.twig b/templates/admin/base.html.twig index aaae130..3bd46ee 100644 --- a/templates/admin/base.html.twig +++ b/templates/admin/base.html.twig @@ -134,6 +134,10 @@ href="{{ path('admin_model_config_list') }}#agentLiveTest"> KI-Agent Live-Test + + Eval Suite + {% endif %}
diff --git a/templates/admin/evals/index.html.twig b/templates/admin/evals/index.html.twig new file mode 100644 index 0000000..2fb5046 --- /dev/null +++ b/templates/admin/evals/index.html.twig @@ -0,0 +1,379 @@ +{% extends 'admin/base.html.twig' %} + +{% block title %}RetrieX Eval Suite{% endblock %} + +{% block body %} + +
+
+

+ RetrieX Eval Suite +

+
+ Regressionen für Retrieval, Shopquery, Follow-up und Answer-Guard direkt im Admin prüfen. +
+
+ + + Zurück zum KI-/LLM-Setup + +
+ + {% for label in ['success', 'danger', 'warning', 'info'] %} + {% for message in app.flashes(label) %} +
+ {{ message }} +
+ {% endfor %} + {% endfor %} + + + +
+
+
+
+ +
Eval läuft ...
+
+ Die Regressionstests werden ausgeführt. Bitte die Seite nicht neu laden. +
+
+
+
+
+ +
+ {% for item in overview %} + {% set report = item.report %} + {% set status = item.status %} + {% set badgeClass = status == 'green' + ? 'bg-success' + : (status == 'red' ? 'bg-danger' : 'bg-secondary') + %} +
+
+
+
+
{{ item.label }}
+ + {% if status == 'green' %} + grün + {% elseif status == 'red' %} + rot + {% elseif status == 'empty' %} + leer + {% else %} + nicht gelaufen + {% endif %} + +
+ +
+ {{ item.case_count }} Cases +
+ + {% if report %} +
+
Total: {{ report.total|default(0) }}
+
Passed: {{ report.passed|default(0) }}
+
Failed: {{ report.failed|default(0) }}
+
+ {{ report.generated_at|default('') }} +
+
+ {% else %} +
+ Für diesen Typ liegt noch kein Admin-Report vor. +
+ {% endif %} + +
+
+ + + +
+ + + Details + +
+
+
+
+ {% endfor %} +
+ +
+
+
+
+
+ Eval ausführen +
+ +
+ + +
+ + +
+ Ohne Case-ID wird der komplette Typ ausgeführt. +
+
+ +
+ + + + {% for type, cases in cases_by_type %} + {% for case in cases %} + + {% endfor %} + {% endfor %} + +
+ + +
+
+
+
+ +
+
+
+
+ CLI-Referenz +
+ +

+ Die Admin-Runs schreiben typspezifische Reports nach + tests/evals/reports/<type>-last-run.json + und zusätzlich den bekannten last-run.json. +

+ +
+ {% for type, label in types %} +
+ {{ label }}
+ php bin/console mto:agent:eval:run {{ type }} +
+ {% endfor %} +
+ + {% if last_report %} +
+
+ Letzter generischer Report: + {{ last_report.type|default('unknown') }}, + {{ last_report.passed|default(0) }}/{{ last_report.total|default(0) }} bestanden, + {{ last_report.generated_at|default('') }} +
+ {% endif %} +
+
+
+
+ +
+
+
+
+ + Report-Details: {{ types[selected_type]|default(selected_type) }} +
+ +
+ {% for type, label in types %} + + {{ label }} + + {% endfor %} +
+
+ + {% if selected_report %} + {% set selectedFailed = selected_report.failed|default(0) %} +
+
+
+
Total
+
{{ selected_report.total|default(0) }}
+
+
+
+
+
Passed
+
{{ selected_report.passed|default(0) }}
+
+
+
+
+
Failed
+
+ {{ selectedFailed }} +
+
+
+
+
+
Generated
+
{{ selected_report.generated_at|default('') }}
+
+
+
+ +
+ + + + + + + + + + + {% for result in selected_report.results|default([]) %} + + + + + + + {% else %} + + + + {% endfor %} + +
StatusCaseDauerFailures / Details
+ {% if result.passed|default(false) %} + PASS + {% else %} + FAIL + {% endif %} + + {{ result.case_id|default('') }} +
{{ result.type|default('') }}
+
+ {{ result.duration_ms|default(0) }} ms + + {% if result.failures|default([]) is not empty %} +
    + {% for failure in result.failures %} +
  • {{ failure }}
  • + {% endfor %} +
+ {% else %} +
Keine Fehler.
+ {% endif %} + +
+ + Details anzeigen + +
{{ result.details|default({})|json_encode(constant('JSON_PRETTY_PRINT')) }}
+
+
+ Dieser Report enthält keine Resultate. +
+
+ {% else %} +
+ Für {{ types[selected_type]|default(selected_type) }} liegt noch kein typspezifischer Admin-Report vor. + Starte den Eval oben oder per CLI. +
+ {% endif %} +
+
+ + + + +{% endblock %} diff --git a/templates/admin/model_config/list.html.twig b/templates/admin/model_config/list.html.twig index b8efa3a..6413e7d 100644 --- a/templates/admin/model_config/list.html.twig +++ b/templates/admin/model_config/list.html.twig @@ -4,15 +4,24 @@ {% block body %} -
+

KI Modell-Generierung

- {% if is_granted('ROLE_SUPER_ADMIN') %} - - Neue Konfiguration - - {% endif %} +
+ {% if is_granted('ROLE_KNOWLEDGE_ADMIN') %} + + Eval Suite + + {% endif %} + + {% if is_granted('ROLE_SUPER_ADMIN') %} + + Neue Konfiguration + + {% endif %} +
{# ========================================================= #}