From 6e2ca15e97492a76b05832944cd400fe0d011932 Mon Sep 17 00:00:00 2001 From: team 1 Date: Tue, 12 May 2026 11:08:34 +0200 Subject: [PATCH] p101a --- ...01A_ADMIN_EVAL_CASE_CREATOR_PAGE_README.md | 54 ++++ src/Controller/Admin/AdminEvalController.php | 53 +++- src/Service/Admin/EvalAdminService.php | 235 +++++++++++++- templates/admin/evals/case_new.html.twig | 194 ++++++++++++ templates/admin/evals/index.html.twig | 293 +----------------- 5 files changed, 542 insertions(+), 287 deletions(-) create mode 100644 patch_history/RETRIEX_PATCH_101A_ADMIN_EVAL_CASE_CREATOR_PAGE_README.md create mode 100644 templates/admin/evals/case_new.html.twig diff --git a/patch_history/RETRIEX_PATCH_101A_ADMIN_EVAL_CASE_CREATOR_PAGE_README.md b/patch_history/RETRIEX_PATCH_101A_ADMIN_EVAL_CASE_CREATOR_PAGE_README.md new file mode 100644 index 0000000..d32d4b5 --- /dev/null +++ b/patch_history/RETRIEX_PATCH_101A_ADMIN_EVAL_CASE_CREATOR_PAGE_README.md @@ -0,0 +1,54 @@ +# RetrieX Patch p101a - Admin Eval Case Creator Separate Page + +## Ziel + +Der Eval-Case-Creator wird als eigene Admin-Seite geführt, damit die Eval-Suite-Übersicht schlank bleibt und nicht durch das komplette Case-Erstellformular aufgeblasen wirkt. + +## Neue / geänderte Admin-Routen + +- `GET /admin/evals/` bleibt die fokussierte Eval-Suite-Übersicht für Runs und Reports. +- `GET /admin/evals/cases/new` zeigt das separate Formular zum Anlegen neuer Eval-Cases. +- `POST /admin/evals/cases` speichert neue Eval-Cases in `tests/evals/cases/.ndjson`. + +## UX-Änderungen + +- Die Eval-Suite-Übersicht erhält nur einen kompakten Button `Eval-Case erstellen`. +- Report-Ergebnisse erhalten den Button `Als neuen Case vorbereiten`. +- Die neue Seite übernimmt bei vorbereiteten Cases: + - Eval-Typ + - Prompt + - History/Kontext, sofern im Report vorhanden + - vorgeschlagene Assertions aus Query, Einzelqueries oder Dokument-IDs +- Die eigentliche Case-Erstellung liegt außerhalb der Report-/Run-Übersicht. + +## Validierung + +Beim Speichern werden geprüft: + +- CSRF-Token +- `ROLE_KNOWLEDGE_ADMIN` +- unterstützter Eval-Typ +- eindeutige Case-ID über alle Eval-Typen +- erlaubtes Case-ID-Format +- nicht leerer Prompt +- gültiges Assert-JSON-Objekt +- gültige History-JSON-Liste +- DTO-Validierung über `EvalCase::fromArray()` + +## Nicht geändert + +- Keine Retrieval-Logik +- Keine Shopquery-Logik +- Keine Follow-up-Logik +- Keine Answer-Guard-Logik +- Keine Eval-Cases +- Keine YAML-/Parameteränderung +- Keine Migration + +## Betroffene Dateien + +- `src/Controller/Admin/AdminEvalController.php` +- `src/Service/Admin/EvalAdminService.php` +- `templates/admin/evals/index.html.twig` +- `templates/admin/evals/case_new.html.twig` +- `patch_history/RETRIEX_PATCH_101A_ADMIN_EVAL_CASE_CREATOR_PAGE_README.md` diff --git a/src/Controller/Admin/AdminEvalController.php b/src/Controller/Admin/AdminEvalController.php index 097d0ea..71d9d4d 100644 --- a/src/Controller/Admin/AdminEvalController.php +++ b/src/Controller/Admin/AdminEvalController.php @@ -68,7 +68,35 @@ final class AdminEvalController extends AbstractController ]); } - #[Route('/case/create', name: 'admin_evals_case_create', methods: ['POST'])] + #[Route('/cases/new', name: 'admin_evals_case_new', methods: ['GET'])] + public function newCase(Request $request, EvalAdminService $evals): Response + { + $this->denyAccessUnlessGranted(ApplicationRoles::ROLE_KNOWLEDGE_ADMIN); + + $type = trim((string) $request->query->get('type', 'retrieval')); + if (!in_array($type, $evals->supportedTypeNames(), true)) { + $type = 'retrieval'; + } + + $sourceType = trim((string) $request->query->get('source_type', '')); + $sourceCaseId = trim((string) $request->query->get('source_case_id', '')); + + try { + $draft = $sourceType !== '' && $sourceCaseId !== '' + ? $evals->caseDraftFromReportResult($sourceType, $sourceCaseId) + : $evals->emptyCaseDraft($type); + } catch (\Throwable $e) { + $this->addFlash('warning', $e->getMessage()); + $draft = $evals->emptyCaseDraft($type); + } + + return $this->render('admin/evals/case_new.html.twig', [ + 'types' => $evals->supportedTypes(), + 'case_draft' => $draft, + ]); + } + + #[Route('/cases', name: 'admin_evals_case_create', methods: ['POST'])] public function createCase(Request $request, EvalAdminService $evals): Response { $this->denyAccessUnlessGranted(ApplicationRoles::ROLE_KNOWLEDGE_ADMIN); @@ -78,6 +106,15 @@ final class AdminEvalController extends AbstractController } $type = trim((string) $request->request->get('type', 'retrieval')); + $draft = [ + 'type' => $type, + 'id' => (string) $request->request->get('id', ''), + 'prompt' => (string) $request->request->get('prompt', ''), + 'assert_json' => (string) $request->request->get('assert_json', ''), + 'history_json' => (string) $request->request->get('history_json', ''), + 'request_context_hint' => (string) $request->request->get('request_context_hint', ''), + 'source_label' => '', + ]; try { $created = $evals->createCase( @@ -95,17 +132,21 @@ final class AdminEvalController extends AbstractController 'success', sprintf('Eval-Case "%s" wurde in %s.ndjson gespeichert.', (string) ($created['id'] ?? ''), $type) ); + + return $this->redirectToRoute('admin_evals_index', [ + 'type' => $type, + ]); } catch (\Throwable $e) { $this->addFlash('danger', $e->getMessage()); } if (!in_array($type, $evals->supportedTypeNames(), true)) { - $type = 'retrieval'; + $draft['type'] = 'retrieval'; } - return $this->redirectToRoute('admin_evals_index', [ - 'type' => $type, - ]); + return $this->render('admin/evals/case_new.html.twig', [ + 'types' => $evals->supportedTypes(), + 'case_draft' => $draft, + ], new Response('', Response::HTTP_UNPROCESSABLE_ENTITY)); } - } diff --git a/src/Service/Admin/EvalAdminService.php b/src/Service/Admin/EvalAdminService.php index 57d4731..c91c2e2 100644 --- a/src/Service/Admin/EvalAdminService.php +++ b/src/Service/Admin/EvalAdminService.php @@ -144,6 +144,75 @@ final readonly class EvalAdminService return $report; } + /** + * @return array{type:string,id:string,prompt:string,assert_json:string,history_json:string,request_context_hint:string,source_label:string} + */ + public function emptyCaseDraft(string $type = 'retrieval'): array + { + $type = $this->assertSupportedType($type); + + return [ + 'type' => $type, + 'id' => '', + 'prompt' => '', + 'assert_json' => $this->encodePrettyJson($this->defaultAssertForType($type)), + 'history_json' => '', + 'request_context_hint' => '', + 'source_label' => '', + ]; + } + + /** + * @return array{type:string,id:string,prompt:string,assert_json:string,history_json:string,request_context_hint:string,source_label:string} + */ + public function caseDraftFromReportResult(string $type, string $caseId): array + { + $type = $this->assertSupportedType($type); + $caseId = trim($caseId); + + if ($caseId === '') { + throw new \InvalidArgumentException('Es wurde keine Quell-Case-ID übergeben.'); + } + + $report = $this->readTypeReport($type); + if ($report === null) { + throw new \RuntimeException(sprintf( + 'Für den Eval-Typ "%s" liegt kein Report vor. Bitte den Eval zuerst ausführen.', + $type + )); + } + + $result = null; + foreach (($report['results'] ?? []) as $candidate) { + if (is_array($candidate) && (string) ($candidate['case_id'] ?? '') === $caseId) { + $result = $candidate; + break; + } + } + + if (!is_array($result)) { + throw new \RuntimeException(sprintf( + 'Der Report enthält keinen Case "%s" für Eval-Typ "%s".', + $caseId, + $type + )); + } + + $details = is_array($result['details'] ?? null) ? $result['details'] : []; + $prompt = trim((string) ($result['prompt'] ?? $details['prompt'] ?? '')); + $history = $this->historyDraftFromDetails($details); + $assert = $this->suggestAssertFromReportResult($type, $result, $details); + + return [ + 'type' => $type, + 'id' => $this->suggestUniqueCaseId($type . '_' . $caseId . '_new'), + 'prompt' => $prompt, + 'assert_json' => $this->encodePrettyJson($assert), + 'history_json' => $history === [] ? '' : $this->encodePrettyJson($history), + 'request_context_hint' => '', + 'source_label' => sprintf('Vorlage aus Report-Case %s (%s)', $caseId, self::TYPES[$type]), + ]; + } /** * @return array{type:string,id:string,path:string,row:array,case_count:int} @@ -190,7 +259,7 @@ final readonly class EvalAdminService $row['request_context_hint'] = $requestContextHint; } - // Reuse the regular DTO validation before writing the case file. + // Validate with the same DTO that the eval runner uses. EvalCase::fromArray($row); $path = $this->caseFilePath($type); @@ -221,7 +290,6 @@ final readonly class EvalAdminService ]; } - /** * @param array $cases * @return array @@ -326,7 +394,6 @@ final readonly class EvalAdminService return $decoded; } - private function normalizeNewCaseId(string $id): string { $id = trim($id); @@ -374,7 +441,7 @@ final readonly class EvalAdminService throw new \InvalidArgumentException(sprintf('%s ist ungültig: %s', $label, $e->getMessage())); } - if (!is_array($decoded)) { + if (!is_array($decoded) || !str_starts_with($json, '{') || ($decoded !== [] && array_is_list($decoded))) { throw new \InvalidArgumentException(sprintf('%s muss ein JSON-Objekt sein.', $label)); } @@ -398,7 +465,7 @@ final readonly class EvalAdminService throw new \InvalidArgumentException(sprintf('History-JSON ist ungültig: %s', $e->getMessage())); } - if (!is_array($decoded)) { + if (!is_array($decoded) || !str_starts_with($json, '[') || !array_is_list($decoded)) { throw new \InvalidArgumentException('History-JSON muss eine JSON-Liste sein.'); } @@ -458,4 +525,162 @@ final readonly class EvalAdminService return $failed === 0 ? 'green' : 'red'; } + + /** + * @return array + */ + private function defaultAssertForType(string $type): array + { + return match ($type) { + 'retrieval', 'answer_guard' => [ + 'min_results' => 1, + ], + 'shop_query', 'followup' => [ + 'expected_query' => '', + ], + default => [], + }; + } + + /** + * @param array $result + * @param array $details + * @return array + */ + private function suggestAssertFromReportResult(string $type, array $result, array $details): array + { + if (($type === 'shop_query' || $type === 'followup') && is_string($details['query'] ?? null)) { + $query = trim($details['query']); + if ($query !== '') { + return [ + 'expected_query' => $query, + ]; + } + } + + if (($type === 'shop_query' || $type === 'followup') && is_array($details['individual_queries'] ?? null)) { + $queries = array_values(array_filter(array_map( + static fn (mixed $value): string => trim((string) $value), + $details['individual_queries'] + ))); + + if ($queries !== []) { + return [ + 'expected_individual_queries' => $queries, + 'expected_individual_queries_exact' => true, + ]; + } + } + + if (is_array($details['document_refs'] ?? null)) { + $documentIds = []; + foreach ($details['document_refs'] as $documentRef) { + if (!is_array($documentRef)) { + continue; + } + + $documentId = trim((string) ($documentRef['id'] ?? '')); + if ($documentId !== '') { + $documentIds[] = $documentId; + } + } + + if ($documentIds !== []) { + return [ + 'min_results' => 1, + 'must_include_one_of_document_ids' => array_values(array_unique($documentIds)), + ]; + } + } + + if (is_array($details['document_ids'] ?? null)) { + $documentIds = array_values(array_filter(array_map( + static fn (mixed $value): string => trim((string) $value), + $details['document_ids'] + ))); + + if ($documentIds !== []) { + return [ + 'min_results' => 1, + 'must_include_one_of_document_ids' => array_values(array_unique($documentIds)), + ]; + } + } + + $resultCount = (int) ($details['result_count'] ?? -1); + if ($resultCount === 0) { + return [ + 'max_results' => 0, + ]; + } + + return $this->defaultAssertForType($type); + } + + /** + * @param array $details + * @return array + */ + private function historyDraftFromDetails(array $details): array + { + if (!is_array($details['history'] ?? null)) { + return []; + } + + $history = []; + foreach ($details['history'] as $entry) { + if (!is_array($entry)) { + continue; + } + + $prompt = trim((string) ($entry['prompt'] ?? '')); + $answer = trim((string) ($entry['answer'] ?? $entry['answer_preview'] ?? '')); + + if ($prompt === '' && $answer === '') { + continue; + } + + $history[] = [ + 'prompt' => $prompt !== '' ? $prompt : 'Eval-Kontext', + 'answer' => $answer, + ]; + } + + return $history; + } + + private function suggestUniqueCaseId(string $base): string + { + $base = strtolower(trim($base)); + $base = preg_replace('/[^a-z0-9_-]+/', '_', $base) ?? 'eval_case'; + $base = trim($base, '_-'); + + if ($base === '') { + $base = 'eval_case'; + } + + if (!$this->caseIdExists($base)) { + return $base; + } + + for ($i = 2; $i <= 999; ++$i) { + $candidate = sprintf('%s_%d', $base, $i); + if (!$this->caseIdExists($candidate)) { + return $candidate; + } + } + + return sprintf('%s_%s', $base, (new \DateTimeImmutable())->format('YmdHis')); + } + + /** + * @param array $value + */ + private function encodePrettyJson(array $value): string + { + return json_encode( + $value, + JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_THROW_ON_ERROR + ); + } } diff --git a/templates/admin/evals/case_new.html.twig b/templates/admin/evals/case_new.html.twig new file mode 100644 index 0000000..f38084d --- /dev/null +++ b/templates/admin/evals/case_new.html.twig @@ -0,0 +1,194 @@ +{% extends 'admin/base.html.twig' %} + +{% block title %}Eval-Case erstellen{% endblock %} + +{% block body %} + +
+
+

+ Eval-Case erstellen +

+
+ Neue Regression-Cases separat anlegen, ohne die Eval-Suite-Übersicht aufzublähen. +
+
+ + + Zurück zur Eval Suite + +
+ + {% for label in ['success', 'danger', 'warning', 'info'] %} + {% for message in app.flashes(label) %} +
+ {{ message }} +
+ {% endfor %} + {% endfor %} + + {% if case_draft.source_label|default('') %} +
+ Vorlage geladen: {{ case_draft.source_label }}
+ + Bitte Case-ID, Prompt und Assertions prüfen, bevor du den Case speicherst. + +
+ {% endif %} + +
+
+
+
+
+ Neuer Eval-Case +
+ +
+ + +
+ + +
+ Der Typ entscheidet, in welche Datei geschrieben wird: tests/evals/cases/<type>.ndjson. +
+
+ +
+ + +
+ Eindeutig über alle Eval-Typen. Erlaubt: Buchstaben, Zahlen, _ und -. +
+
+ +
+ + +
+ Exakt der Nutzerprompt, der abgesichert werden soll. Tippfehler bewusst so eintragen, wenn sie Teil des Tests sind. +
+
+ +
+ + +
+ Muss ein gültiges JSON-Objekt sein. Beispiel: {"expected_query":"testomat 808"}. +
+
+ +
+ + +
+ Für Follow-up-Cases empfohlen. Muss eine JSON-Liste sein. Leer lassen für direkte Prompts. +
+
+ +
+ + +
+ Normalerweise leer lassen. Für reguläre Regressionen lieber History-JSON verwenden. +
+
+ +
+ + + Abbrechen + +
+
+
+
+
+ +
+
+
+
+ Feld-Checkliste +
+
    +
  • retrieval: richtiges Dokument / richtige Chunks prüfen.
  • +
  • shop_query: direkte Shopquery prüfen.
  • +
  • followup: Prompt plus History prüfen.
  • +
  • answer_guard: No-Answer- oder Evidenzfälle prüfen.
  • +
+
+
+ +
+
+
+ Häufige Assertions +
+
Exakte Query:
+
{
+  "expected_query": "testomat 808"
+}
+ +
Begriffe müssen enthalten sein:
+
{
+  "must_include_terms": [
+    "testomat",
+    "808"
+  ]
+}
+ +
Dokument muss enthalten sein:
+
{
+  "min_results": 1,
+  "must_include_one_of_document_ids": [
+    "DOKUMENT-ID"
+  ]
+}
+
+
+ +
+
+
+ Empfehlung +
+

+ Ein guter Eval-Case prüft genau einen Zweck. Lieber mehrere kleine Cases anlegen als einen großen, empfindlichen Case. +

+
+
+
+
+ +{% endblock %} diff --git a/templates/admin/evals/index.html.twig b/templates/admin/evals/index.html.twig index d7c328d..9c6d033 100644 --- a/templates/admin/evals/index.html.twig +++ b/templates/admin/evals/index.html.twig @@ -14,10 +14,16 @@ - - Zurück zum KI-/LLM-Setup - + {% for label in ['success', 'danger', 'warning', 'info'] %} @@ -212,100 +218,6 @@ - -
-
-
-
-
- Eval-Case erstellen -
-
- Speichert neue Regression-Cases direkt in tests/evals/cases/<type>.ndjson. - Aus Report-Ergebnissen kannst du Prompt, History, Query oder Dokument-IDs als Vorlage übernehmen. -
-
-
- -
- - -
- - -
- -
- - -
- Erlaubt: Buchstaben, Zahlen, Unterstrich, Bindestrich. IDs müssen eindeutig sein. -
-
- -
- - -
- -
- - -
- Beispiel: expected_query, must_include_one_of_document_ids, must_not_include_terms. -
-
- -
- - -
- Für Follow-up-Cases: Liste vorheriger Chat-Turns mit prompt und answer. -
-
- -
- - -
- -
- - -
-
-
-
-
@@ -387,6 +299,13 @@
{% endif %} + + {% set historyRows = result.details.history|default([]) %} {% if historyRows is not empty %}
@@ -407,17 +326,6 @@
{% endif %} - - {{ result.duration_ms|default(0) }} ms @@ -595,173 +503,6 @@ }); } - const creator = document.getElementById('adminEvalCaseCreator'); - - function parseJsonData(value, fallback) { - if (!value) { - return fallback; - } - - try { - return JSON.parse(value); - } catch (error) { - return fallback; - } - } - - function slugifyPrompt(prompt) { - const normalized = (prompt || '') - .toLowerCase() - .normalize('NFD') - .replace(/[\u0300-\u036f]/g, '') - .replace(/ä/g, 'ae') - .replace(/ö/g, 'oe') - .replace(/ü/g, 'ue') - .replace(/ß/g, 'ss') - .replace(/[^a-z0-9]+/g, '_') - .replace(/^_+|_+$/g, '') - .slice(0, 44); - - return normalized || 'case'; - } - - function buildAssertTemplate(type, query, individualQueries, documentIds) { - if ((type === 'shop_query' || type === 'followup') && individualQueries.length > 0) { - return { - expected_individual_queries: individualQueries, - expected_individual_queries_exact: true - }; - } - - if ((type === 'shop_query' || type === 'followup') && query) { - return { - expected_query: query - }; - } - - if ((type === 'retrieval' || type === 'answer_guard') && documentIds.length > 0) { - return { - min_results: 1, - must_include_one_of_document_ids: [documentIds[0]] - }; - } - - if (type === 'answer_guard') { - return { - max_results: 0 - }; - } - - return { - min_results: 1 - }; - } - - function normalizeHistoryForForm(historyRows) { - return historyRows - .map(function (turn) { - return { - prompt: (turn.prompt || 'Eval-Kontext').trim(), - answer: (turn.answer || turn.response || turn.answer_preview || '').trim() - }; - }) - .filter(function (turn) { - return turn.prompt !== '' || turn.answer !== ''; - }); - } - - function fillCreatorFormFromResult(button) { - if (!creator) { - return; - } - - const type = button.dataset.resultType || 'retrieval'; - const prompt = button.dataset.resultPrompt || ''; - const history = normalizeHistoryForForm(parseJsonData(button.dataset.resultHistory, [])); - const query = button.dataset.resultQuery || ''; - const individualQueries = parseJsonData(button.dataset.resultIndividualQueries, []); - const documentIds = parseJsonData(button.dataset.resultDocumentIds, []); - const now = new Date(); - const suffix = String(now.getFullYear()).slice(2) - + String(now.getMonth() + 1).padStart(2, '0') - + String(now.getDate()).padStart(2, '0') - + '_' - + String(now.getHours()).padStart(2, '0') - + String(now.getMinutes()).padStart(2, '0') - + String(now.getSeconds()).padStart(2, '0'); - - const typeField = creator.querySelector('.js-admin-eval-create-type'); - const idField = creator.querySelector('.js-admin-eval-create-id'); - const promptField = creator.querySelector('.js-admin-eval-create-prompt'); - const assertField = creator.querySelector('.js-admin-eval-create-assert'); - const historyField = creator.querySelector('.js-admin-eval-create-history'); - const contextField = creator.querySelector('.js-admin-eval-create-context'); - - if (typeField) { - typeField.value = type; - } - - if (idField) { - idField.value = type + '_' + slugifyPrompt(prompt) + '_' + suffix; - } - - if (promptField) { - promptField.value = prompt; - } - - if (assertField) { - assertField.value = JSON.stringify( - buildAssertTemplate(type, query, individualQueries, documentIds), - null, - 2 - ); - } - - if (historyField) { - historyField.value = history.length > 0 ? JSON.stringify(history, null, 2) : ''; - } - - if (contextField) { - contextField.value = ''; - } - - creator.scrollIntoView({behavior: 'smooth', block: 'start'}); - } - - if (creator) { - creator.querySelectorAll('.js-admin-eval-create-clear').forEach(function (button) { - button.addEventListener('click', function () { - const idField = creator.querySelector('.js-admin-eval-create-id'); - const promptField = creator.querySelector('.js-admin-eval-create-prompt'); - const assertField = creator.querySelector('.js-admin-eval-create-assert'); - const historyField = creator.querySelector('.js-admin-eval-create-history'); - const contextField = creator.querySelector('.js-admin-eval-create-context'); - - if (idField) { - idField.value = ''; - } - if (promptField) { - promptField.value = ''; - } - if (assertField) { - assertField.value = '{\n "min_results": 1\n}'; - } - if (historyField) { - historyField.value = ''; - } - if (contextField) { - contextField.value = ''; - } - }); - }); - } - - document.querySelectorAll('.js-admin-eval-prefill-case').forEach(function (button) { - button.addEventListener('click', function () { - fillCreatorFormFromResult(button); - }); - }); - forms.forEach(function (form) { syncCaseSelect(form);