diff --git a/src/Command/AgentEvalRunCommand.php b/src/Command/AgentEvalRunCommand.php index d14f2fe..d34a67b 100644 --- a/src/Command/AgentEvalRunCommand.php +++ b/src/Command/AgentEvalRunCommand.php @@ -8,6 +8,7 @@ use App\Eval\AgentEvalRunner; use App\Eval\Dto\EvalCase; use App\Eval\Dto\EvalResult; use App\Eval\EvalCaseLoader; +use App\Eval\EvalReportWriter; use Symfony\Component\Console\Attribute\AsCommand; use Symfony\Component\Console\Command\Command; use Symfony\Component\Console\Input\InputArgument; @@ -25,6 +26,7 @@ final class AgentEvalRunCommand extends Command public function __construct( private readonly EvalCaseLoader $loader, private readonly AgentEvalRunner $runner, + private readonly EvalReportWriter $reportWriter, ) { parent::__construct(); } @@ -49,6 +51,12 @@ final class AgentEvalRunCommand extends Command null, InputOption::VALUE_NONE, 'Print the full report as JSON' + ) + ->addOption( + 'no-write', + null, + InputOption::VALUE_NONE, + 'Do not write the report file' ); } @@ -59,6 +67,7 @@ final class AgentEvalRunCommand extends Command $type = trim((string) $input->getArgument('type')); $caseId = trim((string) $input->getOption('case')); $asJson = (bool) $input->getOption('json'); + $noWrite = (bool) $input->getOption('no-write'); try { $cases = $this->loader->load($type); @@ -97,18 +106,38 @@ final class AgentEvalRunCommand extends Command $report = [ 'type' => $type, + 'case_filter' => $caseId !== '' ? $caseId : null, 'total' => count($results), 'passed' => $passed, 'failed' => $failed, + 'generated_at' => (new \DateTimeImmutable())->format(\DateTimeInterface::ATOM), 'results' => array_map( static fn (EvalResult $result): array => $result->toArray(), $results ), ]; + $writtenPath = null; + + if (!$noWrite) { + try { + $writtenPath = $this->reportWriter->write($report); + } catch (\Throwable $e) { + $io->error($e->getMessage()); + + return Command::FAILURE; + } + } + if ($asJson) { + $jsonReport = $report; + + if ($writtenPath !== null) { + $jsonReport['written_to'] = $writtenPath; + } + $json = json_encode( - $report, + $jsonReport, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE ); @@ -128,7 +157,8 @@ final class AgentEvalRunCommand extends Command ['type' => $type], ['total' => (string) count($results)], ['passed' => (string) $passed], - ['failed' => (string) $failed] + ['failed' => (string) $failed], + ['report_file' => $writtenPath ?? 'disabled (--no-write)'] ); foreach ($results as $result) { diff --git a/src/Eval/EvalReportWriter.php b/src/Eval/EvalReportWriter.php new file mode 100644 index 0000000..f889847 --- /dev/null +++ b/src/Eval/EvalReportWriter.php @@ -0,0 +1,59 @@ + $report + */ + public function write(array $report, string $filename = 'last-run.json'): string + { + $directory = sprintf('%s/tests/evals/reports', $this->projectDir); + + if (!is_dir($directory) && !mkdir($directory, 0775, true) && !is_dir($directory)) { + throw new \RuntimeException(sprintf( + 'Failed to create eval report directory: %s', + $directory + )); + } + + $path = sprintf('%s/%s', $directory, ltrim($filename, '/')); + + $json = json_encode( + $report, + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE + ); + + if (!is_string($json)) { + throw new \RuntimeException('json_encode failed for eval report.'); + } + + $tmpPath = $path . '.tmp'; + + if (file_put_contents($tmpPath, $json) === false) { + throw new \RuntimeException(sprintf( + 'Failed to write temporary eval report file: %s', + $tmpPath + )); + } + + if (!rename($tmpPath, $path)) { + @unlink($tmpPath); + + throw new \RuntimeException(sprintf( + 'Failed to move temporary eval report into place: %s', + $path + )); + } + + return $path; + } +} \ No newline at end of file diff --git a/src/Eval/RetrievalDebugRunner.php b/src/Eval/RetrievalDebugRunner.php index d5e606c..c07c28a 100644 --- a/src/Eval/RetrievalDebugRunner.php +++ b/src/Eval/RetrievalDebugRunner.php @@ -33,9 +33,13 @@ final readonly class RetrievalDebugRunner $documentIds = $this->extractUniqueStringValues($rows, 'document_id'); $chunkIds = $this->extractUniqueStringValues($rows, 'chunk_id'); + $joinedText = $this->extractJoinedText($rows); $assert = $case->assert; + // --------------------------------------------------------- + // Strict single-value assertions + // --------------------------------------------------------- if (isset($assert['selection_mode']) && (string) $assert['selection_mode'] !== $selectionMode) { $failures[] = sprintf( 'selection_mode mismatch: expected "%s", got "%s".', @@ -60,6 +64,33 @@ final readonly class RetrievalDebugRunner ); } + // --------------------------------------------------------- + // Flexible multi-value assertions + // --------------------------------------------------------- + $this->assertValueInList( + failures: $failures, + actual: $selectionMode, + expectedList: $assert['selection_mode_in'] ?? [], + label: 'selection_mode' + ); + + $this->assertValueInList( + failures: $failures, + actual: $route, + expectedList: $assert['route_in'] ?? [], + label: 'route' + ); + + $this->assertValueInList( + failures: $failures, + actual: $intent, + expectedList: $assert['intent_in'] ?? [], + label: 'intent' + ); + + // --------------------------------------------------------- + // Result count assertions + // --------------------------------------------------------- if (isset($assert['min_results']) && $resultCount < (int) $assert['min_results']) { $failures[] = sprintf( 'result_count too low: expected >= %d, got %d.', @@ -76,6 +107,9 @@ final readonly class RetrievalDebugRunner ); } + // --------------------------------------------------------- + // ID assertions + // --------------------------------------------------------- foreach ($this->normalizeStringList($assert['must_include_document_ids'] ?? []) as $expectedDocumentId) { if (!in_array($expectedDocumentId, $documentIds, true)) { $failures[] = sprintf( @@ -94,6 +128,65 @@ final readonly class RetrievalDebugRunner } } + $this->assertContainsAtLeastOne( + failures: $failures, + actualValues: $documentIds, + expectedList: $assert['must_include_one_of_document_ids'] ?? [], + label: 'document_id' + ); + + $this->assertContainsAtLeastOne( + failures: $failures, + actualValues: $chunkIds, + expectedList: $assert['must_include_one_of_chunk_ids'] ?? [], + label: 'chunk_id' + ); + + $this->assertContainsNone( + failures: $failures, + actualValues: $documentIds, + forbiddenList: $assert['must_not_include_document_ids'] ?? [], + label: 'document_id' + ); + + $this->assertContainsNone( + failures: $failures, + actualValues: $chunkIds, + forbiddenList: $assert['must_not_include_chunk_ids'] ?? [], + label: 'chunk_id' + ); + + // --------------------------------------------------------- + // Text / term assertions + // --------------------------------------------------------- + $matchedAnyTerms = $this->findMatchingTerms( + haystack: $joinedText, + terms: $this->normalizeStringList($assert['must_include_any_terms'] ?? []) + ); + + $matchedAllTerms = $this->findMatchingTerms( + haystack: $joinedText, + terms: $this->normalizeStringList($assert['must_include_all_terms'] ?? []) + ); + + $requiredAnyTerms = $this->normalizeStringList($assert['must_include_any_terms'] ?? []); + if ($requiredAnyTerms !== [] && $matchedAnyTerms === []) { + $failures[] = sprintf( + 'none of the required any-terms were found in the retrieval text: [%s].', + implode(', ', $requiredAnyTerms) + ); + } + + $requiredAllTerms = $this->normalizeStringList($assert['must_include_all_terms'] ?? []); + foreach ($requiredAllTerms as $requiredTerm) { + if (!$this->containsTerm($joinedText, $requiredTerm)) { + $failures[] = sprintf( + 'required all-term "%s" was not found in the retrieval text.', + $requiredTerm + ); + } + } + return new EvalResult( caseId: $case->id, type: $case->type, @@ -108,6 +201,8 @@ final readonly class RetrievalDebugRunner 'intent' => $intent, 'document_ids' => $documentIds, 'chunk_ids' => $chunkIds, + 'matched_any_terms' => $matchedAnyTerms, + 'matched_all_terms' => $matchedAllTerms, ], ); } @@ -153,6 +248,161 @@ final readonly class RetrievalDebugRunner return array_keys($values); } + /** + * @param array> $rows + */ + private function extractJoinedText(array $rows): string + { + $parts = []; + + foreach ($rows as $row) { + $text = $row['text'] ?? null; + + if (!is_string($text)) { + continue; + } + + $text = trim($text); + + if ($text === '') { + continue; + } + + $parts[] = $text; + } + + return implode("\n\n", $parts); + } + + /** + * @param array $failures + * @param mixed $expectedList + */ + private function assertValueInList( + array &$failures, + string $actual, + mixed $expectedList, + string $label + ): void { + $expected = $this->normalizeStringList($expectedList); + + if ($expected === []) { + return; + } + + if (!in_array($actual, $expected, true)) { + $failures[] = sprintf( + '%s mismatch: expected one of [%s], got "%s".', + $label, + implode(', ', $expected), + $actual + ); + } + } + + /** + * @param array $failures + * @param array $actualValues + * @param mixed $expectedList + */ + private function assertContainsAtLeastOne( + array &$failures, + array $actualValues, + mixed $expectedList, + string $label + ): void { + $expected = $this->normalizeStringList($expectedList); + + if ($expected === []) { + return; + } + + foreach ($expected as $candidate) { + if (in_array($candidate, $actualValues, true)) { + return; + } + } + + $failures[] = sprintf( + 'none of the expected %s values were found. Expected one of [%s], got [%s].', + $label, + implode(', ', $expected), + implode(', ', $actualValues) + ); + } + + /** + * @param array $failures + * @param array $actualValues + * @param mixed $forbiddenList + */ + private function assertContainsNone( + array &$failures, + array $actualValues, + mixed $forbiddenList, + string $label + ): void { + $forbidden = $this->normalizeStringList($forbiddenList); + + if ($forbidden === []) { + return; + } + + foreach ($forbidden as $forbiddenValue) { + if (in_array($forbiddenValue, $actualValues, true)) { + $failures[] = sprintf( + 'forbidden %s "%s" was present in the retrieval results.', + $label, + $forbiddenValue + ); + } + } + } + + /** + * @param array $terms + * @return array + */ + private function findMatchingTerms(string $haystack, array $terms): array + { + $matches = []; + + foreach ($terms as $term) { + if ($this->containsTerm($haystack, $term)) { + $matches[] = $term; + } + } + + return array_values(array_unique($matches)); + } + + private function containsTerm(string $haystack, string $term): bool + { + $haystack = $this->normalizeText($haystack); + $term = $this->normalizeText($term); + + if ($term === '') { + return false; + } + + return str_contains($haystack, $term); + } + + private function normalizeText(string $value): string + { + $value = trim($value); + + if ($value === '') { + return ''; + } + + if (function_exists('mb_strtolower')) { + return mb_strtolower($value); + } + + return strtolower($value); + } + /** * @param mixed $value * @return array diff --git a/tests/evals/cases/retrieval.ndjson b/tests/evals/cases/retrieval.ndjson index 0f9efa3..629e1aa 100644 --- a/tests/evals/cases/retrieval.ndjson +++ b/tests/evals/cases/retrieval.ndjson @@ -1,4 +1,117 @@ -{"id":"retrieval_exact_doc_001","type":"retrieval","prompt":"Testomat 808","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["26129c01-c09f-4c71-9c80-7ddffb6c77fb"]}} -{"id":"retrieval_exact_doc_002","type":"retrieval","prompt":"Testomat EVO CALC","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["74fdad85-5e4e-4f08-8d95-402f3180ed55"]}} -{"id":"retrieval_exact_doc_003","type":"retrieval","prompt":"Wasserhärte Grenzwert Testomat","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["60706498-867b-41b8-8e76-63248178d265"]}} -{"id":"retrieval_noise_001","type":"retrieval","prompt":"dsgfsdgfsdgf","assert":{"max_results":0}} \ No newline at end of file +{ + "id": "retrieval_exact_doc_001", + "type": "retrieval", + "prompt": "phaseaaudit-md", + "assert": { + "selection_mode_in": [ + "exact_document_title" + ], + "min_results": 1, + "must_include_one_of_document_ids": [ + "5914508a-5930-4f04-892b-323881d0daa7" + ], + "must_include_any_terms": [ + "enterprise", + "governance", + "vector-service" + ] + } +} +{ + "id": "retrieval_exact_doc_002", + "type": "retrieval", + "prompt": "ragsystemoverview-md", + "assert": { + "selection_mode_in": [ + "exact_document_title" + ], + "min_results": 1, + "must_include_one_of_document_ids": [ + "7513fd82-eec6-4bfa-a730-41820b38b6b4" + ], + "must_include_all_terms": [ + "rag-system", + "dokumente" + ] + } +} +{ + "id": "retrieval_exact_doc_003", + "type": "retrieval", + "prompt": "matrixparams-md", + "assert": { + "selection_mode_in": [ + "exact_document_title" + ], + "min_results": 1, + "must_include_one_of_document_ids": [ + "25276f4c-32bb-47a5-98b3-9d81aa722d2b" + ], + "must_include_any_terms": [ + "retrievalmaxchunks", + "retrievalvectortopk", + "hard_max_chunks" + ] + } +} +{ + "id": "retrieval_exact_doc_004", + "type": "retrieval", + "prompt": "readme-md", + "assert": { + "selection_mode_in": [ + "exact_document_title" + ], + "min_results": 1, + "must_include_one_of_document_ids": [ + "8abe1f0d-54e6-41ad-967a-9ce8a0efc6da" + ], + "must_include_any_terms": [ + "deterministisches", + "faiss", + "vector-service" + ] + } +} +{ + "id": "retrieval_semantic_001", + "type": "retrieval", + "prompt": "wie funktioniert das system", + "assert": { + "min_results": 1, + "must_include_one_of_document_ids": [ + "7513fd82-eec6-4bfa-a730-41820b38b6b4", + "8abe1f0d-54e6-41ad-967a-9ce8a0efc6da" + ], + "must_include_any_terms": [ + "rag-system", + "dokumente", + "indexierung" + ] + } +} +{ + "id": "retrieval_semantic_002", + "type": "retrieval", + "prompt": "welche parameter beeinflussen retrieval", + "assert": { + "min_results": 1, + "must_include_one_of_document_ids": [ + "25276f4c-32bb-47a5-98b3-9d81aa722d2b", + "7513fd82-eec6-4bfa-a730-41820b38b6b4" + ], + "must_include_any_terms": [ + "retrievalmaxchunks", + "vectortopk", + "chunk" + ] + } +} +{ + "id": "retrieval_noise_001", + "type": "retrieval", + "prompt": "dsgfsdgfsdgf", + "assert": { + "max_results": 0 + } +} \ No newline at end of file