From 8127d335717d7c5da6d736c2c5843f4cb729e9b9 Mon Sep 17 00:00:00 2001 From: team2 Date: Wed, 22 Apr 2026 22:03:23 +0200 Subject: [PATCH] first test suite retrieval --- src/Command/AgentEvalRunCommand.php | 149 ++++++++++++++++++++++ src/Eval/AgentEvalRunner.php | 43 +++++++ src/Eval/Dto/EvalCase.php | 60 +++++++++ src/Eval/Dto/EvalResult.php | 37 ++++++ src/Eval/EvalCaseLoader.php | 67 ++++++++++ src/Eval/RetrievalDebugRunner.php | 184 ++++++++++++++++++++++++++++ tests/evals/cases/retrieval.ndjson | 4 + tests/evals/reports/.gitignore | 2 + 8 files changed, 546 insertions(+) create mode 100644 src/Command/AgentEvalRunCommand.php create mode 100644 src/Eval/AgentEvalRunner.php create mode 100644 src/Eval/Dto/EvalCase.php create mode 100644 src/Eval/Dto/EvalResult.php create mode 100644 src/Eval/EvalCaseLoader.php create mode 100644 src/Eval/RetrievalDebugRunner.php create mode 100644 tests/evals/cases/retrieval.ndjson create mode 100644 tests/evals/reports/.gitignore diff --git a/src/Command/AgentEvalRunCommand.php b/src/Command/AgentEvalRunCommand.php new file mode 100644 index 0000000..d14f2fe --- /dev/null +++ b/src/Command/AgentEvalRunCommand.php @@ -0,0 +1,149 @@ +addArgument( + 'type', + InputArgument::OPTIONAL, + 'Eval type to run', + 'retrieval' + ) + ->addOption( + 'case', + null, + InputOption::VALUE_OPTIONAL, + 'Run only a single case by id' + ) + ->addOption( + 'json', + null, + InputOption::VALUE_NONE, + 'Print the full report as JSON' + ); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $io = new SymfonyStyle($input, $output); + + $type = trim((string) $input->getArgument('type')); + $caseId = trim((string) $input->getOption('case')); + $asJson = (bool) $input->getOption('json'); + + try { + $cases = $this->loader->load($type); + } catch (\Throwable $e) { + $io->error($e->getMessage()); + + return Command::FAILURE; + } + + if ($caseId !== '') { + $cases = array_values(array_filter( + $cases, + static fn (EvalCase $case): bool => $case->id === $caseId + )); + } + + if ($cases === []) { + $io->warning('No eval cases selected.'); + + return Command::SUCCESS; + } + + try { + $results = $this->runner->runAll($cases); + } catch (\Throwable $e) { + $io->error($e->getMessage()); + + return Command::FAILURE; + } + + $passed = count(array_filter( + $results, + static fn (EvalResult $result): bool => $result->passed + )); + $failed = count($results) - $passed; + + $report = [ + 'type' => $type, + 'total' => count($results), + 'passed' => $passed, + 'failed' => $failed, + 'results' => array_map( + static fn (EvalResult $result): array => $result->toArray(), + $results + ), + ]; + + if ($asJson) { + $json = json_encode( + $report, + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE + ); + + if (!is_string($json)) { + $io->error('json_encode failed.'); + + return Command::FAILURE; + } + + $output->writeln($json); + + return $failed > 0 ? Command::FAILURE : Command::SUCCESS; + } + + $io->title('RetrieX Eval Run'); + $io->definitionList( + ['type' => $type], + ['total' => (string) count($results)], + ['passed' => (string) $passed], + ['failed' => (string) $failed] + ); + + foreach ($results as $result) { + if ($result->passed) { + $io->writeln(sprintf('PASS %s', $result->caseId)); + continue; + } + + $io->writeln(sprintf('FAIL %s', $result->caseId)); + + foreach ($result->failures as $failure) { + $io->writeln(' - ' . $failure); + } + } + + return $failed > 0 ? Command::FAILURE : Command::SUCCESS; + } +} \ No newline at end of file diff --git a/src/Eval/AgentEvalRunner.php b/src/Eval/AgentEvalRunner.php new file mode 100644 index 0000000..1358357 --- /dev/null +++ b/src/Eval/AgentEvalRunner.php @@ -0,0 +1,43 @@ +isRetrievalCase()) { + return $this->retrievalDebugRunner->run($case); + } + + throw new \InvalidArgumentException(sprintf( + 'Unsupported eval case type: %s', + $case->type + )); + } + + /** + * @param array $cases + * @return array + */ + public function runAll(array $cases): array + { + $results = []; + + foreach ($cases as $case) { + $results[] = $this->run($case); + } + + return $results; + } +} \ No newline at end of file diff --git a/src/Eval/Dto/EvalCase.php b/src/Eval/Dto/EvalCase.php new file mode 100644 index 0000000..e5ce645 --- /dev/null +++ b/src/Eval/Dto/EvalCase.php @@ -0,0 +1,60 @@ + $assert + */ + public function __construct( + public string $id, + public string $type, + public string $prompt, + public array $assert = [], + ) { + } + + /** + * @param array $row + */ + public static function fromArray(array $row): self + { + $id = trim((string) ($row['id'] ?? '')); + $type = trim((string) ($row['type'] ?? '')); + $prompt = trim((string) ($row['prompt'] ?? '')); + $assert = is_array($row['assert'] ?? null) ? $row['assert'] : []; + + if ($id === '') { + throw new \InvalidArgumentException('Eval case id must not be empty.'); + } + + if ($type === '') { + throw new \InvalidArgumentException(sprintf( + 'Eval case "%s" has an empty type.', + $id + )); + } + + if ($prompt === '') { + throw new \InvalidArgumentException(sprintf( + 'Eval case "%s" has an empty prompt.', + $id + )); + } + + return new self( + id: $id, + type: $type, + prompt: $prompt, + assert: $assert, + ); + } + + public function isRetrievalCase(): bool + { + return $this->type === 'retrieval'; + } +} \ No newline at end of file diff --git a/src/Eval/Dto/EvalResult.php b/src/Eval/Dto/EvalResult.php new file mode 100644 index 0000000..7215941 --- /dev/null +++ b/src/Eval/Dto/EvalResult.php @@ -0,0 +1,37 @@ + $failures + * @param array $details + */ + public function __construct( + public string $caseId, + public string $type, + public bool $passed, + public float $durationMs, + public array $failures = [], + public array $details = [], + ) { + } + + /** + * @return array + */ + public function toArray(): array + { + return [ + 'case_id' => $this->caseId, + 'type' => $this->type, + 'passed' => $this->passed, + 'duration_ms' => $this->durationMs, + 'failures' => $this->failures, + 'details' => $this->details, + ]; + } +} \ No newline at end of file diff --git a/src/Eval/EvalCaseLoader.php b/src/Eval/EvalCaseLoader.php new file mode 100644 index 0000000..e71259b --- /dev/null +++ b/src/Eval/EvalCaseLoader.php @@ -0,0 +1,67 @@ + + */ + public function load(string $type = 'retrieval'): array + { + $path = sprintf( + '%s/tests/evals/cases/%s.ndjson', + $this->projectDir, + $type + ); + + if (!is_file($path)) { + throw new \RuntimeException(sprintf( + 'Eval case file not found: %s', + $path + )); + } + + $lines = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + + if ($lines === false) { + throw new \RuntimeException(sprintf( + 'Failed to read eval case file: %s', + $path + )); + } + + $cases = []; + + foreach ($lines as $lineNumber => $line) { + $line = trim($line); + + if ($line === '') { + continue; + } + + $decoded = json_decode($line, true); + + if (!is_array($decoded)) { + throw new \RuntimeException(sprintf( + 'Invalid JSON in %s on line %d.', + $path, + $lineNumber + 1 + )); + } + + $cases[] = EvalCase::fromArray($decoded); + } + + return $cases; + } +} \ No newline at end of file diff --git a/src/Eval/RetrievalDebugRunner.php b/src/Eval/RetrievalDebugRunner.php new file mode 100644 index 0000000..d5e606c --- /dev/null +++ b/src/Eval/RetrievalDebugRunner.php @@ -0,0 +1,184 @@ +retriever->retrieveDebug($case->prompt); + + $durationMs = round((microtime(true) - $start) * 1000, 2); + + $resultCount = count($rows); + $first = $rows[0] ?? []; + + $selectionMode = $this->extractString($first, 'selection_mode'); + $route = $this->extractString($first, 'route'); + $intent = $this->extractString($first, 'intent'); + + $documentIds = $this->extractUniqueStringValues($rows, 'document_id'); + $chunkIds = $this->extractUniqueStringValues($rows, 'chunk_id'); + + $assert = $case->assert; + + if (isset($assert['selection_mode']) && (string) $assert['selection_mode'] !== $selectionMode) { + $failures[] = sprintf( + 'selection_mode mismatch: expected "%s", got "%s".', + (string) $assert['selection_mode'], + $selectionMode + ); + } + + if (isset($assert['route']) && (string) $assert['route'] !== $route) { + $failures[] = sprintf( + 'route mismatch: expected "%s", got "%s".', + (string) $assert['route'], + $route + ); + } + + if (isset($assert['intent']) && (string) $assert['intent'] !== $intent) { + $failures[] = sprintf( + 'intent mismatch: expected "%s", got "%s".', + (string) $assert['intent'], + $intent + ); + } + + if (isset($assert['min_results']) && $resultCount < (int) $assert['min_results']) { + $failures[] = sprintf( + 'result_count too low: expected >= %d, got %d.', + (int) $assert['min_results'], + $resultCount + ); + } + + if (isset($assert['max_results']) && $resultCount > (int) $assert['max_results']) { + $failures[] = sprintf( + 'result_count too high: expected <= %d, got %d.', + (int) $assert['max_results'], + $resultCount + ); + } + + foreach ($this->normalizeStringList($assert['must_include_document_ids'] ?? []) as $expectedDocumentId) { + if (!in_array($expectedDocumentId, $documentIds, true)) { + $failures[] = sprintf( + 'missing expected document_id "%s".', + $expectedDocumentId + ); + } + } + + foreach ($this->normalizeStringList($assert['must_include_chunk_ids'] ?? []) as $expectedChunkId) { + if (!in_array($expectedChunkId, $chunkIds, true)) { + $failures[] = sprintf( + 'missing expected chunk_id "%s".', + $expectedChunkId + ); + } + } + + return new EvalResult( + caseId: $case->id, + type: $case->type, + passed: $failures === [], + durationMs: $durationMs, + failures: $failures, + details: [ + 'prompt' => $case->prompt, + 'result_count' => $resultCount, + 'selection_mode' => $selectionMode, + 'route' => $route, + 'intent' => $intent, + 'document_ids' => $documentIds, + 'chunk_ids' => $chunkIds, + ], + ); + } + + /** + * @param array $row + */ + private function extractString(array $row, string $key): string + { + $value = $row[$key] ?? null; + + if (!is_string($value)) { + return ''; + } + + return trim($value); + } + + /** + * @param array> $rows + * @return array + */ + private function extractUniqueStringValues(array $rows, string $key): array + { + $values = []; + + foreach ($rows as $row) { + $value = $row[$key] ?? null; + + if (!is_string($value)) { + continue; + } + + $value = trim($value); + + if ($value === '') { + continue; + } + + $values[$value] = true; + } + + return array_keys($values); + } + + /** + * @param mixed $value + * @return array + */ + private function normalizeStringList(mixed $value): array + { + if (!is_array($value)) { + return []; + } + + $out = []; + + foreach ($value as $item) { + if (!is_string($item)) { + continue; + } + + $item = trim($item); + + if ($item === '') { + continue; + } + + $out[] = $item; + } + + return array_values(array_unique($out)); + } +} \ No newline at end of file diff --git a/tests/evals/cases/retrieval.ndjson b/tests/evals/cases/retrieval.ndjson new file mode 100644 index 0000000..0f9efa3 --- /dev/null +++ b/tests/evals/cases/retrieval.ndjson @@ -0,0 +1,4 @@ +{"id":"retrieval_exact_doc_001","type":"retrieval","prompt":"Testomat 808","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["26129c01-c09f-4c71-9c80-7ddffb6c77fb"]}} +{"id":"retrieval_exact_doc_002","type":"retrieval","prompt":"Testomat EVO CALC","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["74fdad85-5e4e-4f08-8d95-402f3180ed55"]}} +{"id":"retrieval_exact_doc_003","type":"retrieval","prompt":"Wasserhärte Grenzwert Testomat","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["60706498-867b-41b8-8e76-63248178d265"]}} +{"id":"retrieval_noise_001","type":"retrieval","prompt":"dsgfsdgfsdgf","assert":{"max_results":0}} \ No newline at end of file diff --git a/tests/evals/reports/.gitignore b/tests/evals/reports/.gitignore new file mode 100644 index 0000000..c96a04f --- /dev/null +++ b/tests/evals/reports/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file