diff --git a/src/Command/AgentEvalRunCommand.php b/src/Command/AgentEvalRunCommand.php
new file mode 100644
index 0000000..d14f2fe
--- /dev/null
+++ b/src/Command/AgentEvalRunCommand.php
@@ -0,0 +1,149 @@
+addArgument(
+ 'type',
+ InputArgument::OPTIONAL,
+ 'Eval type to run',
+ 'retrieval'
+ )
+ ->addOption(
+ 'case',
+ null,
+ InputOption::VALUE_OPTIONAL,
+ 'Run only a single case by id'
+ )
+ ->addOption(
+ 'json',
+ null,
+ InputOption::VALUE_NONE,
+ 'Print the full report as JSON'
+ );
+ }
+
+ protected function execute(InputInterface $input, OutputInterface $output): int
+ {
+ $io = new SymfonyStyle($input, $output);
+
+ $type = trim((string) $input->getArgument('type'));
+ $caseId = trim((string) $input->getOption('case'));
+ $asJson = (bool) $input->getOption('json');
+
+ try {
+ $cases = $this->loader->load($type);
+ } catch (\Throwable $e) {
+ $io->error($e->getMessage());
+
+ return Command::FAILURE;
+ }
+
+ if ($caseId !== '') {
+ $cases = array_values(array_filter(
+ $cases,
+ static fn (EvalCase $case): bool => $case->id === $caseId
+ ));
+ }
+
+ if ($cases === []) {
+ $io->warning('No eval cases selected.');
+
+ return Command::SUCCESS;
+ }
+
+ try {
+ $results = $this->runner->runAll($cases);
+ } catch (\Throwable $e) {
+ $io->error($e->getMessage());
+
+ return Command::FAILURE;
+ }
+
+ $passed = count(array_filter(
+ $results,
+ static fn (EvalResult $result): bool => $result->passed
+ ));
+ $failed = count($results) - $passed;
+
+ $report = [
+ 'type' => $type,
+ 'total' => count($results),
+ 'passed' => $passed,
+ 'failed' => $failed,
+ 'results' => array_map(
+ static fn (EvalResult $result): array => $result->toArray(),
+ $results
+ ),
+ ];
+
+ if ($asJson) {
+ $json = json_encode(
+ $report,
+ JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
+ );
+
+ if (!is_string($json)) {
+ $io->error('json_encode failed.');
+
+ return Command::FAILURE;
+ }
+
+ $output->writeln($json);
+
+ return $failed > 0 ? Command::FAILURE : Command::SUCCESS;
+ }
+
+ $io->title('RetrieX Eval Run');
+ $io->definitionList(
+ ['type' => $type],
+ ['total' => (string) count($results)],
+ ['passed' => (string) $passed],
+ ['failed' => (string) $failed]
+ );
+
+ foreach ($results as $result) {
+ if ($result->passed) {
+ $io->writeln(sprintf('PASS %s', $result->caseId));
+ continue;
+ }
+
+ $io->writeln(sprintf('FAIL %s', $result->caseId));
+
+ foreach ($result->failures as $failure) {
+ $io->writeln(' - ' . $failure);
+ }
+ }
+
+ return $failed > 0 ? Command::FAILURE : Command::SUCCESS;
+ }
+}
\ No newline at end of file
diff --git a/src/Eval/AgentEvalRunner.php b/src/Eval/AgentEvalRunner.php
new file mode 100644
index 0000000..1358357
--- /dev/null
+++ b/src/Eval/AgentEvalRunner.php
@@ -0,0 +1,43 @@
+isRetrievalCase()) {
+ return $this->retrievalDebugRunner->run($case);
+ }
+
+ throw new \InvalidArgumentException(sprintf(
+ 'Unsupported eval case type: %s',
+ $case->type
+ ));
+ }
+
+ /**
+ * @param array $cases
+ * @return array
+ */
+ public function runAll(array $cases): array
+ {
+ $results = [];
+
+ foreach ($cases as $case) {
+ $results[] = $this->run($case);
+ }
+
+ return $results;
+ }
+}
\ No newline at end of file
diff --git a/src/Eval/Dto/EvalCase.php b/src/Eval/Dto/EvalCase.php
new file mode 100644
index 0000000..e5ce645
--- /dev/null
+++ b/src/Eval/Dto/EvalCase.php
@@ -0,0 +1,60 @@
+ $assert
+ */
+ public function __construct(
+ public string $id,
+ public string $type,
+ public string $prompt,
+ public array $assert = [],
+ ) {
+ }
+
+ /**
+ * @param array $row
+ */
+ public static function fromArray(array $row): self
+ {
+ $id = trim((string) ($row['id'] ?? ''));
+ $type = trim((string) ($row['type'] ?? ''));
+ $prompt = trim((string) ($row['prompt'] ?? ''));
+ $assert = is_array($row['assert'] ?? null) ? $row['assert'] : [];
+
+ if ($id === '') {
+ throw new \InvalidArgumentException('Eval case id must not be empty.');
+ }
+
+ if ($type === '') {
+ throw new \InvalidArgumentException(sprintf(
+ 'Eval case "%s" has an empty type.',
+ $id
+ ));
+ }
+
+ if ($prompt === '') {
+ throw new \InvalidArgumentException(sprintf(
+ 'Eval case "%s" has an empty prompt.',
+ $id
+ ));
+ }
+
+ return new self(
+ id: $id,
+ type: $type,
+ prompt: $prompt,
+ assert: $assert,
+ );
+ }
+
+ public function isRetrievalCase(): bool
+ {
+ return $this->type === 'retrieval';
+ }
+}
\ No newline at end of file
diff --git a/src/Eval/Dto/EvalResult.php b/src/Eval/Dto/EvalResult.php
new file mode 100644
index 0000000..7215941
--- /dev/null
+++ b/src/Eval/Dto/EvalResult.php
@@ -0,0 +1,37 @@
+ $failures
+ * @param array $details
+ */
+ public function __construct(
+ public string $caseId,
+ public string $type,
+ public bool $passed,
+ public float $durationMs,
+ public array $failures = [],
+ public array $details = [],
+ ) {
+ }
+
+ /**
+ * @return array
+ */
+ public function toArray(): array
+ {
+ return [
+ 'case_id' => $this->caseId,
+ 'type' => $this->type,
+ 'passed' => $this->passed,
+ 'duration_ms' => $this->durationMs,
+ 'failures' => $this->failures,
+ 'details' => $this->details,
+ ];
+ }
+}
\ No newline at end of file
diff --git a/src/Eval/EvalCaseLoader.php b/src/Eval/EvalCaseLoader.php
new file mode 100644
index 0000000..e71259b
--- /dev/null
+++ b/src/Eval/EvalCaseLoader.php
@@ -0,0 +1,67 @@
+
+ */
+ public function load(string $type = 'retrieval'): array
+ {
+ $path = sprintf(
+ '%s/tests/evals/cases/%s.ndjson',
+ $this->projectDir,
+ $type
+ );
+
+ if (!is_file($path)) {
+ throw new \RuntimeException(sprintf(
+ 'Eval case file not found: %s',
+ $path
+ ));
+ }
+
+ $lines = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+
+ if ($lines === false) {
+ throw new \RuntimeException(sprintf(
+ 'Failed to read eval case file: %s',
+ $path
+ ));
+ }
+
+ $cases = [];
+
+ foreach ($lines as $lineNumber => $line) {
+ $line = trim($line);
+
+ if ($line === '') {
+ continue;
+ }
+
+ $decoded = json_decode($line, true);
+
+ if (!is_array($decoded)) {
+ throw new \RuntimeException(sprintf(
+ 'Invalid JSON in %s on line %d.',
+ $path,
+ $lineNumber + 1
+ ));
+ }
+
+ $cases[] = EvalCase::fromArray($decoded);
+ }
+
+ return $cases;
+ }
+}
\ No newline at end of file
diff --git a/src/Eval/RetrievalDebugRunner.php b/src/Eval/RetrievalDebugRunner.php
new file mode 100644
index 0000000..d5e606c
--- /dev/null
+++ b/src/Eval/RetrievalDebugRunner.php
@@ -0,0 +1,184 @@
+retriever->retrieveDebug($case->prompt);
+
+ $durationMs = round((microtime(true) - $start) * 1000, 2);
+
+ $resultCount = count($rows);
+ $first = $rows[0] ?? [];
+
+ $selectionMode = $this->extractString($first, 'selection_mode');
+ $route = $this->extractString($first, 'route');
+ $intent = $this->extractString($first, 'intent');
+
+ $documentIds = $this->extractUniqueStringValues($rows, 'document_id');
+ $chunkIds = $this->extractUniqueStringValues($rows, 'chunk_id');
+
+ $assert = $case->assert;
+
+ if (isset($assert['selection_mode']) && (string) $assert['selection_mode'] !== $selectionMode) {
+ $failures[] = sprintf(
+ 'selection_mode mismatch: expected "%s", got "%s".',
+ (string) $assert['selection_mode'],
+ $selectionMode
+ );
+ }
+
+ if (isset($assert['route']) && (string) $assert['route'] !== $route) {
+ $failures[] = sprintf(
+ 'route mismatch: expected "%s", got "%s".',
+ (string) $assert['route'],
+ $route
+ );
+ }
+
+ if (isset($assert['intent']) && (string) $assert['intent'] !== $intent) {
+ $failures[] = sprintf(
+ 'intent mismatch: expected "%s", got "%s".',
+ (string) $assert['intent'],
+ $intent
+ );
+ }
+
+ if (isset($assert['min_results']) && $resultCount < (int) $assert['min_results']) {
+ $failures[] = sprintf(
+ 'result_count too low: expected >= %d, got %d.',
+ (int) $assert['min_results'],
+ $resultCount
+ );
+ }
+
+ if (isset($assert['max_results']) && $resultCount > (int) $assert['max_results']) {
+ $failures[] = sprintf(
+ 'result_count too high: expected <= %d, got %d.',
+ (int) $assert['max_results'],
+ $resultCount
+ );
+ }
+
+ foreach ($this->normalizeStringList($assert['must_include_document_ids'] ?? []) as $expectedDocumentId) {
+ if (!in_array($expectedDocumentId, $documentIds, true)) {
+ $failures[] = sprintf(
+ 'missing expected document_id "%s".',
+ $expectedDocumentId
+ );
+ }
+ }
+
+ foreach ($this->normalizeStringList($assert['must_include_chunk_ids'] ?? []) as $expectedChunkId) {
+ if (!in_array($expectedChunkId, $chunkIds, true)) {
+ $failures[] = sprintf(
+ 'missing expected chunk_id "%s".',
+ $expectedChunkId
+ );
+ }
+ }
+
+ return new EvalResult(
+ caseId: $case->id,
+ type: $case->type,
+ passed: $failures === [],
+ durationMs: $durationMs,
+ failures: $failures,
+ details: [
+ 'prompt' => $case->prompt,
+ 'result_count' => $resultCount,
+ 'selection_mode' => $selectionMode,
+ 'route' => $route,
+ 'intent' => $intent,
+ 'document_ids' => $documentIds,
+ 'chunk_ids' => $chunkIds,
+ ],
+ );
+ }
+
+ /**
+ * @param array $row
+ */
+ private function extractString(array $row, string $key): string
+ {
+ $value = $row[$key] ?? null;
+
+ if (!is_string($value)) {
+ return '';
+ }
+
+ return trim($value);
+ }
+
+ /**
+ * @param array> $rows
+ * @return array
+ */
+ private function extractUniqueStringValues(array $rows, string $key): array
+ {
+ $values = [];
+
+ foreach ($rows as $row) {
+ $value = $row[$key] ?? null;
+
+ if (!is_string($value)) {
+ continue;
+ }
+
+ $value = trim($value);
+
+ if ($value === '') {
+ continue;
+ }
+
+ $values[$value] = true;
+ }
+
+ return array_keys($values);
+ }
+
+ /**
+ * @param mixed $value
+ * @return array
+ */
+ private function normalizeStringList(mixed $value): array
+ {
+ if (!is_array($value)) {
+ return [];
+ }
+
+ $out = [];
+
+ foreach ($value as $item) {
+ if (!is_string($item)) {
+ continue;
+ }
+
+ $item = trim($item);
+
+ if ($item === '') {
+ continue;
+ }
+
+ $out[] = $item;
+ }
+
+ return array_values(array_unique($out));
+ }
+}
\ No newline at end of file
diff --git a/tests/evals/cases/retrieval.ndjson b/tests/evals/cases/retrieval.ndjson
new file mode 100644
index 0000000..0f9efa3
--- /dev/null
+++ b/tests/evals/cases/retrieval.ndjson
@@ -0,0 +1,4 @@
+{"id":"retrieval_exact_doc_001","type":"retrieval","prompt":"Testomat 808","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["26129c01-c09f-4c71-9c80-7ddffb6c77fb"]}}
+{"id":"retrieval_exact_doc_002","type":"retrieval","prompt":"Testomat EVO CALC","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["74fdad85-5e4e-4f08-8d95-402f3180ed55"]}}
+{"id":"retrieval_exact_doc_003","type":"retrieval","prompt":"Wasserhärte Grenzwert Testomat","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["60706498-867b-41b8-8e76-63248178d265"]}}
+{"id":"retrieval_noise_001","type":"retrieval","prompt":"dsgfsdgfsdgf","assert":{"max_results":0}}
\ No newline at end of file
diff --git a/tests/evals/reports/.gitignore b/tests/evals/reports/.gitignore
new file mode 100644
index 0000000..c96a04f
--- /dev/null
+++ b/tests/evals/reports/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
\ No newline at end of file