From a468ddd843fa52ca088e32524580440aa417be54 Mon Sep 17 00:00:00 2001 From: team2 Date: Wed, 22 Apr 2026 20:43:41 +0200 Subject: [PATCH 1/4] stop fallback from hits --- src/Config/NdjsonHybridRetrieverConfig.php | 2 +- src/Knowledge/Retrieval/NdjsonHybridRetriever.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index 55d64c2..8f472f3 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -79,7 +79,7 @@ final class NdjsonHybridRetrieverConfig * - slightly larger safety net for the richer hybrid stack * - helps no-tag and low-signal cases without exploding context */ - public const EMPTY_RRF_FALLBACK_TOPN = 5; + public const EMPTY_RRF_FALLBACK_TOPN = 1; /** * Maximum number of chunks allowed from one document in spread mode. diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index bc4ef2c..175c68d 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -366,7 +366,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $rawScores = $fused['raw_scores']; if ($rrfScores === [] && $globalHits !== []) { - $rrfScores = $this->fallbackRrfFromHits($globalHits); + // $rrfScores = $this->fallbackRrfFromHits($globalHits); } if ($rrfScores === []) { From 65e2b1917c803f425af614f1128d8781360c835b Mon Sep 17 00:00:00 2001 From: team2 Date: Wed, 22 Apr 2026 21:02:08 +0200 Subject: [PATCH 2/4] VECTOR_SCORE_THRESHOLD .82 --- src/Config/NdjsonHybridRetrieverConfig.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index 8f472f3..80129c5 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -34,7 +34,7 @@ final class NdjsonHybridRetrieverConfig * - the system now has more safeguards: * lexical cross-signals, scoped retrieval, title/meta boost, selection rules */ - public const VECTOR_SCORE_THRESHOLD = 0.81; + public const VECTOR_SCORE_THRESHOLD = 0.82; /** * Lower safety boundary for dynamic threshold adjustments. From 8127d335717d7c5da6d736c2c5843f4cb729e9b9 Mon Sep 17 00:00:00 2001 From: team2 Date: Wed, 22 Apr 2026 22:03:23 +0200 Subject: [PATCH 3/4] first test suite retrieval --- src/Command/AgentEvalRunCommand.php | 149 ++++++++++++++++++++++ src/Eval/AgentEvalRunner.php | 43 +++++++ src/Eval/Dto/EvalCase.php | 60 +++++++++ src/Eval/Dto/EvalResult.php | 37 ++++++ src/Eval/EvalCaseLoader.php | 67 ++++++++++ src/Eval/RetrievalDebugRunner.php | 184 ++++++++++++++++++++++++++++ tests/evals/cases/retrieval.ndjson | 4 + tests/evals/reports/.gitignore | 2 + 8 files changed, 546 insertions(+) create mode 100644 src/Command/AgentEvalRunCommand.php create mode 100644 src/Eval/AgentEvalRunner.php create mode 100644 src/Eval/Dto/EvalCase.php create mode 100644 src/Eval/Dto/EvalResult.php create mode 100644 src/Eval/EvalCaseLoader.php create mode 100644 src/Eval/RetrievalDebugRunner.php create mode 100644 tests/evals/cases/retrieval.ndjson create mode 100644 tests/evals/reports/.gitignore diff --git a/src/Command/AgentEvalRunCommand.php b/src/Command/AgentEvalRunCommand.php new file mode 100644 index 0000000..d14f2fe --- /dev/null +++ b/src/Command/AgentEvalRunCommand.php @@ -0,0 +1,149 @@ +addArgument( + 'type', + InputArgument::OPTIONAL, + 'Eval type to run', + 'retrieval' + ) + ->addOption( + 'case', + null, + InputOption::VALUE_OPTIONAL, + 'Run only a single case by id' + ) + ->addOption( + 'json', + null, + InputOption::VALUE_NONE, + 'Print the full report as JSON' + ); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $io = new SymfonyStyle($input, $output); + + $type = trim((string) $input->getArgument('type')); + $caseId = trim((string) $input->getOption('case')); + $asJson = (bool) $input->getOption('json'); + + try { + $cases = $this->loader->load($type); + } catch (\Throwable $e) { + $io->error($e->getMessage()); + + return Command::FAILURE; + } + + if ($caseId !== '') { + $cases = array_values(array_filter( + $cases, + static fn (EvalCase $case): bool => $case->id === $caseId + )); + } + + if ($cases === []) { + $io->warning('No eval cases selected.'); + + return Command::SUCCESS; + } + + try { + $results = $this->runner->runAll($cases); + } catch (\Throwable $e) { + $io->error($e->getMessage()); + + return Command::FAILURE; + } + + $passed = count(array_filter( + $results, + static fn (EvalResult $result): bool => $result->passed + )); + $failed = count($results) - $passed; + + $report = [ + 'type' => $type, + 'total' => count($results), + 'passed' => $passed, + 'failed' => $failed, + 'results' => array_map( + static fn (EvalResult $result): array => $result->toArray(), + $results + ), + ]; + + if ($asJson) { + $json = json_encode( + $report, + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE + ); + + if (!is_string($json)) { + $io->error('json_encode failed.'); + + return Command::FAILURE; + } + + $output->writeln($json); + + return $failed > 0 ? Command::FAILURE : Command::SUCCESS; + } + + $io->title('RetrieX Eval Run'); + $io->definitionList( + ['type' => $type], + ['total' => (string) count($results)], + ['passed' => (string) $passed], + ['failed' => (string) $failed] + ); + + foreach ($results as $result) { + if ($result->passed) { + $io->writeln(sprintf('PASS %s', $result->caseId)); + continue; + } + + $io->writeln(sprintf('FAIL %s', $result->caseId)); + + foreach ($result->failures as $failure) { + $io->writeln(' - ' . $failure); + } + } + + return $failed > 0 ? Command::FAILURE : Command::SUCCESS; + } +} \ No newline at end of file diff --git a/src/Eval/AgentEvalRunner.php b/src/Eval/AgentEvalRunner.php new file mode 100644 index 0000000..1358357 --- /dev/null +++ b/src/Eval/AgentEvalRunner.php @@ -0,0 +1,43 @@ +isRetrievalCase()) { + return $this->retrievalDebugRunner->run($case); + } + + throw new \InvalidArgumentException(sprintf( + 'Unsupported eval case type: %s', + $case->type + )); + } + + /** + * @param array $cases + * @return array + */ + public function runAll(array $cases): array + { + $results = []; + + foreach ($cases as $case) { + $results[] = $this->run($case); + } + + return $results; + } +} \ No newline at end of file diff --git a/src/Eval/Dto/EvalCase.php b/src/Eval/Dto/EvalCase.php new file mode 100644 index 0000000..e5ce645 --- /dev/null +++ b/src/Eval/Dto/EvalCase.php @@ -0,0 +1,60 @@ + $assert + */ + public function __construct( + public string $id, + public string $type, + public string $prompt, + public array $assert = [], + ) { + } + + /** + * @param array $row + */ + public static function fromArray(array $row): self + { + $id = trim((string) ($row['id'] ?? '')); + $type = trim((string) ($row['type'] ?? '')); + $prompt = trim((string) ($row['prompt'] ?? '')); + $assert = is_array($row['assert'] ?? null) ? $row['assert'] : []; + + if ($id === '') { + throw new \InvalidArgumentException('Eval case id must not be empty.'); + } + + if ($type === '') { + throw new \InvalidArgumentException(sprintf( + 'Eval case "%s" has an empty type.', + $id + )); + } + + if ($prompt === '') { + throw new \InvalidArgumentException(sprintf( + 'Eval case "%s" has an empty prompt.', + $id + )); + } + + return new self( + id: $id, + type: $type, + prompt: $prompt, + assert: $assert, + ); + } + + public function isRetrievalCase(): bool + { + return $this->type === 'retrieval'; + } +} \ No newline at end of file diff --git a/src/Eval/Dto/EvalResult.php b/src/Eval/Dto/EvalResult.php new file mode 100644 index 0000000..7215941 --- /dev/null +++ b/src/Eval/Dto/EvalResult.php @@ -0,0 +1,37 @@ + $failures + * @param array $details + */ + public function __construct( + public string $caseId, + public string $type, + public bool $passed, + public float $durationMs, + public array $failures = [], + public array $details = [], + ) { + } + + /** + * @return array + */ + public function toArray(): array + { + return [ + 'case_id' => $this->caseId, + 'type' => $this->type, + 'passed' => $this->passed, + 'duration_ms' => $this->durationMs, + 'failures' => $this->failures, + 'details' => $this->details, + ]; + } +} \ No newline at end of file diff --git a/src/Eval/EvalCaseLoader.php b/src/Eval/EvalCaseLoader.php new file mode 100644 index 0000000..e71259b --- /dev/null +++ b/src/Eval/EvalCaseLoader.php @@ -0,0 +1,67 @@ + + */ + public function load(string $type = 'retrieval'): array + { + $path = sprintf( + '%s/tests/evals/cases/%s.ndjson', + $this->projectDir, + $type + ); + + if (!is_file($path)) { + throw new \RuntimeException(sprintf( + 'Eval case file not found: %s', + $path + )); + } + + $lines = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + + if ($lines === false) { + throw new \RuntimeException(sprintf( + 'Failed to read eval case file: %s', + $path + )); + } + + $cases = []; + + foreach ($lines as $lineNumber => $line) { + $line = trim($line); + + if ($line === '') { + continue; + } + + $decoded = json_decode($line, true); + + if (!is_array($decoded)) { + throw new \RuntimeException(sprintf( + 'Invalid JSON in %s on line %d.', + $path, + $lineNumber + 1 + )); + } + + $cases[] = EvalCase::fromArray($decoded); + } + + return $cases; + } +} \ No newline at end of file diff --git a/src/Eval/RetrievalDebugRunner.php b/src/Eval/RetrievalDebugRunner.php new file mode 100644 index 0000000..d5e606c --- /dev/null +++ b/src/Eval/RetrievalDebugRunner.php @@ -0,0 +1,184 @@ +retriever->retrieveDebug($case->prompt); + + $durationMs = round((microtime(true) - $start) * 1000, 2); + + $resultCount = count($rows); + $first = $rows[0] ?? []; + + $selectionMode = $this->extractString($first, 'selection_mode'); + $route = $this->extractString($first, 'route'); + $intent = $this->extractString($first, 'intent'); + + $documentIds = $this->extractUniqueStringValues($rows, 'document_id'); + $chunkIds = $this->extractUniqueStringValues($rows, 'chunk_id'); + + $assert = $case->assert; + + if (isset($assert['selection_mode']) && (string) $assert['selection_mode'] !== $selectionMode) { + $failures[] = sprintf( + 'selection_mode mismatch: expected "%s", got "%s".', + (string) $assert['selection_mode'], + $selectionMode + ); + } + + if (isset($assert['route']) && (string) $assert['route'] !== $route) { + $failures[] = sprintf( + 'route mismatch: expected "%s", got "%s".', + (string) $assert['route'], + $route + ); + } + + if (isset($assert['intent']) && (string) $assert['intent'] !== $intent) { + $failures[] = sprintf( + 'intent mismatch: expected "%s", got "%s".', + (string) $assert['intent'], + $intent + ); + } + + if (isset($assert['min_results']) && $resultCount < (int) $assert['min_results']) { + $failures[] = sprintf( + 'result_count too low: expected >= %d, got %d.', + (int) $assert['min_results'], + $resultCount + ); + } + + if (isset($assert['max_results']) && $resultCount > (int) $assert['max_results']) { + $failures[] = sprintf( + 'result_count too high: expected <= %d, got %d.', + (int) $assert['max_results'], + $resultCount + ); + } + + foreach ($this->normalizeStringList($assert['must_include_document_ids'] ?? []) as $expectedDocumentId) { + if (!in_array($expectedDocumentId, $documentIds, true)) { + $failures[] = sprintf( + 'missing expected document_id "%s".', + $expectedDocumentId + ); + } + } + + foreach ($this->normalizeStringList($assert['must_include_chunk_ids'] ?? []) as $expectedChunkId) { + if (!in_array($expectedChunkId, $chunkIds, true)) { + $failures[] = sprintf( + 'missing expected chunk_id "%s".', + $expectedChunkId + ); + } + } + + return new EvalResult( + caseId: $case->id, + type: $case->type, + passed: $failures === [], + durationMs: $durationMs, + failures: $failures, + details: [ + 'prompt' => $case->prompt, + 'result_count' => $resultCount, + 'selection_mode' => $selectionMode, + 'route' => $route, + 'intent' => $intent, + 'document_ids' => $documentIds, + 'chunk_ids' => $chunkIds, + ], + ); + } + + /** + * @param array $row + */ + private function extractString(array $row, string $key): string + { + $value = $row[$key] ?? null; + + if (!is_string($value)) { + return ''; + } + + return trim($value); + } + + /** + * @param array> $rows + * @return array + */ + private function extractUniqueStringValues(array $rows, string $key): array + { + $values = []; + + foreach ($rows as $row) { + $value = $row[$key] ?? null; + + if (!is_string($value)) { + continue; + } + + $value = trim($value); + + if ($value === '') { + continue; + } + + $values[$value] = true; + } + + return array_keys($values); + } + + /** + * @param mixed $value + * @return array + */ + private function normalizeStringList(mixed $value): array + { + if (!is_array($value)) { + return []; + } + + $out = []; + + foreach ($value as $item) { + if (!is_string($item)) { + continue; + } + + $item = trim($item); + + if ($item === '') { + continue; + } + + $out[] = $item; + } + + return array_values(array_unique($out)); + } +} \ No newline at end of file diff --git a/tests/evals/cases/retrieval.ndjson b/tests/evals/cases/retrieval.ndjson new file mode 100644 index 0000000..0f9efa3 --- /dev/null +++ b/tests/evals/cases/retrieval.ndjson @@ -0,0 +1,4 @@ +{"id":"retrieval_exact_doc_001","type":"retrieval","prompt":"Testomat 808","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["26129c01-c09f-4c71-9c80-7ddffb6c77fb"]}} +{"id":"retrieval_exact_doc_002","type":"retrieval","prompt":"Testomat EVO CALC","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["74fdad85-5e4e-4f08-8d95-402f3180ed55"]}} +{"id":"retrieval_exact_doc_003","type":"retrieval","prompt":"Wasserhärte Grenzwert Testomat","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["60706498-867b-41b8-8e76-63248178d265"]}} +{"id":"retrieval_noise_001","type":"retrieval","prompt":"dsgfsdgfsdgf","assert":{"max_results":0}} \ No newline at end of file diff --git a/tests/evals/reports/.gitignore b/tests/evals/reports/.gitignore new file mode 100644 index 0000000..c96a04f --- /dev/null +++ b/tests/evals/reports/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file From 47099a44076ee42ceea0acaf5c0fb4dd76438195 Mon Sep 17 00:00:00 2001 From: team2 Date: Wed, 22 Apr 2026 22:22:27 +0200 Subject: [PATCH 4/4] first test suite retrieval --- src/Command/AgentEvalRunCommand.php | 34 +++- src/Eval/EvalReportWriter.php | 59 +++++++ src/Eval/RetrievalDebugRunner.php | 250 ++++++++++++++++++++++++++++ tests/evals/cases/retrieval.ndjson | 121 +++++++++++++- 4 files changed, 458 insertions(+), 6 deletions(-) create mode 100644 src/Eval/EvalReportWriter.php diff --git a/src/Command/AgentEvalRunCommand.php b/src/Command/AgentEvalRunCommand.php index d14f2fe..d34a67b 100644 --- a/src/Command/AgentEvalRunCommand.php +++ b/src/Command/AgentEvalRunCommand.php @@ -8,6 +8,7 @@ use App\Eval\AgentEvalRunner; use App\Eval\Dto\EvalCase; use App\Eval\Dto\EvalResult; use App\Eval\EvalCaseLoader; +use App\Eval\EvalReportWriter; use Symfony\Component\Console\Attribute\AsCommand; use Symfony\Component\Console\Command\Command; use Symfony\Component\Console\Input\InputArgument; @@ -25,6 +26,7 @@ final class AgentEvalRunCommand extends Command public function __construct( private readonly EvalCaseLoader $loader, private readonly AgentEvalRunner $runner, + private readonly EvalReportWriter $reportWriter, ) { parent::__construct(); } @@ -49,6 +51,12 @@ final class AgentEvalRunCommand extends Command null, InputOption::VALUE_NONE, 'Print the full report as JSON' + ) + ->addOption( + 'no-write', + null, + InputOption::VALUE_NONE, + 'Do not write the report file' ); } @@ -59,6 +67,7 @@ final class AgentEvalRunCommand extends Command $type = trim((string) $input->getArgument('type')); $caseId = trim((string) $input->getOption('case')); $asJson = (bool) $input->getOption('json'); + $noWrite = (bool) $input->getOption('no-write'); try { $cases = $this->loader->load($type); @@ -97,18 +106,38 @@ final class AgentEvalRunCommand extends Command $report = [ 'type' => $type, + 'case_filter' => $caseId !== '' ? $caseId : null, 'total' => count($results), 'passed' => $passed, 'failed' => $failed, + 'generated_at' => (new \DateTimeImmutable())->format(\DateTimeInterface::ATOM), 'results' => array_map( static fn (EvalResult $result): array => $result->toArray(), $results ), ]; + $writtenPath = null; + + if (!$noWrite) { + try { + $writtenPath = $this->reportWriter->write($report); + } catch (\Throwable $e) { + $io->error($e->getMessage()); + + return Command::FAILURE; + } + } + if ($asJson) { + $jsonReport = $report; + + if ($writtenPath !== null) { + $jsonReport['written_to'] = $writtenPath; + } + $json = json_encode( - $report, + $jsonReport, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE ); @@ -128,7 +157,8 @@ final class AgentEvalRunCommand extends Command ['type' => $type], ['total' => (string) count($results)], ['passed' => (string) $passed], - ['failed' => (string) $failed] + ['failed' => (string) $failed], + ['report_file' => $writtenPath ?? 'disabled (--no-write)'] ); foreach ($results as $result) { diff --git a/src/Eval/EvalReportWriter.php b/src/Eval/EvalReportWriter.php new file mode 100644 index 0000000..f889847 --- /dev/null +++ b/src/Eval/EvalReportWriter.php @@ -0,0 +1,59 @@ + $report + */ + public function write(array $report, string $filename = 'last-run.json'): string + { + $directory = sprintf('%s/tests/evals/reports', $this->projectDir); + + if (!is_dir($directory) && !mkdir($directory, 0775, true) && !is_dir($directory)) { + throw new \RuntimeException(sprintf( + 'Failed to create eval report directory: %s', + $directory + )); + } + + $path = sprintf('%s/%s', $directory, ltrim($filename, '/')); + + $json = json_encode( + $report, + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE + ); + + if (!is_string($json)) { + throw new \RuntimeException('json_encode failed for eval report.'); + } + + $tmpPath = $path . '.tmp'; + + if (file_put_contents($tmpPath, $json) === false) { + throw new \RuntimeException(sprintf( + 'Failed to write temporary eval report file: %s', + $tmpPath + )); + } + + if (!rename($tmpPath, $path)) { + @unlink($tmpPath); + + throw new \RuntimeException(sprintf( + 'Failed to move temporary eval report into place: %s', + $path + )); + } + + return $path; + } +} \ No newline at end of file diff --git a/src/Eval/RetrievalDebugRunner.php b/src/Eval/RetrievalDebugRunner.php index d5e606c..c07c28a 100644 --- a/src/Eval/RetrievalDebugRunner.php +++ b/src/Eval/RetrievalDebugRunner.php @@ -33,9 +33,13 @@ final readonly class RetrievalDebugRunner $documentIds = $this->extractUniqueStringValues($rows, 'document_id'); $chunkIds = $this->extractUniqueStringValues($rows, 'chunk_id'); + $joinedText = $this->extractJoinedText($rows); $assert = $case->assert; + // --------------------------------------------------------- + // Strict single-value assertions + // --------------------------------------------------------- if (isset($assert['selection_mode']) && (string) $assert['selection_mode'] !== $selectionMode) { $failures[] = sprintf( 'selection_mode mismatch: expected "%s", got "%s".', @@ -60,6 +64,33 @@ final readonly class RetrievalDebugRunner ); } + // --------------------------------------------------------- + // Flexible multi-value assertions + // --------------------------------------------------------- + $this->assertValueInList( + failures: $failures, + actual: $selectionMode, + expectedList: $assert['selection_mode_in'] ?? [], + label: 'selection_mode' + ); + + $this->assertValueInList( + failures: $failures, + actual: $route, + expectedList: $assert['route_in'] ?? [], + label: 'route' + ); + + $this->assertValueInList( + failures: $failures, + actual: $intent, + expectedList: $assert['intent_in'] ?? [], + label: 'intent' + ); + + // --------------------------------------------------------- + // Result count assertions + // --------------------------------------------------------- if (isset($assert['min_results']) && $resultCount < (int) $assert['min_results']) { $failures[] = sprintf( 'result_count too low: expected >= %d, got %d.', @@ -76,6 +107,9 @@ final readonly class RetrievalDebugRunner ); } + // --------------------------------------------------------- + // ID assertions + // --------------------------------------------------------- foreach ($this->normalizeStringList($assert['must_include_document_ids'] ?? []) as $expectedDocumentId) { if (!in_array($expectedDocumentId, $documentIds, true)) { $failures[] = sprintf( @@ -94,6 +128,65 @@ final readonly class RetrievalDebugRunner } } + $this->assertContainsAtLeastOne( + failures: $failures, + actualValues: $documentIds, + expectedList: $assert['must_include_one_of_document_ids'] ?? [], + label: 'document_id' + ); + + $this->assertContainsAtLeastOne( + failures: $failures, + actualValues: $chunkIds, + expectedList: $assert['must_include_one_of_chunk_ids'] ?? [], + label: 'chunk_id' + ); + + $this->assertContainsNone( + failures: $failures, + actualValues: $documentIds, + forbiddenList: $assert['must_not_include_document_ids'] ?? [], + label: 'document_id' + ); + + $this->assertContainsNone( + failures: $failures, + actualValues: $chunkIds, + forbiddenList: $assert['must_not_include_chunk_ids'] ?? [], + label: 'chunk_id' + ); + + // --------------------------------------------------------- + // Text / term assertions + // --------------------------------------------------------- + $matchedAnyTerms = $this->findMatchingTerms( + haystack: $joinedText, + terms: $this->normalizeStringList($assert['must_include_any_terms'] ?? []) + ); + + $matchedAllTerms = $this->findMatchingTerms( + haystack: $joinedText, + terms: $this->normalizeStringList($assert['must_include_all_terms'] ?? []) + ); + + $requiredAnyTerms = $this->normalizeStringList($assert['must_include_any_terms'] ?? []); + if ($requiredAnyTerms !== [] && $matchedAnyTerms === []) { + $failures[] = sprintf( + 'none of the required any-terms were found in the retrieval text: [%s].', + implode(', ', $requiredAnyTerms) + ); + } + + $requiredAllTerms = $this->normalizeStringList($assert['must_include_all_terms'] ?? []); + foreach ($requiredAllTerms as $requiredTerm) { + if (!$this->containsTerm($joinedText, $requiredTerm)) { + $failures[] = sprintf( + 'required all-term "%s" was not found in the retrieval text.', + $requiredTerm + ); + } + } + return new EvalResult( caseId: $case->id, type: $case->type, @@ -108,6 +201,8 @@ final readonly class RetrievalDebugRunner 'intent' => $intent, 'document_ids' => $documentIds, 'chunk_ids' => $chunkIds, + 'matched_any_terms' => $matchedAnyTerms, + 'matched_all_terms' => $matchedAllTerms, ], ); } @@ -153,6 +248,161 @@ final readonly class RetrievalDebugRunner return array_keys($values); } + /** + * @param array> $rows + */ + private function extractJoinedText(array $rows): string + { + $parts = []; + + foreach ($rows as $row) { + $text = $row['text'] ?? null; + + if (!is_string($text)) { + continue; + } + + $text = trim($text); + + if ($text === '') { + continue; + } + + $parts[] = $text; + } + + return implode("\n\n", $parts); + } + + /** + * @param array $failures + * @param mixed $expectedList + */ + private function assertValueInList( + array &$failures, + string $actual, + mixed $expectedList, + string $label + ): void { + $expected = $this->normalizeStringList($expectedList); + + if ($expected === []) { + return; + } + + if (!in_array($actual, $expected, true)) { + $failures[] = sprintf( + '%s mismatch: expected one of [%s], got "%s".', + $label, + implode(', ', $expected), + $actual + ); + } + } + + /** + * @param array $failures + * @param array $actualValues + * @param mixed $expectedList + */ + private function assertContainsAtLeastOne( + array &$failures, + array $actualValues, + mixed $expectedList, + string $label + ): void { + $expected = $this->normalizeStringList($expectedList); + + if ($expected === []) { + return; + } + + foreach ($expected as $candidate) { + if (in_array($candidate, $actualValues, true)) { + return; + } + } + + $failures[] = sprintf( + 'none of the expected %s values were found. Expected one of [%s], got [%s].', + $label, + implode(', ', $expected), + implode(', ', $actualValues) + ); + } + + /** + * @param array $failures + * @param array $actualValues + * @param mixed $forbiddenList + */ + private function assertContainsNone( + array &$failures, + array $actualValues, + mixed $forbiddenList, + string $label + ): void { + $forbidden = $this->normalizeStringList($forbiddenList); + + if ($forbidden === []) { + return; + } + + foreach ($forbidden as $forbiddenValue) { + if (in_array($forbiddenValue, $actualValues, true)) { + $failures[] = sprintf( + 'forbidden %s "%s" was present in the retrieval results.', + $label, + $forbiddenValue + ); + } + } + } + + /** + * @param array $terms + * @return array + */ + private function findMatchingTerms(string $haystack, array $terms): array + { + $matches = []; + + foreach ($terms as $term) { + if ($this->containsTerm($haystack, $term)) { + $matches[] = $term; + } + } + + return array_values(array_unique($matches)); + } + + private function containsTerm(string $haystack, string $term): bool + { + $haystack = $this->normalizeText($haystack); + $term = $this->normalizeText($term); + + if ($term === '') { + return false; + } + + return str_contains($haystack, $term); + } + + private function normalizeText(string $value): string + { + $value = trim($value); + + if ($value === '') { + return ''; + } + + if (function_exists('mb_strtolower')) { + return mb_strtolower($value); + } + + return strtolower($value); + } + /** * @param mixed $value * @return array diff --git a/tests/evals/cases/retrieval.ndjson b/tests/evals/cases/retrieval.ndjson index 0f9efa3..629e1aa 100644 --- a/tests/evals/cases/retrieval.ndjson +++ b/tests/evals/cases/retrieval.ndjson @@ -1,4 +1,117 @@ -{"id":"retrieval_exact_doc_001","type":"retrieval","prompt":"Testomat 808","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["26129c01-c09f-4c71-9c80-7ddffb6c77fb"]}} -{"id":"retrieval_exact_doc_002","type":"retrieval","prompt":"Testomat EVO CALC","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["74fdad85-5e4e-4f08-8d95-402f3180ed55"]}} -{"id":"retrieval_exact_doc_003","type":"retrieval","prompt":"Wasserhärte Grenzwert Testomat","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["60706498-867b-41b8-8e76-63248178d265"]}} -{"id":"retrieval_noise_001","type":"retrieval","prompt":"dsgfsdgfsdgf","assert":{"max_results":0}} \ No newline at end of file +{ + "id": "retrieval_exact_doc_001", + "type": "retrieval", + "prompt": "phaseaaudit-md", + "assert": { + "selection_mode_in": [ + "exact_document_title" + ], + "min_results": 1, + "must_include_one_of_document_ids": [ + "5914508a-5930-4f04-892b-323881d0daa7" + ], + "must_include_any_terms": [ + "enterprise", + "governance", + "vector-service" + ] + } +} +{ + "id": "retrieval_exact_doc_002", + "type": "retrieval", + "prompt": "ragsystemoverview-md", + "assert": { + "selection_mode_in": [ + "exact_document_title" + ], + "min_results": 1, + "must_include_one_of_document_ids": [ + "7513fd82-eec6-4bfa-a730-41820b38b6b4" + ], + "must_include_all_terms": [ + "rag-system", + "dokumente" + ] + } +} +{ + "id": "retrieval_exact_doc_003", + "type": "retrieval", + "prompt": "matrixparams-md", + "assert": { + "selection_mode_in": [ + "exact_document_title" + ], + "min_results": 1, + "must_include_one_of_document_ids": [ + "25276f4c-32bb-47a5-98b3-9d81aa722d2b" + ], + "must_include_any_terms": [ + "retrievalmaxchunks", + "retrievalvectortopk", + "hard_max_chunks" + ] + } +} +{ + "id": "retrieval_exact_doc_004", + "type": "retrieval", + "prompt": "readme-md", + "assert": { + "selection_mode_in": [ + "exact_document_title" + ], + "min_results": 1, + "must_include_one_of_document_ids": [ + "8abe1f0d-54e6-41ad-967a-9ce8a0efc6da" + ], + "must_include_any_terms": [ + "deterministisches", + "faiss", + "vector-service" + ] + } +} +{ + "id": "retrieval_semantic_001", + "type": "retrieval", + "prompt": "wie funktioniert das system", + "assert": { + "min_results": 1, + "must_include_one_of_document_ids": [ + "7513fd82-eec6-4bfa-a730-41820b38b6b4", + "8abe1f0d-54e6-41ad-967a-9ce8a0efc6da" + ], + "must_include_any_terms": [ + "rag-system", + "dokumente", + "indexierung" + ] + } +} +{ + "id": "retrieval_semantic_002", + "type": "retrieval", + "prompt": "welche parameter beeinflussen retrieval", + "assert": { + "min_results": 1, + "must_include_one_of_document_ids": [ + "25276f4c-32bb-47a5-98b3-9d81aa722d2b", + "7513fd82-eec6-4bfa-a730-41820b38b6b4" + ], + "must_include_any_terms": [ + "retrievalmaxchunks", + "vectortopk", + "chunk" + ] + } +} +{ + "id": "retrieval_noise_001", + "type": "retrieval", + "prompt": "dsgfsdgfsdgf", + "assert": { + "max_results": 0 + } +} \ No newline at end of file