diff --git a/src/Command/AgentEvalRunCommand.php b/src/Command/AgentEvalRunCommand.php new file mode 100644 index 0000000..d34a67b --- /dev/null +++ b/src/Command/AgentEvalRunCommand.php @@ -0,0 +1,179 @@ +addArgument( + 'type', + InputArgument::OPTIONAL, + 'Eval type to run', + 'retrieval' + ) + ->addOption( + 'case', + null, + InputOption::VALUE_OPTIONAL, + 'Run only a single case by id' + ) + ->addOption( + 'json', + null, + InputOption::VALUE_NONE, + 'Print the full report as JSON' + ) + ->addOption( + 'no-write', + null, + InputOption::VALUE_NONE, + 'Do not write the report file' + ); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $io = new SymfonyStyle($input, $output); + + $type = trim((string) $input->getArgument('type')); + $caseId = trim((string) $input->getOption('case')); + $asJson = (bool) $input->getOption('json'); + $noWrite = (bool) $input->getOption('no-write'); + + try { + $cases = $this->loader->load($type); + } catch (\Throwable $e) { + $io->error($e->getMessage()); + + return Command::FAILURE; + } + + if ($caseId !== '') { + $cases = array_values(array_filter( + $cases, + static fn (EvalCase $case): bool => $case->id === $caseId + )); + } + + if ($cases === []) { + $io->warning('No eval cases selected.'); + + return Command::SUCCESS; + } + + try { + $results = $this->runner->runAll($cases); + } catch (\Throwable $e) { + $io->error($e->getMessage()); + + return Command::FAILURE; + } + + $passed = count(array_filter( + $results, + static fn (EvalResult $result): bool => $result->passed + )); + $failed = count($results) - $passed; + + $report = [ + 'type' => $type, + 'case_filter' => $caseId !== '' ? $caseId : null, + 'total' => count($results), + 'passed' => $passed, + 'failed' => $failed, + 'generated_at' => (new \DateTimeImmutable())->format(\DateTimeInterface::ATOM), + 'results' => array_map( + static fn (EvalResult $result): array => $result->toArray(), + $results + ), + ]; + + $writtenPath = null; + + if (!$noWrite) { + try { + $writtenPath = $this->reportWriter->write($report); + } catch (\Throwable $e) { + $io->error($e->getMessage()); + + return Command::FAILURE; + } + } + + if ($asJson) { + $jsonReport = $report; + + if ($writtenPath !== null) { + $jsonReport['written_to'] = $writtenPath; + } + + $json = json_encode( + $jsonReport, + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE + ); + + if (!is_string($json)) { + $io->error('json_encode failed.'); + + return Command::FAILURE; + } + + $output->writeln($json); + + return $failed > 0 ? Command::FAILURE : Command::SUCCESS; + } + + $io->title('RetrieX Eval Run'); + $io->definitionList( + ['type' => $type], + ['total' => (string) count($results)], + ['passed' => (string) $passed], + ['failed' => (string) $failed], + ['report_file' => $writtenPath ?? 'disabled (--no-write)'] + ); + + foreach ($results as $result) { + if ($result->passed) { + $io->writeln(sprintf('PASS %s', $result->caseId)); + continue; + } + + $io->writeln(sprintf('FAIL %s', $result->caseId)); + + foreach ($result->failures as $failure) { + $io->writeln(' - ' . $failure); + } + } + + return $failed > 0 ? Command::FAILURE : Command::SUCCESS; + } +} \ No newline at end of file diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index ab83943..80129c5 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -34,7 +34,7 @@ final class NdjsonHybridRetrieverConfig * - the system now has more safeguards: * lexical cross-signals, scoped retrieval, title/meta boost, selection rules */ - public const VECTOR_SCORE_THRESHOLD = 0.83; + public const VECTOR_SCORE_THRESHOLD = 0.82; /** * Lower safety boundary for dynamic threshold adjustments. @@ -79,7 +79,7 @@ final class NdjsonHybridRetrieverConfig * - slightly larger safety net for the richer hybrid stack * - helps no-tag and low-signal cases without exploding context */ - public const EMPTY_RRF_FALLBACK_TOPN = 5; + public const EMPTY_RRF_FALLBACK_TOPN = 1; /** * Maximum number of chunks allowed from one document in spread mode. diff --git a/src/Eval/AgentEvalRunner.php b/src/Eval/AgentEvalRunner.php new file mode 100644 index 0000000..1358357 --- /dev/null +++ b/src/Eval/AgentEvalRunner.php @@ -0,0 +1,43 @@ +isRetrievalCase()) { + return $this->retrievalDebugRunner->run($case); + } + + throw new \InvalidArgumentException(sprintf( + 'Unsupported eval case type: %s', + $case->type + )); + } + + /** + * @param array $cases + * @return array + */ + public function runAll(array $cases): array + { + $results = []; + + foreach ($cases as $case) { + $results[] = $this->run($case); + } + + return $results; + } +} \ No newline at end of file diff --git a/src/Eval/Dto/EvalCase.php b/src/Eval/Dto/EvalCase.php new file mode 100644 index 0000000..e5ce645 --- /dev/null +++ b/src/Eval/Dto/EvalCase.php @@ -0,0 +1,60 @@ + $assert + */ + public function __construct( + public string $id, + public string $type, + public string $prompt, + public array $assert = [], + ) { + } + + /** + * @param array $row + */ + public static function fromArray(array $row): self + { + $id = trim((string) ($row['id'] ?? '')); + $type = trim((string) ($row['type'] ?? '')); + $prompt = trim((string) ($row['prompt'] ?? '')); + $assert = is_array($row['assert'] ?? null) ? $row['assert'] : []; + + if ($id === '') { + throw new \InvalidArgumentException('Eval case id must not be empty.'); + } + + if ($type === '') { + throw new \InvalidArgumentException(sprintf( + 'Eval case "%s" has an empty type.', + $id + )); + } + + if ($prompt === '') { + throw new \InvalidArgumentException(sprintf( + 'Eval case "%s" has an empty prompt.', + $id + )); + } + + return new self( + id: $id, + type: $type, + prompt: $prompt, + assert: $assert, + ); + } + + public function isRetrievalCase(): bool + { + return $this->type === 'retrieval'; + } +} \ No newline at end of file diff --git a/src/Eval/Dto/EvalResult.php b/src/Eval/Dto/EvalResult.php new file mode 100644 index 0000000..7215941 --- /dev/null +++ b/src/Eval/Dto/EvalResult.php @@ -0,0 +1,37 @@ + $failures + * @param array $details + */ + public function __construct( + public string $caseId, + public string $type, + public bool $passed, + public float $durationMs, + public array $failures = [], + public array $details = [], + ) { + } + + /** + * @return array + */ + public function toArray(): array + { + return [ + 'case_id' => $this->caseId, + 'type' => $this->type, + 'passed' => $this->passed, + 'duration_ms' => $this->durationMs, + 'failures' => $this->failures, + 'details' => $this->details, + ]; + } +} \ No newline at end of file diff --git a/src/Eval/EvalCaseLoader.php b/src/Eval/EvalCaseLoader.php new file mode 100644 index 0000000..e71259b --- /dev/null +++ b/src/Eval/EvalCaseLoader.php @@ -0,0 +1,67 @@ + + */ + public function load(string $type = 'retrieval'): array + { + $path = sprintf( + '%s/tests/evals/cases/%s.ndjson', + $this->projectDir, + $type + ); + + if (!is_file($path)) { + throw new \RuntimeException(sprintf( + 'Eval case file not found: %s', + $path + )); + } + + $lines = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + + if ($lines === false) { + throw new \RuntimeException(sprintf( + 'Failed to read eval case file: %s', + $path + )); + } + + $cases = []; + + foreach ($lines as $lineNumber => $line) { + $line = trim($line); + + if ($line === '') { + continue; + } + + $decoded = json_decode($line, true); + + if (!is_array($decoded)) { + throw new \RuntimeException(sprintf( + 'Invalid JSON in %s on line %d.', + $path, + $lineNumber + 1 + )); + } + + $cases[] = EvalCase::fromArray($decoded); + } + + return $cases; + } +} \ No newline at end of file diff --git a/src/Eval/EvalReportWriter.php b/src/Eval/EvalReportWriter.php new file mode 100644 index 0000000..f889847 --- /dev/null +++ b/src/Eval/EvalReportWriter.php @@ -0,0 +1,59 @@ + $report + */ + public function write(array $report, string $filename = 'last-run.json'): string + { + $directory = sprintf('%s/tests/evals/reports', $this->projectDir); + + if (!is_dir($directory) && !mkdir($directory, 0775, true) && !is_dir($directory)) { + throw new \RuntimeException(sprintf( + 'Failed to create eval report directory: %s', + $directory + )); + } + + $path = sprintf('%s/%s', $directory, ltrim($filename, '/')); + + $json = json_encode( + $report, + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE + ); + + if (!is_string($json)) { + throw new \RuntimeException('json_encode failed for eval report.'); + } + + $tmpPath = $path . '.tmp'; + + if (file_put_contents($tmpPath, $json) === false) { + throw new \RuntimeException(sprintf( + 'Failed to write temporary eval report file: %s', + $tmpPath + )); + } + + if (!rename($tmpPath, $path)) { + @unlink($tmpPath); + + throw new \RuntimeException(sprintf( + 'Failed to move temporary eval report into place: %s', + $path + )); + } + + return $path; + } +} \ No newline at end of file diff --git a/src/Eval/RetrievalDebugRunner.php b/src/Eval/RetrievalDebugRunner.php new file mode 100644 index 0000000..c07c28a --- /dev/null +++ b/src/Eval/RetrievalDebugRunner.php @@ -0,0 +1,434 @@ +retriever->retrieveDebug($case->prompt); + + $durationMs = round((microtime(true) - $start) * 1000, 2); + + $resultCount = count($rows); + $first = $rows[0] ?? []; + + $selectionMode = $this->extractString($first, 'selection_mode'); + $route = $this->extractString($first, 'route'); + $intent = $this->extractString($first, 'intent'); + + $documentIds = $this->extractUniqueStringValues($rows, 'document_id'); + $chunkIds = $this->extractUniqueStringValues($rows, 'chunk_id'); + $joinedText = $this->extractJoinedText($rows); + + $assert = $case->assert; + + // --------------------------------------------------------- + // Strict single-value assertions + // --------------------------------------------------------- + if (isset($assert['selection_mode']) && (string) $assert['selection_mode'] !== $selectionMode) { + $failures[] = sprintf( + 'selection_mode mismatch: expected "%s", got "%s".', + (string) $assert['selection_mode'], + $selectionMode + ); + } + + if (isset($assert['route']) && (string) $assert['route'] !== $route) { + $failures[] = sprintf( + 'route mismatch: expected "%s", got "%s".', + (string) $assert['route'], + $route + ); + } + + if (isset($assert['intent']) && (string) $assert['intent'] !== $intent) { + $failures[] = sprintf( + 'intent mismatch: expected "%s", got "%s".', + (string) $assert['intent'], + $intent + ); + } + + // --------------------------------------------------------- + // Flexible multi-value assertions + // --------------------------------------------------------- + $this->assertValueInList( + failures: $failures, + actual: $selectionMode, + expectedList: $assert['selection_mode_in'] ?? [], + label: 'selection_mode' + ); + + $this->assertValueInList( + failures: $failures, + actual: $route, + expectedList: $assert['route_in'] ?? [], + label: 'route' + ); + + $this->assertValueInList( + failures: $failures, + actual: $intent, + expectedList: $assert['intent_in'] ?? [], + label: 'intent' + ); + + // --------------------------------------------------------- + // Result count assertions + // --------------------------------------------------------- + if (isset($assert['min_results']) && $resultCount < (int) $assert['min_results']) { + $failures[] = sprintf( + 'result_count too low: expected >= %d, got %d.', + (int) $assert['min_results'], + $resultCount + ); + } + + if (isset($assert['max_results']) && $resultCount > (int) $assert['max_results']) { + $failures[] = sprintf( + 'result_count too high: expected <= %d, got %d.', + (int) $assert['max_results'], + $resultCount + ); + } + + // --------------------------------------------------------- + // ID assertions + // --------------------------------------------------------- + foreach ($this->normalizeStringList($assert['must_include_document_ids'] ?? []) as $expectedDocumentId) { + if (!in_array($expectedDocumentId, $documentIds, true)) { + $failures[] = sprintf( + 'missing expected document_id "%s".', + $expectedDocumentId + ); + } + } + + foreach ($this->normalizeStringList($assert['must_include_chunk_ids'] ?? []) as $expectedChunkId) { + if (!in_array($expectedChunkId, $chunkIds, true)) { + $failures[] = sprintf( + 'missing expected chunk_id "%s".', + $expectedChunkId + ); + } + } + + $this->assertContainsAtLeastOne( + failures: $failures, + actualValues: $documentIds, + expectedList: $assert['must_include_one_of_document_ids'] ?? [], + label: 'document_id' + ); + + $this->assertContainsAtLeastOne( + failures: $failures, + actualValues: $chunkIds, + expectedList: $assert['must_include_one_of_chunk_ids'] ?? [], + label: 'chunk_id' + ); + + $this->assertContainsNone( + failures: $failures, + actualValues: $documentIds, + forbiddenList: $assert['must_not_include_document_ids'] ?? [], + label: 'document_id' + ); + + $this->assertContainsNone( + failures: $failures, + actualValues: $chunkIds, + forbiddenList: $assert['must_not_include_chunk_ids'] ?? [], + label: 'chunk_id' + ); + + // --------------------------------------------------------- + // Text / term assertions + // --------------------------------------------------------- + $matchedAnyTerms = $this->findMatchingTerms( + haystack: $joinedText, + terms: $this->normalizeStringList($assert['must_include_any_terms'] ?? []) + ); + + $matchedAllTerms = $this->findMatchingTerms( + haystack: $joinedText, + terms: $this->normalizeStringList($assert['must_include_all_terms'] ?? []) + ); + + $requiredAnyTerms = $this->normalizeStringList($assert['must_include_any_terms'] ?? []); + if ($requiredAnyTerms !== [] && $matchedAnyTerms === []) { + $failures[] = sprintf( + 'none of the required any-terms were found in the retrieval text: [%s].', + implode(', ', $requiredAnyTerms) + ); + } + + $requiredAllTerms = $this->normalizeStringList($assert['must_include_all_terms'] ?? []); + foreach ($requiredAllTerms as $requiredTerm) { + if (!$this->containsTerm($joinedText, $requiredTerm)) { + $failures[] = sprintf( + 'required all-term "%s" was not found in the retrieval text.', + $requiredTerm + ); + } + } + + return new EvalResult( + caseId: $case->id, + type: $case->type, + passed: $failures === [], + durationMs: $durationMs, + failures: $failures, + details: [ + 'prompt' => $case->prompt, + 'result_count' => $resultCount, + 'selection_mode' => $selectionMode, + 'route' => $route, + 'intent' => $intent, + 'document_ids' => $documentIds, + 'chunk_ids' => $chunkIds, + 'matched_any_terms' => $matchedAnyTerms, + 'matched_all_terms' => $matchedAllTerms, + ], + ); + } + + /** + * @param array $row + */ + private function extractString(array $row, string $key): string + { + $value = $row[$key] ?? null; + + if (!is_string($value)) { + return ''; + } + + return trim($value); + } + + /** + * @param array> $rows + * @return array + */ + private function extractUniqueStringValues(array $rows, string $key): array + { + $values = []; + + foreach ($rows as $row) { + $value = $row[$key] ?? null; + + if (!is_string($value)) { + continue; + } + + $value = trim($value); + + if ($value === '') { + continue; + } + + $values[$value] = true; + } + + return array_keys($values); + } + + /** + * @param array> $rows + */ + private function extractJoinedText(array $rows): string + { + $parts = []; + + foreach ($rows as $row) { + $text = $row['text'] ?? null; + + if (!is_string($text)) { + continue; + } + + $text = trim($text); + + if ($text === '') { + continue; + } + + $parts[] = $text; + } + + return implode("\n\n", $parts); + } + + /** + * @param array $failures + * @param mixed $expectedList + */ + private function assertValueInList( + array &$failures, + string $actual, + mixed $expectedList, + string $label + ): void { + $expected = $this->normalizeStringList($expectedList); + + if ($expected === []) { + return; + } + + if (!in_array($actual, $expected, true)) { + $failures[] = sprintf( + '%s mismatch: expected one of [%s], got "%s".', + $label, + implode(', ', $expected), + $actual + ); + } + } + + /** + * @param array $failures + * @param array $actualValues + * @param mixed $expectedList + */ + private function assertContainsAtLeastOne( + array &$failures, + array $actualValues, + mixed $expectedList, + string $label + ): void { + $expected = $this->normalizeStringList($expectedList); + + if ($expected === []) { + return; + } + + foreach ($expected as $candidate) { + if (in_array($candidate, $actualValues, true)) { + return; + } + } + + $failures[] = sprintf( + 'none of the expected %s values were found. Expected one of [%s], got [%s].', + $label, + implode(', ', $expected), + implode(', ', $actualValues) + ); + } + + /** + * @param array $failures + * @param array $actualValues + * @param mixed $forbiddenList + */ + private function assertContainsNone( + array &$failures, + array $actualValues, + mixed $forbiddenList, + string $label + ): void { + $forbidden = $this->normalizeStringList($forbiddenList); + + if ($forbidden === []) { + return; + } + + foreach ($forbidden as $forbiddenValue) { + if (in_array($forbiddenValue, $actualValues, true)) { + $failures[] = sprintf( + 'forbidden %s "%s" was present in the retrieval results.', + $label, + $forbiddenValue + ); + } + } + } + + /** + * @param array $terms + * @return array + */ + private function findMatchingTerms(string $haystack, array $terms): array + { + $matches = []; + + foreach ($terms as $term) { + if ($this->containsTerm($haystack, $term)) { + $matches[] = $term; + } + } + + return array_values(array_unique($matches)); + } + + private function containsTerm(string $haystack, string $term): bool + { + $haystack = $this->normalizeText($haystack); + $term = $this->normalizeText($term); + + if ($term === '') { + return false; + } + + return str_contains($haystack, $term); + } + + private function normalizeText(string $value): string + { + $value = trim($value); + + if ($value === '') { + return ''; + } + + if (function_exists('mb_strtolower')) { + return mb_strtolower($value); + } + + return strtolower($value); + } + + /** + * @param mixed $value + * @return array + */ + private function normalizeStringList(mixed $value): array + { + if (!is_array($value)) { + return []; + } + + $out = []; + + foreach ($value as $item) { + if (!is_string($item)) { + continue; + } + + $item = trim($item); + + if ($item === '') { + continue; + } + + $out[] = $item; + } + + return array_values(array_unique($out)); + } +} \ No newline at end of file diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index bc4ef2c..175c68d 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -366,7 +366,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $rawScores = $fused['raw_scores']; if ($rrfScores === [] && $globalHits !== []) { - $rrfScores = $this->fallbackRrfFromHits($globalHits); + // $rrfScores = $this->fallbackRrfFromHits($globalHits); } if ($rrfScores === []) { diff --git a/tests/evals/cases/retrieval.ndjson b/tests/evals/cases/retrieval.ndjson new file mode 100644 index 0000000..629e1aa --- /dev/null +++ b/tests/evals/cases/retrieval.ndjson @@ -0,0 +1,117 @@ +{ + "id": "retrieval_exact_doc_001", + "type": "retrieval", + "prompt": "phaseaaudit-md", + "assert": { + "selection_mode_in": [ + "exact_document_title" + ], + "min_results": 1, + "must_include_one_of_document_ids": [ + "5914508a-5930-4f04-892b-323881d0daa7" + ], + "must_include_any_terms": [ + "enterprise", + "governance", + "vector-service" + ] + } +} +{ + "id": "retrieval_exact_doc_002", + "type": "retrieval", + "prompt": "ragsystemoverview-md", + "assert": { + "selection_mode_in": [ + "exact_document_title" + ], + "min_results": 1, + "must_include_one_of_document_ids": [ + "7513fd82-eec6-4bfa-a730-41820b38b6b4" + ], + "must_include_all_terms": [ + "rag-system", + "dokumente" + ] + } +} +{ + "id": "retrieval_exact_doc_003", + "type": "retrieval", + "prompt": "matrixparams-md", + "assert": { + "selection_mode_in": [ + "exact_document_title" + ], + "min_results": 1, + "must_include_one_of_document_ids": [ + "25276f4c-32bb-47a5-98b3-9d81aa722d2b" + ], + "must_include_any_terms": [ + "retrievalmaxchunks", + "retrievalvectortopk", + "hard_max_chunks" + ] + } +} +{ + "id": "retrieval_exact_doc_004", + "type": "retrieval", + "prompt": "readme-md", + "assert": { + "selection_mode_in": [ + "exact_document_title" + ], + "min_results": 1, + "must_include_one_of_document_ids": [ + "8abe1f0d-54e6-41ad-967a-9ce8a0efc6da" + ], + "must_include_any_terms": [ + "deterministisches", + "faiss", + "vector-service" + ] + } +} +{ + "id": "retrieval_semantic_001", + "type": "retrieval", + "prompt": "wie funktioniert das system", + "assert": { + "min_results": 1, + "must_include_one_of_document_ids": [ + "7513fd82-eec6-4bfa-a730-41820b38b6b4", + "8abe1f0d-54e6-41ad-967a-9ce8a0efc6da" + ], + "must_include_any_terms": [ + "rag-system", + "dokumente", + "indexierung" + ] + } +} +{ + "id": "retrieval_semantic_002", + "type": "retrieval", + "prompt": "welche parameter beeinflussen retrieval", + "assert": { + "min_results": 1, + "must_include_one_of_document_ids": [ + "25276f4c-32bb-47a5-98b3-9d81aa722d2b", + "7513fd82-eec6-4bfa-a730-41820b38b6b4" + ], + "must_include_any_terms": [ + "retrievalmaxchunks", + "vectortopk", + "chunk" + ] + } +} +{ + "id": "retrieval_noise_001", + "type": "retrieval", + "prompt": "dsgfsdgfsdgf", + "assert": { + "max_results": 0 + } +} \ No newline at end of file diff --git a/tests/evals/reports/.gitignore b/tests/evals/reports/.gitignore new file mode 100644 index 0000000..c96a04f --- /dev/null +++ b/tests/evals/reports/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file