first test suite retrieval
This commit is contained in:
@@ -8,6 +8,7 @@ use App\Eval\AgentEvalRunner;
|
||||
use App\Eval\Dto\EvalCase;
|
||||
use App\Eval\Dto\EvalResult;
|
||||
use App\Eval\EvalCaseLoader;
|
||||
use App\Eval\EvalReportWriter;
|
||||
use Symfony\Component\Console\Attribute\AsCommand;
|
||||
use Symfony\Component\Console\Command\Command;
|
||||
use Symfony\Component\Console\Input\InputArgument;
|
||||
@@ -25,6 +26,7 @@ final class AgentEvalRunCommand extends Command
|
||||
public function __construct(
|
||||
private readonly EvalCaseLoader $loader,
|
||||
private readonly AgentEvalRunner $runner,
|
||||
private readonly EvalReportWriter $reportWriter,
|
||||
) {
|
||||
parent::__construct();
|
||||
}
|
||||
@@ -49,6 +51,12 @@ final class AgentEvalRunCommand extends Command
|
||||
null,
|
||||
InputOption::VALUE_NONE,
|
||||
'Print the full report as JSON'
|
||||
)
|
||||
->addOption(
|
||||
'no-write',
|
||||
null,
|
||||
InputOption::VALUE_NONE,
|
||||
'Do not write the report file'
|
||||
);
|
||||
}
|
||||
|
||||
@@ -59,6 +67,7 @@ final class AgentEvalRunCommand extends Command
|
||||
$type = trim((string) $input->getArgument('type'));
|
||||
$caseId = trim((string) $input->getOption('case'));
|
||||
$asJson = (bool) $input->getOption('json');
|
||||
$noWrite = (bool) $input->getOption('no-write');
|
||||
|
||||
try {
|
||||
$cases = $this->loader->load($type);
|
||||
@@ -97,18 +106,38 @@ final class AgentEvalRunCommand extends Command
|
||||
|
||||
$report = [
|
||||
'type' => $type,
|
||||
'case_filter' => $caseId !== '' ? $caseId : null,
|
||||
'total' => count($results),
|
||||
'passed' => $passed,
|
||||
'failed' => $failed,
|
||||
'generated_at' => (new \DateTimeImmutable())->format(\DateTimeInterface::ATOM),
|
||||
'results' => array_map(
|
||||
static fn (EvalResult $result): array => $result->toArray(),
|
||||
$results
|
||||
),
|
||||
];
|
||||
|
||||
$writtenPath = null;
|
||||
|
||||
if (!$noWrite) {
|
||||
try {
|
||||
$writtenPath = $this->reportWriter->write($report);
|
||||
} catch (\Throwable $e) {
|
||||
$io->error($e->getMessage());
|
||||
|
||||
return Command::FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
if ($asJson) {
|
||||
$jsonReport = $report;
|
||||
|
||||
if ($writtenPath !== null) {
|
||||
$jsonReport['written_to'] = $writtenPath;
|
||||
}
|
||||
|
||||
$json = json_encode(
|
||||
$report,
|
||||
$jsonReport,
|
||||
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
|
||||
);
|
||||
|
||||
@@ -128,7 +157,8 @@ final class AgentEvalRunCommand extends Command
|
||||
['type' => $type],
|
||||
['total' => (string) count($results)],
|
||||
['passed' => (string) $passed],
|
||||
['failed' => (string) $failed]
|
||||
['failed' => (string) $failed],
|
||||
['report_file' => $writtenPath ?? 'disabled (--no-write)']
|
||||
);
|
||||
|
||||
foreach ($results as $result) {
|
||||
|
||||
59
src/Eval/EvalReportWriter.php
Normal file
59
src/Eval/EvalReportWriter.php
Normal file
@@ -0,0 +1,59 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Eval;
|
||||
|
||||
final readonly class EvalReportWriter
|
||||
{
|
||||
public function __construct(
|
||||
private string $projectDir,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $report
|
||||
*/
|
||||
public function write(array $report, string $filename = 'last-run.json'): string
|
||||
{
|
||||
$directory = sprintf('%s/tests/evals/reports', $this->projectDir);
|
||||
|
||||
if (!is_dir($directory) && !mkdir($directory, 0775, true) && !is_dir($directory)) {
|
||||
throw new \RuntimeException(sprintf(
|
||||
'Failed to create eval report directory: %s',
|
||||
$directory
|
||||
));
|
||||
}
|
||||
|
||||
$path = sprintf('%s/%s', $directory, ltrim($filename, '/'));
|
||||
|
||||
$json = json_encode(
|
||||
$report,
|
||||
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
|
||||
);
|
||||
|
||||
if (!is_string($json)) {
|
||||
throw new \RuntimeException('json_encode failed for eval report.');
|
||||
}
|
||||
|
||||
$tmpPath = $path . '.tmp';
|
||||
|
||||
if (file_put_contents($tmpPath, $json) === false) {
|
||||
throw new \RuntimeException(sprintf(
|
||||
'Failed to write temporary eval report file: %s',
|
||||
$tmpPath
|
||||
));
|
||||
}
|
||||
|
||||
if (!rename($tmpPath, $path)) {
|
||||
@unlink($tmpPath);
|
||||
|
||||
throw new \RuntimeException(sprintf(
|
||||
'Failed to move temporary eval report into place: %s',
|
||||
$path
|
||||
));
|
||||
}
|
||||
|
||||
return $path;
|
||||
}
|
||||
}
|
||||
@@ -33,9 +33,13 @@ final readonly class RetrievalDebugRunner
|
||||
|
||||
$documentIds = $this->extractUniqueStringValues($rows, 'document_id');
|
||||
$chunkIds = $this->extractUniqueStringValues($rows, 'chunk_id');
|
||||
$joinedText = $this->extractJoinedText($rows);
|
||||
|
||||
$assert = $case->assert;
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// Strict single-value assertions
|
||||
// ---------------------------------------------------------
|
||||
if (isset($assert['selection_mode']) && (string) $assert['selection_mode'] !== $selectionMode) {
|
||||
$failures[] = sprintf(
|
||||
'selection_mode mismatch: expected "%s", got "%s".',
|
||||
@@ -60,6 +64,33 @@ final readonly class RetrievalDebugRunner
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// Flexible multi-value assertions
|
||||
// ---------------------------------------------------------
|
||||
$this->assertValueInList(
|
||||
failures: $failures,
|
||||
actual: $selectionMode,
|
||||
expectedList: $assert['selection_mode_in'] ?? [],
|
||||
label: 'selection_mode'
|
||||
);
|
||||
|
||||
$this->assertValueInList(
|
||||
failures: $failures,
|
||||
actual: $route,
|
||||
expectedList: $assert['route_in'] ?? [],
|
||||
label: 'route'
|
||||
);
|
||||
|
||||
$this->assertValueInList(
|
||||
failures: $failures,
|
||||
actual: $intent,
|
||||
expectedList: $assert['intent_in'] ?? [],
|
||||
label: 'intent'
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// Result count assertions
|
||||
// ---------------------------------------------------------
|
||||
if (isset($assert['min_results']) && $resultCount < (int) $assert['min_results']) {
|
||||
$failures[] = sprintf(
|
||||
'result_count too low: expected >= %d, got %d.',
|
||||
@@ -76,6 +107,9 @@ final readonly class RetrievalDebugRunner
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// ID assertions
|
||||
// ---------------------------------------------------------
|
||||
foreach ($this->normalizeStringList($assert['must_include_document_ids'] ?? []) as $expectedDocumentId) {
|
||||
if (!in_array($expectedDocumentId, $documentIds, true)) {
|
||||
$failures[] = sprintf(
|
||||
@@ -94,6 +128,65 @@ final readonly class RetrievalDebugRunner
|
||||
}
|
||||
}
|
||||
|
||||
$this->assertContainsAtLeastOne(
|
||||
failures: $failures,
|
||||
actualValues: $documentIds,
|
||||
expectedList: $assert['must_include_one_of_document_ids'] ?? [],
|
||||
label: 'document_id'
|
||||
);
|
||||
|
||||
$this->assertContainsAtLeastOne(
|
||||
failures: $failures,
|
||||
actualValues: $chunkIds,
|
||||
expectedList: $assert['must_include_one_of_chunk_ids'] ?? [],
|
||||
label: 'chunk_id'
|
||||
);
|
||||
|
||||
$this->assertContainsNone(
|
||||
failures: $failures,
|
||||
actualValues: $documentIds,
|
||||
forbiddenList: $assert['must_not_include_document_ids'] ?? [],
|
||||
label: 'document_id'
|
||||
);
|
||||
|
||||
$this->assertContainsNone(
|
||||
failures: $failures,
|
||||
actualValues: $chunkIds,
|
||||
forbiddenList: $assert['must_not_include_chunk_ids'] ?? [],
|
||||
label: 'chunk_id'
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// Text / term assertions
|
||||
// ---------------------------------------------------------
|
||||
$matchedAnyTerms = $this->findMatchingTerms(
|
||||
haystack: $joinedText,
|
||||
terms: $this->normalizeStringList($assert['must_include_any_terms'] ?? [])
|
||||
);
|
||||
|
||||
$matchedAllTerms = $this->findMatchingTerms(
|
||||
haystack: $joinedText,
|
||||
terms: $this->normalizeStringList($assert['must_include_all_terms'] ?? [])
|
||||
);
|
||||
|
||||
$requiredAnyTerms = $this->normalizeStringList($assert['must_include_any_terms'] ?? []);
|
||||
if ($requiredAnyTerms !== [] && $matchedAnyTerms === []) {
|
||||
$failures[] = sprintf(
|
||||
'none of the required any-terms were found in the retrieval text: [%s].',
|
||||
implode(', ', $requiredAnyTerms)
|
||||
);
|
||||
}
|
||||
|
||||
$requiredAllTerms = $this->normalizeStringList($assert['must_include_all_terms'] ?? []);
|
||||
foreach ($requiredAllTerms as $requiredTerm) {
|
||||
if (!$this->containsTerm($joinedText, $requiredTerm)) {
|
||||
$failures[] = sprintf(
|
||||
'required all-term "%s" was not found in the retrieval text.',
|
||||
$requiredTerm
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return new EvalResult(
|
||||
caseId: $case->id,
|
||||
type: $case->type,
|
||||
@@ -108,6 +201,8 @@ final readonly class RetrievalDebugRunner
|
||||
'intent' => $intent,
|
||||
'document_ids' => $documentIds,
|
||||
'chunk_ids' => $chunkIds,
|
||||
'matched_any_terms' => $matchedAnyTerms,
|
||||
'matched_all_terms' => $matchedAllTerms,
|
||||
],
|
||||
);
|
||||
}
|
||||
@@ -153,6 +248,161 @@ final readonly class RetrievalDebugRunner
|
||||
return array_keys($values);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, array<string, mixed>> $rows
|
||||
*/
|
||||
private function extractJoinedText(array $rows): string
|
||||
{
|
||||
$parts = [];
|
||||
|
||||
foreach ($rows as $row) {
|
||||
$text = $row['text'] ?? null;
|
||||
|
||||
if (!is_string($text)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$text = trim($text);
|
||||
|
||||
if ($text === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$parts[] = $text;
|
||||
}
|
||||
|
||||
return implode("\n\n", $parts);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, string> $failures
|
||||
* @param mixed $expectedList
|
||||
*/
|
||||
private function assertValueInList(
|
||||
array &$failures,
|
||||
string $actual,
|
||||
mixed $expectedList,
|
||||
string $label
|
||||
): void {
|
||||
$expected = $this->normalizeStringList($expectedList);
|
||||
|
||||
if ($expected === []) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!in_array($actual, $expected, true)) {
|
||||
$failures[] = sprintf(
|
||||
'%s mismatch: expected one of [%s], got "%s".',
|
||||
$label,
|
||||
implode(', ', $expected),
|
||||
$actual
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, string> $failures
|
||||
* @param array<int, string> $actualValues
|
||||
* @param mixed $expectedList
|
||||
*/
|
||||
private function assertContainsAtLeastOne(
|
||||
array &$failures,
|
||||
array $actualValues,
|
||||
mixed $expectedList,
|
||||
string $label
|
||||
): void {
|
||||
$expected = $this->normalizeStringList($expectedList);
|
||||
|
||||
if ($expected === []) {
|
||||
return;
|
||||
}
|
||||
|
||||
foreach ($expected as $candidate) {
|
||||
if (in_array($candidate, $actualValues, true)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
$failures[] = sprintf(
|
||||
'none of the expected %s values were found. Expected one of [%s], got [%s].',
|
||||
$label,
|
||||
implode(', ', $expected),
|
||||
implode(', ', $actualValues)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, string> $failures
|
||||
* @param array<int, string> $actualValues
|
||||
* @param mixed $forbiddenList
|
||||
*/
|
||||
private function assertContainsNone(
|
||||
array &$failures,
|
||||
array $actualValues,
|
||||
mixed $forbiddenList,
|
||||
string $label
|
||||
): void {
|
||||
$forbidden = $this->normalizeStringList($forbiddenList);
|
||||
|
||||
if ($forbidden === []) {
|
||||
return;
|
||||
}
|
||||
|
||||
foreach ($forbidden as $forbiddenValue) {
|
||||
if (in_array($forbiddenValue, $actualValues, true)) {
|
||||
$failures[] = sprintf(
|
||||
'forbidden %s "%s" was present in the retrieval results.',
|
||||
$label,
|
||||
$forbiddenValue
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, string> $terms
|
||||
* @return array<int, string>
|
||||
*/
|
||||
private function findMatchingTerms(string $haystack, array $terms): array
|
||||
{
|
||||
$matches = [];
|
||||
|
||||
foreach ($terms as $term) {
|
||||
if ($this->containsTerm($haystack, $term)) {
|
||||
$matches[] = $term;
|
||||
}
|
||||
}
|
||||
|
||||
return array_values(array_unique($matches));
|
||||
}
|
||||
|
||||
private function containsTerm(string $haystack, string $term): bool
|
||||
{
|
||||
$haystack = $this->normalizeText($haystack);
|
||||
$term = $this->normalizeText($term);
|
||||
|
||||
if ($term === '') {
|
||||
return false;
|
||||
}
|
||||
|
||||
return str_contains($haystack, $term);
|
||||
}
|
||||
|
||||
private function normalizeText(string $value): string
|
||||
{
|
||||
$value = trim($value);
|
||||
|
||||
if ($value === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
if (function_exists('mb_strtolower')) {
|
||||
return mb_strtolower($value);
|
||||
}
|
||||
|
||||
return strtolower($value);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param mixed $value
|
||||
* @return array<int, string>
|
||||
|
||||
@@ -1,4 +1,117 @@
|
||||
{"id":"retrieval_exact_doc_001","type":"retrieval","prompt":"Testomat 808","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["26129c01-c09f-4c71-9c80-7ddffb6c77fb"]}}
|
||||
{"id":"retrieval_exact_doc_002","type":"retrieval","prompt":"Testomat EVO CALC","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["74fdad85-5e4e-4f08-8d95-402f3180ed55"]}}
|
||||
{"id":"retrieval_exact_doc_003","type":"retrieval","prompt":"Wasserhärte Grenzwert Testomat","assert":{"selection_mode":"exact_document_title","min_results":1,"must_include_document_ids":["60706498-867b-41b8-8e76-63248178d265"]}}
|
||||
{"id":"retrieval_noise_001","type":"retrieval","prompt":"dsgfsdgfsdgf","assert":{"max_results":0}}
|
||||
{
|
||||
"id": "retrieval_exact_doc_001",
|
||||
"type": "retrieval",
|
||||
"prompt": "phaseaaudit-md",
|
||||
"assert": {
|
||||
"selection_mode_in": [
|
||||
"exact_document_title"
|
||||
],
|
||||
"min_results": 1,
|
||||
"must_include_one_of_document_ids": [
|
||||
"5914508a-5930-4f04-892b-323881d0daa7"
|
||||
],
|
||||
"must_include_any_terms": [
|
||||
"enterprise",
|
||||
"governance",
|
||||
"vector-service"
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
"id": "retrieval_exact_doc_002",
|
||||
"type": "retrieval",
|
||||
"prompt": "ragsystemoverview-md",
|
||||
"assert": {
|
||||
"selection_mode_in": [
|
||||
"exact_document_title"
|
||||
],
|
||||
"min_results": 1,
|
||||
"must_include_one_of_document_ids": [
|
||||
"7513fd82-eec6-4bfa-a730-41820b38b6b4"
|
||||
],
|
||||
"must_include_all_terms": [
|
||||
"rag-system",
|
||||
"dokumente"
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
"id": "retrieval_exact_doc_003",
|
||||
"type": "retrieval",
|
||||
"prompt": "matrixparams-md",
|
||||
"assert": {
|
||||
"selection_mode_in": [
|
||||
"exact_document_title"
|
||||
],
|
||||
"min_results": 1,
|
||||
"must_include_one_of_document_ids": [
|
||||
"25276f4c-32bb-47a5-98b3-9d81aa722d2b"
|
||||
],
|
||||
"must_include_any_terms": [
|
||||
"retrievalmaxchunks",
|
||||
"retrievalvectortopk",
|
||||
"hard_max_chunks"
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
"id": "retrieval_exact_doc_004",
|
||||
"type": "retrieval",
|
||||
"prompt": "readme-md",
|
||||
"assert": {
|
||||
"selection_mode_in": [
|
||||
"exact_document_title"
|
||||
],
|
||||
"min_results": 1,
|
||||
"must_include_one_of_document_ids": [
|
||||
"8abe1f0d-54e6-41ad-967a-9ce8a0efc6da"
|
||||
],
|
||||
"must_include_any_terms": [
|
||||
"deterministisches",
|
||||
"faiss",
|
||||
"vector-service"
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
"id": "retrieval_semantic_001",
|
||||
"type": "retrieval",
|
||||
"prompt": "wie funktioniert das system",
|
||||
"assert": {
|
||||
"min_results": 1,
|
||||
"must_include_one_of_document_ids": [
|
||||
"7513fd82-eec6-4bfa-a730-41820b38b6b4",
|
||||
"8abe1f0d-54e6-41ad-967a-9ce8a0efc6da"
|
||||
],
|
||||
"must_include_any_terms": [
|
||||
"rag-system",
|
||||
"dokumente",
|
||||
"indexierung"
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
"id": "retrieval_semantic_002",
|
||||
"type": "retrieval",
|
||||
"prompt": "welche parameter beeinflussen retrieval",
|
||||
"assert": {
|
||||
"min_results": 1,
|
||||
"must_include_one_of_document_ids": [
|
||||
"25276f4c-32bb-47a5-98b3-9d81aa722d2b",
|
||||
"7513fd82-eec6-4bfa-a730-41820b38b6b4"
|
||||
],
|
||||
"must_include_any_terms": [
|
||||
"retrievalmaxchunks",
|
||||
"vectortopk",
|
||||
"chunk"
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
"id": "retrieval_noise_001",
|
||||
"type": "retrieval",
|
||||
"prompt": "dsgfsdgfsdgf",
|
||||
"assert": {
|
||||
"max_results": 0
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user