first test suite retrieval

This commit is contained in:
team2
2026-04-22 22:03:23 +02:00
parent 65e2b1917c
commit 8127d33571
8 changed files with 546 additions and 0 deletions

View File

@@ -0,0 +1,43 @@
<?php
declare(strict_types=1);
namespace App\Eval;
use App\Eval\Dto\EvalCase;
use App\Eval\Dto\EvalResult;
final readonly class AgentEvalRunner
{
public function __construct(
private RetrievalDebugRunner $retrievalDebugRunner,
) {
}
public function run(EvalCase $case): EvalResult
{
if ($case->isRetrievalCase()) {
return $this->retrievalDebugRunner->run($case);
}
throw new \InvalidArgumentException(sprintf(
'Unsupported eval case type: %s',
$case->type
));
}
/**
* @param array<int, EvalCase> $cases
* @return array<int, EvalResult>
*/
public function runAll(array $cases): array
{
$results = [];
foreach ($cases as $case) {
$results[] = $this->run($case);
}
return $results;
}
}

60
src/Eval/Dto/EvalCase.php Normal file
View File

@@ -0,0 +1,60 @@
<?php
declare(strict_types=1);
namespace App\Eval\Dto;
final readonly class EvalCase
{
/**
* @param array<string, mixed> $assert
*/
public function __construct(
public string $id,
public string $type,
public string $prompt,
public array $assert = [],
) {
}
/**
* @param array<string, mixed> $row
*/
public static function fromArray(array $row): self
{
$id = trim((string) ($row['id'] ?? ''));
$type = trim((string) ($row['type'] ?? ''));
$prompt = trim((string) ($row['prompt'] ?? ''));
$assert = is_array($row['assert'] ?? null) ? $row['assert'] : [];
if ($id === '') {
throw new \InvalidArgumentException('Eval case id must not be empty.');
}
if ($type === '') {
throw new \InvalidArgumentException(sprintf(
'Eval case "%s" has an empty type.',
$id
));
}
if ($prompt === '') {
throw new \InvalidArgumentException(sprintf(
'Eval case "%s" has an empty prompt.',
$id
));
}
return new self(
id: $id,
type: $type,
prompt: $prompt,
assert: $assert,
);
}
public function isRetrievalCase(): bool
{
return $this->type === 'retrieval';
}
}

View File

@@ -0,0 +1,37 @@
<?php
declare(strict_types=1);
namespace App\Eval\Dto;
final readonly class EvalResult
{
/**
* @param array<int, string> $failures
* @param array<string, mixed> $details
*/
public function __construct(
public string $caseId,
public string $type,
public bool $passed,
public float $durationMs,
public array $failures = [],
public array $details = [],
) {
}
/**
* @return array<string, mixed>
*/
public function toArray(): array
{
return [
'case_id' => $this->caseId,
'type' => $this->type,
'passed' => $this->passed,
'duration_ms' => $this->durationMs,
'failures' => $this->failures,
'details' => $this->details,
];
}
}

View File

@@ -0,0 +1,67 @@
<?php
declare(strict_types=1);
namespace App\Eval;
use App\Eval\Dto\EvalCase;
final readonly class EvalCaseLoader
{
public function __construct(
private string $projectDir,
) {
}
/**
* @return array<int, EvalCase>
*/
public function load(string $type = 'retrieval'): array
{
$path = sprintf(
'%s/tests/evals/cases/%s.ndjson',
$this->projectDir,
$type
);
if (!is_file($path)) {
throw new \RuntimeException(sprintf(
'Eval case file not found: %s',
$path
));
}
$lines = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
if ($lines === false) {
throw new \RuntimeException(sprintf(
'Failed to read eval case file: %s',
$path
));
}
$cases = [];
foreach ($lines as $lineNumber => $line) {
$line = trim($line);
if ($line === '') {
continue;
}
$decoded = json_decode($line, true);
if (!is_array($decoded)) {
throw new \RuntimeException(sprintf(
'Invalid JSON in %s on line %d.',
$path,
$lineNumber + 1
));
}
$cases[] = EvalCase::fromArray($decoded);
}
return $cases;
}
}

View File

@@ -0,0 +1,184 @@
<?php
declare(strict_types=1);
namespace App\Eval;
use App\Eval\Dto\EvalCase;
use App\Eval\Dto\EvalResult;
use App\Knowledge\Retrieval\NdjsonHybridRetriever;
final readonly class RetrievalDebugRunner
{
public function __construct(
private NdjsonHybridRetriever $retriever,
) {
}
public function run(EvalCase $case): EvalResult
{
$start = microtime(true);
$failures = [];
$rows = $this->retriever->retrieveDebug($case->prompt);
$durationMs = round((microtime(true) - $start) * 1000, 2);
$resultCount = count($rows);
$first = $rows[0] ?? [];
$selectionMode = $this->extractString($first, 'selection_mode');
$route = $this->extractString($first, 'route');
$intent = $this->extractString($first, 'intent');
$documentIds = $this->extractUniqueStringValues($rows, 'document_id');
$chunkIds = $this->extractUniqueStringValues($rows, 'chunk_id');
$assert = $case->assert;
if (isset($assert['selection_mode']) && (string) $assert['selection_mode'] !== $selectionMode) {
$failures[] = sprintf(
'selection_mode mismatch: expected "%s", got "%s".',
(string) $assert['selection_mode'],
$selectionMode
);
}
if (isset($assert['route']) && (string) $assert['route'] !== $route) {
$failures[] = sprintf(
'route mismatch: expected "%s", got "%s".',
(string) $assert['route'],
$route
);
}
if (isset($assert['intent']) && (string) $assert['intent'] !== $intent) {
$failures[] = sprintf(
'intent mismatch: expected "%s", got "%s".',
(string) $assert['intent'],
$intent
);
}
if (isset($assert['min_results']) && $resultCount < (int) $assert['min_results']) {
$failures[] = sprintf(
'result_count too low: expected >= %d, got %d.',
(int) $assert['min_results'],
$resultCount
);
}
if (isset($assert['max_results']) && $resultCount > (int) $assert['max_results']) {
$failures[] = sprintf(
'result_count too high: expected <= %d, got %d.',
(int) $assert['max_results'],
$resultCount
);
}
foreach ($this->normalizeStringList($assert['must_include_document_ids'] ?? []) as $expectedDocumentId) {
if (!in_array($expectedDocumentId, $documentIds, true)) {
$failures[] = sprintf(
'missing expected document_id "%s".',
$expectedDocumentId
);
}
}
foreach ($this->normalizeStringList($assert['must_include_chunk_ids'] ?? []) as $expectedChunkId) {
if (!in_array($expectedChunkId, $chunkIds, true)) {
$failures[] = sprintf(
'missing expected chunk_id "%s".',
$expectedChunkId
);
}
}
return new EvalResult(
caseId: $case->id,
type: $case->type,
passed: $failures === [],
durationMs: $durationMs,
failures: $failures,
details: [
'prompt' => $case->prompt,
'result_count' => $resultCount,
'selection_mode' => $selectionMode,
'route' => $route,
'intent' => $intent,
'document_ids' => $documentIds,
'chunk_ids' => $chunkIds,
],
);
}
/**
* @param array<string, mixed> $row
*/
private function extractString(array $row, string $key): string
{
$value = $row[$key] ?? null;
if (!is_string($value)) {
return '';
}
return trim($value);
}
/**
* @param array<int, array<string, mixed>> $rows
* @return array<int, string>
*/
private function extractUniqueStringValues(array $rows, string $key): array
{
$values = [];
foreach ($rows as $row) {
$value = $row[$key] ?? null;
if (!is_string($value)) {
continue;
}
$value = trim($value);
if ($value === '') {
continue;
}
$values[$value] = true;
}
return array_keys($values);
}
/**
* @param mixed $value
* @return array<int, string>
*/
private function normalizeStringList(mixed $value): array
{
if (!is_array($value)) {
return [];
}
$out = [];
foreach ($value as $item) {
if (!is_string($item)) {
continue;
}
$item = trim($item);
if ($item === '') {
continue;
}
$out[] = $item;
}
return array_values(array_unique($out));
}
}