p99
This commit is contained in:
@@ -37,7 +37,7 @@ final class AgentEvalRunCommand extends Command
|
||||
->addArgument(
|
||||
'type',
|
||||
InputArgument::OPTIONAL,
|
||||
'Eval type to run',
|
||||
'Eval type to run (retrieval, shop_query, followup, answer_guard)',
|
||||
'retrieval'
|
||||
)
|
||||
->addOption(
|
||||
|
||||
@@ -11,6 +11,8 @@ final readonly class AgentEvalRunner
|
||||
{
|
||||
public function __construct(
|
||||
private RetrievalDebugRunner $retrievalDebugRunner,
|
||||
private ShopQueryEvalRunner $shopQueryEvalRunner,
|
||||
private AnswerGuardEvalRunner $answerGuardEvalRunner,
|
||||
) {
|
||||
}
|
||||
|
||||
@@ -20,6 +22,14 @@ final readonly class AgentEvalRunner
|
||||
return $this->retrievalDebugRunner->run($case);
|
||||
}
|
||||
|
||||
if ($case->isShopQueryCase() || $case->isFollowUpCase()) {
|
||||
return $this->shopQueryEvalRunner->run($case);
|
||||
}
|
||||
|
||||
if ($case->isAnswerGuardCase()) {
|
||||
return $this->answerGuardEvalRunner->run($case);
|
||||
}
|
||||
|
||||
throw new \InvalidArgumentException(sprintf(
|
||||
'Unsupported eval case type: %s',
|
||||
$case->type
|
||||
@@ -40,4 +50,4 @@ final readonly class AgentEvalRunner
|
||||
|
||||
return $results;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
32
src/Eval/AnswerGuardEvalRunner.php
Normal file
32
src/Eval/AnswerGuardEvalRunner.php
Normal file
@@ -0,0 +1,32 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Eval;
|
||||
|
||||
use App\Eval\Dto\EvalCase;
|
||||
use App\Eval\Dto\EvalResult;
|
||||
|
||||
final readonly class AnswerGuardEvalRunner
|
||||
{
|
||||
public function __construct(
|
||||
private RetrievalDebugRunner $retrievalDebugRunner,
|
||||
) {
|
||||
}
|
||||
|
||||
public function run(EvalCase $case): EvalResult
|
||||
{
|
||||
$result = $this->retrievalDebugRunner->run($case);
|
||||
$details = $result->details;
|
||||
$details['guard_scope'] = 'retrieval_evidence_pre_answer';
|
||||
|
||||
return new EvalResult(
|
||||
caseId: $result->caseId,
|
||||
type: $case->type,
|
||||
passed: $result->passed,
|
||||
durationMs: $result->durationMs,
|
||||
failures: $result->failures,
|
||||
details: $details,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -8,12 +8,15 @@ final readonly class EvalCase
|
||||
{
|
||||
/**
|
||||
* @param array<string, mixed> $assert
|
||||
* @param array<int, array{prompt:string,answer:string}> $history
|
||||
*/
|
||||
public function __construct(
|
||||
public string $id,
|
||||
public string $type,
|
||||
public string $prompt,
|
||||
public array $assert = [],
|
||||
public array $history = [],
|
||||
public string $requestContextHint = '',
|
||||
) {
|
||||
}
|
||||
|
||||
@@ -26,6 +29,8 @@ final readonly class EvalCase
|
||||
$type = trim((string) ($row['type'] ?? ''));
|
||||
$prompt = trim((string) ($row['prompt'] ?? ''));
|
||||
$assert = is_array($row['assert'] ?? null) ? $row['assert'] : [];
|
||||
$history = self::normalizeHistory($row['history'] ?? []);
|
||||
$requestContextHint = trim((string) ($row['request_context_hint'] ?? ''));
|
||||
|
||||
if ($id === '') {
|
||||
throw new \InvalidArgumentException('Eval case id must not be empty.');
|
||||
@@ -50,6 +55,8 @@ final readonly class EvalCase
|
||||
type: $type,
|
||||
prompt: $prompt,
|
||||
assert: $assert,
|
||||
history: $history,
|
||||
requestContextHint: $requestContextHint,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -57,4 +64,64 @@ final readonly class EvalCase
|
||||
{
|
||||
return $this->type === 'retrieval';
|
||||
}
|
||||
}
|
||||
|
||||
public function isShopQueryCase(): bool
|
||||
{
|
||||
return $this->type === 'shop_query';
|
||||
}
|
||||
|
||||
public function isFollowUpCase(): bool
|
||||
{
|
||||
return $this->type === 'followup';
|
||||
}
|
||||
|
||||
public function isAnswerGuardCase(): bool
|
||||
{
|
||||
return $this->type === 'answer_guard';
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<int, array{prompt:string,answer:string}>
|
||||
*/
|
||||
private static function normalizeHistory(mixed $value): array
|
||||
{
|
||||
if (!is_array($value)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$history = [];
|
||||
|
||||
foreach ($value as $entry) {
|
||||
if (is_string($entry)) {
|
||||
$entry = trim($entry);
|
||||
|
||||
if ($entry !== '') {
|
||||
$history[] = [
|
||||
'prompt' => 'Eval-Kontext',
|
||||
'answer' => $entry,
|
||||
];
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!is_array($entry)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$prompt = trim((string) ($entry['prompt'] ?? ''));
|
||||
$answer = trim((string) ($entry['answer'] ?? $entry['response'] ?? ''));
|
||||
|
||||
if ($prompt === '' && $answer === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$history[] = [
|
||||
'prompt' => $prompt !== '' ? $prompt : 'Eval-Kontext',
|
||||
'answer' => $answer,
|
||||
];
|
||||
}
|
||||
|
||||
return $history;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -187,6 +187,25 @@ final readonly class RetrievalDebugRunner
|
||||
}
|
||||
}
|
||||
|
||||
$forbiddenTerms = $this->normalizeStringList($assert['must_not_include_terms'] ?? []);
|
||||
foreach ($forbiddenTerms as $forbiddenTerm) {
|
||||
if ($this->containsTerm($joinedText, $forbiddenTerm)) {
|
||||
$failures[] = sprintf(
|
||||
'forbidden term "%s" was present in the retrieval text.',
|
||||
$forbiddenTerm
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($this->normalizeStringList($assert['must_not_match_patterns'] ?? []) as $pattern) {
|
||||
if (@preg_match($pattern, $joinedText) === 1) {
|
||||
$failures[] = sprintf(
|
||||
'forbidden pattern "%s" matched the retrieval text.',
|
||||
$pattern
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return new EvalResult(
|
||||
caseId: $case->id,
|
||||
type: $case->type,
|
||||
@@ -203,6 +222,7 @@ final readonly class RetrievalDebugRunner
|
||||
'chunk_ids' => $chunkIds,
|
||||
'matched_any_terms' => $matchedAnyTerms,
|
||||
'matched_all_terms' => $matchedAllTerms,
|
||||
'forbidden_terms_checked' => $this->normalizeStringList($assert['must_not_include_terms'] ?? []),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
362
src/Eval/ShopQueryEvalRunner.php
Normal file
362
src/Eval/ShopQueryEvalRunner.php
Normal file
@@ -0,0 +1,362 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Eval;
|
||||
|
||||
use App\Agent\AgentRunner;
|
||||
use App\Context\ContextService;
|
||||
use App\Eval\Dto\EvalCase;
|
||||
use App\Eval\Dto\EvalResult;
|
||||
|
||||
final readonly class ShopQueryEvalRunner
|
||||
{
|
||||
public function __construct(
|
||||
private AgentRunner $agentRunner,
|
||||
private ContextService $contextService,
|
||||
) {
|
||||
}
|
||||
|
||||
public function run(EvalCase $case): EvalResult
|
||||
{
|
||||
$start = microtime(true);
|
||||
$failures = [];
|
||||
$userId = $this->buildUserId($case);
|
||||
$transcript = '';
|
||||
$shopMeta = null;
|
||||
|
||||
$this->contextService->deleteHistory($userId);
|
||||
$this->seedHistory($userId, $case->history);
|
||||
|
||||
try {
|
||||
foreach ($this->agentRunner->run($case->prompt, $userId, false, $case->requestContextHint) as $chunk) {
|
||||
if (!is_string($chunk) || $chunk === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$transcript .= $chunk . "\n";
|
||||
|
||||
if (!str_contains($chunk, 'retriex-shop-meta')) {
|
||||
if (mb_strlen($transcript, 'UTF-8') > 120000) {
|
||||
$transcript = mb_substr($transcript, -120000, null, 'UTF-8');
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
$shopMeta = $this->extractShopMeta($chunk);
|
||||
break;
|
||||
}
|
||||
} catch (\Throwable $e) {
|
||||
$failures[] = sprintf('agent run failed before shop-query meta was emitted: %s', $e->getMessage());
|
||||
} finally {
|
||||
$this->contextService->deleteHistory($userId);
|
||||
}
|
||||
|
||||
$durationMs = round((microtime(true) - $start) * 1000, 2);
|
||||
|
||||
if ($shopMeta === null) {
|
||||
$failures[] = 'no shop-query meta message was emitted before the runner stopped.';
|
||||
$shopMeta = [
|
||||
'query' => '',
|
||||
'individual_queries' => [],
|
||||
'raw_html' => '',
|
||||
];
|
||||
}
|
||||
|
||||
$this->assertShopQuery($failures, $case, $shopMeta);
|
||||
|
||||
return new EvalResult(
|
||||
caseId: $case->id,
|
||||
type: $case->type,
|
||||
passed: $failures === [],
|
||||
durationMs: $durationMs,
|
||||
failures: $failures,
|
||||
details: [
|
||||
'prompt' => $case->prompt,
|
||||
'history_turns' => count($case->history),
|
||||
'has_request_context_hint' => $case->requestContextHint !== '',
|
||||
'query' => $shopMeta['query'],
|
||||
'individual_queries' => $shopMeta['individual_queries'],
|
||||
'transcript_preview' => $this->previewText($transcript),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
private function buildUserId(EvalCase $case): string
|
||||
{
|
||||
$safeId = preg_replace('/[^a-zA-Z0-9_-]+/', '_', $case->id) ?? $case->id;
|
||||
$safeId = trim($safeId, '_');
|
||||
|
||||
return 'eval_' . ($safeId !== '' ? $safeId : sha1($case->id));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, array{prompt:string,answer:string}> $history
|
||||
*/
|
||||
private function seedHistory(string $userId, array $history): void
|
||||
{
|
||||
foreach ($history as $turn) {
|
||||
$prompt = trim($turn['prompt'] ?? '');
|
||||
$answer = trim($turn['answer'] ?? '');
|
||||
|
||||
if ($prompt === '' && $answer === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($prompt === '') {
|
||||
$prompt = 'Eval-Kontext';
|
||||
}
|
||||
|
||||
$this->contextService->appendHistory($userId, $prompt, $answer);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{query:string,individual_queries:array<int,string>,raw_html:string}
|
||||
*/
|
||||
private function extractShopMeta(string $html): array
|
||||
{
|
||||
$isMultiQuery = str_contains($html, 'retriex-meta-query--multi');
|
||||
$codes = [];
|
||||
|
||||
if (preg_match_all('/<code>(.*?)<\/code>/su', $html, $matches) !== false) {
|
||||
foreach ($matches[1] ?? [] as $value) {
|
||||
$decoded = html_entity_decode(strip_tags((string) $value), ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');
|
||||
$decoded = $this->normalizeOneLine($decoded);
|
||||
|
||||
if ($decoded !== '') {
|
||||
$codes[] = $decoded;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$codes = array_values(array_unique($codes));
|
||||
|
||||
if ($isMultiQuery) {
|
||||
return [
|
||||
'query' => '',
|
||||
'individual_queries' => $codes,
|
||||
'raw_html' => $html,
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
'query' => $codes[0] ?? '',
|
||||
'individual_queries' => [],
|
||||
'raw_html' => $html,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, string> $failures
|
||||
* @param array{query:string,individual_queries:array<int,string>,raw_html:string} $shopMeta
|
||||
*/
|
||||
private function assertShopQuery(array &$failures, EvalCase $case, array $shopMeta): void
|
||||
{
|
||||
$assert = $case->assert;
|
||||
$query = $shopMeta['query'];
|
||||
$individualQueries = $shopMeta['individual_queries'];
|
||||
$joined = trim($query . ' ' . implode(' ', $individualQueries));
|
||||
|
||||
$expectedQuery = $this->stringOrNull($assert['expected_query'] ?? null);
|
||||
if ($expectedQuery !== null && $this->normalizeQuery($query) !== $this->normalizeQuery($expectedQuery)) {
|
||||
$failures[] = sprintf(
|
||||
'shop query mismatch: expected "%s", got "%s".',
|
||||
$expectedQuery,
|
||||
$query
|
||||
);
|
||||
}
|
||||
|
||||
$forbiddenExactQuery = $this->stringOrNull($assert['must_not_equal_query'] ?? null);
|
||||
if ($forbiddenExactQuery !== null && $this->normalizeQuery($query) === $this->normalizeQuery($forbiddenExactQuery)) {
|
||||
$failures[] = sprintf('shop query must not equal "%s".', $forbiddenExactQuery);
|
||||
}
|
||||
|
||||
$expectedIndividualQueries = $this->normalizeStringList($assert['expected_individual_queries'] ?? []);
|
||||
if ($expectedIndividualQueries !== []) {
|
||||
foreach ($expectedIndividualQueries as $expectedIndividualQuery) {
|
||||
if (!$this->containsNormalizedQuery($individualQueries, $expectedIndividualQuery)) {
|
||||
$failures[] = sprintf(
|
||||
'missing expected individual shop query "%s". Got [%s].',
|
||||
$expectedIndividualQuery,
|
||||
implode(', ', $individualQueries)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (($assert['expected_individual_queries_exact'] ?? false) === true) {
|
||||
$expected = array_map(fn(string $value): string => $this->normalizeQuery($value), $expectedIndividualQueries);
|
||||
$actual = array_map(fn(string $value): string => $this->normalizeQuery($value), $individualQueries);
|
||||
|
||||
sort($expected);
|
||||
sort($actual);
|
||||
|
||||
if ($expected !== $actual) {
|
||||
$failures[] = sprintf(
|
||||
'individual shop queries differ from expected exact set. Expected [%s], got [%s].',
|
||||
implode(', ', $expectedIndividualQueries),
|
||||
implode(', ', $individualQueries)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($assert['min_individual_queries']) && count($individualQueries) < (int) $assert['min_individual_queries']) {
|
||||
$failures[] = sprintf(
|
||||
'too few individual shop queries: expected >= %d, got %d.',
|
||||
(int) $assert['min_individual_queries'],
|
||||
count($individualQueries)
|
||||
);
|
||||
}
|
||||
|
||||
if (isset($assert['max_individual_queries']) && count($individualQueries) > (int) $assert['max_individual_queries']) {
|
||||
$failures[] = sprintf(
|
||||
'too many individual shop queries: expected <= %d, got %d.',
|
||||
(int) $assert['max_individual_queries'],
|
||||
count($individualQueries)
|
||||
);
|
||||
}
|
||||
|
||||
foreach ($this->normalizeStringList($assert['must_include_terms'] ?? []) as $term) {
|
||||
if (!$this->containsTerm($joined, $term)) {
|
||||
$failures[] = sprintf('shop query output does not contain required term "%s".', $term);
|
||||
}
|
||||
}
|
||||
|
||||
$requiredAnyTerms = $this->normalizeStringList($assert['must_include_any_terms'] ?? []);
|
||||
if ($requiredAnyTerms !== []) {
|
||||
$matched = false;
|
||||
foreach ($requiredAnyTerms as $term) {
|
||||
if ($this->containsTerm($joined, $term)) {
|
||||
$matched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!$matched) {
|
||||
$failures[] = sprintf(
|
||||
'shop query output contains none of the required any-terms: [%s].',
|
||||
implode(', ', $requiredAnyTerms)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($this->normalizeStringList($assert['must_not_include_terms'] ?? []) as $term) {
|
||||
if ($this->containsTerm($joined, $term)) {
|
||||
$failures[] = sprintf('shop query output contains forbidden term "%s".', $term);
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($this->normalizeStringList($assert['query_must_match_patterns'] ?? []) as $pattern) {
|
||||
if (@preg_match($pattern, $joined) !== 1) {
|
||||
$failures[] = sprintf('shop query output does not match required pattern "%s".', $pattern);
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($this->normalizeStringList($assert['query_must_not_match_patterns'] ?? []) as $pattern) {
|
||||
if (@preg_match($pattern, $joined) === 1) {
|
||||
$failures[] = sprintf('shop query output matches forbidden pattern "%s".', $pattern);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, string> $queries
|
||||
*/
|
||||
private function containsNormalizedQuery(array $queries, string $needle): bool
|
||||
{
|
||||
$needle = $this->normalizeQuery($needle);
|
||||
|
||||
foreach ($queries as $query) {
|
||||
if ($this->normalizeQuery($query) === $needle) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private function containsTerm(string $haystack, string $term): bool
|
||||
{
|
||||
$haystack = $this->normalizeText($haystack);
|
||||
$term = $this->normalizeText($term);
|
||||
|
||||
return $term !== '' && str_contains($haystack, $term);
|
||||
}
|
||||
|
||||
private function normalizeQuery(string $value): string
|
||||
{
|
||||
$value = $this->normalizeText($value);
|
||||
$value = preg_replace('/[^\p{L}\p{N}]+/u', ' ', $value) ?? $value;
|
||||
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
||||
|
||||
return trim($value);
|
||||
}
|
||||
|
||||
private function normalizeText(string $value): string
|
||||
{
|
||||
$value = html_entity_decode(strip_tags($value), ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');
|
||||
$value = mb_strtolower(trim($value), 'UTF-8');
|
||||
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
||||
|
||||
return trim($value);
|
||||
}
|
||||
|
||||
private function normalizeOneLine(string $value): string
|
||||
{
|
||||
$value = trim($value);
|
||||
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
||||
|
||||
return trim($value);
|
||||
}
|
||||
|
||||
private function stringOrNull(mixed $value): ?string
|
||||
{
|
||||
if (!is_string($value)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$value = trim($value);
|
||||
|
||||
return $value !== '' ? $value : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<int, string>
|
||||
*/
|
||||
private function normalizeStringList(mixed $value): array
|
||||
{
|
||||
if (!is_array($value)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$out = [];
|
||||
|
||||
foreach ($value as $item) {
|
||||
if (!is_string($item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = trim($item);
|
||||
|
||||
if ($item === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = $item;
|
||||
}
|
||||
|
||||
return array_values(array_unique($out));
|
||||
}
|
||||
|
||||
private function previewText(string $value): string
|
||||
{
|
||||
$value = $this->normalizeOneLine($value);
|
||||
|
||||
if (mb_strlen($value, 'UTF-8') <= 1200) {
|
||||
return $value;
|
||||
}
|
||||
|
||||
return rtrim(mb_substr($value, 0, 1200, 'UTF-8')) . '...';
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user