434 lines
12 KiB
PHP
434 lines
12 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Eval;
|
|
|
|
use App\Eval\Dto\EvalCase;
|
|
use App\Eval\Dto\EvalResult;
|
|
use App\Knowledge\Retrieval\NdjsonHybridRetriever;
|
|
|
|
final readonly class RetrievalDebugRunner
|
|
{
|
|
public function __construct(
|
|
private NdjsonHybridRetriever $retriever,
|
|
) {
|
|
}
|
|
|
|
public function run(EvalCase $case): EvalResult
|
|
{
|
|
$start = microtime(true);
|
|
$failures = [];
|
|
|
|
$rows = $this->retriever->retrieveDebug($case->prompt);
|
|
|
|
$durationMs = round((microtime(true) - $start) * 1000, 2);
|
|
|
|
$resultCount = count($rows);
|
|
$first = $rows[0] ?? [];
|
|
|
|
$selectionMode = $this->extractString($first, 'selection_mode');
|
|
$route = $this->extractString($first, 'route');
|
|
$intent = $this->extractString($first, 'intent');
|
|
|
|
$documentIds = $this->extractUniqueStringValues($rows, 'document_id');
|
|
$chunkIds = $this->extractUniqueStringValues($rows, 'chunk_id');
|
|
$joinedText = $this->extractJoinedText($rows);
|
|
|
|
$assert = $case->assert;
|
|
|
|
// ---------------------------------------------------------
|
|
// Strict single-value assertions
|
|
// ---------------------------------------------------------
|
|
if (isset($assert['selection_mode']) && (string) $assert['selection_mode'] !== $selectionMode) {
|
|
$failures[] = sprintf(
|
|
'selection_mode mismatch: expected "%s", got "%s".',
|
|
(string) $assert['selection_mode'],
|
|
$selectionMode
|
|
);
|
|
}
|
|
|
|
if (isset($assert['route']) && (string) $assert['route'] !== $route) {
|
|
$failures[] = sprintf(
|
|
'route mismatch: expected "%s", got "%s".',
|
|
(string) $assert['route'],
|
|
$route
|
|
);
|
|
}
|
|
|
|
if (isset($assert['intent']) && (string) $assert['intent'] !== $intent) {
|
|
$failures[] = sprintf(
|
|
'intent mismatch: expected "%s", got "%s".',
|
|
(string) $assert['intent'],
|
|
$intent
|
|
);
|
|
}
|
|
|
|
// ---------------------------------------------------------
|
|
// Flexible multi-value assertions
|
|
// ---------------------------------------------------------
|
|
$this->assertValueInList(
|
|
failures: $failures,
|
|
actual: $selectionMode,
|
|
expectedList: $assert['selection_mode_in'] ?? [],
|
|
label: 'selection_mode'
|
|
);
|
|
|
|
$this->assertValueInList(
|
|
failures: $failures,
|
|
actual: $route,
|
|
expectedList: $assert['route_in'] ?? [],
|
|
label: 'route'
|
|
);
|
|
|
|
$this->assertValueInList(
|
|
failures: $failures,
|
|
actual: $intent,
|
|
expectedList: $assert['intent_in'] ?? [],
|
|
label: 'intent'
|
|
);
|
|
|
|
// ---------------------------------------------------------
|
|
// Result count assertions
|
|
// ---------------------------------------------------------
|
|
if (isset($assert['min_results']) && $resultCount < (int) $assert['min_results']) {
|
|
$failures[] = sprintf(
|
|
'result_count too low: expected >= %d, got %d.',
|
|
(int) $assert['min_results'],
|
|
$resultCount
|
|
);
|
|
}
|
|
|
|
if (isset($assert['max_results']) && $resultCount > (int) $assert['max_results']) {
|
|
$failures[] = sprintf(
|
|
'result_count too high: expected <= %d, got %d.',
|
|
(int) $assert['max_results'],
|
|
$resultCount
|
|
);
|
|
}
|
|
|
|
// ---------------------------------------------------------
|
|
// ID assertions
|
|
// ---------------------------------------------------------
|
|
foreach ($this->normalizeStringList($assert['must_include_document_ids'] ?? []) as $expectedDocumentId) {
|
|
if (!in_array($expectedDocumentId, $documentIds, true)) {
|
|
$failures[] = sprintf(
|
|
'missing expected document_id "%s".',
|
|
$expectedDocumentId
|
|
);
|
|
}
|
|
}
|
|
|
|
foreach ($this->normalizeStringList($assert['must_include_chunk_ids'] ?? []) as $expectedChunkId) {
|
|
if (!in_array($expectedChunkId, $chunkIds, true)) {
|
|
$failures[] = sprintf(
|
|
'missing expected chunk_id "%s".',
|
|
$expectedChunkId
|
|
);
|
|
}
|
|
}
|
|
|
|
$this->assertContainsAtLeastOne(
|
|
failures: $failures,
|
|
actualValues: $documentIds,
|
|
expectedList: $assert['must_include_one_of_document_ids'] ?? [],
|
|
label: 'document_id'
|
|
);
|
|
|
|
$this->assertContainsAtLeastOne(
|
|
failures: $failures,
|
|
actualValues: $chunkIds,
|
|
expectedList: $assert['must_include_one_of_chunk_ids'] ?? [],
|
|
label: 'chunk_id'
|
|
);
|
|
|
|
$this->assertContainsNone(
|
|
failures: $failures,
|
|
actualValues: $documentIds,
|
|
forbiddenList: $assert['must_not_include_document_ids'] ?? [],
|
|
label: 'document_id'
|
|
);
|
|
|
|
$this->assertContainsNone(
|
|
failures: $failures,
|
|
actualValues: $chunkIds,
|
|
forbiddenList: $assert['must_not_include_chunk_ids'] ?? [],
|
|
label: 'chunk_id'
|
|
);
|
|
|
|
// ---------------------------------------------------------
|
|
// Text / term assertions
|
|
// ---------------------------------------------------------
|
|
$matchedAnyTerms = $this->findMatchingTerms(
|
|
haystack: $joinedText,
|
|
terms: $this->normalizeStringList($assert['must_include_any_terms'] ?? [])
|
|
);
|
|
|
|
$matchedAllTerms = $this->findMatchingTerms(
|
|
haystack: $joinedText,
|
|
terms: $this->normalizeStringList($assert['must_include_all_terms'] ?? [])
|
|
);
|
|
|
|
$requiredAnyTerms = $this->normalizeStringList($assert['must_include_any_terms'] ?? []);
|
|
if ($requiredAnyTerms !== [] && $matchedAnyTerms === []) {
|
|
$failures[] = sprintf(
|
|
'none of the required any-terms were found in the retrieval text: [%s].',
|
|
implode(', ', $requiredAnyTerms)
|
|
);
|
|
}
|
|
|
|
$requiredAllTerms = $this->normalizeStringList($assert['must_include_all_terms'] ?? []);
|
|
foreach ($requiredAllTerms as $requiredTerm) {
|
|
if (!$this->containsTerm($joinedText, $requiredTerm)) {
|
|
$failures[] = sprintf(
|
|
'required all-term "%s" was not found in the retrieval text.',
|
|
$requiredTerm
|
|
);
|
|
}
|
|
}
|
|
|
|
return new EvalResult(
|
|
caseId: $case->id,
|
|
type: $case->type,
|
|
passed: $failures === [],
|
|
durationMs: $durationMs,
|
|
failures: $failures,
|
|
details: [
|
|
'prompt' => $case->prompt,
|
|
'result_count' => $resultCount,
|
|
'selection_mode' => $selectionMode,
|
|
'route' => $route,
|
|
'intent' => $intent,
|
|
'document_ids' => $documentIds,
|
|
'chunk_ids' => $chunkIds,
|
|
'matched_any_terms' => $matchedAnyTerms,
|
|
'matched_all_terms' => $matchedAllTerms,
|
|
],
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @param array<string, mixed> $row
|
|
*/
|
|
private function extractString(array $row, string $key): string
|
|
{
|
|
$value = $row[$key] ?? null;
|
|
|
|
if (!is_string($value)) {
|
|
return '';
|
|
}
|
|
|
|
return trim($value);
|
|
}
|
|
|
|
/**
|
|
* @param array<int, array<string, mixed>> $rows
|
|
* @return array<int, string>
|
|
*/
|
|
private function extractUniqueStringValues(array $rows, string $key): array
|
|
{
|
|
$values = [];
|
|
|
|
foreach ($rows as $row) {
|
|
$value = $row[$key] ?? null;
|
|
|
|
if (!is_string($value)) {
|
|
continue;
|
|
}
|
|
|
|
$value = trim($value);
|
|
|
|
if ($value === '') {
|
|
continue;
|
|
}
|
|
|
|
$values[$value] = true;
|
|
}
|
|
|
|
return array_keys($values);
|
|
}
|
|
|
|
/**
|
|
* @param array<int, array<string, mixed>> $rows
|
|
*/
|
|
private function extractJoinedText(array $rows): string
|
|
{
|
|
$parts = [];
|
|
|
|
foreach ($rows as $row) {
|
|
$text = $row['text'] ?? null;
|
|
|
|
if (!is_string($text)) {
|
|
continue;
|
|
}
|
|
|
|
$text = trim($text);
|
|
|
|
if ($text === '') {
|
|
continue;
|
|
}
|
|
|
|
$parts[] = $text;
|
|
}
|
|
|
|
return implode("\n\n", $parts);
|
|
}
|
|
|
|
/**
|
|
* @param array<int, string> $failures
|
|
* @param mixed $expectedList
|
|
*/
|
|
private function assertValueInList(
|
|
array &$failures,
|
|
string $actual,
|
|
mixed $expectedList,
|
|
string $label
|
|
): void {
|
|
$expected = $this->normalizeStringList($expectedList);
|
|
|
|
if ($expected === []) {
|
|
return;
|
|
}
|
|
|
|
if (!in_array($actual, $expected, true)) {
|
|
$failures[] = sprintf(
|
|
'%s mismatch: expected one of [%s], got "%s".',
|
|
$label,
|
|
implode(', ', $expected),
|
|
$actual
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @param array<int, string> $failures
|
|
* @param array<int, string> $actualValues
|
|
* @param mixed $expectedList
|
|
*/
|
|
private function assertContainsAtLeastOne(
|
|
array &$failures,
|
|
array $actualValues,
|
|
mixed $expectedList,
|
|
string $label
|
|
): void {
|
|
$expected = $this->normalizeStringList($expectedList);
|
|
|
|
if ($expected === []) {
|
|
return;
|
|
}
|
|
|
|
foreach ($expected as $candidate) {
|
|
if (in_array($candidate, $actualValues, true)) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
$failures[] = sprintf(
|
|
'none of the expected %s values were found. Expected one of [%s], got [%s].',
|
|
$label,
|
|
implode(', ', $expected),
|
|
implode(', ', $actualValues)
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @param array<int, string> $failures
|
|
* @param array<int, string> $actualValues
|
|
* @param mixed $forbiddenList
|
|
*/
|
|
private function assertContainsNone(
|
|
array &$failures,
|
|
array $actualValues,
|
|
mixed $forbiddenList,
|
|
string $label
|
|
): void {
|
|
$forbidden = $this->normalizeStringList($forbiddenList);
|
|
|
|
if ($forbidden === []) {
|
|
return;
|
|
}
|
|
|
|
foreach ($forbidden as $forbiddenValue) {
|
|
if (in_array($forbiddenValue, $actualValues, true)) {
|
|
$failures[] = sprintf(
|
|
'forbidden %s "%s" was present in the retrieval results.',
|
|
$label,
|
|
$forbiddenValue
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @param array<int, string> $terms
|
|
* @return array<int, string>
|
|
*/
|
|
private function findMatchingTerms(string $haystack, array $terms): array
|
|
{
|
|
$matches = [];
|
|
|
|
foreach ($terms as $term) {
|
|
if ($this->containsTerm($haystack, $term)) {
|
|
$matches[] = $term;
|
|
}
|
|
}
|
|
|
|
return array_values(array_unique($matches));
|
|
}
|
|
|
|
private function containsTerm(string $haystack, string $term): bool
|
|
{
|
|
$haystack = $this->normalizeText($haystack);
|
|
$term = $this->normalizeText($term);
|
|
|
|
if ($term === '') {
|
|
return false;
|
|
}
|
|
|
|
return str_contains($haystack, $term);
|
|
}
|
|
|
|
private function normalizeText(string $value): string
|
|
{
|
|
$value = trim($value);
|
|
|
|
if ($value === '') {
|
|
return '';
|
|
}
|
|
|
|
if (function_exists('mb_strtolower')) {
|
|
return mb_strtolower($value);
|
|
}
|
|
|
|
return strtolower($value);
|
|
}
|
|
|
|
/**
|
|
* @param mixed $value
|
|
* @return array<int, string>
|
|
*/
|
|
private function normalizeStringList(mixed $value): array
|
|
{
|
|
if (!is_array($value)) {
|
|
return [];
|
|
}
|
|
|
|
$out = [];
|
|
|
|
foreach ($value as $item) {
|
|
if (!is_string($item)) {
|
|
continue;
|
|
}
|
|
|
|
$item = trim($item);
|
|
|
|
if ($item === '') {
|
|
continue;
|
|
}
|
|
|
|
$out[] = $item;
|
|
}
|
|
|
|
return array_values(array_unique($out));
|
|
}
|
|
} |