Merge branch 'optimizeTags' into optimizeSystem

# Conflicts:
#	src/Config/NdjsonHybridRetrieverConfig.php
This commit is contained in:
team 1
2026-04-23 08:41:51 +02:00
11 changed files with 1001 additions and 3 deletions

View File

@@ -0,0 +1,179 @@
<?php
declare(strict_types=1);
namespace App\Command;
use App\Eval\AgentEvalRunner;
use App\Eval\Dto\EvalCase;
use App\Eval\Dto\EvalResult;
use App\Eval\EvalCaseLoader;
use App\Eval\EvalReportWriter;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
#[AsCommand(
name: 'mto:agent:eval:run',
description: 'Run versioned eval cases for RetrieX'
)]
final class AgentEvalRunCommand extends Command
{
public function __construct(
private readonly EvalCaseLoader $loader,
private readonly AgentEvalRunner $runner,
private readonly EvalReportWriter $reportWriter,
) {
parent::__construct();
}
protected function configure(): void
{
$this
->addArgument(
'type',
InputArgument::OPTIONAL,
'Eval type to run',
'retrieval'
)
->addOption(
'case',
null,
InputOption::VALUE_OPTIONAL,
'Run only a single case by id'
)
->addOption(
'json',
null,
InputOption::VALUE_NONE,
'Print the full report as JSON'
)
->addOption(
'no-write',
null,
InputOption::VALUE_NONE,
'Do not write the report file'
);
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$io = new SymfonyStyle($input, $output);
$type = trim((string) $input->getArgument('type'));
$caseId = trim((string) $input->getOption('case'));
$asJson = (bool) $input->getOption('json');
$noWrite = (bool) $input->getOption('no-write');
try {
$cases = $this->loader->load($type);
} catch (\Throwable $e) {
$io->error($e->getMessage());
return Command::FAILURE;
}
if ($caseId !== '') {
$cases = array_values(array_filter(
$cases,
static fn (EvalCase $case): bool => $case->id === $caseId
));
}
if ($cases === []) {
$io->warning('No eval cases selected.');
return Command::SUCCESS;
}
try {
$results = $this->runner->runAll($cases);
} catch (\Throwable $e) {
$io->error($e->getMessage());
return Command::FAILURE;
}
$passed = count(array_filter(
$results,
static fn (EvalResult $result): bool => $result->passed
));
$failed = count($results) - $passed;
$report = [
'type' => $type,
'case_filter' => $caseId !== '' ? $caseId : null,
'total' => count($results),
'passed' => $passed,
'failed' => $failed,
'generated_at' => (new \DateTimeImmutable())->format(\DateTimeInterface::ATOM),
'results' => array_map(
static fn (EvalResult $result): array => $result->toArray(),
$results
),
];
$writtenPath = null;
if (!$noWrite) {
try {
$writtenPath = $this->reportWriter->write($report);
} catch (\Throwable $e) {
$io->error($e->getMessage());
return Command::FAILURE;
}
}
if ($asJson) {
$jsonReport = $report;
if ($writtenPath !== null) {
$jsonReport['written_to'] = $writtenPath;
}
$json = json_encode(
$jsonReport,
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
);
if (!is_string($json)) {
$io->error('json_encode failed.');
return Command::FAILURE;
}
$output->writeln($json);
return $failed > 0 ? Command::FAILURE : Command::SUCCESS;
}
$io->title('RetrieX Eval Run');
$io->definitionList(
['type' => $type],
['total' => (string) count($results)],
['passed' => (string) $passed],
['failed' => (string) $failed],
['report_file' => $writtenPath ?? 'disabled (--no-write)']
);
foreach ($results as $result) {
if ($result->passed) {
$io->writeln(sprintf('<info>PASS</info> %s', $result->caseId));
continue;
}
$io->writeln(sprintf('<error>FAIL</error> %s', $result->caseId));
foreach ($result->failures as $failure) {
$io->writeln(' - ' . $failure);
}
}
return $failed > 0 ? Command::FAILURE : Command::SUCCESS;
}
}

View File

@@ -34,7 +34,7 @@ final class NdjsonHybridRetrieverConfig
* - the system now has more safeguards: * - the system now has more safeguards:
* lexical cross-signals, scoped retrieval, title/meta boost, selection rules * lexical cross-signals, scoped retrieval, title/meta boost, selection rules
*/ */
public const VECTOR_SCORE_THRESHOLD = 0.83; public const VECTOR_SCORE_THRESHOLD = 0.82;
/** /**
* Lower safety boundary for dynamic threshold adjustments. * Lower safety boundary for dynamic threshold adjustments.
@@ -79,7 +79,7 @@ final class NdjsonHybridRetrieverConfig
* - slightly larger safety net for the richer hybrid stack * - slightly larger safety net for the richer hybrid stack
* - helps no-tag and low-signal cases without exploding context * - helps no-tag and low-signal cases without exploding context
*/ */
public const EMPTY_RRF_FALLBACK_TOPN = 5; public const EMPTY_RRF_FALLBACK_TOPN = 1;
/** /**
* Maximum number of chunks allowed from one document in spread mode. * Maximum number of chunks allowed from one document in spread mode.

View File

@@ -0,0 +1,43 @@
<?php
declare(strict_types=1);
namespace App\Eval;
use App\Eval\Dto\EvalCase;
use App\Eval\Dto\EvalResult;
final readonly class AgentEvalRunner
{
public function __construct(
private RetrievalDebugRunner $retrievalDebugRunner,
) {
}
public function run(EvalCase $case): EvalResult
{
if ($case->isRetrievalCase()) {
return $this->retrievalDebugRunner->run($case);
}
throw new \InvalidArgumentException(sprintf(
'Unsupported eval case type: %s',
$case->type
));
}
/**
* @param array<int, EvalCase> $cases
* @return array<int, EvalResult>
*/
public function runAll(array $cases): array
{
$results = [];
foreach ($cases as $case) {
$results[] = $this->run($case);
}
return $results;
}
}

60
src/Eval/Dto/EvalCase.php Normal file
View File

@@ -0,0 +1,60 @@
<?php
declare(strict_types=1);
namespace App\Eval\Dto;
final readonly class EvalCase
{
/**
* @param array<string, mixed> $assert
*/
public function __construct(
public string $id,
public string $type,
public string $prompt,
public array $assert = [],
) {
}
/**
* @param array<string, mixed> $row
*/
public static function fromArray(array $row): self
{
$id = trim((string) ($row['id'] ?? ''));
$type = trim((string) ($row['type'] ?? ''));
$prompt = trim((string) ($row['prompt'] ?? ''));
$assert = is_array($row['assert'] ?? null) ? $row['assert'] : [];
if ($id === '') {
throw new \InvalidArgumentException('Eval case id must not be empty.');
}
if ($type === '') {
throw new \InvalidArgumentException(sprintf(
'Eval case "%s" has an empty type.',
$id
));
}
if ($prompt === '') {
throw new \InvalidArgumentException(sprintf(
'Eval case "%s" has an empty prompt.',
$id
));
}
return new self(
id: $id,
type: $type,
prompt: $prompt,
assert: $assert,
);
}
public function isRetrievalCase(): bool
{
return $this->type === 'retrieval';
}
}

View File

@@ -0,0 +1,37 @@
<?php
declare(strict_types=1);
namespace App\Eval\Dto;
final readonly class EvalResult
{
/**
* @param array<int, string> $failures
* @param array<string, mixed> $details
*/
public function __construct(
public string $caseId,
public string $type,
public bool $passed,
public float $durationMs,
public array $failures = [],
public array $details = [],
) {
}
/**
* @return array<string, mixed>
*/
public function toArray(): array
{
return [
'case_id' => $this->caseId,
'type' => $this->type,
'passed' => $this->passed,
'duration_ms' => $this->durationMs,
'failures' => $this->failures,
'details' => $this->details,
];
}
}

View File

@@ -0,0 +1,67 @@
<?php
declare(strict_types=1);
namespace App\Eval;
use App\Eval\Dto\EvalCase;
final readonly class EvalCaseLoader
{
public function __construct(
private string $projectDir,
) {
}
/**
* @return array<int, EvalCase>
*/
public function load(string $type = 'retrieval'): array
{
$path = sprintf(
'%s/tests/evals/cases/%s.ndjson',
$this->projectDir,
$type
);
if (!is_file($path)) {
throw new \RuntimeException(sprintf(
'Eval case file not found: %s',
$path
));
}
$lines = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
if ($lines === false) {
throw new \RuntimeException(sprintf(
'Failed to read eval case file: %s',
$path
));
}
$cases = [];
foreach ($lines as $lineNumber => $line) {
$line = trim($line);
if ($line === '') {
continue;
}
$decoded = json_decode($line, true);
if (!is_array($decoded)) {
throw new \RuntimeException(sprintf(
'Invalid JSON in %s on line %d.',
$path,
$lineNumber + 1
));
}
$cases[] = EvalCase::fromArray($decoded);
}
return $cases;
}
}

View File

@@ -0,0 +1,59 @@
<?php
declare(strict_types=1);
namespace App\Eval;
final readonly class EvalReportWriter
{
public function __construct(
private string $projectDir,
) {
}
/**
* @param array<string, mixed> $report
*/
public function write(array $report, string $filename = 'last-run.json'): string
{
$directory = sprintf('%s/tests/evals/reports', $this->projectDir);
if (!is_dir($directory) && !mkdir($directory, 0775, true) && !is_dir($directory)) {
throw new \RuntimeException(sprintf(
'Failed to create eval report directory: %s',
$directory
));
}
$path = sprintf('%s/%s', $directory, ltrim($filename, '/'));
$json = json_encode(
$report,
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
);
if (!is_string($json)) {
throw new \RuntimeException('json_encode failed for eval report.');
}
$tmpPath = $path . '.tmp';
if (file_put_contents($tmpPath, $json) === false) {
throw new \RuntimeException(sprintf(
'Failed to write temporary eval report file: %s',
$tmpPath
));
}
if (!rename($tmpPath, $path)) {
@unlink($tmpPath);
throw new \RuntimeException(sprintf(
'Failed to move temporary eval report into place: %s',
$path
));
}
return $path;
}
}

View File

@@ -0,0 +1,434 @@
<?php
declare(strict_types=1);
namespace App\Eval;
use App\Eval\Dto\EvalCase;
use App\Eval\Dto\EvalResult;
use App\Knowledge\Retrieval\NdjsonHybridRetriever;
final readonly class RetrievalDebugRunner
{
public function __construct(
private NdjsonHybridRetriever $retriever,
) {
}
public function run(EvalCase $case): EvalResult
{
$start = microtime(true);
$failures = [];
$rows = $this->retriever->retrieveDebug($case->prompt);
$durationMs = round((microtime(true) - $start) * 1000, 2);
$resultCount = count($rows);
$first = $rows[0] ?? [];
$selectionMode = $this->extractString($first, 'selection_mode');
$route = $this->extractString($first, 'route');
$intent = $this->extractString($first, 'intent');
$documentIds = $this->extractUniqueStringValues($rows, 'document_id');
$chunkIds = $this->extractUniqueStringValues($rows, 'chunk_id');
$joinedText = $this->extractJoinedText($rows);
$assert = $case->assert;
// ---------------------------------------------------------
// Strict single-value assertions
// ---------------------------------------------------------
if (isset($assert['selection_mode']) && (string) $assert['selection_mode'] !== $selectionMode) {
$failures[] = sprintf(
'selection_mode mismatch: expected "%s", got "%s".',
(string) $assert['selection_mode'],
$selectionMode
);
}
if (isset($assert['route']) && (string) $assert['route'] !== $route) {
$failures[] = sprintf(
'route mismatch: expected "%s", got "%s".',
(string) $assert['route'],
$route
);
}
if (isset($assert['intent']) && (string) $assert['intent'] !== $intent) {
$failures[] = sprintf(
'intent mismatch: expected "%s", got "%s".',
(string) $assert['intent'],
$intent
);
}
// ---------------------------------------------------------
// Flexible multi-value assertions
// ---------------------------------------------------------
$this->assertValueInList(
failures: $failures,
actual: $selectionMode,
expectedList: $assert['selection_mode_in'] ?? [],
label: 'selection_mode'
);
$this->assertValueInList(
failures: $failures,
actual: $route,
expectedList: $assert['route_in'] ?? [],
label: 'route'
);
$this->assertValueInList(
failures: $failures,
actual: $intent,
expectedList: $assert['intent_in'] ?? [],
label: 'intent'
);
// ---------------------------------------------------------
// Result count assertions
// ---------------------------------------------------------
if (isset($assert['min_results']) && $resultCount < (int) $assert['min_results']) {
$failures[] = sprintf(
'result_count too low: expected >= %d, got %d.',
(int) $assert['min_results'],
$resultCount
);
}
if (isset($assert['max_results']) && $resultCount > (int) $assert['max_results']) {
$failures[] = sprintf(
'result_count too high: expected <= %d, got %d.',
(int) $assert['max_results'],
$resultCount
);
}
// ---------------------------------------------------------
// ID assertions
// ---------------------------------------------------------
foreach ($this->normalizeStringList($assert['must_include_document_ids'] ?? []) as $expectedDocumentId) {
if (!in_array($expectedDocumentId, $documentIds, true)) {
$failures[] = sprintf(
'missing expected document_id "%s".',
$expectedDocumentId
);
}
}
foreach ($this->normalizeStringList($assert['must_include_chunk_ids'] ?? []) as $expectedChunkId) {
if (!in_array($expectedChunkId, $chunkIds, true)) {
$failures[] = sprintf(
'missing expected chunk_id "%s".',
$expectedChunkId
);
}
}
$this->assertContainsAtLeastOne(
failures: $failures,
actualValues: $documentIds,
expectedList: $assert['must_include_one_of_document_ids'] ?? [],
label: 'document_id'
);
$this->assertContainsAtLeastOne(
failures: $failures,
actualValues: $chunkIds,
expectedList: $assert['must_include_one_of_chunk_ids'] ?? [],
label: 'chunk_id'
);
$this->assertContainsNone(
failures: $failures,
actualValues: $documentIds,
forbiddenList: $assert['must_not_include_document_ids'] ?? [],
label: 'document_id'
);
$this->assertContainsNone(
failures: $failures,
actualValues: $chunkIds,
forbiddenList: $assert['must_not_include_chunk_ids'] ?? [],
label: 'chunk_id'
);
// ---------------------------------------------------------
// Text / term assertions
// ---------------------------------------------------------
$matchedAnyTerms = $this->findMatchingTerms(
haystack: $joinedText,
terms: $this->normalizeStringList($assert['must_include_any_terms'] ?? [])
);
$matchedAllTerms = $this->findMatchingTerms(
haystack: $joinedText,
terms: $this->normalizeStringList($assert['must_include_all_terms'] ?? [])
);
$requiredAnyTerms = $this->normalizeStringList($assert['must_include_any_terms'] ?? []);
if ($requiredAnyTerms !== [] && $matchedAnyTerms === []) {
$failures[] = sprintf(
'none of the required any-terms were found in the retrieval text: [%s].',
implode(', ', $requiredAnyTerms)
);
}
$requiredAllTerms = $this->normalizeStringList($assert['must_include_all_terms'] ?? []);
foreach ($requiredAllTerms as $requiredTerm) {
if (!$this->containsTerm($joinedText, $requiredTerm)) {
$failures[] = sprintf(
'required all-term "%s" was not found in the retrieval text.',
$requiredTerm
);
}
}
return new EvalResult(
caseId: $case->id,
type: $case->type,
passed: $failures === [],
durationMs: $durationMs,
failures: $failures,
details: [
'prompt' => $case->prompt,
'result_count' => $resultCount,
'selection_mode' => $selectionMode,
'route' => $route,
'intent' => $intent,
'document_ids' => $documentIds,
'chunk_ids' => $chunkIds,
'matched_any_terms' => $matchedAnyTerms,
'matched_all_terms' => $matchedAllTerms,
],
);
}
/**
* @param array<string, mixed> $row
*/
private function extractString(array $row, string $key): string
{
$value = $row[$key] ?? null;
if (!is_string($value)) {
return '';
}
return trim($value);
}
/**
* @param array<int, array<string, mixed>> $rows
* @return array<int, string>
*/
private function extractUniqueStringValues(array $rows, string $key): array
{
$values = [];
foreach ($rows as $row) {
$value = $row[$key] ?? null;
if (!is_string($value)) {
continue;
}
$value = trim($value);
if ($value === '') {
continue;
}
$values[$value] = true;
}
return array_keys($values);
}
/**
* @param array<int, array<string, mixed>> $rows
*/
private function extractJoinedText(array $rows): string
{
$parts = [];
foreach ($rows as $row) {
$text = $row['text'] ?? null;
if (!is_string($text)) {
continue;
}
$text = trim($text);
if ($text === '') {
continue;
}
$parts[] = $text;
}
return implode("\n\n", $parts);
}
/**
* @param array<int, string> $failures
* @param mixed $expectedList
*/
private function assertValueInList(
array &$failures,
string $actual,
mixed $expectedList,
string $label
): void {
$expected = $this->normalizeStringList($expectedList);
if ($expected === []) {
return;
}
if (!in_array($actual, $expected, true)) {
$failures[] = sprintf(
'%s mismatch: expected one of [%s], got "%s".',
$label,
implode(', ', $expected),
$actual
);
}
}
/**
* @param array<int, string> $failures
* @param array<int, string> $actualValues
* @param mixed $expectedList
*/
private function assertContainsAtLeastOne(
array &$failures,
array $actualValues,
mixed $expectedList,
string $label
): void {
$expected = $this->normalizeStringList($expectedList);
if ($expected === []) {
return;
}
foreach ($expected as $candidate) {
if (in_array($candidate, $actualValues, true)) {
return;
}
}
$failures[] = sprintf(
'none of the expected %s values were found. Expected one of [%s], got [%s].',
$label,
implode(', ', $expected),
implode(', ', $actualValues)
);
}
/**
* @param array<int, string> $failures
* @param array<int, string> $actualValues
* @param mixed $forbiddenList
*/
private function assertContainsNone(
array &$failures,
array $actualValues,
mixed $forbiddenList,
string $label
): void {
$forbidden = $this->normalizeStringList($forbiddenList);
if ($forbidden === []) {
return;
}
foreach ($forbidden as $forbiddenValue) {
if (in_array($forbiddenValue, $actualValues, true)) {
$failures[] = sprintf(
'forbidden %s "%s" was present in the retrieval results.',
$label,
$forbiddenValue
);
}
}
}
/**
* @param array<int, string> $terms
* @return array<int, string>
*/
private function findMatchingTerms(string $haystack, array $terms): array
{
$matches = [];
foreach ($terms as $term) {
if ($this->containsTerm($haystack, $term)) {
$matches[] = $term;
}
}
return array_values(array_unique($matches));
}
private function containsTerm(string $haystack, string $term): bool
{
$haystack = $this->normalizeText($haystack);
$term = $this->normalizeText($term);
if ($term === '') {
return false;
}
return str_contains($haystack, $term);
}
private function normalizeText(string $value): string
{
$value = trim($value);
if ($value === '') {
return '';
}
if (function_exists('mb_strtolower')) {
return mb_strtolower($value);
}
return strtolower($value);
}
/**
* @param mixed $value
* @return array<int, string>
*/
private function normalizeStringList(mixed $value): array
{
if (!is_array($value)) {
return [];
}
$out = [];
foreach ($value as $item) {
if (!is_string($item)) {
continue;
}
$item = trim($item);
if ($item === '') {
continue;
}
$out[] = $item;
}
return array_values(array_unique($out));
}
}

View File

@@ -366,7 +366,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$rawScores = $fused['raw_scores']; $rawScores = $fused['raw_scores'];
if ($rrfScores === [] && $globalHits !== []) { if ($rrfScores === [] && $globalHits !== []) {
$rrfScores = $this->fallbackRrfFromHits($globalHits); // $rrfScores = $this->fallbackRrfFromHits($globalHits);
} }
if ($rrfScores === []) { if ($rrfScores === []) {

View File

@@ -0,0 +1,117 @@
{
"id": "retrieval_exact_doc_001",
"type": "retrieval",
"prompt": "phaseaaudit-md",
"assert": {
"selection_mode_in": [
"exact_document_title"
],
"min_results": 1,
"must_include_one_of_document_ids": [
"5914508a-5930-4f04-892b-323881d0daa7"
],
"must_include_any_terms": [
"enterprise",
"governance",
"vector-service"
]
}
}
{
"id": "retrieval_exact_doc_002",
"type": "retrieval",
"prompt": "ragsystemoverview-md",
"assert": {
"selection_mode_in": [
"exact_document_title"
],
"min_results": 1,
"must_include_one_of_document_ids": [
"7513fd82-eec6-4bfa-a730-41820b38b6b4"
],
"must_include_all_terms": [
"rag-system",
"dokumente"
]
}
}
{
"id": "retrieval_exact_doc_003",
"type": "retrieval",
"prompt": "matrixparams-md",
"assert": {
"selection_mode_in": [
"exact_document_title"
],
"min_results": 1,
"must_include_one_of_document_ids": [
"25276f4c-32bb-47a5-98b3-9d81aa722d2b"
],
"must_include_any_terms": [
"retrievalmaxchunks",
"retrievalvectortopk",
"hard_max_chunks"
]
}
}
{
"id": "retrieval_exact_doc_004",
"type": "retrieval",
"prompt": "readme-md",
"assert": {
"selection_mode_in": [
"exact_document_title"
],
"min_results": 1,
"must_include_one_of_document_ids": [
"8abe1f0d-54e6-41ad-967a-9ce8a0efc6da"
],
"must_include_any_terms": [
"deterministisches",
"faiss",
"vector-service"
]
}
}
{
"id": "retrieval_semantic_001",
"type": "retrieval",
"prompt": "wie funktioniert das system",
"assert": {
"min_results": 1,
"must_include_one_of_document_ids": [
"7513fd82-eec6-4bfa-a730-41820b38b6b4",
"8abe1f0d-54e6-41ad-967a-9ce8a0efc6da"
],
"must_include_any_terms": [
"rag-system",
"dokumente",
"indexierung"
]
}
}
{
"id": "retrieval_semantic_002",
"type": "retrieval",
"prompt": "welche parameter beeinflussen retrieval",
"assert": {
"min_results": 1,
"must_include_one_of_document_ids": [
"25276f4c-32bb-47a5-98b3-9d81aa722d2b",
"7513fd82-eec6-4bfa-a730-41820b38b6b4"
],
"must_include_any_terms": [
"retrievalmaxchunks",
"vectortopk",
"chunk"
]
}
}
{
"id": "retrieval_noise_001",
"type": "retrieval",
"prompt": "dsgfsdgfsdgf",
"assert": {
"max_results": 0
}
}

2
tests/evals/reports/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
*
!.gitignore