From 3d0092b753ed3c4d58bf068c701bc7ceea2136e7 Mon Sep 17 00:00:00 2001 From: team 1 Date: Tue, 12 May 2026 08:25:59 +0200 Subject: [PATCH] p99 --- ...EX_PATCH_99_EVAL_SUITE_EXPANSION_README.md | 157 ++++++++ src/Command/AgentEvalRunCommand.php | 2 +- src/Eval/AgentEvalRunner.php | 12 +- src/Eval/AnswerGuardEvalRunner.php | 32 ++ src/Eval/Dto/EvalCase.php | 69 +++- src/Eval/RetrievalDebugRunner.php | 20 + src/Eval/ShopQueryEvalRunner.php | 362 ++++++++++++++++++ tests/evals/cases/answer_guard.ndjson | 4 + tests/evals/cases/followup.ndjson | 4 + tests/evals/cases/shop_query.ndjson | 5 + 10 files changed, 664 insertions(+), 3 deletions(-) create mode 100644 patch_history/RETRIEX_PATCH_99_EVAL_SUITE_EXPANSION_README.md create mode 100644 src/Eval/AnswerGuardEvalRunner.php create mode 100644 src/Eval/ShopQueryEvalRunner.php create mode 100644 tests/evals/cases/answer_guard.ndjson create mode 100644 tests/evals/cases/followup.ndjson create mode 100644 tests/evals/cases/shop_query.ndjson diff --git a/patch_history/RETRIEX_PATCH_99_EVAL_SUITE_EXPANSION_README.md b/patch_history/RETRIEX_PATCH_99_EVAL_SUITE_EXPANSION_README.md new file mode 100644 index 0000000..c6763aa --- /dev/null +++ b/patch_history/RETRIEX_PATCH_99_EVAL_SUITE_EXPANSION_README.md @@ -0,0 +1,157 @@ +# RetrieX Patch p99 - Eval Suite Expansion + +## Ziel + +p99 erweitert die bisher reine Retrieval-Eval-Baseline um zusätzliche, manuell bekannte Regressionstypen aus v1.6.2: + +- Shopquery-Erzeugung +- Follow-up-Auflösung mit Chatverlauf +- Antwort-/Halluzinations-Guardrails auf Retrieval-Evidenzebene + +Der Patch ändert bewusst keine produktive RAG-, Retrieval-, Shop-, Prompt- oder Antwortlogik. Er ergänzt nur Eval-Infrastruktur und Eval-Cases. + +## Neue Eval-Typen + +### `shop_query` + +Prüft die von `AgentRunner` vorbereitete Shop-Suchquery anhand der Shop-Meta-Ausgabe. Der Runner stoppt, sobald die erste Shop-Such-Meta-Card erzeugt wurde. Dadurch werden die Query-Guards, die Routing-/History-Logik und die finalen Shopquery-Filter geprüft, ohne von der Live-Shopware-Suche abhängig zu sein. + +Beispiel: + +```bash +php bin/console mto:agent:eval:run shop_query +``` + +Cases liegen in: + +```text +tests/evals/cases/shop_query.ndjson +``` + +Abgedeckt werden unter anderem: + +- exakter Indikatorcode `Testomat 808 Indikator 300` +- Brauerei-/Brauwasser-Query-Cleanup +- Schwimmbad-Tippfehlerkorrektur +- LAB-CL-Kürzelerhalt +- SIO2-Geräteanker für Silikatüberwachung + +### `followup` + +Prüft referenzielle Shop-Folgefragen mit vorbereiteten History-Turns. Die History wird pro Eval-Case in einen isolierten temporären Eval-User geschrieben und danach wieder gelöscht. + +Beispiel: + +```bash +php bin/console mto:agent:eval:run followup +``` + +Cases liegen in: + +```text +tests/evals/cases/followup.ndjson +``` + +Abgedeckt werden unter anderem: + +- `0,02 °dH -> Testomat 808 -> Indikatortyp 300 -> was kostet der indikator` +- Wechsel vom Indikatorpreis zurück zum Hauptgerätpreis +- schwache Shop-Folgefrage `suche im shop nach der information` mit THCL-Historyanker +- Produktlink-Follow-up mit Einzelqueries statt kombinierter Multi-Produkt-Query + +### `answer_guard` + +Prüft Antwort-Guardrails vor der finalen LLM-Antwort auf Basis der Retrieval-Evidenz. Das ist absichtlich kein generativer LLM-Antworttest, sondern ein stabiler Pre-Answer-Guard gegen falsche Evidenz oder Halluzinationsrisiken. + +Beispiel: + +```bash +php bin/console mto:agent:eval:run answer_guard +``` + +Cases liegen in: + +```text +tests/evals/cases/answer_guard.ndjson +``` + +Abgedeckt werden unter anderem: + +- Noise-Prompt ohne Evidenz +- Fantasie-Medien wie Drachenblut / Mondwasser +- Lieferbedingungen dürfen nicht auf Sicherheitsdatenblätter kippen + +## Neue Assertion-Felder + +### Für `shop_query` und `followup` + +```json +{ + "expected_query": "testomat 808 300 indikator", + "must_include_terms": ["testomat", "808", "300", "indikator"], + "must_not_include_terms": ["300 s", "301", "302"], + "must_not_equal_query": "information" +} +``` + +Für Multi-Produkt-Follow-ups: + +```json +{ + "expected_individual_queries": [ + "testomat 2000 self clean", + "testomat 2000 cal", + "testomat 808" + ], + "expected_individual_queries_exact": true, + "min_individual_queries": 3, + "max_individual_queries": 3 +} +``` + +### Für `retrieval` und `answer_guard` + +`RetrievalDebugRunner` unterstützt zusätzlich: + +```json +{ + "must_not_include_terms": ["sicherheitsdatenblatt"], + "must_not_match_patterns": ["/forbidden/u"] +} +``` + +## Geänderte Dateien + +```text +src/Command/AgentEvalRunCommand.php +src/Eval/AgentEvalRunner.php +src/Eval/AnswerGuardEvalRunner.php +src/Eval/Dto/EvalCase.php +src/Eval/RetrievalDebugRunner.php +src/Eval/ShopQueryEvalRunner.php +tests/evals/cases/answer_guard.ndjson +tests/evals/cases/followup.ndjson +tests/evals/cases/shop_query.ndjson +patch_history/RETRIEX_PATCH_99_EVAL_SUITE_EXPANSION_README.md +``` + +## Nicht geändert + +- Keine Retrieval-Gewichte geändert. +- Keine Shopquery-Produktivlogik geändert. +- Keine Prompt-Regeln geändert. +- Keine YAML-Vokabularregeln geändert. +- Keine LLM-/Modellparameter geändert. +- Keine Admin-/Frontend-Logik geändert. + +## Empfohlene Validierung nach Einspielen + +```bash +php bin/console mto:agent:config:validate +php bin/console mto:agent:eval:run retrieval +php bin/console mto:agent:eval:run shop_query +php bin/console mto:agent:eval:run followup +php bin/console mto:agent:eval:run answer_guard +``` + +Wichtig: `shop_query` und `followup` laufen über den `AgentRunner` bis zur Shop-Meta-Card. Sie stoppen vor der Live-Shop-Suche, können aber je nach aktiver Konfiguration weiterhin Input-Normalisierung oder Shopquery-Optimierung über das konfigurierte LLM versuchen. Wenn das LLM nicht erreichbar ist, greift die bestehende Fallback-Logik des Agenten. diff --git a/src/Command/AgentEvalRunCommand.php b/src/Command/AgentEvalRunCommand.php index d34a67b..7dffc06 100644 --- a/src/Command/AgentEvalRunCommand.php +++ b/src/Command/AgentEvalRunCommand.php @@ -37,7 +37,7 @@ final class AgentEvalRunCommand extends Command ->addArgument( 'type', InputArgument::OPTIONAL, - 'Eval type to run', + 'Eval type to run (retrieval, shop_query, followup, answer_guard)', 'retrieval' ) ->addOption( diff --git a/src/Eval/AgentEvalRunner.php b/src/Eval/AgentEvalRunner.php index 1358357..f1af348 100644 --- a/src/Eval/AgentEvalRunner.php +++ b/src/Eval/AgentEvalRunner.php @@ -11,6 +11,8 @@ final readonly class AgentEvalRunner { public function __construct( private RetrievalDebugRunner $retrievalDebugRunner, + private ShopQueryEvalRunner $shopQueryEvalRunner, + private AnswerGuardEvalRunner $answerGuardEvalRunner, ) { } @@ -20,6 +22,14 @@ final readonly class AgentEvalRunner return $this->retrievalDebugRunner->run($case); } + if ($case->isShopQueryCase() || $case->isFollowUpCase()) { + return $this->shopQueryEvalRunner->run($case); + } + + if ($case->isAnswerGuardCase()) { + return $this->answerGuardEvalRunner->run($case); + } + throw new \InvalidArgumentException(sprintf( 'Unsupported eval case type: %s', $case->type @@ -40,4 +50,4 @@ final readonly class AgentEvalRunner return $results; } -} \ No newline at end of file +} diff --git a/src/Eval/AnswerGuardEvalRunner.php b/src/Eval/AnswerGuardEvalRunner.php new file mode 100644 index 0000000..0c71901 --- /dev/null +++ b/src/Eval/AnswerGuardEvalRunner.php @@ -0,0 +1,32 @@ +retrievalDebugRunner->run($case); + $details = $result->details; + $details['guard_scope'] = 'retrieval_evidence_pre_answer'; + + return new EvalResult( + caseId: $result->caseId, + type: $case->type, + passed: $result->passed, + durationMs: $result->durationMs, + failures: $result->failures, + details: $details, + ); + } +} diff --git a/src/Eval/Dto/EvalCase.php b/src/Eval/Dto/EvalCase.php index e5ce645..b6c18cd 100644 --- a/src/Eval/Dto/EvalCase.php +++ b/src/Eval/Dto/EvalCase.php @@ -8,12 +8,15 @@ final readonly class EvalCase { /** * @param array $assert + * @param array $history */ public function __construct( public string $id, public string $type, public string $prompt, public array $assert = [], + public array $history = [], + public string $requestContextHint = '', ) { } @@ -26,6 +29,8 @@ final readonly class EvalCase $type = trim((string) ($row['type'] ?? '')); $prompt = trim((string) ($row['prompt'] ?? '')); $assert = is_array($row['assert'] ?? null) ? $row['assert'] : []; + $history = self::normalizeHistory($row['history'] ?? []); + $requestContextHint = trim((string) ($row['request_context_hint'] ?? '')); if ($id === '') { throw new \InvalidArgumentException('Eval case id must not be empty.'); @@ -50,6 +55,8 @@ final readonly class EvalCase type: $type, prompt: $prompt, assert: $assert, + history: $history, + requestContextHint: $requestContextHint, ); } @@ -57,4 +64,64 @@ final readonly class EvalCase { return $this->type === 'retrieval'; } -} \ No newline at end of file + + public function isShopQueryCase(): bool + { + return $this->type === 'shop_query'; + } + + public function isFollowUpCase(): bool + { + return $this->type === 'followup'; + } + + public function isAnswerGuardCase(): bool + { + return $this->type === 'answer_guard'; + } + + /** + * @return array + */ + private static function normalizeHistory(mixed $value): array + { + if (!is_array($value)) { + return []; + } + + $history = []; + + foreach ($value as $entry) { + if (is_string($entry)) { + $entry = trim($entry); + + if ($entry !== '') { + $history[] = [ + 'prompt' => 'Eval-Kontext', + 'answer' => $entry, + ]; + } + + continue; + } + + if (!is_array($entry)) { + continue; + } + + $prompt = trim((string) ($entry['prompt'] ?? '')); + $answer = trim((string) ($entry['answer'] ?? $entry['response'] ?? '')); + + if ($prompt === '' && $answer === '') { + continue; + } + + $history[] = [ + 'prompt' => $prompt !== '' ? $prompt : 'Eval-Kontext', + 'answer' => $answer, + ]; + } + + return $history; + } +} diff --git a/src/Eval/RetrievalDebugRunner.php b/src/Eval/RetrievalDebugRunner.php index c07c28a..63b2129 100644 --- a/src/Eval/RetrievalDebugRunner.php +++ b/src/Eval/RetrievalDebugRunner.php @@ -187,6 +187,25 @@ final readonly class RetrievalDebugRunner } } + $forbiddenTerms = $this->normalizeStringList($assert['must_not_include_terms'] ?? []); + foreach ($forbiddenTerms as $forbiddenTerm) { + if ($this->containsTerm($joinedText, $forbiddenTerm)) { + $failures[] = sprintf( + 'forbidden term "%s" was present in the retrieval text.', + $forbiddenTerm + ); + } + } + + foreach ($this->normalizeStringList($assert['must_not_match_patterns'] ?? []) as $pattern) { + if (@preg_match($pattern, $joinedText) === 1) { + $failures[] = sprintf( + 'forbidden pattern "%s" matched the retrieval text.', + $pattern + ); + } + } + return new EvalResult( caseId: $case->id, type: $case->type, @@ -203,6 +222,7 @@ final readonly class RetrievalDebugRunner 'chunk_ids' => $chunkIds, 'matched_any_terms' => $matchedAnyTerms, 'matched_all_terms' => $matchedAllTerms, + 'forbidden_terms_checked' => $this->normalizeStringList($assert['must_not_include_terms'] ?? []), ], ); } diff --git a/src/Eval/ShopQueryEvalRunner.php b/src/Eval/ShopQueryEvalRunner.php new file mode 100644 index 0000000..4b137cd --- /dev/null +++ b/src/Eval/ShopQueryEvalRunner.php @@ -0,0 +1,362 @@ +buildUserId($case); + $transcript = ''; + $shopMeta = null; + + $this->contextService->deleteHistory($userId); + $this->seedHistory($userId, $case->history); + + try { + foreach ($this->agentRunner->run($case->prompt, $userId, false, $case->requestContextHint) as $chunk) { + if (!is_string($chunk) || $chunk === '') { + continue; + } + + $transcript .= $chunk . "\n"; + + if (!str_contains($chunk, 'retriex-shop-meta')) { + if (mb_strlen($transcript, 'UTF-8') > 120000) { + $transcript = mb_substr($transcript, -120000, null, 'UTF-8'); + } + continue; + } + + $shopMeta = $this->extractShopMeta($chunk); + break; + } + } catch (\Throwable $e) { + $failures[] = sprintf('agent run failed before shop-query meta was emitted: %s', $e->getMessage()); + } finally { + $this->contextService->deleteHistory($userId); + } + + $durationMs = round((microtime(true) - $start) * 1000, 2); + + if ($shopMeta === null) { + $failures[] = 'no shop-query meta message was emitted before the runner stopped.'; + $shopMeta = [ + 'query' => '', + 'individual_queries' => [], + 'raw_html' => '', + ]; + } + + $this->assertShopQuery($failures, $case, $shopMeta); + + return new EvalResult( + caseId: $case->id, + type: $case->type, + passed: $failures === [], + durationMs: $durationMs, + failures: $failures, + details: [ + 'prompt' => $case->prompt, + 'history_turns' => count($case->history), + 'has_request_context_hint' => $case->requestContextHint !== '', + 'query' => $shopMeta['query'], + 'individual_queries' => $shopMeta['individual_queries'], + 'transcript_preview' => $this->previewText($transcript), + ], + ); + } + + private function buildUserId(EvalCase $case): string + { + $safeId = preg_replace('/[^a-zA-Z0-9_-]+/', '_', $case->id) ?? $case->id; + $safeId = trim($safeId, '_'); + + return 'eval_' . ($safeId !== '' ? $safeId : sha1($case->id)); + } + + /** + * @param array $history + */ + private function seedHistory(string $userId, array $history): void + { + foreach ($history as $turn) { + $prompt = trim($turn['prompt'] ?? ''); + $answer = trim($turn['answer'] ?? ''); + + if ($prompt === '' && $answer === '') { + continue; + } + + if ($prompt === '') { + $prompt = 'Eval-Kontext'; + } + + $this->contextService->appendHistory($userId, $prompt, $answer); + } + } + + /** + * @return array{query:string,individual_queries:array,raw_html:string} + */ + private function extractShopMeta(string $html): array + { + $isMultiQuery = str_contains($html, 'retriex-meta-query--multi'); + $codes = []; + + if (preg_match_all('/(.*?)<\/code>/su', $html, $matches) !== false) { + foreach ($matches[1] ?? [] as $value) { + $decoded = html_entity_decode(strip_tags((string) $value), ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8'); + $decoded = $this->normalizeOneLine($decoded); + + if ($decoded !== '') { + $codes[] = $decoded; + } + } + } + + $codes = array_values(array_unique($codes)); + + if ($isMultiQuery) { + return [ + 'query' => '', + 'individual_queries' => $codes, + 'raw_html' => $html, + ]; + } + + return [ + 'query' => $codes[0] ?? '', + 'individual_queries' => [], + 'raw_html' => $html, + ]; + } + + /** + * @param array $failures + * @param array{query:string,individual_queries:array,raw_html:string} $shopMeta + */ + private function assertShopQuery(array &$failures, EvalCase $case, array $shopMeta): void + { + $assert = $case->assert; + $query = $shopMeta['query']; + $individualQueries = $shopMeta['individual_queries']; + $joined = trim($query . ' ' . implode(' ', $individualQueries)); + + $expectedQuery = $this->stringOrNull($assert['expected_query'] ?? null); + if ($expectedQuery !== null && $this->normalizeQuery($query) !== $this->normalizeQuery($expectedQuery)) { + $failures[] = sprintf( + 'shop query mismatch: expected "%s", got "%s".', + $expectedQuery, + $query + ); + } + + $forbiddenExactQuery = $this->stringOrNull($assert['must_not_equal_query'] ?? null); + if ($forbiddenExactQuery !== null && $this->normalizeQuery($query) === $this->normalizeQuery($forbiddenExactQuery)) { + $failures[] = sprintf('shop query must not equal "%s".', $forbiddenExactQuery); + } + + $expectedIndividualQueries = $this->normalizeStringList($assert['expected_individual_queries'] ?? []); + if ($expectedIndividualQueries !== []) { + foreach ($expectedIndividualQueries as $expectedIndividualQuery) { + if (!$this->containsNormalizedQuery($individualQueries, $expectedIndividualQuery)) { + $failures[] = sprintf( + 'missing expected individual shop query "%s". Got [%s].', + $expectedIndividualQuery, + implode(', ', $individualQueries) + ); + } + } + } + + if (($assert['expected_individual_queries_exact'] ?? false) === true) { + $expected = array_map(fn(string $value): string => $this->normalizeQuery($value), $expectedIndividualQueries); + $actual = array_map(fn(string $value): string => $this->normalizeQuery($value), $individualQueries); + + sort($expected); + sort($actual); + + if ($expected !== $actual) { + $failures[] = sprintf( + 'individual shop queries differ from expected exact set. Expected [%s], got [%s].', + implode(', ', $expectedIndividualQueries), + implode(', ', $individualQueries) + ); + } + } + + if (isset($assert['min_individual_queries']) && count($individualQueries) < (int) $assert['min_individual_queries']) { + $failures[] = sprintf( + 'too few individual shop queries: expected >= %d, got %d.', + (int) $assert['min_individual_queries'], + count($individualQueries) + ); + } + + if (isset($assert['max_individual_queries']) && count($individualQueries) > (int) $assert['max_individual_queries']) { + $failures[] = sprintf( + 'too many individual shop queries: expected <= %d, got %d.', + (int) $assert['max_individual_queries'], + count($individualQueries) + ); + } + + foreach ($this->normalizeStringList($assert['must_include_terms'] ?? []) as $term) { + if (!$this->containsTerm($joined, $term)) { + $failures[] = sprintf('shop query output does not contain required term "%s".', $term); + } + } + + $requiredAnyTerms = $this->normalizeStringList($assert['must_include_any_terms'] ?? []); + if ($requiredAnyTerms !== []) { + $matched = false; + foreach ($requiredAnyTerms as $term) { + if ($this->containsTerm($joined, $term)) { + $matched = true; + break; + } + } + + if (!$matched) { + $failures[] = sprintf( + 'shop query output contains none of the required any-terms: [%s].', + implode(', ', $requiredAnyTerms) + ); + } + } + + foreach ($this->normalizeStringList($assert['must_not_include_terms'] ?? []) as $term) { + if ($this->containsTerm($joined, $term)) { + $failures[] = sprintf('shop query output contains forbidden term "%s".', $term); + } + } + + foreach ($this->normalizeStringList($assert['query_must_match_patterns'] ?? []) as $pattern) { + if (@preg_match($pattern, $joined) !== 1) { + $failures[] = sprintf('shop query output does not match required pattern "%s".', $pattern); + } + } + + foreach ($this->normalizeStringList($assert['query_must_not_match_patterns'] ?? []) as $pattern) { + if (@preg_match($pattern, $joined) === 1) { + $failures[] = sprintf('shop query output matches forbidden pattern "%s".', $pattern); + } + } + } + + /** + * @param array $queries + */ + private function containsNormalizedQuery(array $queries, string $needle): bool + { + $needle = $this->normalizeQuery($needle); + + foreach ($queries as $query) { + if ($this->normalizeQuery($query) === $needle) { + return true; + } + } + + return false; + } + + private function containsTerm(string $haystack, string $term): bool + { + $haystack = $this->normalizeText($haystack); + $term = $this->normalizeText($term); + + return $term !== '' && str_contains($haystack, $term); + } + + private function normalizeQuery(string $value): string + { + $value = $this->normalizeText($value); + $value = preg_replace('/[^\p{L}\p{N}]+/u', ' ', $value) ?? $value; + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim($value); + } + + private function normalizeText(string $value): string + { + $value = html_entity_decode(strip_tags($value), ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8'); + $value = mb_strtolower(trim($value), 'UTF-8'); + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim($value); + } + + private function normalizeOneLine(string $value): string + { + $value = trim($value); + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim($value); + } + + private function stringOrNull(mixed $value): ?string + { + if (!is_string($value)) { + return null; + } + + $value = trim($value); + + return $value !== '' ? $value : null; + } + + /** + * @return array + */ + private function normalizeStringList(mixed $value): array + { + if (!is_array($value)) { + return []; + } + + $out = []; + + foreach ($value as $item) { + if (!is_string($item)) { + continue; + } + + $item = trim($item); + + if ($item === '') { + continue; + } + + $out[] = $item; + } + + return array_values(array_unique($out)); + } + + private function previewText(string $value): string + { + $value = $this->normalizeOneLine($value); + + if (mb_strlen($value, 'UTF-8') <= 1200) { + return $value; + } + + return rtrim(mb_substr($value, 0, 1200, 'UTF-8')) . '...'; + } +} diff --git a/tests/evals/cases/answer_guard.ndjson b/tests/evals/cases/answer_guard.ndjson new file mode 100644 index 0000000..4d78dae --- /dev/null +++ b/tests/evals/cases/answer_guard.ndjson @@ -0,0 +1,4 @@ +{"id":"answer_guard_noise_no_evidence_001","type":"answer_guard","prompt":"dsgfsdgfsdgf","assert":{"max_results":0}} +{"id":"answer_guard_mythical_medium_no_direct_evidence_001","type":"answer_guard","prompt":"gibt es einen testomat für drachenblut","assert":{"must_not_include_terms":["drachenblut"]}} +{"id":"answer_guard_lunar_water_no_direct_evidence_001","type":"answer_guard","prompt":"welcher testomat misst mondwasser im vakuum","assert":{"must_not_include_terms":["mondwasser","vakuum"]}} +{"id":"answer_guard_delivery_not_sdb_001","type":"answer_guard","prompt":"lieferbedingungen versand testomat","assert":{"min_results":1,"must_include_one_of_document_ids":["26ddf03d-9108-4a65-aa0e-a5df7613fa77"],"must_not_include_document_ids":["7166592f-85f2-425c-997b-73e323ae184d"],"must_not_include_terms":["sicherheitsdatenblatt"]}} diff --git a/tests/evals/cases/followup.ndjson b/tests/evals/cases/followup.ndjson new file mode 100644 index 0000000..1c9aaf1 --- /dev/null +++ b/tests/evals/cases/followup.ndjson @@ -0,0 +1,4 @@ +{"id":"followup_indicator_price_001","type":"followup","prompt":"was kostet der indikator","history":[{"prompt":"Was ist der niedrigste Grenzwert für die Wasserhärte, welcher mit einem Testomaten überwacht werden kann?","answer":"Der niedrigste Grenzwert für die Wasserhärte beträgt 0,02 °dH. Dieser Wert wird vom Testomat 808 gemessen."},{"prompt":"mit welchem indikator","answer":"Der niedrigste messbare Grenzwert für Wasserhärte mit dem Testomat 808 wird mit dem Indikatortyp 300 erreicht."}],"assert":{"expected_query":"testomat 808 300 indikator","must_include_terms":["testomat","808","300","indikator"],"must_not_include_terms":["300 s","301","302","303","testomat 2000"]}} +{"id":"followup_main_device_price_001","type":"followup","prompt":"und was kostet das gerät selber","history":[{"prompt":"was kostet der indikator","answer":"Shop-Suche abgeschlossen. Gesendete Suchquery: testomat 808 300 indikator. Testomat® 808 Indikator 300 500 ml, Produkt-Nummer 141001. Testomat® 808 Indikator 300 2 x 100 ml, Produkt-Nummer 140001. Der zugehörige Testomat ist Testomat 808."}],"assert":{"expected_query":"testomat 808","must_include_terms":["testomat","808"],"must_not_include_terms":["indikator","300","141001","140001"]}} +{"id":"followup_weak_shop_information_anchor_001","type":"followup","prompt":"suche im shop nach der information","history":[{"prompt":"welche grenzwerte kann der testomat 2000 thcl messen","answer":"Der relevante Produktanker ist Testomat 2000 THCL. Das Gerät ist für Chlorüberwachung / freies Chlor relevant."}],"assert":{"expected_query":"testomat 2000 thcl","must_include_terms":["testomat","2000","thcl"],"must_not_equal_query":"information","must_not_include_terms":["information"]}} +{"id":"followup_product_links_split_001","type":"followup","prompt":"gebe mir links zu den produkten aus dem shop","history":[{"prompt":"gerät zur messung Prozesswasser in medizinischen Geräten","answer":"Geeignete Produktanker sind Testomat 2000 Self Clean, Testomat 2000 CAL und Testomat 808."}],"assert":{"expected_individual_queries":["testomat 2000 self clean","testomat 2000 cal","testomat 808"],"expected_individual_queries_exact":true,"min_individual_queries":3,"max_individual_queries":3,"must_not_include_terms":["links zu aus"]}} diff --git a/tests/evals/cases/shop_query.ndjson b/tests/evals/cases/shop_query.ndjson new file mode 100644 index 0000000..dcb6bbe --- /dev/null +++ b/tests/evals/cases/shop_query.ndjson @@ -0,0 +1,5 @@ +{"id":"shop_query_indicator_exact_001","type":"shop_query","prompt":"was kostet der Testomat 808 Indikator 300","assert":{"must_include_terms":["testomat","808","300","indikator"],"must_not_include_terms":["300 s","301","302","303","gerät selber"]}} +{"id":"shop_query_brewing_water_cleanup_001","type":"shop_query","prompt":"ich möchte für brauerei das brauwasser messen","assert":{"expected_query":"brauerei brauwasser","must_include_terms":["brauerei","brauwasser"],"must_not_include_terms":["möchte","messen","think"]}} +{"id":"shop_query_swimming_pool_typo_001","type":"shop_query","prompt":"ich würde gern chlor im schwinnbad messen","assert":{"expected_query":"chlor schwimmbad","must_include_terms":["chlor","schwimmbad"],"must_not_include_terms":["schwinnbad","messen"]}} +{"id":"shop_query_lab_cl_acronym_001","type":"shop_query","prompt":"Zeige mir die Preise zu Testomat LAB CL.","assert":{"expected_query":"testomat lab cl","must_include_terms":["testomat","lab","cl"],"must_not_equal_query":"testomat"}} +{"id":"shop_query_sio2_anchor_001","type":"shop_query","prompt":"suche gerät kühlsysteme Silikatüberwachung","assert":{"expected_query":"testomat 808 sio2","must_include_terms":["testomat","808","sio2"],"must_not_include_terms":["kühlsysteme","silikatüberwachung"]}}