diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index 411d66b..501731c 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -357,6 +357,25 @@ final class NdjsonHybridRetrieverConfig { return $this->stringList('looks_like_device_words', self::LOOKS_LIKE_DEVICE_WORDS); } + /** + * Effective retrieval vocabulary as a dedicated structure for diagnostics and config dumps. + * + * @return array> + */ + public function vocabularyToArray(): array + { + return [ + 'generic_product_tokens' => $this->genericProductTokens(), + 'important_short_model_tokens' => $this->importantShortModelTokens(), + 'family_descriptor_tokens' => $this->familyDescriptorTokens(), + 'looks_like_reagent_tokens' => $this->looksLikeReagentTokens(), + 'looks_like_safety_docs' => $this->looksLikeSafetyDocs(), + 'looks_like_reagent_words' => $this->looksLikeReagentWords(), + 'looks_like_document_words' => $this->looksLikeDocumentWords(), + 'looks_like_safety_words' => $this->looksLikeSafetyWords(), + 'looks_like_device_words' => $this->looksLikeDeviceWords(), + ]; + } /** * @return array diff --git a/tests/evals/cases/retrieval.ndjson b/tests/evals/cases/retrieval.ndjson index fbbd804..1dbf7bd 100644 --- a/tests/evals/cases/retrieval.ndjson +++ b/tests/evals/cases/retrieval.ndjson @@ -1,329 +1,19 @@ -{ - "id": "retrieval_exact_doc_001", - "type": "retrieval", - "prompt": "Testomat 808", - "assert": { - "selection_mode_in": [ - "exact_document_title" - ], - "min_results": 1, - "must_include_one_of_document_ids": [ - "26129c01-c09f-4c71-9c80-7ddffb6c77fb" - ], - "must_include_any_terms": [ - "testomat 808", - "0,02 °dh", - "indikator" - ] - } -} -{ - "id": "retrieval_exact_doc_002", - "type": "retrieval", - "prompt": "Testomat EVO CALC", - "assert": { - "selection_mode_in": [ - "exact_document_title" - ], - "min_results": 1, - "must_include_one_of_document_ids": [ - "74fdad85-5e4e-4f08-8d95-402f3180ed55" - ], - "must_include_any_terms": [ - "evo calc", - "kalibrier", - "wasserhärte" - ] - } -} -{ - "id": "retrieval_exact_doc_003", - "type": "retrieval", - "prompt": "Testomat ECO PLUS", - "assert": { - "selection_mode_in": [ - "exact_document_title" - ], - "min_results": 1, - "must_include_one_of_document_ids": [ - "bace47f9-647e-4d47-95d9-118e553c6e5a" - ], - "must_include_any_terms": [ - "eco-plus", - "intervall", - "liter" - ] - } -} -{ - "id": "retrieval_exact_doc_004", - "type": "retrieval", - "prompt": "Testomat ECO", - "assert": { - "selection_mode_in": [ - "exact_document_title" - ], - "min_results": 1, - "must_include_one_of_document_ids": [ - "3d6c2add-c643-4e96-a3e7-5eb949c41303" - ], - "must_include_any_terms": [ - "testomat eco", - "intervall", - "spülzeit" - ] - } -} -{ - "id": "retrieval_exact_doc_005", - "type": "retrieval", - "prompt": "Testomat EVO TH", - "assert": { - "selection_mode_in": [ - "exact_document_title" - ], - "min_results": 1, - "must_include_one_of_document_ids": [ - "eb91c1be-4546-4ed5-8b01-f075519d675b" - ], - "must_include_any_terms": [ - "evo th", - "online-analysenautomat", - "digitale eingänge" - ] - } -} -{ - "id": "retrieval_exact_doc_006", - "type": "retrieval", - "prompt": "Wasserhärte Grenzwert Testomat", - "assert": { - "selection_mode_in": [ - "exact_document_title" - ], - "min_results": 1, - "must_include_one_of_document_ids": [ - "60706498-867b-41b8-8e76-63248178d265" - ], - "must_include_any_terms": [ - "grenzwert", - "0,02 °dh", - "0,05 °dh" - ] - } -} - -{ - "id": "retrieval_semantic_001", - "type": "retrieval", - "prompt": "welche testomat geraete gibt es", - "assert": { - "min_results": 1, - "must_include_one_of_document_ids": [ - "cf6a1ff2-8afe-4ebe-951b-805d7324d0a3", - "26129c01-c09f-4c71-9c80-7ddffb6c77fb", - "74fdad85-5e4e-4f08-8d95-402f3180ed55" - ], - "must_include_any_terms": [ - "testomatwelt", - "geräteübersicht", - "wasserhärte" - ] - } -} -{ - "id": "retrieval_semantic_002", - "type": "retrieval", - "prompt": "welches testomat modell ist fuer hohe wasserhaerte geeignet", - "assert": { - "min_results": 1, - "must_include_one_of_document_ids": [ - "eb91c1be-4546-4ed5-8b01-f075519d675b", - "74fdad85-5e4e-4f08-8d95-402f3180ed55", - "26129c01-c09f-4c71-9c80-7ddffb6c77fb", - "60706498-867b-41b8-8e76-63248178d265" - ], - "must_include_any_terms": [ - "wasserhärte", - "grenzwert", - "testomat" - ] - } -} -{ - "id": "retrieval_semantic_003", - "type": "retrieval", - "prompt": "welche testomat indikatoren gibt es", - "assert": { - "min_results": 1, - "must_include_one_of_document_ids": [ - "8db60a9f-3549-4567-b914-5e3d0d9ef715", - "f0422ac8-3d60-4b6c-ab97-8eba652d9eb3", - "5ced4bcb-aa9d-4032-9eee-37a33f744476", - "a9fedf75-bccc-4100-ac59-b6f4eef01e61", - "d11948da-4e77-48e3-bab2-d32f622343de" - ], - "must_include_any_terms": [ - "indikator", - "th 2250", - "th 2005", - "tc 2050", - "tc 2100" - ] - } -} -{ - "id": "retrieval_semantic_004", - "type": "retrieval", - "prompt": "welcher testomat ist ein verschneideregler", - "assert": { - "min_results": 1, - "must_include_one_of_document_ids": [ - "7fe9342f-2ca4-41ce-bdea-410b516ef6b4" - ], - "must_include_any_terms": [ - "verschneideregler", - "motorventil", - "0/4–20 ma" - ] - } -} -{ - "id": "retrieval_semantic_005", - "type": "retrieval", - "prompt": "welcher testomat hat automatische reinigung", - "assert": { - "min_results": 1, - "must_include_one_of_document_ids": [ - "b8c3343b-931e-4994-9d53-a2130efc846f", - "51589532-a1a1-46e0-94b2-a139dce78543" - ], - "must_include_any_terms": [ - "self clean", - "reinigung", - "messkammer" - ] - } -} -{ - "id": "retrieval_semantic_006", - "type": "retrieval", - "prompt": "welches geraet ist fuer chlorueberwachung gedacht", - "assert": { - "min_results": 1, - "must_include_one_of_document_ids": [ - "1d467913-a2d9-42e6-8510-83a65aba9403" - ], - "must_include_any_terms": [ - "thcl", - "chlor", - "online-analysegerät" - ] - } -} -{ - "id": "retrieval_semantic_007", - "type": "retrieval", - "prompt": "gibt es ein kompaktes kosteneffizientes haerteueberwachungsgeraet", - "assert": { - "min_results": 1, - "must_include_one_of_document_ids": [ - "afcf1cd6-9b02-4828-b11f-339096a3c864", - "3d6c2add-c643-4e96-a3e7-5eb949c41303" - ], - "must_include_any_terms": [ - "eco c", - "kosteneffizient", - "härteüberwachung" - ] - } -} - -{ - "id": "retrieval_negative_001", - "type": "retrieval", - "prompt": "lieferbedingungen versand testomat", - "assert": { - "min_results": 1, - "must_include_one_of_document_ids": [ - "26ddf03d-9108-4a65-aa0e-a5df7613fa77" - ], - "must_not_include_document_ids": [ - "7166592f-85f2-425c-997b-73e323ae184d" - ] - } -} -{ - "id": "retrieval_negative_002", - "type": "retrieval", - "prompt": "testomat 2000 th 2005 sicherheitsdatenblatt", - "assert": { - "min_results": 1, - "must_include_one_of_document_ids": [ - "f0422ac8-3d60-4b6c-ab97-8eba652d9eb3", - "e3d05954-cde3-40bc-baf6-aa9a350a8aa2" - ], - "must_not_include_document_ids": [ - "26129c01-c09f-4c71-9c80-7ddffb6c77fb", - "74fdad85-5e4e-4f08-8d95-402f3180ed55" - ] - } -} -{ - "id": "retrieval_negative_003", - "type": "retrieval", - "prompt": "testomat 2000 self clean reinigungsloesung", - "assert": { - "min_results": 1, - "must_include_one_of_document_ids": [ - "51589532-a1a1-46e0-94b2-a139dce78543", - "b8c3343b-931e-4994-9d53-a2130efc846f" - ], - "must_include_any_terms": [ - "reinigungslösung", - "self clean" - ], - "must_not_include_document_ids": [ - "26129c01-c09f-4c71-9c80-7ddffb6c77fb" - ] - } -} - -{ - "id": "retrieval_short_001", - "type": "retrieval", - "prompt": "evo th", - "assert": { - "min_results": 1, - "must_include_one_of_document_ids": [ - "eb91c1be-4546-4ed5-8b01-f075519d675b", - "74fdad85-5e4e-4f08-8d95-402f3180ed55" - ], - "must_include_any_terms": [ - "evo" - ] - } -} -{ - "id": "retrieval_short_002", - "type": "retrieval", - "prompt": "808", - "assert": { - "min_results": 1, - "must_include_one_of_document_ids": [ - "26129c01-c09f-4c71-9c80-7ddffb6c77fb" - ], - "must_include_any_terms": [ - "808" - ] - } -} - -{ - "id": "retrieval_noise_001", - "type": "retrieval", - "prompt": "dsgfsdgfsdgf", - "assert": { - "max_results": 0 - } -} \ No newline at end of file +{"id":"retrieval_exact_doc_001","type":"retrieval","prompt":"Testomat 808","assert":{"selection_mode_in":["exact_document_title"],"min_results":1,"must_include_one_of_document_ids":["26129c01-c09f-4c71-9c80-7ddffb6c77fb"],"must_include_any_terms":["testomat 808","0,02 °dh","indikator"]}} +{"id":"retrieval_exact_doc_002","type":"retrieval","prompt":"Testomat EVO CALC","assert":{"selection_mode_in":["exact_document_title"],"min_results":1,"must_include_one_of_document_ids":["74fdad85-5e4e-4f08-8d95-402f3180ed55"],"must_include_any_terms":["evo calc","kalibrier","wasserhärte"]}} +{"id":"retrieval_exact_doc_003","type":"retrieval","prompt":"Testomat ECO PLUS","assert":{"selection_mode_in":["exact_document_title"],"min_results":1,"must_include_one_of_document_ids":["bace47f9-647e-4d47-95d9-118e553c6e5a"],"must_include_any_terms":["eco-plus","intervall","liter"]}} +{"id":"retrieval_exact_doc_004","type":"retrieval","prompt":"Testomat ECO","assert":{"selection_mode_in":["exact_document_title"],"min_results":1,"must_include_one_of_document_ids":["3d6c2add-c643-4e96-a3e7-5eb949c41303"],"must_include_any_terms":["testomat eco","intervall","spülzeit"]}} +{"id":"retrieval_exact_doc_005","type":"retrieval","prompt":"Testomat EVO TH","assert":{"selection_mode_in":["exact_document_title"],"min_results":1,"must_include_one_of_document_ids":["eb91c1be-4546-4ed5-8b01-f075519d675b"],"must_include_any_terms":["evo th","online-analysenautomat","digitale eingänge"]}} +{"id":"retrieval_exact_doc_006","type":"retrieval","prompt":"Wasserhärte Grenzwert Testomat","assert":{"selection_mode_in":["exact_document_title"],"min_results":1,"must_include_one_of_document_ids":["60706498-867b-41b8-8e76-63248178d265"],"must_include_any_terms":["grenzwert","0,02 °dh","0,05 °dh"]}} +{"id":"retrieval_semantic_001","type":"retrieval","prompt":"welche testomat geraete gibt es","assert":{"min_results":1,"must_include_one_of_document_ids":["cf6a1ff2-8afe-4ebe-951b-805d7324d0a3","26129c01-c09f-4c71-9c80-7ddffb6c77fb","74fdad85-5e4e-4f08-8d95-402f3180ed55"],"must_include_any_terms":["testomatwelt","geräteübersicht","wasserhärte"]}} +{"id":"retrieval_semantic_002","type":"retrieval","prompt":"welches testomat modell ist fuer hohe wasserhaerte geeignet","assert":{"min_results":1,"must_include_one_of_document_ids":["eb91c1be-4546-4ed5-8b01-f075519d675b","74fdad85-5e4e-4f08-8d95-402f3180ed55","26129c01-c09f-4c71-9c80-7ddffb6c77fb","60706498-867b-41b8-8e76-63248178d265"],"must_include_any_terms":["wasserhärte","grenzwert","testomat"]}} +{"id":"retrieval_semantic_003","type":"retrieval","prompt":"welche testomat indikatoren gibt es","assert":{"min_results":1,"must_include_one_of_document_ids":["8db60a9f-3549-4567-b914-5e3d0d9ef715","f0422ac8-3d60-4b6c-ab97-8eba652d9eb3","5ced4bcb-aa9d-4032-9eee-37a33f744476","a9fedf75-bccc-4100-ac59-b6f4eef01e61","d11948da-4e77-48e3-bab2-d32f622343de"],"must_include_any_terms":["indikator","th 2250","th 2005","tc 2050","tc 2100"]}} +{"id":"retrieval_semantic_004","type":"retrieval","prompt":"welcher testomat ist ein verschneideregler","assert":{"min_results":1,"must_include_one_of_document_ids":["7fe9342f-2ca4-41ce-bdea-410b516ef6b4"],"must_include_any_terms":["verschneideregler","motorventil","0/4–20 ma"]}} +{"id":"retrieval_semantic_005","type":"retrieval","prompt":"welcher testomat hat automatische reinigung","assert":{"min_results":1,"must_include_one_of_document_ids":["b8c3343b-931e-4994-9d53-a2130efc846f","51589532-a1a1-46e0-94b2-a139dce78543"],"must_include_any_terms":["self clean","reinigung","messkammer"]}} +{"id":"retrieval_semantic_006","type":"retrieval","prompt":"welches geraet ist fuer chlorueberwachung gedacht","assert":{"min_results":1,"must_include_one_of_document_ids":["1d467913-a2d9-42e6-8510-83a65aba9403"],"must_include_any_terms":["thcl","chlor","online-analysegerät"]}} +{"id":"retrieval_semantic_007","type":"retrieval","prompt":"gibt es ein kompaktes kosteneffizientes haerteueberwachungsgeraet","assert":{"min_results":1,"must_include_one_of_document_ids":["afcf1cd6-9b02-4828-b11f-339096a3c864","3d6c2add-c643-4e96-a3e7-5eb949c41303"],"must_include_any_terms":["eco c","kosteneffizient","härteüberwachung"]}} +{"id":"retrieval_negative_001","type":"retrieval","prompt":"lieferbedingungen versand testomat","assert":{"min_results":1,"must_include_one_of_document_ids":["26ddf03d-9108-4a65-aa0e-a5df7613fa77"],"must_not_include_document_ids":["7166592f-85f2-425c-997b-73e323ae184d"]}} +{"id":"retrieval_negative_002","type":"retrieval","prompt":"testomat 2000 th 2005 sicherheitsdatenblatt","assert":{"min_results":1,"must_include_one_of_document_ids":["f0422ac8-3d60-4b6c-ab97-8eba652d9eb3","e3d05954-cde3-40bc-baf6-aa9a350a8aa2"],"must_not_include_document_ids":["26129c01-c09f-4c71-9c80-7ddffb6c77fb","74fdad85-5e4e-4f08-8d95-402f3180ed55"]}} +{"id":"retrieval_negative_003","type":"retrieval","prompt":"testomat 2000 self clean reinigungsloesung","assert":{"min_results":1,"must_include_one_of_document_ids":["51589532-a1a1-46e0-94b2-a139dce78543","b8c3343b-931e-4994-9d53-a2130efc846f"],"must_include_any_terms":["reinigungslösung","self clean"],"must_not_include_document_ids":["26129c01-c09f-4c71-9c80-7ddffb6c77fb"]}} +{"id":"retrieval_short_001","type":"retrieval","prompt":"evo th","assert":{"min_results":1,"must_include_one_of_document_ids":["eb91c1be-4546-4ed5-8b01-f075519d675b","74fdad85-5e4e-4f08-8d95-402f3180ed55"],"must_include_any_terms":["evo"]}} +{"id":"retrieval_short_002","type":"retrieval","prompt":"808","assert":{"min_results":1,"must_include_one_of_document_ids":["26129c01-c09f-4c71-9c80-7ddffb6c77fb"],"must_include_any_terms":["808"]}} +{"id":"retrieval_noise_001","type":"retrieval","prompt":"dsgfsdgfsdgf","assert":{"max_results":0}}