Files
MtoRagSystem/src/Config/NdjsonHybridRetrieverConfig.php
2026-05-07 07:52:52 +02:00

644 lines
20 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Config;
use InvalidArgumentException;
final class NdjsonHybridRetrieverConfig
{
/**
* @param array<string, mixed> $config
*/
public function __construct(
private array $config = [],
private ?DomainVocabularyConfig $vocabulary = null,
private ?GenreConfig $genreConfig = null,
) {
}
public function hardMaxChunks(): int
{
return $this->requiredInt('hard_max_chunks', 1);
}
public function hardMaxVectorK(): int
{
return $this->requiredInt('hard_max_vectork', 1);
}
public function hardMaxKeywordK(): int
{
return $this->requiredInt('hard_max_keywordk', 1);
}
public function vectorScoreThreshold(): float
{
return $this->requiredFloat('vector_score_threshold', 0.0, 1.0);
}
public function thresholdFloor(): float
{
return $this->requiredFloat('threshold_floor', 0.0, 1.0);
}
public function thresholdCeil(): float
{
return $this->requiredFloat('threshold_ceil', 0.0, 1.0);
}
public function listBonus(): float
{
return $this->requiredFloat('list_bonus', 1.0);
}
public function rrfK(): int
{
return $this->requiredInt('rrf_k', 1);
}
public function keywordTopKMultiplier(): float
{
return $this->requiredFloat('keyword_topk_multiplier', 0.1);
}
public function keywordScoreThreshold(): float
{
return $this->requiredFloat('keyword_score_threshold', 0.0, 1.0);
}
public function keywordRrfWeight(): float
{
return $this->requiredFloat('keyword_rrf_weight', 0.0);
}
public function scopedVectorRrfWeight(): float
{
return $this->requiredFloat('scoped_vector_rrf_weight', 0.0);
}
public function scopedKeywordRrfWeight(): float
{
return $this->requiredFloat('scoped_keyword_rrf_weight', 0.0);
}
public function emptyRrfFallbackTopN(): int
{
return $this->requiredInt('empty_rrf_fallback_topn', 1);
}
public function maxChunksPerDoc(): int
{
return $this->requiredInt('max_chunks_per_doc', 1);
}
public function minChunkDistance(): int
{
return $this->requiredInt('min_chunk_distance', 0);
}
public function dominantDocWindow(): int
{
return $this->requiredInt('dominant_doc_window', 1);
}
public function dominantDocMinHits(): int
{
return $this->requiredInt('dominant_doc_min_hits', 1);
}
public function dominantDocMaxChunks(): int
{
return $this->requiredInt('dominant_doc_max_chunks', 1);
}
public function exactDocumentMaxChunks(): int
{
return $this->requiredInt('exact_document_max_chunks', 1);
}
public function focusedProductWindow(): int
{
return $this->requiredInt('focused_product_window', 1);
}
public function focusedProductMinScore(): float
{
return $this->requiredFloat('focused_product_min_score', 0.0);
}
public function focusedProductMinGap(): float
{
return $this->requiredFloat('focused_product_min_gap', 0.0);
}
public function focusedProductMaxChunks(): int
{
return $this->requiredInt('focused_product_max_chunks', 1);
}
/** @return string[] */
public function catalogListShortcutPatterns(): array
{
return $this->requiredStringList('catalog_list_shortcut_patterns');
}
/** @return array<string, string[]> */
public function exactSelectionTokenVariantPrefixes(): array
{
return $this->genreStringListMap('retrieval_and_language.exact_selection.token_variant_prefixes')
?: $this->requiredStringListMap('exact_selection_token_variant_prefixes');
}
/** @return string[] */
public function exactSelectionTokenVariantSuffixes(): array
{
return $this->requiredStringList('exact_selection_token_variant_suffixes');
}
/** @return string[] */
public function exactSelectionIndicatorQuestionTokens(): array
{
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_question_tokens')
?: $this->requiredStringList('exact_selection_indicator_question_tokens');
}
/** @return string[] */
public function exactSelectionIndicatorQuestionPhrases(): array
{
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_question_phrases')
?: $this->requiredStringList('exact_selection_indicator_question_phrases');
}
/** @return string[] */
public function exactSelectionIndicatorTableHeadingPatterns(): array
{
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_table_heading_patterns')
?: $this->requiredStringList('exact_selection_indicator_table_heading_patterns');
}
/** @return string[] */
public function exactSelectionIndicatorTableHeaderPatterns(): array
{
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_table_header_patterns')
?: $this->requiredStringList('exact_selection_indicator_table_header_patterns');
}
/** @return string[] */
public function exactSelectionIndicatorTableRowPatterns(): array
{
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_table_row_patterns')
?: $this->requiredStringList('exact_selection_indicator_table_row_patterns');
}
/** @return string[] */
public function exactSelectionIndicatorTableRequiredPrimaryTerms(): array
{
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_table_required_primary_terms')
?: $this->requiredStringList('exact_selection_indicator_table_required_primary_terms');
}
/** @return string[] */
public function exactSelectionIndicatorTableRequiredContextTerms(): array
{
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_table_required_context_terms')
?: $this->requiredStringList('exact_selection_indicator_table_required_context_terms');
}
/** @return string[] */
public function exactDetailTokens(): array
{
return $this->requiredStringList('exact_detail_tokens');
}
public function genericExactSelectionCleanupProfile(): string
{
return $this->requiredString('generic_exact_selection_cleanup_profile');
}
/** @return string[] */
public function genericExactSelectionTokens(): array
{
return $this->requiredStringList('generic_exact_selection_tokens');
}
/** @return string[] */
public function genericProductTokens(): array
{
return $this->configuredStringListOrVocabularyView(
'generic_product_tokens',
'vocabulary_views.generic_product_tokens'
);
}
/** @return string[] */
public function importantShortModelTokens(): array
{
return $this->configuredStringListOrVocabularyView(
'important_short_model_tokens',
'vocabulary_views.important_short_model_tokens'
);
}
/** @return string[] */
public function familyDescriptorTokens(): array
{
return $this->configuredStringListOrVocabularyView(
'family_descriptor_tokens',
'vocabulary_views.family_descriptor_tokens'
);
}
/** @return string[] */
public function looksLikeReagentTokens(): array
{
return $this->configuredStringListOrVocabularyView(
'looks_like_reagent_tokens',
'vocabulary_views.looks_like_reagent_tokens'
);
}
/** @return string[] */
public function looksLikeSafetyDocs(): array
{
return $this->configuredStringListOrVocabularyView(
'looks_like_safety_docs',
'vocabulary_views.looks_like_safety_docs'
);
}
/** @return string[] */
public function looksLikeReagentWords(): array
{
return $this->configuredStringListOrVocabularyView(
'looks_like_reagent_words',
'vocabulary_views.looks_like_reagent_words'
);
}
/** @return string[] */
public function looksLikeDocumentWords(): array
{
return $this->configuredStringListOrVocabularyView(
'looks_like_document_words',
'vocabulary_views.looks_like_document_words'
);
}
/** @return string[] */
public function looksLikeSafetyWords(): array
{
return $this->configuredStringListOrVocabularyView(
'looks_like_safety_words',
'vocabulary_views.looks_like_safety_words'
);
}
/** @return string[] */
public function looksLikeDeviceWords(): array
{
return $this->configuredStringListOrVocabularyView(
'looks_like_device_words',
'vocabulary_views.looks_like_device_words'
);
}
/**
* Effective retrieval vocabulary as a dedicated structure for diagnostics and config dumps.
*
* @return array<string, array<int, string>>
*/
public function vocabularyToArray(): array
{
return [
'generic_product_tokens' => $this->genericProductTokens(),
'important_short_model_tokens' => $this->importantShortModelTokens(),
'family_descriptor_tokens' => $this->familyDescriptorTokens(),
'looks_like_reagent_tokens' => $this->looksLikeReagentTokens(),
'looks_like_safety_docs' => $this->looksLikeSafetyDocs(),
'looks_like_reagent_words' => $this->looksLikeReagentWords(),
'looks_like_document_words' => $this->looksLikeDocumentWords(),
'looks_like_safety_words' => $this->looksLikeSafetyWords(),
'looks_like_device_words' => $this->looksLikeDeviceWords(),
];
}
/**
* @return array<string, mixed>
*/
public function toArray(): array
{
return [
'hard_max_chunks' => $this->hardMaxChunks(),
'hard_max_vectork' => $this->hardMaxVectorK(),
'hard_max_keywordk' => $this->hardMaxKeywordK(),
'vector_score_threshold' => $this->vectorScoreThreshold(),
'threshold_floor' => $this->thresholdFloor(),
'threshold_ceil' => $this->thresholdCeil(),
'list_bonus' => $this->listBonus(),
'rrf_k' => $this->rrfK(),
'keyword_topk_multiplier' => $this->keywordTopKMultiplier(),
'keyword_score_threshold' => $this->keywordScoreThreshold(),
'keyword_rrf_weight' => $this->keywordRrfWeight(),
'scoped_vector_rrf_weight' => $this->scopedVectorRrfWeight(),
'scoped_keyword_rrf_weight' => $this->scopedKeywordRrfWeight(),
'empty_rrf_fallback_topn' => $this->emptyRrfFallbackTopN(),
'max_chunks_per_doc' => $this->maxChunksPerDoc(),
'min_chunk_distance' => $this->minChunkDistance(),
'dominant_doc_window' => $this->dominantDocWindow(),
'dominant_doc_min_hits' => $this->dominantDocMinHits(),
'dominant_doc_max_chunks' => $this->dominantDocMaxChunks(),
'exact_document_max_chunks' => $this->exactDocumentMaxChunks(),
'focused_product_window' => $this->focusedProductWindow(),
'focused_product_min_score' => $this->focusedProductMinScore(),
'focused_product_min_gap' => $this->focusedProductMinGap(),
'focused_product_max_chunks' => $this->focusedProductMaxChunks(),
'catalog_list_shortcut_patterns' => $this->catalogListShortcutPatterns(),
'exact_selection_token_variant_prefixes' => $this->exactSelectionTokenVariantPrefixes(),
'exact_selection_token_variant_suffixes' => $this->exactSelectionTokenVariantSuffixes(),
'exact_selection_indicator_question_tokens' => $this->exactSelectionIndicatorQuestionTokens(),
'exact_selection_indicator_question_phrases' => $this->exactSelectionIndicatorQuestionPhrases(),
'exact_selection_indicator_table_heading_patterns' => $this->exactSelectionIndicatorTableHeadingPatterns(),
'exact_selection_indicator_table_header_patterns' => $this->exactSelectionIndicatorTableHeaderPatterns(),
'exact_selection_indicator_table_row_patterns' => $this->exactSelectionIndicatorTableRowPatterns(),
'exact_selection_indicator_table_required_primary_terms' => $this->exactSelectionIndicatorTableRequiredPrimaryTerms(),
'exact_selection_indicator_table_required_context_terms' => $this->exactSelectionIndicatorTableRequiredContextTerms(),
'exact_detail_tokens' => $this->exactDetailTokens(),
'generic_exact_selection_cleanup_profile' => $this->genericExactSelectionCleanupProfile(),
'generic_exact_selection_tokens' => $this->genericExactSelectionTokens(),
'generic_product_tokens' => $this->genericProductTokens(),
'important_short_model_tokens' => $this->importantShortModelTokens(),
'family_descriptor_tokens' => $this->familyDescriptorTokens(),
'looks_like_reagent_tokens' => $this->looksLikeReagentTokens(),
'looks_like_safety_docs' => $this->looksLikeSafetyDocs(),
'looks_like_reagent_words' => $this->looksLikeReagentWords(),
'looks_like_document_words' => $this->looksLikeDocumentWords(),
'looks_like_safety_words' => $this->looksLikeSafetyWords(),
'looks_like_device_words' => $this->looksLikeDeviceWords(),
];
}
/** @return string[] */
private function genreStringList(string $path): array
{
return $this->genreConfig?->getValueStringList($path) ?? [];
}
/** @return array<string, string[]> */
private function genreStringListMap(string $path): array
{
$value = $this->genreConfig?->getValueArray($path) ?? [];
if ($value === []) {
return [];
}
return $this->normalizeStringListMap($value);
}
private function requiredInt(string $key, int $min = PHP_INT_MIN, ?int $max = null): int
{
$value = $this->requiredValue($key);
if (!is_numeric($value)) {
throw $this->invalid($key, 'must be numeric');
}
$value = (int) $value;
if ($value < $min) {
throw $this->invalid($key, sprintf('must be greater than or equal to %d', $min));
}
if ($max !== null && $value > $max) {
throw $this->invalid($key, sprintf('must be less than or equal to %d', $max));
}
return $value;
}
private function requiredFloat(string $key, float $min = -INF, ?float $max = null): float
{
$value = $this->requiredValue($key);
if (!is_numeric($value)) {
throw $this->invalid($key, 'must be numeric');
}
$value = (float) $value;
if ($value < $min) {
throw $this->invalid($key, sprintf('must be greater than or equal to %s', (string) $min));
}
if ($max !== null && $value > $max) {
throw $this->invalid($key, sprintf('must be less than or equal to %s', (string) $max));
}
return $value;
}
private function requiredString(string $key): string
{
$value = $this->requiredValue($key);
if (!is_scalar($value)) {
throw $this->invalid($key, 'must be a non-empty string');
}
$value = trim((string) $value);
if ($value === '') {
throw $this->invalid($key, 'must be a non-empty string');
}
return $value;
}
/** @return string[] */
private function requiredStringList(string $key): array
{
$value = $this->requiredValue($key);
if (!is_array($value)) {
throw $this->invalid($key, 'must be a list of non-empty strings');
}
$out = [];
foreach ($value as $item) {
if (!is_scalar($item)) {
continue;
}
$item = trim((string) $item);
if ($item === '') {
continue;
}
if (!in_array($item, $out, true)) {
$out[] = $item;
}
}
if ($out === []) {
throw $this->invalid($key, 'must contain at least one non-empty string');
}
return $out;
}
/** @return array<string, string[]> */
private function normalizeStringListMap(array $value): array
{
$out = [];
foreach ($value as $mapKey => $items) {
if (!is_string($mapKey) || trim($mapKey) === '' || !is_array($items)) {
continue;
}
$cleanItems = [];
foreach ($items as $item) {
if (!is_scalar($item)) {
continue;
}
$item = trim((string) $item);
if ($item !== '' && !in_array($item, $cleanItems, true)) {
$cleanItems[] = $item;
}
}
if ($cleanItems !== []) {
$out[trim($mapKey)] = $cleanItems;
}
}
return $out;
}
/**
* @return array<string, string[]>
*/
private function requiredStringListMap(string $key): array
{
$value = $this->requiredValue($key);
if (!is_array($value)) {
throw $this->invalid($key, 'must be a map of string lists');
}
$out = [];
foreach ($value as $mapKey => $items) {
if (!is_string($mapKey) || trim($mapKey) === '' || !is_array($items)) {
continue;
}
$cleanItems = [];
foreach ($items as $item) {
if (!is_scalar($item)) {
continue;
}
$item = trim((string) $item);
if ($item !== '' && !in_array($item, $cleanItems, true)) {
$cleanItems[] = $item;
}
}
if ($cleanItems !== []) {
$out[trim($mapKey)] = $cleanItems;
}
}
if ($out === []) {
throw $this->invalid($key, 'must contain at least one non-empty map entry');
}
return $out;
}
/** @return string[] */
private function configuredStringListOrVocabularyView(string $configPath, string $viewPathConfigPath): array
{
if ($this->hasKey($configPath)) {
return $this->requiredStringList($configPath);
}
if ($this->vocabulary === null) {
throw $this->missing($configPath);
}
$viewPath = $this->requiredPathString($viewPathConfigPath);
$terms = $this->vocabulary->view($viewPath, []);
if ($terms === []) {
throw $this->invalid($viewPathConfigPath, sprintf('references empty vocabulary view "%s"', $viewPath));
}
return $terms;
}
private function requiredPathString(string $key): string
{
$value = $this->requiredPathValue($key);
if (!is_scalar($value)) {
throw $this->invalid($key, 'must be a non-empty string');
}
$value = trim((string) $value);
if ($value === '') {
throw $this->invalid($key, 'must be a non-empty string');
}
return $value;
}
private function requiredPathValue(string $key): mixed
{
$current = $this->config;
foreach (explode('.', $key) as $segment) {
if (!is_array($current) || !array_key_exists($segment, $current)) {
throw $this->missing($key);
}
$current = $current[$segment];
}
return $current;
}
private function hasKey(string $key): bool
{
$current = $this->config;
foreach (explode('.', $key) as $segment) {
if (!is_array($current) || !array_key_exists($segment, $current)) {
return false;
}
$current = $current[$segment];
}
return true;
}
private function requiredValue(string $key): mixed
{
if (!array_key_exists($key, $this->config)) {
throw $this->missing($key);
}
return $this->config[$key];
}
private function missing(string $key): InvalidArgumentException
{
return new InvalidArgumentException(sprintf('RetrieX retrieval config "%s" is missing.', $key));
}
private function invalid(string $key, string $reason): InvalidArgumentException
{
return new InvalidArgumentException(sprintf('RetrieX retrieval config "%s" %s.', $key, $reason));
}
}