644 lines
20 KiB
PHP
644 lines
20 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Config;
|
|
|
|
use InvalidArgumentException;
|
|
|
|
final class NdjsonHybridRetrieverConfig
|
|
{
|
|
/**
|
|
* @param array<string, mixed> $config
|
|
*/
|
|
public function __construct(
|
|
private array $config = [],
|
|
private ?DomainVocabularyConfig $vocabulary = null,
|
|
private ?GenreConfig $genreConfig = null,
|
|
) {
|
|
}
|
|
|
|
public function hardMaxChunks(): int
|
|
{
|
|
return $this->requiredInt('hard_max_chunks', 1);
|
|
}
|
|
|
|
public function hardMaxVectorK(): int
|
|
{
|
|
return $this->requiredInt('hard_max_vectork', 1);
|
|
}
|
|
|
|
public function hardMaxKeywordK(): int
|
|
{
|
|
return $this->requiredInt('hard_max_keywordk', 1);
|
|
}
|
|
|
|
public function vectorScoreThreshold(): float
|
|
{
|
|
return $this->requiredFloat('vector_score_threshold', 0.0, 1.0);
|
|
}
|
|
|
|
public function thresholdFloor(): float
|
|
{
|
|
return $this->requiredFloat('threshold_floor', 0.0, 1.0);
|
|
}
|
|
|
|
public function thresholdCeil(): float
|
|
{
|
|
return $this->requiredFloat('threshold_ceil', 0.0, 1.0);
|
|
}
|
|
|
|
public function listBonus(): float
|
|
{
|
|
return $this->requiredFloat('list_bonus', 1.0);
|
|
}
|
|
|
|
public function rrfK(): int
|
|
{
|
|
return $this->requiredInt('rrf_k', 1);
|
|
}
|
|
|
|
public function keywordTopKMultiplier(): float
|
|
{
|
|
return $this->requiredFloat('keyword_topk_multiplier', 0.1);
|
|
}
|
|
|
|
public function keywordScoreThreshold(): float
|
|
{
|
|
return $this->requiredFloat('keyword_score_threshold', 0.0, 1.0);
|
|
}
|
|
|
|
public function keywordRrfWeight(): float
|
|
{
|
|
return $this->requiredFloat('keyword_rrf_weight', 0.0);
|
|
}
|
|
|
|
public function scopedVectorRrfWeight(): float
|
|
{
|
|
return $this->requiredFloat('scoped_vector_rrf_weight', 0.0);
|
|
}
|
|
|
|
public function scopedKeywordRrfWeight(): float
|
|
{
|
|
return $this->requiredFloat('scoped_keyword_rrf_weight', 0.0);
|
|
}
|
|
|
|
public function emptyRrfFallbackTopN(): int
|
|
{
|
|
return $this->requiredInt('empty_rrf_fallback_topn', 1);
|
|
}
|
|
|
|
public function maxChunksPerDoc(): int
|
|
{
|
|
return $this->requiredInt('max_chunks_per_doc', 1);
|
|
}
|
|
|
|
public function minChunkDistance(): int
|
|
{
|
|
return $this->requiredInt('min_chunk_distance', 0);
|
|
}
|
|
|
|
public function dominantDocWindow(): int
|
|
{
|
|
return $this->requiredInt('dominant_doc_window', 1);
|
|
}
|
|
|
|
public function dominantDocMinHits(): int
|
|
{
|
|
return $this->requiredInt('dominant_doc_min_hits', 1);
|
|
}
|
|
|
|
public function dominantDocMaxChunks(): int
|
|
{
|
|
return $this->requiredInt('dominant_doc_max_chunks', 1);
|
|
}
|
|
|
|
public function exactDocumentMaxChunks(): int
|
|
{
|
|
return $this->requiredInt('exact_document_max_chunks', 1);
|
|
}
|
|
|
|
public function focusedProductWindow(): int
|
|
{
|
|
return $this->requiredInt('focused_product_window', 1);
|
|
}
|
|
|
|
public function focusedProductMinScore(): float
|
|
{
|
|
return $this->requiredFloat('focused_product_min_score', 0.0);
|
|
}
|
|
|
|
public function focusedProductMinGap(): float
|
|
{
|
|
return $this->requiredFloat('focused_product_min_gap', 0.0);
|
|
}
|
|
|
|
public function focusedProductMaxChunks(): int
|
|
{
|
|
return $this->requiredInt('focused_product_max_chunks', 1);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function catalogListShortcutPatterns(): array
|
|
{
|
|
return $this->requiredStringList('catalog_list_shortcut_patterns');
|
|
}
|
|
|
|
/** @return array<string, string[]> */
|
|
public function exactSelectionTokenVariantPrefixes(): array
|
|
{
|
|
return $this->genreStringListMap('retrieval_and_language.exact_selection.token_variant_prefixes')
|
|
?: $this->requiredStringListMap('exact_selection_token_variant_prefixes');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function exactSelectionTokenVariantSuffixes(): array
|
|
{
|
|
return $this->requiredStringList('exact_selection_token_variant_suffixes');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function exactSelectionIndicatorQuestionTokens(): array
|
|
{
|
|
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_question_tokens')
|
|
?: $this->requiredStringList('exact_selection_indicator_question_tokens');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function exactSelectionIndicatorQuestionPhrases(): array
|
|
{
|
|
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_question_phrases')
|
|
?: $this->requiredStringList('exact_selection_indicator_question_phrases');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function exactSelectionIndicatorTableHeadingPatterns(): array
|
|
{
|
|
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_table_heading_patterns')
|
|
?: $this->requiredStringList('exact_selection_indicator_table_heading_patterns');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function exactSelectionIndicatorTableHeaderPatterns(): array
|
|
{
|
|
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_table_header_patterns')
|
|
?: $this->requiredStringList('exact_selection_indicator_table_header_patterns');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function exactSelectionIndicatorTableRowPatterns(): array
|
|
{
|
|
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_table_row_patterns')
|
|
?: $this->requiredStringList('exact_selection_indicator_table_row_patterns');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function exactSelectionIndicatorTableRequiredPrimaryTerms(): array
|
|
{
|
|
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_table_required_primary_terms')
|
|
?: $this->requiredStringList('exact_selection_indicator_table_required_primary_terms');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function exactSelectionIndicatorTableRequiredContextTerms(): array
|
|
{
|
|
return $this->genreStringList('retrieval_and_language.exact_selection.indicator_table_required_context_terms')
|
|
?: $this->requiredStringList('exact_selection_indicator_table_required_context_terms');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function exactDetailTokens(): array
|
|
{
|
|
return $this->requiredStringList('exact_detail_tokens');
|
|
}
|
|
|
|
public function genericExactSelectionCleanupProfile(): string
|
|
{
|
|
return $this->requiredString('generic_exact_selection_cleanup_profile');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function genericExactSelectionTokens(): array
|
|
{
|
|
return $this->requiredStringList('generic_exact_selection_tokens');
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function genericProductTokens(): array
|
|
{
|
|
return $this->configuredStringListOrVocabularyView(
|
|
'generic_product_tokens',
|
|
'vocabulary_views.generic_product_tokens'
|
|
);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function importantShortModelTokens(): array
|
|
{
|
|
return $this->configuredStringListOrVocabularyView(
|
|
'important_short_model_tokens',
|
|
'vocabulary_views.important_short_model_tokens'
|
|
);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function familyDescriptorTokens(): array
|
|
{
|
|
return $this->configuredStringListOrVocabularyView(
|
|
'family_descriptor_tokens',
|
|
'vocabulary_views.family_descriptor_tokens'
|
|
);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function looksLikeReagentTokens(): array
|
|
{
|
|
return $this->configuredStringListOrVocabularyView(
|
|
'looks_like_reagent_tokens',
|
|
'vocabulary_views.looks_like_reagent_tokens'
|
|
);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function looksLikeSafetyDocs(): array
|
|
{
|
|
return $this->configuredStringListOrVocabularyView(
|
|
'looks_like_safety_docs',
|
|
'vocabulary_views.looks_like_safety_docs'
|
|
);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function looksLikeReagentWords(): array
|
|
{
|
|
return $this->configuredStringListOrVocabularyView(
|
|
'looks_like_reagent_words',
|
|
'vocabulary_views.looks_like_reagent_words'
|
|
);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function looksLikeDocumentWords(): array
|
|
{
|
|
return $this->configuredStringListOrVocabularyView(
|
|
'looks_like_document_words',
|
|
'vocabulary_views.looks_like_document_words'
|
|
);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function looksLikeSafetyWords(): array
|
|
{
|
|
return $this->configuredStringListOrVocabularyView(
|
|
'looks_like_safety_words',
|
|
'vocabulary_views.looks_like_safety_words'
|
|
);
|
|
}
|
|
|
|
/** @return string[] */
|
|
public function looksLikeDeviceWords(): array
|
|
{
|
|
return $this->configuredStringListOrVocabularyView(
|
|
'looks_like_device_words',
|
|
'vocabulary_views.looks_like_device_words'
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Effective retrieval vocabulary as a dedicated structure for diagnostics and config dumps.
|
|
*
|
|
* @return array<string, array<int, string>>
|
|
*/
|
|
public function vocabularyToArray(): array
|
|
{
|
|
return [
|
|
'generic_product_tokens' => $this->genericProductTokens(),
|
|
'important_short_model_tokens' => $this->importantShortModelTokens(),
|
|
'family_descriptor_tokens' => $this->familyDescriptorTokens(),
|
|
'looks_like_reagent_tokens' => $this->looksLikeReagentTokens(),
|
|
'looks_like_safety_docs' => $this->looksLikeSafetyDocs(),
|
|
'looks_like_reagent_words' => $this->looksLikeReagentWords(),
|
|
'looks_like_document_words' => $this->looksLikeDocumentWords(),
|
|
'looks_like_safety_words' => $this->looksLikeSafetyWords(),
|
|
'looks_like_device_words' => $this->looksLikeDeviceWords(),
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @return array<string, mixed>
|
|
*/
|
|
public function toArray(): array
|
|
{
|
|
return [
|
|
'hard_max_chunks' => $this->hardMaxChunks(),
|
|
'hard_max_vectork' => $this->hardMaxVectorK(),
|
|
'hard_max_keywordk' => $this->hardMaxKeywordK(),
|
|
'vector_score_threshold' => $this->vectorScoreThreshold(),
|
|
'threshold_floor' => $this->thresholdFloor(),
|
|
'threshold_ceil' => $this->thresholdCeil(),
|
|
'list_bonus' => $this->listBonus(),
|
|
'rrf_k' => $this->rrfK(),
|
|
'keyword_topk_multiplier' => $this->keywordTopKMultiplier(),
|
|
'keyword_score_threshold' => $this->keywordScoreThreshold(),
|
|
'keyword_rrf_weight' => $this->keywordRrfWeight(),
|
|
'scoped_vector_rrf_weight' => $this->scopedVectorRrfWeight(),
|
|
'scoped_keyword_rrf_weight' => $this->scopedKeywordRrfWeight(),
|
|
'empty_rrf_fallback_topn' => $this->emptyRrfFallbackTopN(),
|
|
'max_chunks_per_doc' => $this->maxChunksPerDoc(),
|
|
'min_chunk_distance' => $this->minChunkDistance(),
|
|
'dominant_doc_window' => $this->dominantDocWindow(),
|
|
'dominant_doc_min_hits' => $this->dominantDocMinHits(),
|
|
'dominant_doc_max_chunks' => $this->dominantDocMaxChunks(),
|
|
'exact_document_max_chunks' => $this->exactDocumentMaxChunks(),
|
|
'focused_product_window' => $this->focusedProductWindow(),
|
|
'focused_product_min_score' => $this->focusedProductMinScore(),
|
|
'focused_product_min_gap' => $this->focusedProductMinGap(),
|
|
'focused_product_max_chunks' => $this->focusedProductMaxChunks(),
|
|
'catalog_list_shortcut_patterns' => $this->catalogListShortcutPatterns(),
|
|
'exact_selection_token_variant_prefixes' => $this->exactSelectionTokenVariantPrefixes(),
|
|
'exact_selection_token_variant_suffixes' => $this->exactSelectionTokenVariantSuffixes(),
|
|
'exact_selection_indicator_question_tokens' => $this->exactSelectionIndicatorQuestionTokens(),
|
|
'exact_selection_indicator_question_phrases' => $this->exactSelectionIndicatorQuestionPhrases(),
|
|
'exact_selection_indicator_table_heading_patterns' => $this->exactSelectionIndicatorTableHeadingPatterns(),
|
|
'exact_selection_indicator_table_header_patterns' => $this->exactSelectionIndicatorTableHeaderPatterns(),
|
|
'exact_selection_indicator_table_row_patterns' => $this->exactSelectionIndicatorTableRowPatterns(),
|
|
'exact_selection_indicator_table_required_primary_terms' => $this->exactSelectionIndicatorTableRequiredPrimaryTerms(),
|
|
'exact_selection_indicator_table_required_context_terms' => $this->exactSelectionIndicatorTableRequiredContextTerms(),
|
|
'exact_detail_tokens' => $this->exactDetailTokens(),
|
|
'generic_exact_selection_cleanup_profile' => $this->genericExactSelectionCleanupProfile(),
|
|
'generic_exact_selection_tokens' => $this->genericExactSelectionTokens(),
|
|
'generic_product_tokens' => $this->genericProductTokens(),
|
|
'important_short_model_tokens' => $this->importantShortModelTokens(),
|
|
'family_descriptor_tokens' => $this->familyDescriptorTokens(),
|
|
'looks_like_reagent_tokens' => $this->looksLikeReagentTokens(),
|
|
'looks_like_safety_docs' => $this->looksLikeSafetyDocs(),
|
|
'looks_like_reagent_words' => $this->looksLikeReagentWords(),
|
|
'looks_like_document_words' => $this->looksLikeDocumentWords(),
|
|
'looks_like_safety_words' => $this->looksLikeSafetyWords(),
|
|
'looks_like_device_words' => $this->looksLikeDeviceWords(),
|
|
];
|
|
}
|
|
|
|
/** @return string[] */
|
|
private function genreStringList(string $path): array
|
|
{
|
|
return $this->genreConfig?->getValueStringList($path) ?? [];
|
|
}
|
|
|
|
/** @return array<string, string[]> */
|
|
private function genreStringListMap(string $path): array
|
|
{
|
|
$value = $this->genreConfig?->getValueArray($path) ?? [];
|
|
if ($value === []) {
|
|
return [];
|
|
}
|
|
|
|
return $this->normalizeStringListMap($value);
|
|
}
|
|
|
|
private function requiredInt(string $key, int $min = PHP_INT_MIN, ?int $max = null): int
|
|
{
|
|
$value = $this->requiredValue($key);
|
|
|
|
if (!is_numeric($value)) {
|
|
throw $this->invalid($key, 'must be numeric');
|
|
}
|
|
|
|
$value = (int) $value;
|
|
if ($value < $min) {
|
|
throw $this->invalid($key, sprintf('must be greater than or equal to %d', $min));
|
|
}
|
|
|
|
if ($max !== null && $value > $max) {
|
|
throw $this->invalid($key, sprintf('must be less than or equal to %d', $max));
|
|
}
|
|
|
|
return $value;
|
|
}
|
|
|
|
private function requiredFloat(string $key, float $min = -INF, ?float $max = null): float
|
|
{
|
|
$value = $this->requiredValue($key);
|
|
|
|
if (!is_numeric($value)) {
|
|
throw $this->invalid($key, 'must be numeric');
|
|
}
|
|
|
|
$value = (float) $value;
|
|
if ($value < $min) {
|
|
throw $this->invalid($key, sprintf('must be greater than or equal to %s', (string) $min));
|
|
}
|
|
|
|
if ($max !== null && $value > $max) {
|
|
throw $this->invalid($key, sprintf('must be less than or equal to %s', (string) $max));
|
|
}
|
|
|
|
return $value;
|
|
}
|
|
|
|
private function requiredString(string $key): string
|
|
{
|
|
$value = $this->requiredValue($key);
|
|
|
|
if (!is_scalar($value)) {
|
|
throw $this->invalid($key, 'must be a non-empty string');
|
|
}
|
|
|
|
$value = trim((string) $value);
|
|
if ($value === '') {
|
|
throw $this->invalid($key, 'must be a non-empty string');
|
|
}
|
|
|
|
return $value;
|
|
}
|
|
|
|
/** @return string[] */
|
|
private function requiredStringList(string $key): array
|
|
{
|
|
$value = $this->requiredValue($key);
|
|
|
|
if (!is_array($value)) {
|
|
throw $this->invalid($key, 'must be a list of non-empty strings');
|
|
}
|
|
|
|
$out = [];
|
|
foreach ($value as $item) {
|
|
if (!is_scalar($item)) {
|
|
continue;
|
|
}
|
|
|
|
$item = trim((string) $item);
|
|
if ($item === '') {
|
|
continue;
|
|
}
|
|
|
|
if (!in_array($item, $out, true)) {
|
|
$out[] = $item;
|
|
}
|
|
}
|
|
|
|
if ($out === []) {
|
|
throw $this->invalid($key, 'must contain at least one non-empty string');
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
/** @return array<string, string[]> */
|
|
private function normalizeStringListMap(array $value): array
|
|
{
|
|
$out = [];
|
|
foreach ($value as $mapKey => $items) {
|
|
if (!is_string($mapKey) || trim($mapKey) === '' || !is_array($items)) {
|
|
continue;
|
|
}
|
|
|
|
$cleanItems = [];
|
|
foreach ($items as $item) {
|
|
if (!is_scalar($item)) {
|
|
continue;
|
|
}
|
|
|
|
$item = trim((string) $item);
|
|
if ($item !== '' && !in_array($item, $cleanItems, true)) {
|
|
$cleanItems[] = $item;
|
|
}
|
|
}
|
|
|
|
if ($cleanItems !== []) {
|
|
$out[trim($mapKey)] = $cleanItems;
|
|
}
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
/**
|
|
* @return array<string, string[]>
|
|
*/
|
|
private function requiredStringListMap(string $key): array
|
|
{
|
|
$value = $this->requiredValue($key);
|
|
|
|
if (!is_array($value)) {
|
|
throw $this->invalid($key, 'must be a map of string lists');
|
|
}
|
|
|
|
$out = [];
|
|
foreach ($value as $mapKey => $items) {
|
|
if (!is_string($mapKey) || trim($mapKey) === '' || !is_array($items)) {
|
|
continue;
|
|
}
|
|
|
|
$cleanItems = [];
|
|
foreach ($items as $item) {
|
|
if (!is_scalar($item)) {
|
|
continue;
|
|
}
|
|
|
|
$item = trim((string) $item);
|
|
if ($item !== '' && !in_array($item, $cleanItems, true)) {
|
|
$cleanItems[] = $item;
|
|
}
|
|
}
|
|
|
|
if ($cleanItems !== []) {
|
|
$out[trim($mapKey)] = $cleanItems;
|
|
}
|
|
}
|
|
|
|
if ($out === []) {
|
|
throw $this->invalid($key, 'must contain at least one non-empty map entry');
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
|
|
/** @return string[] */
|
|
private function configuredStringListOrVocabularyView(string $configPath, string $viewPathConfigPath): array
|
|
{
|
|
if ($this->hasKey($configPath)) {
|
|
return $this->requiredStringList($configPath);
|
|
}
|
|
|
|
if ($this->vocabulary === null) {
|
|
throw $this->missing($configPath);
|
|
}
|
|
|
|
$viewPath = $this->requiredPathString($viewPathConfigPath);
|
|
$terms = $this->vocabulary->view($viewPath, []);
|
|
|
|
if ($terms === []) {
|
|
throw $this->invalid($viewPathConfigPath, sprintf('references empty vocabulary view "%s"', $viewPath));
|
|
}
|
|
|
|
return $terms;
|
|
}
|
|
|
|
private function requiredPathString(string $key): string
|
|
{
|
|
$value = $this->requiredPathValue($key);
|
|
|
|
if (!is_scalar($value)) {
|
|
throw $this->invalid($key, 'must be a non-empty string');
|
|
}
|
|
|
|
$value = trim((string) $value);
|
|
if ($value === '') {
|
|
throw $this->invalid($key, 'must be a non-empty string');
|
|
}
|
|
|
|
return $value;
|
|
}
|
|
|
|
private function requiredPathValue(string $key): mixed
|
|
{
|
|
$current = $this->config;
|
|
|
|
foreach (explode('.', $key) as $segment) {
|
|
if (!is_array($current) || !array_key_exists($segment, $current)) {
|
|
throw $this->missing($key);
|
|
}
|
|
|
|
$current = $current[$segment];
|
|
}
|
|
|
|
return $current;
|
|
}
|
|
|
|
private function hasKey(string $key): bool
|
|
{
|
|
$current = $this->config;
|
|
|
|
foreach (explode('.', $key) as $segment) {
|
|
if (!is_array($current) || !array_key_exists($segment, $current)) {
|
|
return false;
|
|
}
|
|
|
|
$current = $current[$segment];
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
private function requiredValue(string $key): mixed
|
|
{
|
|
if (!array_key_exists($key, $this->config)) {
|
|
throw $this->missing($key);
|
|
}
|
|
|
|
return $this->config[$key];
|
|
}
|
|
|
|
private function missing(string $key): InvalidArgumentException
|
|
{
|
|
return new InvalidArgumentException(sprintf('RetrieX retrieval config "%s" is missing.', $key));
|
|
}
|
|
|
|
private function invalid(string $key, string $reason): InvalidArgumentException
|
|
{
|
|
return new InvalidArgumentException(sprintf('RetrieX retrieval config "%s" %s.', $key, $reason));
|
|
}
|
|
}
|