diff --git a/RETRIEX_CONFIG_FIX_README.md b/RETRIEX_CONFIG_FIX_README.md new file mode 100644 index 0000000..c1f7ea5 --- /dev/null +++ b/RETRIEX_CONFIG_FIX_README.md @@ -0,0 +1,35 @@ +# RetrieX config hardening overlay + +Scope: +- Multi-tenant and configuration hardening only. +- No retrieval logic rewrite. +- No prompt logic rewrite. +- No security/secrets cleanup in this round. + +Install: +1. Backup the current project. +2. Extract this ZIP over the project root. +3. Clear Symfony cache. +4. Run: + bin/console mto:agent:config:dump-effective --summary + bin/console mto:agent:config:validate +5. Run the existing 1.4.2 regression tests before deployment. + +New config files: +- config/retriex/runtime.yaml +- config/retriex/index.yaml +- config/retriex/vector.yaml +- config/retriex/commerce.yaml +- config/retriex/model.yaml +- config/retriex/prompt.yaml +- config/retriex/agent.yaml +- config/retriex/retrieval.yaml + +New commands: +- mto:agent:config:dump-effective +- mto:agent:config:validate + +Notes: +- Existing mto.* parameters remain for compatibility. +- services.yaml imports config/retriex/*.yaml explicitly in stable order. +- Retrieval constants are exposed as inventory and validation baseline, but are not dynamically changed in this round. diff --git a/config/retriex/agent.yaml b/config/retriex/agent.yaml new file mode 100644 index 0000000..d369e85 --- /dev/null +++ b/config/retriex/agent.yaml @@ -0,0 +1,8 @@ +# Agent orchestration limits and user-visible source/progress labels. +# Values mirror the current 1.4.2 defaults. +parameters: + retriex.agent.config: + commerce_history_budget_chars: 1000 + product_search_knowledge_chunk_limit: 6 + advisory_product_search_knowledge_chunk_limit: 9 + optimized_shop_query_prefix_pattern: '/^(?:keywords?|suchquery|search\s*query|query)\s*:\s*/iu' diff --git a/config/retriex/commerce.yaml b/config/retriex/commerce.yaml new file mode 100644 index 0000000..02ad973 --- /dev/null +++ b/config/retriex/commerce.yaml @@ -0,0 +1,12 @@ +# Commerce / Shopware Store API configuration. +# The existing Commerce and Shopware services stay unchanged; these values only centralize wiring. +parameters: + retriex.commerce.enabled: true + retriex.commerce.max_shop_results: '%env(SHOPWARE_STORE_API_MAX_RESULT)%' + retriex.commerce.shop_timeout: 5 + retriex.commerce.store_api_base_url: '%env(SHOPWARE_STORE_API_BASE_URL)%' + retriex.commerce.sales_channel_access_key: '%env(SHOPWARE_SALES_CHANNEL_ACCESS_KEY)%' + + retriex.commerce.search_repair.enabled: true + retriex.commerce.search_repair.max_queries: 3 + retriex.commerce.search_repair.min_primary_results_without_repair: 2 diff --git a/config/retriex/index.yaml b/config/retriex/index.yaml new file mode 100644 index 0000000..5192d51 --- /dev/null +++ b/config/retriex/index.yaml @@ -0,0 +1,10 @@ +# Fallback ingest/index configuration. +# DB based IngestProfile remains authoritative when an active profile exists. +parameters: + retriex.index.chunk_size: 250 + retriex.index.chunk_overlap: 50 + retriex.index.embedding_model: 'intfloat/multilingual-e5-base' + retriex.index.embedding_dimension: 768 + retriex.index.scoring_version: 1 + retriex.index.index_format: 'ndjson' + retriex.index.vector_backend: 'faiss' diff --git a/config/retriex/model.yaml b/config/retriex/model.yaml new file mode 100644 index 0000000..34cbccf --- /dev/null +++ b/config/retriex/model.yaml @@ -0,0 +1,17 @@ +# Default model generation values used only when no DB value/user input overrides them. +parameters: + retriex.model.default_name: 'mto-model' + retriex.model.default_stream: false + retriex.model.default_temperature: 0.1 + retriex.model.default_top_k: 20 + retriex.model.default_top_p: 0.8 + retriex.model.default_repeat_penalty: 1.05 + retriex.model.default_num_ctx: 4096 + retriex.model.default_retrieval_max_chunks: 25 + retriex.model.default_retrieval_vector_top_k: 25 + + retriex.model.guardrail_min_num_ctx: 512 + retriex.model.guardrail_max_retrieval_chunks: 200 + retriex.model.guardrail_max_vector_top_k: 200 + + retriex.llm.timeout_seconds: 600 diff --git a/config/retriex/prompt.yaml b/config/retriex/prompt.yaml new file mode 100644 index 0000000..12f9f6f --- /dev/null +++ b/config/retriex/prompt.yaml @@ -0,0 +1,18 @@ +# Prompt budget and prompt rendering limits. +# Existing prompt wording/rules remain in PromptBuilderConfig for this minimal-invasive round. +parameters: + retriex.prompt.config: + budget: + chars_per_token: 4 + history_padding_chars: 400 + output_reserve_ratio: 0.25 + output_reserve_min_tokens: 768 + output_reserve_max_tokens: 6000 + safety_reserve_ratio: 0.05 + safety_reserve_min_tokens: 256 + safety_reserve_max_tokens: 1024 + min_prompt_budget_tokens: 1024 + shop_results: + max_results_in_prompt: 24 + detailed_max_count: 5 + technical_product_keyword_match_threshold: 2 diff --git a/config/retriex/retrieval.yaml b/config/retriex/retrieval.yaml new file mode 100644 index 0000000..d096a53 --- /dev/null +++ b/config/retriex/retrieval.yaml @@ -0,0 +1,28 @@ +# Current 1.4.2 retrieval constants documented as configuration inventory. +# In this round these values are exposed by config dump/validation; the retriever logic remains unchanged. +parameters: + retriex.retrieval.inventory: + hard_max_chunks: 6 + hard_max_vectork: 18 + hard_max_keywordk: 36 + vector_score_threshold: 0.83 + threshold_floor: 0.75 + threshold_ceil: 0.90 + list_bonus: 1.35 + rrf_k: 50 + keyword_topk_multiplier: 2.0 + keyword_score_threshold: 0.35 + keyword_rrf_weight: 1.15 + scoped_vector_rrf_weight: 1.20 + scoped_keyword_rrf_weight: 1.30 + empty_rrf_fallback_topn: 1 + max_chunks_per_doc: 2 + min_chunk_distance: 2 + dominant_doc_window: 6 + dominant_doc_min_hits: 3 + dominant_doc_max_chunks: 4 + exact_document_max_chunks: 6 + focused_product_window: 8 + focused_product_min_score: 10.0 + focused_product_min_gap: 4.0 + focused_product_max_chunks: 4 diff --git a/config/retriex/runtime.yaml b/config/retriex/runtime.yaml new file mode 100644 index 0000000..5192e16 --- /dev/null +++ b/config/retriex/runtime.yaml @@ -0,0 +1,19 @@ +# RetrieX runtime paths. +# Keep these values customer/project specific and keep service wiring generic. +parameters: + retriex.root: '%kernel.project_dir%' + + retriex.knowledge.root: '%retriex.root%/var/knowledge' + retriex.knowledge.ndjson: '%retriex.knowledge.root%/index.ndjson' + retriex.knowledge.index_meta: '%retriex.knowledge.root%/index_meta.json' + retriex.knowledge.vector_index: '%retriex.knowledge.root%/vector.index' + retriex.knowledge.vector_index_meta: '%retriex.knowledge.root%/vector.index.meta.json' + retriex.knowledge.runtime_meta: '%retriex.knowledge.root%/index_runtime.json' + retriex.knowledge.upload: '%retriex.knowledge.root%/uploads' + + retriex.knowledge.tags_ndjson: '%retriex.knowledge.root%/tags.ndjson' + retriex.knowledge.vector_tags_index: '%retriex.knowledge.root%/vector_tags.index' + retriex.knowledge.vector_tags_index_meta: '%retriex.knowledge.root%/vector_tags.index.meta.json' + + retriex.locks.dir: '%retriex.knowledge.root%/locks' + retriex.tags.rebuild_lock: '%retriex.locks.dir%/tag_rebuild.lock' diff --git a/config/retriex/vector.yaml b/config/retriex/vector.yaml new file mode 100644 index 0000000..d0c38ab --- /dev/null +++ b/config/retriex/vector.yaml @@ -0,0 +1,31 @@ +# Vector runtime configuration shared by Symfony and vector control commands. +parameters: + retriex.vector.script_dir: '%retriex.root%/python/vector' + retriex.vector.python_bin: '%retriex.root%/.venv/bin/python3' + retriex.vector.control_script: '%retriex.vector.script_dir%/vector_control.py' + retriex.vector.ingest_script: '%retriex.vector.script_dir%/vector_ingest.py' + retriex.vector.search_script: '%retriex.vector.script_dir%/vector_search.py' + retriex.vector.ingest_tags_script: '%retriex.vector.script_dir%/vector_ingest_tags.py' + retriex.vector.search_tags_script: '%retriex.vector.script_dir%/vector_search_tags.py' + + retriex.vector.host: '0.0.0.0' + retriex.vector.port: 8090 + retriex.vector.service_url: 'http://127.0.0.1:%retriex.vector.port%' + retriex.vector.timeout: 600 + + retriex.vector.search.min_score: 0.30 + retriex.vector.search.max_limit: 200 + retriex.vector.search.http_timeout: 10 + + retriex.vector.tags.min_score: 0.72 + retriex.vector.tags.default_limit: 8 + retriex.vector.tags.max_limit: 50 + retriex.vector.tags.http_timeout: 10 + + retriex.vector.tag_routing.default_topk: 8 + retriex.vector.tag_routing.min_best_score: 0.72 + retriex.vector.tag_routing.max_score_drop_from_best: 0.08 + retriex.vector.tag_routing.max_routing_tags: 5 + retriex.vector.tag_routing.max_candidate_docs: 80 + retriex.vector.tag_routing.multi_tag_bonus_per_extra_tag: 0.05 + retriex.vector.tag_routing.max_multi_tag_bonus: 0.15 diff --git a/rag-inprogress.zip b/rag-inprogress.zip new file mode 100644 index 0000000..e54f896 Binary files /dev/null and b/rag-inprogress.zip differ diff --git a/src/Command/ConfigDumpEffectiveCommand.php b/src/Command/ConfigDumpEffectiveCommand.php new file mode 100644 index 0000000..ed708b7 --- /dev/null +++ b/src/Command/ConfigDumpEffectiveCommand.php @@ -0,0 +1,120 @@ +addOption('summary', null, InputOption::VALUE_NONE, 'Render a compact summary instead of JSON.'); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $config = $this->provider->dump(); + + if ((bool) $input->getOption('summary')) { + $this->renderSummary(new SymfonyStyle($input, $output), $config); + + return Command::SUCCESS; + } + + $json = json_encode($config, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE); + $output->writeln(is_string($json) ? $json : '{}'); + + return Command::SUCCESS; + } + + /** + * @param array $config + */ + private function renderSummary(SymfonyStyle $io, array $config): void + { + $io->title('RetrieX effective configuration'); + + $runtime = $this->section($config, 'runtime'); + $model = $this->section($config, 'model_generation'); + $index = $this->section($config, 'index'); + $retrieval = $this->section($config, 'retrieval'); + $vector = $this->section($config, 'vector'); + $commerce = $this->section($config, 'commerce'); + + $io->section('Runtime'); + $io->definitionList( + ['root' => (string) ($runtime['root'] ?? '')], + ['knowledge_root' => (string) ($runtime['knowledge_root'] ?? '')], + ['index_ndjson' => (string) ($runtime['index_ndjson'] ?? '')] + ); + + $io->section('Model'); + $io->definitionList( + ['model_name' => (string) ($model['model_name'] ?? $model['default_model_name'] ?? '')], + ['num_ctx' => (string) ($model['num_ctx'] ?? $model['default_num_ctx'] ?? '')], + ['retrieval_max_chunks' => (string) ($model['retrieval_max_chunks'] ?? $model['default_retrieval_max_chunks'] ?? '')], + ['retrieval_vector_top_k' => (string) ($model['retrieval_vector_top_k'] ?? $model['default_retrieval_vector_top_k'] ?? '')] + ); + + $io->section('Index'); + $io->definitionList( + ['chunk_size' => (string) ($index['chunk_size'] ?? $index['fallback_chunk_size'] ?? '')], + ['chunk_overlap' => (string) ($index['chunk_overlap'] ?? $index['fallback_chunk_overlap'] ?? '')], + ['embedding_model' => (string) ($index['embedding_model'] ?? $index['fallback_embedding_model'] ?? '')], + ['embedding_dimension' => (string) ($index['embedding_dimension'] ?? $index['fallback_embedding_dimension'] ?? '')] + ); + + $io->section('Retrieval'); + $io->definitionList( + ['hard_max_chunks' => (string) ($retrieval['hard_max_chunks'] ?? '')], + ['hard_max_vectork' => (string) ($retrieval['hard_max_vectork'] ?? '')], + ['vector_score_threshold' => (string) ($retrieval['vector_score_threshold'] ?? '')] + ); + + $io->section('Vector'); + $io->definitionList( + ['service_url' => (string) ($vector['service_url'] ?? '')], + ['port' => (string) ($vector['port'] ?? '')], + ['timeout' => (string) ($vector['timeout'] ?? '')] + ); + + $io->section('Commerce'); + $io->definitionList( + ['enabled' => $this->formatBool($commerce['enabled'] ?? false)], + ['max_shop_results' => (string) ($commerce['max_shop_results'] ?? '')], + ['store_api_base_url' => (string) ($commerce['store_api_base_url'] ?? '')] + ); + } + + /** + * @param array $data + * @return array + */ + private function section(array $data, string $key): array + { + return isset($data[$key]) && is_array($data[$key]) ? $data[$key] : []; + } + + private function formatBool(mixed $value): string + { + return filter_var($value, FILTER_VALIDATE_BOOLEAN) ? 'yes' : 'no'; + } +} diff --git a/src/Command/ConfigValidateCommand.php b/src/Command/ConfigValidateCommand.php new file mode 100644 index 0000000..9754f5d --- /dev/null +++ b/src/Command/ConfigValidateCommand.php @@ -0,0 +1,74 @@ +addOption('json', null, InputOption::VALUE_NONE, 'Render validation result as JSON.'); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $result = $this->provider->validate(); + + if ((bool) $input->getOption('json')) { + $json = json_encode($result, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE); + $output->writeln(is_string($json) ? $json : '{}'); + + return $result['status'] === 'OK' ? Command::SUCCESS : Command::FAILURE; + } + + $this->renderSummary(new SymfonyStyle($input, $output), $result); + + return $result['status'] === 'OK' ? Command::SUCCESS : Command::FAILURE; + } + + /** + * @param array{status:string, errors:list, warnings:list, config:array} $result + */ + private function renderSummary(SymfonyStyle $io, array $result): void + { + $io->title('RetrieX configuration validation'); + + if ($result['errors'] === [] && $result['warnings'] === []) { + $io->success('Configuration is valid.'); + return; + } + + if ($result['errors'] !== []) { + $io->section('Errors'); + foreach ($result['errors'] as $error) { + $io->writeln('- ' . $error); + } + } + + if ($result['warnings'] !== []) { + $io->section('Warnings'); + foreach ($result['warnings'] as $warning) { + $io->writeln('- ' . $warning); + } + } + } +} diff --git a/src/Config/ModelGenerationDefaultsConfig.php b/src/Config/ModelGenerationDefaultsConfig.php new file mode 100644 index 0000000..5b3e2c4 --- /dev/null +++ b/src/Config/ModelGenerationDefaultsConfig.php @@ -0,0 +1,66 @@ +modelName; + } + + public function isStream(): bool + { + return $this->stream; + } + + public function getTemperature(): float + { + return $this->temperature; + } + + public function getTopK(): int + { + return $this->topK; + } + + public function getTopP(): float + { + return $this->topP; + } + + public function getRepeatPenalty(): float + { + return $this->repeatPenalty; + } + + public function getNumCtx(): int + { + return $this->numCtx; + } + + public function getRetrievalMaxChunks(): int + { + return $this->retrievalMaxChunks; + } + + public function getRetrievalVectorTopK(): int + { + return $this->retrievalVectorTopK; + } +} diff --git a/src/Config/RetriexEffectiveConfigProvider.php b/src/Config/RetriexEffectiveConfigProvider.php new file mode 100644 index 0000000..aea7595 --- /dev/null +++ b/src/Config/RetriexEffectiveConfigProvider.php @@ -0,0 +1,459 @@ + + */ + public function dump(): array + { + return [ + 'runtime' => $this->runtimeConfig(), + 'index' => $this->indexConfig(), + 'model_generation' => $this->modelConfig(), + 'llm' => ['timeout_seconds' => $this->param('retriex.llm.timeout_seconds')], + 'retrieval' => $this->retrievalConfig(), + 'prompt' => $this->promptConfig(), + 'vector' => $this->vectorConfig(), + 'commerce' => $this->commerceConfig(), + ]; + } + + /** + * @return array{status:string, errors:list, warnings:list, config:array} + */ + public function validate(): array + { + $errors = []; + $warnings = []; + $config = $this->dump(); + + $this->validateRuntime($config['runtime'], $errors, $warnings); + $this->validateIndex($config['index'], $errors, $warnings); + $this->validateModel($config['model_generation'], $errors, $warnings); + $this->validateRetrieval($config['retrieval'], $errors, $warnings); + $this->validatePrompt($config['prompt'], $errors, $warnings); + $this->validateVector($config['vector'], $errors, $warnings); + $this->validateCommerce($config['commerce'], $errors, $warnings); + + return [ + 'status' => $errors === [] ? 'OK' : 'ERROR', + 'errors' => $errors, + 'warnings' => $warnings, + 'config' => $config, + ]; + } + + /** + * @return array + */ + private function runtimeConfig(): array + { + return [ + 'root' => $this->param('retriex.root'), + 'knowledge_root' => $this->param('retriex.knowledge.root'), + 'index_ndjson' => $this->param('retriex.knowledge.ndjson'), + 'index_meta' => $this->param('retriex.knowledge.index_meta'), + 'runtime_meta' => $this->param('retriex.knowledge.runtime_meta'), + 'upload_dir' => $this->param('retriex.knowledge.upload'), + 'locks_dir' => $this->param('retriex.locks.dir'), + ]; + } + + /** + * @return array + */ + private function indexConfig(): array + { + try { + $index = $this->indexProvider->getConfiguration(); + + return [ + 'chunk_size' => $index->getChunkSize(), + 'chunk_overlap' => $index->getChunkOverlap(), + 'embedding_model' => $index->getEmbeddingModel(), + 'embedding_dimension' => $index->getEmbeddingDimension(), + 'scoring_version' => $index->getScoringVersion(), + 'index_format' => $index->getIndexFormat(), + 'vector_backend' => $index->getVectorBackend(), + ]; + } catch (\Throwable $e) { + return [ + 'error' => $e->getMessage(), + 'fallback_chunk_size' => $this->param('retriex.index.chunk_size'), + 'fallback_chunk_overlap' => $this->param('retriex.index.chunk_overlap'), + 'fallback_embedding_model' => $this->param('retriex.index.embedding_model'), + 'fallback_embedding_dimension' => $this->param('retriex.index.embedding_dimension'), + 'fallback_scoring_version' => $this->param('retriex.index.scoring_version'), + ]; + } + } + + /** + * @return array + */ + private function modelConfig(): array + { + try { + $model = $this->modelProvider->getActiveForModel(); + + return [ + 'model_name' => $model->getModelName(), + 'version' => $model->getVersion(), + 'active' => $model->isActive(), + 'stream' => $model->isStream(), + 'temperature' => $model->getTemperature(), + 'top_k' => $model->getTopK(), + 'top_p' => $model->getTopP(), + 'repeat_penalty' => $model->getRepeatPenalty(), + 'num_ctx' => $model->getNumCtx(), + 'retrieval_max_chunks' => $model->getRetrievalMaxChunks(), + 'retrieval_vector_top_k' => $model->getRetrievalVectorTopK(), + ]; + } catch (\Throwable $e) { + return [ + 'error' => $e->getMessage(), + 'default_model_name' => $this->param('retriex.model.default_name'), + 'default_num_ctx' => $this->param('retriex.model.default_num_ctx'), + 'default_retrieval_max_chunks' => $this->param('retriex.model.default_retrieval_max_chunks'), + 'default_retrieval_vector_top_k' => $this->param('retriex.model.default_retrieval_vector_top_k'), + ]; + } + } + + /** + * @return array + */ + private function retrievalConfig(): array + { + return [ + 'hard_max_chunks' => NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS, + 'hard_max_vectork' => NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK, + 'hard_max_keywordk' => NdjsonHybridRetrieverConfig::HARD_MAX_KEYWORDK, + 'vector_score_threshold' => NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD, + 'threshold_floor' => NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR, + 'threshold_ceil' => NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, + 'list_bonus' => NdjsonHybridRetrieverConfig::LIST_BONUS, + 'rrf_k' => NdjsonHybridRetrieverConfig::RRF_K, + 'keyword_topk_multiplier' => NdjsonHybridRetrieverConfig::KEYWORD_TOPK_MULTIPLIER, + 'keyword_score_threshold' => NdjsonHybridRetrieverConfig::KEYWORD_SCORE_THRESHOLD, + 'keyword_rrf_weight' => NdjsonHybridRetrieverConfig::KEYWORD_RRF_WEIGHT, + 'scoped_vector_rrf_weight' => NdjsonHybridRetrieverConfig::SCOPED_VECTOR_RRF_WEIGHT, + 'scoped_keyword_rrf_weight' => NdjsonHybridRetrieverConfig::SCOPED_KEYWORD_RRF_WEIGHT, + 'empty_rrf_fallback_topn' => NdjsonHybridRetrieverConfig::EMPTY_RRF_FALLBACK_TOPN, + 'max_chunks_per_doc' => NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC, + 'min_chunk_distance' => NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE, + 'dominant_doc_window' => NdjsonHybridRetrieverConfig::DOMINANT_DOC_WINDOW, + 'dominant_doc_min_hits' => NdjsonHybridRetrieverConfig::DOMINANT_DOC_MIN_HITS, + 'dominant_doc_max_chunks' => NdjsonHybridRetrieverConfig::DOMINANT_DOC_MAX_CHUNKS, + 'exact_document_max_chunks' => NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS, + 'focused_product_window' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_WINDOW, + 'focused_product_min_score' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_SCORE, + 'focused_product_min_gap' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_GAP, + 'focused_product_max_chunks' => NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MAX_CHUNKS, + 'inventory_parameter' => $this->param('retriex.retrieval.inventory', []), + ]; + } + + /** + * @return array + */ + private function promptConfig(): array + { + return [ + 'chars_per_token' => $this->promptConfig->getCharsPerToken(), + 'history_padding_chars' => $this->promptConfig->getHistoryPaddingChars(), + 'output_reserve_ratio' => $this->promptConfig->getOutputReserveRatio(), + 'output_reserve_min_tokens' => $this->promptConfig->getOutputReserveMinTokens(), + 'output_reserve_max_tokens' => $this->promptConfig->getOutputReserveMaxTokens(), + 'safety_reserve_ratio' => $this->promptConfig->getSafetyReserveRatio(), + 'safety_reserve_min_tokens' => $this->promptConfig->getSafetyReserveMinTokens(), + 'safety_reserve_max_tokens' => $this->promptConfig->getSafetyReserveMaxTokens(), + 'min_prompt_budget_tokens' => $this->promptConfig->getMinPromptBudgetTokens(), + 'max_shop_results_in_prompt' => $this->promptConfig->getMaxShopResultsInPrompt(), + 'detailed_shop_results_max_count' => $this->promptConfig->getDetailedShopResultsMaxCount(), + 'technical_product_keyword_match_threshold' => $this->promptConfig->getTechnicalProductKeywordMatchThreshold(), + ]; + } + + /** + * @return array + */ + private function vectorConfig(): array + { + return [ + 'service_url' => $this->param('retriex.vector.service_url'), + 'host' => $this->param('retriex.vector.host'), + 'port' => $this->param('retriex.vector.port'), + 'python_bin' => $this->param('retriex.vector.python_bin'), + 'control_script' => $this->param('retriex.vector.control_script'), + 'timeout' => $this->param('retriex.vector.timeout'), + 'search' => [ + 'min_score' => $this->param('retriex.vector.search.min_score'), + 'max_limit' => $this->param('retriex.vector.search.max_limit'), + 'http_timeout' => $this->param('retriex.vector.search.http_timeout'), + ], + 'tags' => [ + 'min_score' => $this->param('retriex.vector.tags.min_score'), + 'default_limit' => $this->param('retriex.vector.tags.default_limit'), + 'max_limit' => $this->param('retriex.vector.tags.max_limit'), + 'http_timeout' => $this->param('retriex.vector.tags.http_timeout'), + ], + 'tag_routing' => [ + 'default_topk' => $this->param('retriex.vector.tag_routing.default_topk'), + 'min_best_score' => $this->param('retriex.vector.tag_routing.min_best_score'), + 'max_score_drop_from_best' => $this->param('retriex.vector.tag_routing.max_score_drop_from_best'), + 'max_routing_tags' => $this->param('retriex.vector.tag_routing.max_routing_tags'), + 'max_candidate_docs' => $this->param('retriex.vector.tag_routing.max_candidate_docs'), + 'multi_tag_bonus_per_extra_tag' => $this->param('retriex.vector.tag_routing.multi_tag_bonus_per_extra_tag'), + 'max_multi_tag_bonus' => $this->param('retriex.vector.tag_routing.max_multi_tag_bonus'), + ], + ]; + } + + /** + * @return array + */ + private function commerceConfig(): array + { + return [ + 'enabled' => $this->param('retriex.commerce.enabled'), + 'max_shop_results' => $this->param('retriex.commerce.max_shop_results'), + 'shop_timeout' => $this->param('retriex.commerce.shop_timeout'), + 'store_api_base_url' => $this->param('retriex.commerce.store_api_base_url'), + 'sales_channel_access_key_configured' => $this->param('retriex.commerce.sales_channel_access_key') !== '', + 'search_repair' => [ + 'enabled' => $this->param('retriex.commerce.search_repair.enabled'), + 'max_queries' => $this->param('retriex.commerce.search_repair.max_queries'), + 'min_primary_results_without_repair' => $this->param('retriex.commerce.search_repair.min_primary_results_without_repair'), + ], + ]; + } + + /** + * @param array $runtime + * @param list $errors + * @param list $warnings + */ + private function validateRuntime(array $runtime, array &$errors, array &$warnings): void + { + foreach (['root', 'knowledge_root', 'index_ndjson', 'index_meta', 'upload_dir'] as $key) { + if (trim((string) ($runtime[$key] ?? '')) === '') { + $errors[] = 'runtime.' . $key . ' must not be empty.'; + } + } + } + + /** + * @param array $index + * @param list $errors + * @param list $warnings + */ + private function validateIndex(array $index, array &$errors, array &$warnings): void + { + if (isset($index['error'])) { + $warnings[] = 'index configuration could not be loaded from DB/provider: ' . (string) $index['error']; + return; + } + + $chunkSize = $this->asInt($index['chunk_size'] ?? null); + $chunkOverlap = $this->asInt($index['chunk_overlap'] ?? null); + + if ($chunkSize === null || $chunkSize <= 0) { + $errors[] = 'index.chunk_size must be greater than 0.'; + } + + if ($chunkOverlap === null || $chunkOverlap < 0) { + $errors[] = 'index.chunk_overlap must be greater than or equal to 0.'; + } + + if ($chunkSize !== null && $chunkOverlap !== null && $chunkOverlap >= $chunkSize) { + $errors[] = 'index.chunk_overlap must be smaller than index.chunk_size.'; + } + + if (trim((string) ($index['embedding_model'] ?? '')) === '') { + $errors[] = 'index.embedding_model must not be empty.'; + } + + if (($this->asInt($index['embedding_dimension'] ?? null) ?? 0) <= 0) { + $errors[] = 'index.embedding_dimension must be greater than 0.'; + } + + if (($this->asInt($index['scoring_version'] ?? null) ?? 0) <= 0) { + $errors[] = 'index.scoring_version must be greater than 0.'; + } + } + + /** + * @param array $model + * @param list $errors + * @param list $warnings + */ + private function validateModel(array $model, array &$errors, array &$warnings): void + { + if (isset($model['error'])) { + $warnings[] = 'model configuration could not be loaded from DB/provider: ' . (string) $model['error']; + return; + } + + if (trim((string) ($model['model_name'] ?? '')) === '') { + $errors[] = 'model_generation.model_name must not be empty.'; + } + + if (($this->asInt($model['num_ctx'] ?? null) ?? 0) < 512) { + $errors[] = 'model_generation.num_ctx must be at least 512.'; + } + + if (($this->asInt($model['retrieval_max_chunks'] ?? null) ?? 0) < 1) { + $errors[] = 'model_generation.retrieval_max_chunks must be greater than 0.'; + } + + if (($this->asInt($model['retrieval_vector_top_k'] ?? null) ?? 0) < 1) { + $errors[] = 'model_generation.retrieval_vector_top_k must be greater than 0.'; + } + } + + /** + * @param array $retrieval + * @param list $errors + * @param list $warnings + */ + private function validateRetrieval(array $retrieval, array &$errors, array &$warnings): void + { + $floor = (float) $retrieval['threshold_floor']; + $threshold = (float) $retrieval['vector_score_threshold']; + $ceil = (float) $retrieval['threshold_ceil']; + + if ($floor > $threshold || $threshold > $ceil) { + $errors[] = 'retrieval threshold must satisfy threshold_floor <= vector_score_threshold <= threshold_ceil.'; + } + + if ((int) $retrieval['hard_max_chunks'] < 1) { + $errors[] = 'retrieval.hard_max_chunks must be greater than 0.'; + } + + $inventory = $retrieval['inventory_parameter'] ?? []; + if (is_array($inventory)) { + foreach ($inventory as $key => $value) { + if (array_key_exists($key, $retrieval) && $retrieval[$key] != $value) { + $warnings[] = 'retrieval.inventory.' . $key . ' differs from active retriever constant.'; + } + } + } + } + + /** + * @param array $prompt + * @param list $errors + * @param list $warnings + */ + private function validatePrompt(array $prompt, array &$errors, array &$warnings): void + { + if ((int) $prompt['chars_per_token'] < 1) { + $errors[] = 'prompt.chars_per_token must be greater than 0.'; + } + + if ((float) $prompt['output_reserve_ratio'] < 0.0 || (float) $prompt['output_reserve_ratio'] > 1.0) { + $errors[] = 'prompt.output_reserve_ratio must be between 0 and 1.'; + } + + if ((float) $prompt['safety_reserve_ratio'] < 0.0 || (float) $prompt['safety_reserve_ratio'] > 1.0) { + $errors[] = 'prompt.safety_reserve_ratio must be between 0 and 1.'; + } + } + + /** + * @param array $vector + * @param list $errors + * @param list $warnings + */ + private function validateVector(array $vector, array &$errors, array &$warnings): void + { + if (trim((string) ($vector['service_url'] ?? '')) === '') { + $errors[] = 'vector.service_url must not be empty.'; + } + + if (($this->asInt($vector['port'] ?? null) ?? 0) < 1) { + $errors[] = 'vector.port must be greater than 0.'; + } + + $search = is_array($vector['search'] ?? null) ? $vector['search'] : []; + $tags = is_array($vector['tags'] ?? null) ? $vector['tags'] : []; + + foreach (['search.min_score' => $search['min_score'] ?? null, 'tags.min_score' => $tags['min_score'] ?? null] as $name => $value) { + $score = $this->asFloat($value); + if ($score === null || $score < 0.0 || $score > 1.0) { + $errors[] = 'vector.' . $name . ' must be between 0 and 1.'; + } + } + } + + /** + * @param array $commerce + * @param list $errors + * @param list $warnings + */ + private function validateCommerce(array $commerce, array &$errors, array &$warnings): void + { + if (!$this->asBool($commerce['enabled'] ?? false)) { + return; + } + + if (trim((string) ($commerce['store_api_base_url'] ?? '')) === '') { + $errors[] = 'commerce.store_api_base_url must not be empty when commerce is enabled.'; + } + + if (($this->asInt($commerce['max_shop_results'] ?? null) ?? 0) < 1) { + $warnings[] = 'commerce.max_shop_results could not be resolved as a positive integer.'; + } + } + + private function param(string $name, mixed $default = null): mixed + { + if (!$this->parameters->has($name)) { + return $default; + } + + return $this->parameters->get($name); + } + + private function asInt(mixed $value): ?int + { + return is_numeric($value) ? (int) $value : null; + } + + private function asFloat(mixed $value): ?float + { + return is_numeric($value) ? (float) $value : null; + } + + private function asBool(mixed $value): bool + { + if (is_bool($value)) { + return $value; + } + + if (is_string($value)) { + return in_array(strtolower($value), ['1', 'true', 'yes', 'on'], true); + } + + return (bool) $value; + } +}