diff --git a/RETRIEX_PATCH_61_GENRE_SOURCE_OF_TRUTH_GUARD_README.md b/RETRIEX_PATCH_61_GENRE_SOURCE_OF_TRUTH_GUARD_README.md new file mode 100644 index 0000000..0cd7fd8 --- /dev/null +++ b/RETRIEX_PATCH_61_GENRE_SOURCE_OF_TRUTH_GUARD_README.md @@ -0,0 +1,141 @@ +# RetrieX Patch p61 - Genre Source-of-Truth Guard + +## Ziel + +p61 schliesst die Genre-Source-of-Truth-Arbeit nach p59/p60/p60b ab. + +Der Patch verschiebt keine fachlichen Werte mehr. Stattdessen erzwingt er per Validate/Audit, dass `genre.yaml` die zentrale Pflegequelle fuer genreabhaengige Werte bleibt und Legacy-Pfade nur noch als leere oder explizit eingefrorene Fallbacks dienen. + +## Umfang + +Geaendert bzw. ergaenzt: + +- `config/retriex/governance.yaml` +- `src/Config/GenreSourceOfTruthGuard.php` +- `src/Config/RetriexEffectiveConfigProvider.php` +- `src/Config/ConfigSourceAuditProvider.php` +- `src/Command/ConfigSourceAuditCommand.php` + +## Was der Guard prueft + +### 1. Vollstaendige Genre-Werte + +`genre.configuration_values` muss eine nicht-leere Map sein. + +Fuer jede Gruppe in `genre.adaptation_surface` muss eine entsprechende Gruppe in `genre.configuration_values` existieren. + +### 2. Source-Path-Abdeckung + +Genre-Wertknoten mit direkter Payload muessen selbst `source_paths` deklarieren oder von einem Parent-Knoten mit `source_paths` abgedeckt sein. + +`source_paths` muessen nicht-leere Strings sein und duerfen innerhalb eines Wertknotens nicht doppelt vorkommen. + +### 3. Gueltige Source-Pfade + +Alle `source_paths` werden gegen die effektive bzw. roh geladene RetrieX-Konfiguration validiert. + +Unbekannte Source-Pfade erzeugen jetzt einen Validate-Fehler. + +### 4. Legacy-Fallbacks sind leer oder eingefroren + +Fuer jeden deklarierten Source-Pfad gilt: + +- leerer Legacy-Wert: OK, Status `legacy_fallback_empty` +- runtime-/env-aufgeloester Pfad: OK, Status `legacy_runtime_resolved_allowed` +- nicht-leerer Legacy-Wert mit passendem Hash: OK, Status `legacy_frozen_non_empty` +- nicht-leerer Legacy-Wert ohne registrierten Hash: Fehler +- nicht-leerer Legacy-Wert mit abweichendem Hash: Fehler + +Damit brechen Validate/Audit, wenn neue fachliche Listen wieder ausserhalb von `genre.yaml` landen oder eingefrorene Legacy-Werte veraendert werden. + +## Warum eingefrorene Fallback-Hashes? + +Einige Legacy-Pfade enthalten weiterhin technische Fallback-/Wiring-Werte oder noch nicht entfernte Legacy-Defaults, die im aktuellen gruenen Stand nicht geloescht werden sollten. p61 friert diese Werte per SHA-256-Hash ein. + +Dadurch bleibt der aktuelle Stand kompatibel, aber kuenftige Aenderungen an diesen Legacy-Pfaden muessen bewusst ueber `genre.yaml` erfolgen. + +## Audit-Ausgabe + +`mto:agent:config:audit-source --details` zeigt zusaetzlich: + +- `genre_source_of_truth_violations` +- `genre_source_of_truth_fallback_empty` +- `genre_source_of_truth_frozen_non_empty` +- Detailtabelle `Genre source-of-truth guard` + +Die Detailtabelle zeigt pro Genre-Wert: + +- Genre value path +- Legacy/effective source path +- State +- Hash + +## Validate-/Audit-Verhalten + +`mto:agent:config:validate` fuehrt den Guard aus und gibt Fehler aus, wenn die Source-of-Truth-Regeln verletzt sind. + +`mto:agent:config:audit-source` gibt bei Source-of-Truth-Fehlern jetzt `Command::FAILURE` zurueck. + +## Lokale Checks + +Ausgefuehrt: + +```bash +php -l src/Config/GenreSourceOfTruthGuard.php +php -l src/Config/RetriexEffectiveConfigProvider.php +php -l src/Config/ConfigSourceAuditProvider.php +php -l src/Command/ConfigSourceAuditCommand.php +``` + +Alle PHP-Lints: OK. + +```bash +python3 - <<'PY' +# YAML parse config/retriex/*.yaml +PY +``` + +YAML parse: OK. + +Zusatzcheck: + +- Source-of-Truth-Guard-Simulation gegen den aktuellen Stand: 0 Fehler +- Ergebnisstatus der Simulation: leer/frozen/runtime erlaubt + +Nicht lokal ausfuehrbar: + +```bash +php bin/console mto:agent:config:validate +php bin/console mto:agent:regression:test +php bin/console mto:agent:config:audit-source --details +php bin/console mto:agent:config:audit-patterns --details +``` + +Grund: + +Das ZIP enthaelt kein `vendor/`. + +Fehler: + +```text +Dependencies are missing. Try running "composer install". +``` + +## Empfohlene Checks nach Einspielen + +```bash +bin/console cache:clear +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +bin/console mto:agent:config:audit-patterns --details +``` + +## Abschlussbild + +Nach p61 gilt: + +- `genre.yaml` ist die zentrale fachliche Source of Truth. +- Legacy-YAMLs sind technische Verarbeitungsschichten bzw. leere/eingefrorene Fallbacks. +- Neue fachliche Listen ausserhalb von `genre.yaml` werden durch Validate/Audit verhindert. +- Technische Runtime-/Model-/Vector-/Index-Konfiguration bleibt weiterhin ausserhalb von `genre.yaml`. diff --git a/config/retriex/governance.yaml b/config/retriex/governance.yaml index 1d16bb8..5838812 100644 --- a/config/retriex/governance.yaml +++ b/config/retriex/governance.yaml @@ -18,6 +18,51 @@ parameters: shop_query_context_fallback_filter_terms: [] shop_query_current_input_preservation_terms: [] vocabulary: {} + genre_source_of_truth: + enabled: true + source: genre.yaml + legacy_mode: frozen_or_empty_fallback + runtime_resolved_source_paths: + - commerce.max_shop_results + - commerce.store_api_base_url + frozen_non_empty_legacy_source_hashes: + agent.follow_up_context.commercial_table_follow_up.history_anchor_patterns: '705be92e5783ff3184b767c3d65723df169326b0bcae0d9f502c2d182bf7dcdc' + agent.follow_up_context.commercial_table_follow_up.indicator_marker_patterns: '653ef14fdbe2cfcf60ae164e4eb8d8b03b742801766a1fc2b8e7d860a59e082a' + agent.follow_up_context.commercial_table_follow_up.query_template_with_model: '5e4c77d6b9d13a753d14ca8eea3c942df3d9e8276bc2c208b4002e9464bee4e8' + agent.follow_up_context.commercial_table_follow_up.query_template_without_model: '2b5a25507a162c3b2181001ec832022e3461da3c89229fd9dd9b06ce9d686eab' + agent.input_normalization.fuzzy_routing.vocabulary_views.terms: '430f9da9d2d8d20b1367c875000f941e0b7b56f74a23fbfc8cd29aa11cd59716' + agent.no_llm_fallback.product_roles.vocabulary_views.accessory_product_keywords: '13623043e91858bf30831a28e0df23556bebe036d09eedc8fdca316d585a471f' + agent.no_llm_fallback.product_roles.vocabulary_views.main_device_request_keywords: 'ab49470c75a011355dd827381e881ed8553273cd87090fea162035eaae5f1c6e' + agent.shop_runtime.answer_constraints.length_filter: '66363cc7bf0dfe75c3991cefc70f1dfaf2f3150ab2f4f59f30f54b52d1c4d7af' + agent.shop_runtime.answer_constraints.length_sort: '1207b32e691996ca643f5bfd9b31467b70a11289ce56af356bd7f4fe78826c75' + agent.shop_runtime.attribute_cleanup.vocabulary_views.product_type_terms: '76060cf458b95b104bf3755b9c20997b7ec9b356c30bae093cb504e07434c152' + agent.shop_runtime.attribute_cleanup.vocabulary_views.stop_terms: '0006d5dae5955abfdca612237457f67a24ae7758d23f8e642555ba0a810d1d96' + agent.shop_runtime.context_resolution.history_anchor_enrichment.vocabulary_views.trigger_terms: '4f96642290520ec4b9fe4a42728f5d4c22641c5769bd2d7a5c8e2a4f7ba23c80' + agent.shop_runtime.query_cleanup.current_input_preservation.vocabulary_views.terms: '80c99b61717d630ef2f9a1c9971035d00cc94b863f3269ed1555d2c79a3a487a' + governance.core_pattern_audit: 'f5b3e421481dda023d53d1b1269039391ae9130e68ed5ed4fb149fc215c9b1d6' + intent.commerce.patterns.color_value_template: '271adda666a9c30ab9cc2182ce9c982b20eec89511f4292763347263f4911347' + intent.commerce.patterns.model_like_product: '2d906ee5301bcc71b3fbc0fde07e4cc176940d47a52862f01bf33e7c6ba09d55' + intent.commerce.patterns.size_extraction_template: 'fd4815472f08ffb12ac02fb69f6854299657b68384343974d9b846bc93d6e84e' + intent.commerce.patterns.size_token_value_template: '272487774285539297c1161830200db02e2933837cc044778eac354ef5217e30' + intent.commerce.patterns.size_value_template: 'f1146460915a55bd19cec354b96c77d5040419d3101d71ba7c7c1d9a4893dd3d' + intent.sales.comparison_signals: '2cfc1e320430eb86d22d2e9a6f2caca13eac701da64c694e9ed36d0e015104cc' + intent.sales.implementation_signals: 'a58c9c772f77eca186659ac3b90f6e2517e22b0d32228402902ca9c9d4a244d2' + intent.sales.objection_signals: 'dea7269eaa22d3e3a5ef2cc8d2d012b3c089e0c69d141df2c1e1e118fec6a491' + intent.sales.roi_signals: '88101a34b5e63a938055ab89bde1f73ce3bf8698f6d2793018145cdd7ae814da' + intent.sales.sales_signals: '7269ee14955e4a7c1f0360f3e0c71eaf346a3b12d10c52f5ee78314d0636de69' + language.cleanup_profiles.commerce_query: 'e45193fa1eb51b444fb1e95fcc9814c57f77f954057f7452a467e6e335b13752' + language.cleanup_profiles.rag_evidence: 'f5917e594cec7923029354157ccdc926a09637efff0041ea6df1d8002c2bf838' + language.cleanup_profiles.shop_context_fallback: 'fec1fbd755fd88fe685ea1ef88ba4a18c1290ccbfd1347d2ebf059e830175e6c' + search_repair.patterns.accessory_candidate_template: '7f5e3429d0bdca47515c107dbce0da1d5b50720e8a678aedd3e40876444e4403' + search_repair.patterns.accessory_or_bundle_template: '7f5e3429d0bdca47515c107dbce0da1d5b50720e8a678aedd3e40876444e4403' + search_repair.patterns.model_candidate: '7f5e3429d0bdca47515c107dbce0da1d5b50720e8a678aedd3e40876444e4403' + search_repair.patterns.model_like: '7f5e3429d0bdca47515c107dbce0da1d5b50720e8a678aedd3e40876444e4403' + search_repair.patterns.requested_accessory_code: '7f5e3429d0bdca47515c107dbce0da1d5b50720e8a678aedd3e40876444e4403' + search_repair.patterns.specificity_boost_template: '7f5e3429d0bdca47515c107dbce0da1d5b50720e8a678aedd3e40876444e4403' + shop_matching.custom_fields: '420d3a2a22034b1c76afca609f39ef204d5ccc24f7cfe45dd8ec4dc39da14a51' + shop_matching.role_guard: 'd301f39e06f588293adf2b18d6e28c9236bd0c6756fea3bf3331c31bad52b51e' + shop_matching.text.custom_field_join_separator: '658f073ee2deca9bbd5be611b83f2d8e5e1ccad5ac206e93cbbbd6dd90f238f6' + shop_matching.text.primary_secondary_separator: '6a86154696ca4475af1b75d81fca690bde4366042f9d5cb0e3b21ec09674b5e6' language: protected_stopword_terms: [] required_cleanup_profiles: diff --git a/src/Command/ConfigSourceAuditCommand.php b/src/Command/ConfigSourceAuditCommand.php index ad25e67..8bdac3f 100644 --- a/src/Command/ConfigSourceAuditCommand.php +++ b/src/Command/ConfigSourceAuditCommand.php @@ -43,7 +43,7 @@ final class ConfigSourceAuditCommand extends Command $this->renderSummary(new SymfonyStyle($input, $output), $result, (bool) $input->getOption('details')); - return Command::SUCCESS; + return ($result['status'] ?? 'UNKNOWN') === 'ERROR' ? Command::FAILURE : Command::SUCCESS; } /** @@ -65,9 +65,20 @@ final class ConfigSourceAuditCommand extends Command ['constructor_defaults' => (string) ($summary['constructor_defaults'] ?? 0)], ['constructor_defaults_without_yaml_mapping' => (string) ($summary['constructor_defaults_without_yaml_mapping'] ?? 0)], ['genre_value_paths_with_source_paths' => (string) ($summary['genre_value_paths_with_source_paths'] ?? 0)], - ['genre_declared_source_paths' => (string) ($summary['genre_declared_source_paths'] ?? 0)] + ['genre_declared_source_paths' => (string) ($summary['genre_declared_source_paths'] ?? 0)], + ['genre_source_of_truth_violations' => (string) ($summary['genre_source_of_truth_violations'] ?? 0)], + ['genre_source_of_truth_fallback_empty' => (string) ($summary['genre_source_of_truth_fallback_empty'] ?? 0)], + ['genre_source_of_truth_frozen_non_empty' => (string) ($summary['genre_source_of_truth_frozen_non_empty'] ?? 0)] ); + $errors = is_array($result['errors'] ?? null) ? $result['errors'] : []; + if ($errors !== []) { + $io->section('Errors'); + foreach ($errors as $error) { + $io->writeln('- ' . (string) $error); + } + } + $warnings = is_array($result['warnings'] ?? null) ? $result['warnings'] : []; if ($warnings !== []) { $io->section('Warnings'); @@ -120,5 +131,26 @@ final class ConfigSourceAuditCommand extends Command $io->section('Single-genre configuration source paths'); $io->table(['Genre value path', 'Legacy/effective source path'], $genreSourceRows); } + + + $sourceOfTruthRows = []; + $sourceOfTruth = is_array($result['genre_source_of_truth'] ?? null) ? $result['genre_source_of_truth'] : []; + foreach (($sourceOfTruth['source_path_rows'] ?? []) as $item) { + if (!is_array($item)) { + continue; + } + + $sourceOfTruthRows[] = [ + (string) ($item['genre_value_path'] ?? ''), + (string) ($item['source_path'] ?? ''), + (string) ($item['state'] ?? ''), + (string) ($item['hash'] ?? ''), + ]; + } + + if ($sourceOfTruthRows !== []) { + $io->section('Genre source-of-truth guard'); + $io->table(['Genre value path', 'Legacy/effective source path', 'State', 'Hash'], $sourceOfTruthRows); + } } } diff --git a/src/Config/ConfigSourceAuditProvider.php b/src/Config/ConfigSourceAuditProvider.php index 7434c20..a22d137 100644 --- a/src/Config/ConfigSourceAuditProvider.php +++ b/src/Config/ConfigSourceAuditProvider.php @@ -47,8 +47,10 @@ final readonly class ConfigSourceAuditProvider ], ]; - public function __construct(private string $projectDir) - { + public function __construct( + private string $projectDir, + private GenreSourceOfTruthGuard $genreSourceOfTruthGuard, + ) { } /** @@ -58,6 +60,7 @@ final readonly class ConfigSourceAuditProvider { $yamlPaths = $this->collectYamlParameterPaths(); $genreSourcePaths = $this->collectGenreConfigurationSourcePaths(); + $genreSourceOfTruth = $this->genreSourceOfTruthGuard->auditFromFiles(); $fallbackAccessors = []; $constructorDefaults = []; $phpConstants = []; @@ -124,6 +127,9 @@ final readonly class ConfigSourceAuditProvider )); $status = ($missingYamlFallbacks === [] && $phpOnlyConstants === [] && $constructorPhpDefaults === []) ? 'OK' : 'WARN'; + if (($genreSourceOfTruth['status'] ?? 'OK') === 'ERROR') { + $status = 'ERROR'; + } return [ 'status' => $status, @@ -138,12 +144,20 @@ final readonly class ConfigSourceAuditProvider 'constructor_defaults_without_yaml_mapping' => count($constructorPhpDefaults), 'genre_value_paths_with_source_paths' => count($genreSourcePaths), 'genre_declared_source_paths' => $this->countGenreDeclaredSourcePaths($genreSourcePaths), + 'genre_source_of_truth_violations' => (int) (($genreSourceOfTruth['summary']['violations'] ?? 0)), + 'genre_source_of_truth_fallback_empty' => (int) (($genreSourceOfTruth['summary']['legacy_fallback_empty'] ?? 0)), + 'genre_source_of_truth_frozen_non_empty' => (int) (($genreSourceOfTruth['summary']['legacy_frozen_non_empty'] ?? 0)), ], - 'warnings' => $this->buildWarnings($missingYamlFallbacks, $phpOnlyConstants, $constructorPhpDefaults), + 'errors' => $genreSourceOfTruth['errors'] ?? [], + 'warnings' => array_merge( + $this->buildWarnings($missingYamlFallbacks, $phpOnlyConstants, $constructorPhpDefaults), + $genreSourceOfTruth['warnings'] ?? [] + ), 'fallback_accessors' => $fallbackAccessors, 'constructor_defaults' => $constructorDefaults, 'php_constants' => $phpConstants, 'genre_configuration_source_paths' => $genreSourcePaths, + 'genre_source_of_truth' => $genreSourceOfTruth, ]; } diff --git a/src/Config/GenreSourceOfTruthGuard.php b/src/Config/GenreSourceOfTruthGuard.php new file mode 100644 index 0000000..5029fb2 --- /dev/null +++ b/src/Config/GenreSourceOfTruthGuard.php @@ -0,0 +1,525 @@ + $genre + * @param array $effectiveConfig + * @return array{status:string, errors:list, warnings:list, summary:array, source_path_rows:list>} + */ + public function validate(array $genre, array $effectiveConfig): array + { + $rawConfig = $this->loadRawConfig(); + $rawGenre = is_array($rawConfig['genre'] ?? null) ? $rawConfig['genre'] : $genre; + $rawGovernance = is_array($rawConfig['governance'] ?? null) ? $rawConfig['governance'] : []; + + $guardConfig = $rawGovernance['genre_source_of_truth'] ?? null; + if (!is_array($guardConfig) && isset($effectiveConfig['governance']) && is_array($effectiveConfig['governance'])) { + $guardConfig = $effectiveConfig['governance']['genre_source_of_truth'] ?? []; + } + + return $this->auditConfig($rawGenre, $rawConfig, is_array($guardConfig) ? $guardConfig : []); + } + + /** + * @return array{status:string, errors:list, warnings:list, summary:array, source_path_rows:list>} + */ + public function auditFromFiles(): array + { + $config = $this->loadRawConfig(); + $genre = $config['genre'] ?? []; + $governance = $config['governance'] ?? []; + $guardConfig = is_array($governance) ? ($governance['genre_source_of_truth'] ?? []) : []; + + return $this->auditConfig(is_array($genre) ? $genre : [], $config, is_array($guardConfig) ? $guardConfig : []); + } + + /** + * @param array $genre + * @param array $config + * @param array $guardConfig + * @return array{status:string, errors:list, warnings:list, summary:array, source_path_rows:list>} + */ + private function auditConfig(array $genre, array $config, array $guardConfig): array + { + $enabled = $guardConfig['enabled'] ?? true; + if ($enabled === false || $enabled === 'false' || $enabled === 0 || $enabled === '0') { + return [ + 'status' => 'DISABLED', + 'errors' => [], + 'warnings' => ['genre source-of-truth guard is disabled.'], + 'summary' => $this->emptySummary(), + 'source_path_rows' => [], + ]; + } + + $errors = []; + $warnings = []; + $rows = []; + + $configurationValues = $genre['configuration_values'] ?? null; + if (!is_array($configurationValues) || $configurationValues === []) { + $errors[] = 'genre.configuration_values must be a non-empty map for source-of-truth enforcement.'; + return [ + 'status' => 'ERROR', + 'errors' => $errors, + 'warnings' => $warnings, + 'summary' => $this->emptySummary(), + 'source_path_rows' => $rows, + ]; + } + + $adaptationSurface = $genre['adaptation_surface'] ?? []; + if (!is_array($adaptationSurface) || $adaptationSurface === []) { + $errors[] = 'genre.adaptation_surface must be a non-empty map for source-of-truth enforcement.'; + } else { + foreach ($adaptationSurface as $group => $definition) { + if (!is_string($group) || trim($group) === '') { + continue; + } + if (!array_key_exists($group, $configurationValues)) { + $errors[] = sprintf('genre.configuration_values is missing required source-of-truth group for adaptation_surface.%s.', $group); + } + } + } + + $coverageErrors = $this->validateConfigurationValueCoverage($configurationValues); + array_push($errors, ...$coverageErrors); + + $declaredSourcePaths = $this->collectSourcePaths($configurationValues); + $uniqueSourcePaths = []; + foreach ($declaredSourcePaths as $valuePath => $sourcePaths) { + foreach ($sourcePaths as $sourcePath) { + $uniqueSourcePaths[$sourcePath] = true; + + $resolved = $this->valueAtPath($config, $sourcePath); + if (!$resolved['found']) { + $errors[] = sprintf('genre.configuration_values.%s references unknown source path: %s.', $valuePath, $sourcePath); + $rows[] = $this->row($valuePath, $sourcePath, 'missing', ''); + continue; + } + + $value = $resolved['value']; + if (!$this->hasNonEmptyValue($value)) { + $rows[] = $this->row($valuePath, $sourcePath, 'legacy_fallback_empty', ''); + continue; + } + + if ($this->isRuntimeResolvedSourcePath($guardConfig, $sourcePath)) { + $rows[] = $this->row($valuePath, $sourcePath, 'legacy_runtime_resolved_allowed', ''); + continue; + } + + $hash = $this->hashValue($value); + $expectedHash = $this->expectedFrozenHash($guardConfig, $sourcePath); + if ($expectedHash === null) { + $errors[] = sprintf( + 'Legacy source path %s is non-empty but is not declared as a frozen fallback. Move the value to genre.yaml or add an explicit frozen fallback hash.', + $sourcePath + ); + $rows[] = $this->row($valuePath, $sourcePath, 'legacy_non_empty_unregistered', $hash); + continue; + } + + if (!hash_equals($expectedHash, $hash)) { + $errors[] = sprintf( + 'Legacy source path %s changed outside genre.yaml. Expected frozen hash %s, got %s.', + $sourcePath, + $expectedHash, + $hash + ); + $rows[] = $this->row($valuePath, $sourcePath, 'legacy_frozen_hash_mismatch', $hash); + continue; + } + + $rows[] = $this->row($valuePath, $sourcePath, 'legacy_frozen_non_empty', $hash); + } + } + + foreach ($this->frozenHashes($guardConfig) as $sourcePath => $hash) { + if (!isset($uniqueSourcePaths[$sourcePath])) { + $errors[] = sprintf('governance.genre_source_of_truth.frozen_non_empty_legacy_source_hashes contains undeclared source path: %s.', $sourcePath); + } + if (!is_string($hash) || preg_match('/^[a-f0-9]{64}$/', $hash) !== 1) { + $errors[] = sprintf('governance.genre_source_of_truth.frozen_non_empty_legacy_source_hashes.%s must be a SHA-256 hex hash.', $sourcePath); + } + } + + $summary = $this->summarizeRows($rows); + $summary['configuration_value_groups'] = count($configurationValues); + $summary['source_path_value_nodes'] = count($declaredSourcePaths); + $summary['declared_source_paths'] = count($uniqueSourcePaths); + $summary['violations'] = count($errors); + + return [ + 'status' => $errors === [] ? 'OK' : 'ERROR', + 'errors' => $errors, + 'warnings' => $warnings, + 'summary' => $summary, + 'source_path_rows' => $rows, + ]; + } + + /** + * @param array $configurationValues + * @return list + */ + private function validateConfigurationValueCoverage(array $configurationValues): array + { + $errors = []; + $this->validateCoverageRecursive($configurationValues, '', false, $errors); + + return $errors; + } + + /** + * @param array $value + * @param list $errors + */ + private function validateCoverageRecursive(array $value, string $path, bool $coveredBySourcePath, array &$errors): void + { + $sourcePaths = $value['source_paths'] ?? null; + $hasSourcePaths = is_array($sourcePaths) && $sourcePaths !== []; + if (array_key_exists('source_paths', $value) && !$hasSourcePaths && $path !== '') { + $errors[] = sprintf('genre.configuration_values.%s.source_paths must be a non-empty list when declared.', $path); + } + + if ($hasSourcePaths) { + $seen = []; + foreach ($sourcePaths as $sourcePath) { + if (!is_string($sourcePath) || trim($sourcePath) === '') { + $errors[] = sprintf('genre.configuration_values.%s.source_paths must contain only non-empty strings.', $path); + continue; + } + $sourcePath = trim($sourcePath); + if (isset($seen[$sourcePath])) { + $errors[] = sprintf('genre.configuration_values.%s.source_paths contains duplicate source path: %s.', $path, $sourcePath); + } + $seen[$sourcePath] = true; + } + } + + $covered = $coveredBySourcePath || $hasSourcePaths; + if ($path !== '' && !$covered && $this->hasDirectPayload($value)) { + $errors[] = sprintf('genre.configuration_values.%s must declare source_paths or inherit them from a parent value node.', $path); + } + + foreach ($value as $key => $child) { + if ($key === 'source_paths' || $key === 'description' || !is_string($key) || !is_array($child)) { + continue; + } + + $childPath = $path === '' ? $key : $path . '.' . $key; + $this->validateCoverageRecursive($child, $childPath, $covered, $errors); + } + } + + /** + * @param array $value + */ + private function hasDirectPayload(array $value): bool + { + foreach ($value as $key => $child) { + if ($key === 'source_paths' || $key === 'description') { + continue; + } + if (!is_array($child)) { + return true; + } + } + + return false; + } + + /** + * @param array $configurationValues + * @return array + */ + private function collectSourcePaths(array $configurationValues): array + { + $out = []; + $this->collectSourcePathsRecursive($configurationValues, '', $out); + + return $out; + } + + /** + * @param array $value + * @param array $out + */ + private function collectSourcePathsRecursive(array $value, string $path, array &$out): void + { + $sourcePaths = $value['source_paths'] ?? null; + if (is_array($sourcePaths) && $path !== '') { + $clean = []; + foreach ($sourcePaths as $sourcePath) { + if (!is_string($sourcePath) || trim($sourcePath) === '') { + continue; + } + $sourcePath = trim($sourcePath); + if (!in_array($sourcePath, $clean, true)) { + $clean[] = $sourcePath; + } + } + if ($clean !== []) { + $out[$path] = $clean; + } + } + + foreach ($value as $key => $child) { + if ($key === 'source_paths' || $key === 'description' || !is_string($key) || !is_array($child)) { + continue; + } + $childPath = $path === '' ? $key : $path . '.' . $key; + $this->collectSourcePathsRecursive($child, $childPath, $out); + } + } + + /** + * @param array $guardConfig + * @return array + */ + private function frozenHashes(array $guardConfig): array + { + $value = $guardConfig['frozen_non_empty_legacy_source_hashes'] ?? []; + if (!is_array($value)) { + return []; + } + + $out = []; + foreach ($value as $path => $hash) { + if (!is_string($path) || !is_string($hash)) { + continue; + } + $path = trim($path); + $hash = strtolower(trim($hash)); + if ($path !== '') { + $out[$path] = $hash; + } + } + + return $out; + } + + /** @param array $guardConfig */ + private function expectedFrozenHash(array $guardConfig, string $sourcePath): ?string + { + $hashes = $this->frozenHashes($guardConfig); + + return $hashes[$sourcePath] ?? null; + } + + /** @param array $guardConfig */ + private function isRuntimeResolvedSourcePath(array $guardConfig, string $sourcePath): bool + { + $paths = $guardConfig['runtime_resolved_source_paths'] ?? []; + if (!is_array($paths)) { + return false; + } + + foreach ($paths as $path) { + if (is_string($path) && trim($path) === $sourcePath) { + return true; + } + } + + return false; + } + + /** + * @return array{found:bool, value:mixed} + */ + private function valueAtPath(array $config, string $path): array + { + $current = $config; + foreach (explode('.', $path) as $segment) { + if (!is_array($current) || !array_key_exists($segment, $current)) { + return ['found' => false, 'value' => null]; + } + $current = $current[$segment]; + } + + return ['found' => true, 'value' => $current]; + } + + private function hasNonEmptyValue(mixed $value): bool + { + if ($value === null) { + return false; + } + if (is_string($value)) { + return trim($value) !== ''; + } + if (is_scalar($value)) { + return true; + } + if (is_array($value)) { + foreach ($value as $child) { + if ($this->hasNonEmptyValue($child)) { + return true; + } + } + } + + return false; + } + + private function hashValue(mixed $value): string + { + $normalized = $this->normalizeForHash($value); + $json = json_encode($normalized, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE); + if (!is_string($json)) { + $json = 'null'; + } + + return hash('sha256', $json); + } + + private function normalizeForHash(mixed $value): mixed + { + if (!is_array($value)) { + return $value; + } + + if (array_is_list($value)) { + return array_map(fn (mixed $item): mixed => $this->normalizeForHash($item), $value); + } + + $normalized = []; + $keys = array_keys($value); + sort($keys, SORT_STRING); + foreach ($keys as $key) { + $normalized[(string) $key] = $this->normalizeForHash($value[$key]); + } + + return $normalized; + } + + /** + * @return array + */ + private function loadRawConfig(): array + { + $parameters = []; + $files = glob($this->projectDir . '/config/retriex/*.yaml'); + if (!is_array($files)) { + return []; + } + + sort($files); + foreach ($files as $file) { + $parsed = Yaml::parseFile($file); + if (!is_array($parsed)) { + continue; + } + $fileParameters = $parsed['parameters'] ?? []; + if (is_array($fileParameters)) { + $parameters = array_replace_recursive($parameters, $fileParameters); + } + } + + $config = []; + $parameterRoots = [ + 'retriex.agent.config' => 'agent', + 'retriex.commerce_query.config' => 'commerce_query', + 'retriex.governance.config' => 'governance', + 'retriex.intent.commerce.config' => 'intent.commerce', + 'retriex.intent.light.config' => 'intent.light', + 'retriex.intent.sales.config' => 'intent.sales', + 'retriex.intent.catalog.config' => 'intent.catalog', + 'retriex.prompt.config' => 'prompt', + 'retriex.query_enrichment.config' => 'query_enrichment', + 'retriex.retrieval.config' => 'retrieval', + 'retriex.search_repair.config' => 'search_repair', + 'retriex.shop_matching.config' => 'shop_matching', + 'retriex.stopwords.config' => 'language', + 'retriex.vocabulary.config' => 'vocabulary', + 'retriex.context.config' => 'context', + 'retriex.genre.config' => 'genre', + ]; + + foreach ($parameterRoots as $parameterName => $targetPath) { + if (!array_key_exists($parameterName, $parameters)) { + continue; + } + $this->setPath($config, $targetPath, $parameters[$parameterName]); + } + + foreach ($parameters as $parameterName => $value) { + if (!is_string($parameterName) || !str_starts_with($parameterName, 'retriex.') || isset($parameterRoots[$parameterName])) { + continue; + } + $this->setPath($config, substr($parameterName, strlen('retriex.')), $value); + } + + return $config; + } + + /** @param array $config */ + private function setPath(array &$config, string $path, mixed $value): void + { + $current = &$config; + foreach (explode('.', $path) as $segment) { + if (!isset($current[$segment]) || !is_array($current[$segment])) { + $current[$segment] = []; + } + $current = &$current[$segment]; + } + $current = $value; + } + + /** @return array */ + private function emptySummary(): array + { + return [ + 'configuration_value_groups' => 0, + 'source_path_value_nodes' => 0, + 'declared_source_paths' => 0, + 'legacy_fallback_empty' => 0, + 'legacy_frozen_non_empty' => 0, + 'legacy_non_empty_unregistered' => 0, + 'legacy_frozen_hash_mismatch' => 0, + 'legacy_runtime_resolved_allowed' => 0, + 'missing' => 0, + 'violations' => 0, + ]; + } + + /** + * @param list> $rows + * @return array + */ + private function summarizeRows(array $rows): array + { + $summary = $this->emptySummary(); + foreach ($rows as $row) { + $state = $row['state'] ?? ''; + if ($state !== '') { + $summary[$state] = ($summary[$state] ?? 0) + 1; + } + } + + return $summary; + } + + /** @return array */ + private function row(string $valuePath, string $sourcePath, string $state, string $hash): array + { + return [ + 'genre_value_path' => $valuePath, + 'source_path' => $sourcePath, + 'state' => $state, + 'hash' => $hash, + ]; + } +} diff --git a/src/Config/RetriexEffectiveConfigProvider.php b/src/Config/RetriexEffectiveConfigProvider.php index 317390c..150344c 100644 --- a/src/Config/RetriexEffectiveConfigProvider.php +++ b/src/Config/RetriexEffectiveConfigProvider.php @@ -29,6 +29,7 @@ final readonly class RetriexEffectiveConfigProvider private QueryEnricherConfig $queryEnricherConfig, private GovernanceConfig $governanceConfig, private GenreConfig $genreConfig, + private GenreSourceOfTruthGuard $genreSourceOfTruthGuard, private CatalogIntentConfig $catalogIntentConfig, private ContextServiceConfig $contextServiceConfig, ) { @@ -76,6 +77,9 @@ final readonly class RetriexEffectiveConfigProvider $config = $this->dump(); $this->validateGenre($config['genre'], $config, $errors, $warnings); + $sourceOfTruth = $this->genreSourceOfTruthGuard->validate($config['genre'], $config); + array_push($errors, ...$sourceOfTruth['errors']); + array_push($warnings, ...$sourceOfTruth['warnings']); $this->validateRuntime($config['runtime'], $errors, $warnings); $this->validateIndex($config['index'], $errors, $warnings); $this->validateModel($config['model_generation'], $errors, $warnings);