From 10a3a09a63147a8e7a52592a789bb54d4e122f23 Mon Sep 17 00:00:00 2001 From: team 1 Date: Tue, 5 May 2026 14:17:54 +0200 Subject: [PATCH] p43C --- ...H_43C_REFERENCE_ANCHOR_EXTRACTOR_README.md | 53 ++++ src/Agent/AgentRunner.php | 104 +------ src/Agent/ReferenceAnchorExtractor.php | 129 ++++++++ src/Commerce/ProductRoleResolver.php | 278 ++++++++++++++++++ src/Config/AgentRunnerConfig.php | 25 +- 5 files changed, 469 insertions(+), 120 deletions(-) create mode 100644 patch_history/RETRIEX_PATCH_43C_REFERENCE_ANCHOR_EXTRACTOR_README.md create mode 100644 src/Agent/ReferenceAnchorExtractor.php create mode 100644 src/Commerce/ProductRoleResolver.php diff --git a/patch_history/RETRIEX_PATCH_43C_REFERENCE_ANCHOR_EXTRACTOR_README.md b/patch_history/RETRIEX_PATCH_43C_REFERENCE_ANCHOR_EXTRACTOR_README.md new file mode 100644 index 0000000..cc38bad --- /dev/null +++ b/patch_history/RETRIEX_PATCH_43C_REFERENCE_ANCHOR_EXTRACTOR_README.md @@ -0,0 +1,53 @@ +# RetrieX Patch 43C - Generic Reference Anchor Extractor + +## Goal + +Continue the p43 reduction work with a small, low-risk PHP cleanup: + +- move follow-up reference anchor extraction out of `AgentRunner` +- keep the behavior generic (`product_model` + `measurement_value`) instead of Testomat/hardness-specific naming +- remove legacy fallback accessors for `testomat_model_pattern` and `hardness_value_pattern` +- keep the current configured regex values unchanged + +## Changed files + +- `src/Agent/ReferenceAnchorExtractor.php` added +- `src/Agent/AgentRunner.php` delegates reference anchor extraction to the new extractor +- `src/Config/AgentRunnerConfig.php` now requires the generic keys directly: + - `follow_up_context.reference_anchor.product_model_pattern` + - `follow_up_context.reference_anchor.measurement_value_pattern` + +## Not changed + +- no YAML values changed +- no ranking logic changed +- no prompt rules changed +- no shop search scoring changed +- no new hardcoded product or phrase lists in PHP +- no admin UI + +## Expected effect + +Runtime behavior should remain the same. The patch only reduces responsibility inside `AgentRunner` and removes old Testomat/hardness-specific compatibility naming now that the generic YAML keys are already present and p43A2 is green. + +## Required checks after applying + +```bash +bin/console mto:agent:config:validate +bin/console mto:agent:regression:test +bin/console mto:agent:config:audit-source --details +bin/console mto:agent:config:audit-patterns --details +``` + +## Local checks performed while building this patch + +```bash +php -l src/Agent/ReferenceAnchorExtractor.php +php -l src/Agent/AgentRunner.php +php -l src/Config/AgentRunnerConfig.php +php -l src/Commerce/ProductRoleResolver.php +php -l src/Commerce/ShopSearchService.php +php -l src/Agent/PromptBuilder.php +``` + +YAML parse check for all files in `config/retriex/*.yaml` also passed. diff --git a/src/Agent/AgentRunner.php b/src/Agent/AgentRunner.php index f9a1677..697e043 100644 --- a/src/Agent/AgentRunner.php +++ b/src/Agent/AgentRunner.php @@ -30,6 +30,7 @@ final readonly class AgentRunner private RetrieverInterface $retriever, private ShopSearchService $shopSearchService, private SearchRepairService $searchRepairService, + private ReferenceAnchorExtractor $referenceAnchorExtractor, private CommerceIntentLite $commerceIntentLite, private OllamaClient $ollamaClient, private LoggerInterface $agentLogger, @@ -1135,7 +1136,7 @@ final readonly class AgentRunner $history = $this->contextService->buildUserContextWithinBudget($userId, 3000); $previousQuestions = $this->extractRecentUserQuestions($history, 2); - $referenceAnchors = $this->extractLatestAssistantReferenceAnchors($history); + $referenceAnchors = $this->referenceAnchorExtractor->extractLatestAssistantReferenceAnchors($history); if ($previousQuestions === [] && $referenceAnchors === []) { return $prompt; @@ -1232,74 +1233,6 @@ final readonly class AgentRunner return array_slice($questions, -$limit); } - /** - * Extracts stable reference anchors from the latest assistant answer. - * - * These anchors are only used to resolve follow-up references such as - * "der Wert" or "welcher Indikator". They are not factual evidence for - * the final answer. To avoid propagating wrong earlier answers, only the - * first explicit product-model reference and the first explicit measurement value - * are kept. Indicator names, reagent codes, prices, URLs and product - * numbers are intentionally ignored here. - * - * @return string[] - */ - private function extractLatestAssistantReferenceAnchors(string $history): array - { - $turn = $this->extractLatestHistoryTurn($history); - - if ($turn === '') { - return []; - } - - $answer = preg_replace($this->agentRunnerConfig->getFollowUpHistoryQuestionStripPattern(), '', $turn, 1) ?? ''; - $answer = trim($answer); - - if ($answer === '') { - return []; - } - - $anchors = []; - - $model = $this->extractFirstProductModelAnchor($answer); - if ($model !== '') { - $anchors[] = $model; - } - - $hardnessValue = $this->extractFirstMeasurementValueAnchor($answer); - if ($hardnessValue !== '') { - $anchors[] = $hardnessValue; - } - - return array_values(array_unique($anchors)); - } - - private function extractLatestHistoryTurn(string $history): string - { - $history = trim($history); - - if ($history === '') { - return ''; - } - - $parts = preg_split($this->agentRunnerConfig->getFollowUpHistoryTurnSplitPattern(), $history); - - if ($parts === false || $parts === []) { - return ''; - } - - $turns = array_values(array_filter( - array_map(static fn(string $part): string => trim($part), $parts), - static fn(string $part): bool => $part !== '' - )); - - if ($turns === []) { - return ''; - } - - return (string) end($turns); - } - /** * @return string[] */ @@ -1325,29 +1258,6 @@ final readonly class AgentRunner return array_reverse($turns); } - private function extractFirstProductModelAnchor(string $text): string - { - if (preg_match($this->agentRunnerConfig->getFollowUpReferenceAnchorProductModelPattern(), $text, $matches) !== 1) { - return ''; - } - - $value = $this->sanitizeHistoryQuestion(($matches[0] ?? '')); - $value = preg_replace('/\s+/u', ' ', $value) ?? $value; - - return trim(str_replace('®', '', $value)); - } - - private function extractFirstMeasurementValueAnchor(string $text): string - { - if (preg_match($this->agentRunnerConfig->getFollowUpReferenceAnchorMeasurementValuePattern(), $text, $matches) !== 1) { - return ''; - } - - $value = preg_replace('/\s+/u', ' ', ($matches[0] ?? '')) ?? ''; - - return trim($value); - } - private function sanitizeHistoryQuestion(string $question): string { $question = trim((string) preg_replace('/\s+/u', ' ', $question)); @@ -1500,7 +1410,7 @@ final readonly class AgentRunner return true; } - if ($this->extractFirstProductModelAnchor($prompt) !== '') { + if ($this->referenceAnchorExtractor->extractFirstProductModelAnchor($prompt) !== '') { return false; } @@ -1564,7 +1474,7 @@ final readonly class AgentRunner private function hasStandaloneConcreteShopSubject(string $prompt): bool { - if ($this->extractFirstProductModelAnchor($prompt) !== '') { + if ($this->referenceAnchorExtractor->extractFirstProductModelAnchor($prompt) !== '') { return true; } @@ -1622,7 +1532,7 @@ final readonly class AgentRunner return $prompt; } - if ($this->extractFirstProductModelAnchor($prompt) === '') { + if ($this->referenceAnchorExtractor->extractFirstProductModelAnchor($prompt) === '') { return $optimizedShopQuery; } @@ -2249,7 +2159,7 @@ final readonly class AgentRunner continue; } - $model = $this->extractFirstProductModelAnchor($turn); + $model = $this->referenceAnchorExtractor->extractFirstProductModelAnchor($turn); if ($model !== '') { $query = str_replace( @@ -2334,7 +2244,7 @@ final readonly class AgentRunner } } - $modelAnchor = $this->extractFirstProductModelAnchor($turn); + $modelAnchor = $this->referenceAnchorExtractor->extractFirstProductModelAnchor($turn); if ($modelAnchor !== '' && !$this->isMetaOnlyShopQuery($modelAnchor)) { return mb_strtolower($modelAnchor, 'UTF-8'); diff --git a/src/Agent/ReferenceAnchorExtractor.php b/src/Agent/ReferenceAnchorExtractor.php new file mode 100644 index 0000000..d5fba3e --- /dev/null +++ b/src/Agent/ReferenceAnchorExtractor.php @@ -0,0 +1,129 @@ +extractLatestHistoryTurn($history); + + if ($turn === '') { + return []; + } + + $answer = preg_replace($this->config->getFollowUpHistoryQuestionStripPattern(), '', $turn, 1) ?? ''; + $answer = trim($answer); + + if ($answer === '') { + return []; + } + + $anchors = []; + + $model = $this->extractFirstProductModelAnchor($answer); + if ($model !== '') { + $anchors[] = $model; + } + + $measurementValue = $this->extractFirstMeasurementValueAnchor($answer); + if ($measurementValue !== '') { + $anchors[] = $measurementValue; + } + + return array_values(array_unique($anchors)); + } + + public function extractFirstProductModelAnchor(string $text): string + { + if (preg_match($this->config->getFollowUpReferenceAnchorProductModelPattern(), $text, $matches) !== 1) { + return ''; + } + + $value = $this->sanitizeAnchor((string) ($matches[0] ?? '')); + $value = preg_replace('/\s+/u', ' ', $value) ?? $value; + + return trim(str_replace('®', '', $value)); + } + + public function extractFirstMeasurementValueAnchor(string $text): string + { + if (preg_match($this->config->getFollowUpReferenceAnchorMeasurementValuePattern(), $text, $matches) !== 1) { + return ''; + } + + $value = preg_replace('/\s+/u', ' ', (string) ($matches[0] ?? '')) ?? ''; + + return trim($value); + } + + private function extractLatestHistoryTurn(string $history): string + { + $history = trim($history); + + if ($history === '') { + return ''; + } + + $parts = preg_split($this->config->getFollowUpHistoryTurnSplitPattern(), $history); + + if ($parts === false || $parts === []) { + return ''; + } + + $turns = array_values(array_filter( + array_map(static fn(string $part): string => trim($part), $parts), + static fn(string $part): bool => $part !== '' + )); + + if ($turns === []) { + return ''; + } + + return (string) end($turns); + } + + private function sanitizeAnchor(string $value): string + { + $value = trim((string) preg_replace('/\s+/u', ' ', $value)); + + if ($value === '') { + return ''; + } + + if (mb_strlen($value, 'UTF-8') <= 500) { + return $value; + } + + return rtrim(mb_substr($value, 0, 497, 'UTF-8')) . '...'; + } +} diff --git a/src/Commerce/ProductRoleResolver.php b/src/Commerce/ProductRoleResolver.php new file mode 100644 index 0000000..2c279b4 --- /dev/null +++ b/src/Commerce/ProductRoleResolver.php @@ -0,0 +1,278 @@ +containsAnyKeyword($normalized, $accessoryIntentKeywords, $normalize, true); + $hasMainDeviceIntent = $this->containsAnyKeyword($normalized, $mainDeviceIntentKeywords, $normalize, true); + + if ($hasAccessoryIntent && !$this->matchesAnyPattern($normalized, $directMainDeviceRequestPatterns)) { + return self::ROLE_ACCESSORY_OR_CONSUMABLE; + } + + if ($hasMainDeviceIntent) { + return self::ROLE_MAIN_DEVICE; + } + + if ($hasAccessoryIntent) { + return self::ROLE_ACCESSORY_OR_CONSUMABLE; + } + + return self::ROLE_UNKNOWN; + } + + /** + * @param string[] $accessoryKeywords + * @param string[] $deviceKeywords + */ + public function resolveProductRole( + ShopProductResult $product, + array $accessoryKeywords, + array $deviceKeywords, + callable $normalize, + bool $detectAmbiguousPrimaryRole + ): string { + $primaryRole = $this->resolvePrimaryProductRole( + product: $product, + accessoryKeywords: $accessoryKeywords, + deviceKeywords: $deviceKeywords, + normalize: $normalize, + detectAmbiguousPrimaryRole: $detectAmbiguousPrimaryRole + ); + + if ($primaryRole !== self::ROLE_UNKNOWN) { + return $primaryRole; + } + + $corpus = mb_strtolower($this->buildProductCorpus($product), 'UTF-8'); + $isAccessory = $this->containsAnyKeyword($corpus, $accessoryKeywords, $normalize, true); + $isMainDevice = $this->containsAnyKeyword($corpus, $deviceKeywords, $normalize, true); + + if ($isAccessory) { + return self::ROLE_ACCESSORY_OR_CONSUMABLE; + } + + if ($isMainDevice) { + return self::ROLE_MAIN_DEVICE; + } + + return self::ROLE_UNKNOWN; + } + + /** + * @param string[] $accessoryKeywords + * @param string[] $deviceKeywords + */ + public function resolvePrimaryProductRole( + ShopProductResult $product, + array $accessoryKeywords, + array $deviceKeywords, + callable $normalize, + bool $detectAmbiguousPrimaryRole + ): string { + $primaryText = mb_strtolower($normalize($this->buildPrimaryProductIdentity($product)), 'UTF-8'); + + if ($primaryText === '') { + return self::ROLE_UNKNOWN; + } + + $isAccessory = $this->containsAnyKeyword($primaryText, $accessoryKeywords, $normalize, true); + $isMainDevice = $this->containsAnyKeyword($primaryText, $deviceKeywords, $normalize, true); + + if ($detectAmbiguousPrimaryRole) { + if ($isAccessory && !$isMainDevice) { + return self::ROLE_ACCESSORY_OR_CONSUMABLE; + } + + if ($isMainDevice && !$isAccessory) { + return self::ROLE_MAIN_DEVICE; + } + + if ($isAccessory && $isMainDevice) { + return self::ROLE_AMBIGUOUS_MIXED; + } + + return self::ROLE_UNKNOWN; + } + + if ($isAccessory) { + return self::ROLE_ACCESSORY_OR_CONSUMABLE; + } + + if ($isMainDevice) { + return self::ROLE_MAIN_DEVICE; + } + + return self::ROLE_UNKNOWN; + } + + /** + * @param string[] $accessoryKeywords + * @param string[] $deviceKeywords + */ + public function isAccessoryLikeProduct( + ShopProductResult $product, + array $accessoryKeywords, + array $deviceKeywords, + callable $normalize + ): bool { + $primaryRole = $this->resolvePrimaryProductRole( + product: $product, + accessoryKeywords: $accessoryKeywords, + deviceKeywords: $deviceKeywords, + normalize: $normalize, + detectAmbiguousPrimaryRole: true + ); + + if ($primaryRole === self::ROLE_ACCESSORY_OR_CONSUMABLE) { + return true; + } + + if ($primaryRole === self::ROLE_MAIN_DEVICE) { + return false; + } + + return $this->containsAnyKeyword( + $normalize($this->buildProductCorpus($product)), + $accessoryKeywords, + $normalize, + false + ); + } + + /** + * @param string[] $accessoryKeywords + * @param string[] $deviceKeywords + */ + public function isDeviceLikeProduct( + ShopProductResult $product, + array $accessoryKeywords, + array $deviceKeywords, + callable $normalize + ): bool { + $primaryRole = $this->resolvePrimaryProductRole( + product: $product, + accessoryKeywords: $accessoryKeywords, + deviceKeywords: $deviceKeywords, + normalize: $normalize, + detectAmbiguousPrimaryRole: true + ); + + if ($primaryRole === self::ROLE_MAIN_DEVICE) { + return true; + } + + if ($primaryRole === self::ROLE_ACCESSORY_OR_CONSUMABLE) { + return false; + } + + return $this->containsAnyKeyword( + $normalize($this->buildProductCorpus($product)), + $deviceKeywords, + $normalize, + false + ); + } + + public function resolveCompatibility(string $requestedRole, string $inferredRole): string + { + if ($requestedRole === self::ROLE_UNKNOWN || $inferredRole === self::ROLE_UNKNOWN) { + return self::COMPATIBILITY_UNKNOWN; + } + + if ($requestedRole === self::ROLE_MAIN_DEVICE && $inferredRole === self::ROLE_ACCESSORY_OR_CONSUMABLE) { + return self::COMPATIBILITY_INCOMPATIBLE_ACCESSORY_FOR_MAIN_DEVICE_REQUEST; + } + + if ($requestedRole === self::ROLE_ACCESSORY_OR_CONSUMABLE && $inferredRole === self::ROLE_MAIN_DEVICE) { + return self::COMPATIBILITY_INCOMPATIBLE_MAIN_DEVICE_FOR_ACCESSORY_REQUEST; + } + + if ($inferredRole === self::ROLE_AMBIGUOUS_MIXED) { + return self::COMPATIBILITY_AMBIGUOUS_KEEP_SEPARATE; + } + + return self::COMPATIBILITY_COMPATIBLE; + } + + /** + * @param string[] $keywords + */ + private function containsAnyKeyword(string $text, array $keywords, callable $normalize, bool $normalizeKeyword): bool + { + foreach ($keywords as $keyword) { + $keyword = (string) $keyword; + $candidate = $normalizeKeyword ? mb_strtolower($normalize($keyword), 'UTF-8') : $normalize($keyword); + + if ($candidate !== '' && str_contains($text, $candidate)) { + return true; + } + } + + return false; + } + + /** + * @param string[] $patterns + */ + private function matchesAnyPattern(string $text, array $patterns): bool + { + foreach ($patterns as $pattern) { + if (preg_match((string) $pattern, $text) === 1) { + return true; + } + } + + return false; + } + + private function buildPrimaryProductIdentity(ShopProductResult $product): string + { + return implode(' ', array_filter([ + $product->name, + $product->url, + ])); + } + + private function buildProductCorpus(ShopProductResult $product): string + { + return implode(' ', array_filter([ + $product->name, + $product->productNumber, + $product->manufacturer, + implode(' ', $product->highlights), + $product->description, + $product->customFields, + $product->url, + ])); + } +} diff --git a/src/Config/AgentRunnerConfig.php b/src/Config/AgentRunnerConfig.php index 2f4d52c..9674576 100644 --- a/src/Config/AgentRunnerConfig.php +++ b/src/Config/AgentRunnerConfig.php @@ -127,35 +127,14 @@ final class AgentRunnerConfig public function getFollowUpReferenceAnchorProductModelPattern(): string { - $value = $this->optionalValue('follow_up_context.reference_anchor.product_model_pattern'); - if (is_string($value) && trim($value) !== '') { - return $value; - } - - return $this->getRequiredString('follow_up_context.reference_anchor.testomat_model_pattern'); + return $this->getRequiredString('follow_up_context.reference_anchor.product_model_pattern'); } public function getFollowUpReferenceAnchorMeasurementValuePattern(): string { - $value = $this->optionalValue('follow_up_context.reference_anchor.measurement_value_pattern'); - if (is_string($value) && trim($value) !== '') { - return $value; - } - - return $this->getRequiredString('follow_up_context.reference_anchor.hardness_value_pattern'); + return $this->getRequiredString('follow_up_context.reference_anchor.measurement_value_pattern'); } - public function getFollowUpReferenceAnchorTestomatModelPattern(): string - { - return $this->getFollowUpReferenceAnchorProductModelPattern(); - } - - public function getFollowUpReferenceAnchorHardnessValuePattern(): string - { - return $this->getFollowUpReferenceAnchorMeasurementValuePattern(); - } - - public function getFollowUpContextPreviousUserQuestionTemplate(): string { return $this->getRequiredString('follow_up_context.context_labels.previous_user_question_template');