This commit is contained in:
team 1
2026-05-04 16:33:36 +02:00
parent 33b2b30d99
commit 387506b239
13 changed files with 198 additions and 57 deletions

View File

@@ -16,6 +16,12 @@ parameters:
max_length_ratio_percent: 150 max_length_ratio_percent: 150
heartbeat_message: 'Ich optimiere die Anfrage…' heartbeat_message: 'Ich optimiere die Anfrage…'
output_prefix_pattern: '/^(?:normalisiert|korrigiert|corrected|normalized)\s*:\s*/iu' output_prefix_pattern: '/^(?:normalisiert|korrigiert|corrected|normalized)\s*:\s*/iu'
placeholder_outputs:
- normalized user input
- corrected user input
- user input
- normalisierte nutzereingabe
- korrigierte nutzereingabe
skip_patterns: skip_patterns:
- '/https?:\/\//iu' - '/https?:\/\//iu'
- '/\bwww\./iu' - '/\bwww\./iu'
@@ -192,6 +198,15 @@ parameters:
testomat_model_pattern: '/\bTestomat(?:®)?\s+(?:\d{3,4}(?:\s+[A-Z]{2,8})?|EVO(?:\s+[A-Z]{2,6})?|ECO(?:[-\s]?(?:PLUS|C))?|DUO(?:\s+\d{3,4})?|LAB(?:\s+[A-Z]{2,6})?)\b/iu' testomat_model_pattern: '/\bTestomat(?:®)?\s+(?:\d{3,4}(?:\s+[A-Z]{2,8})?|EVO(?:\s+[A-Z]{2,6})?|ECO(?:[-\s]?(?:PLUS|C))?|DUO(?:\s+\d{3,4})?|LAB(?:\s+[A-Z]{2,6})?)\b/iu'
hardness_value_pattern: '/\b\d+(?:[,.]\d+)?\s*°\s*dH\b/iu' hardness_value_pattern: '/\b\d+(?:[,.]\d+)?\s*°\s*dH\b/iu'
followup_actions:
commerce:
Im Shop suchen: 'Suche die aktuelle Produktauswahl im Shop.'
Nur Zubehör anzeigen: 'Zeige aus der aktuellen Produktauswahl nur Zubehör.'
Nur Geräte anzeigen: 'Zeige aus der aktuellen Produktauswahl nur Geräte.'
Preis anzeigen: 'Zeige mir die Preise der aktuell relevanten Produkte.'
knowledge:
Technische Details anzeigen: 'Zeige technische Details zur aktuellen Antwort.'
messages: messages:
empty_prompt: '❌ Empty prompt.' empty_prompt: '❌ Empty prompt.'
analyze_request: 'Ich analysiere deine Anfrage...' analyze_request: 'Ich analysiere deine Anfrage...'

View File

@@ -69,6 +69,15 @@ parameters:
- tc - tc
- '0,02' - '0,02'
normalization:
# Generic language normalization tables. Keep these in YAML so PHP code
# executes normalization logic without owning language-specific lists.
ascii_transliteration:
ä: ae
ö: oe
ü: ue
ß: ss
stopword_groups: stopword_groups:
de_core: de_core:
- der - der

View File

@@ -46,6 +46,17 @@ parameters:
- messbereich - messbereich
testomat: testomat:
- testomat - testomat
exact_selection_token_variant_suffixes:
- typen
- innen
- enen
- ern
- en
- er
- es
- e
- s
- n
exact_selection_indicator_question_tokens: exact_selection_indicator_question_tokens:
- indikator - indikator
- indikatortyp - indikatortyp

View File

@@ -985,12 +985,7 @@ final readonly class AgentRunner
private function normalizeFuzzyRoutingToken(string $token): string private function normalizeFuzzyRoutingToken(string $token): string
{ {
$token = mb_strtolower(trim($token), 'UTF-8'); $token = mb_strtolower(trim($token), 'UTF-8');
$token = strtr($token, [ $token = $this->languageCleanupConfig->transliterateToAscii($token);
'ä' => 'ae',
'ö' => 'oe',
'ü' => 'ue',
'ß' => 'ss',
]);
$token = preg_replace('/[^a-z0-9]+/u', '', $token) ?? $token; $token = preg_replace('/[^a-z0-9]+/u', '', $token) ?? $token;
return trim($token); return trim($token);
@@ -1028,13 +1023,13 @@ final readonly class AgentRunner
{ {
$normalized = $this->normalizeRoutingComparisonText($candidate); $normalized = $this->normalizeRoutingComparisonText($candidate);
return in_array($normalized, [ foreach ($this->agentRunnerConfig->getInputNormalizationPlaceholderOutputs() as $placeholderOutput) {
'normalized user input', if ($normalized === $this->normalizeRoutingComparisonText($placeholderOutput)) {
'corrected user input', return true;
'user input', }
'normalisierte nutzereingabe', }
'korrigierte nutzereingabe',
], true); return false;
} }
private function normalizeRoutingComparisonText(string $value): string private function normalizeRoutingComparisonText(string $value): string
@@ -2857,12 +2852,7 @@ final readonly class AgentRunner
$value = html_entity_decode(strip_tags($value), ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8'); $value = html_entity_decode(strip_tags($value), ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');
$value = mb_strtolower($value, 'UTF-8'); $value = mb_strtolower($value, 'UTF-8');
$value = str_replace(['', '', '', '', '—'], '-', $value); $value = str_replace(['', '', '', '', '—'], '-', $value);
$value = strtr($value, [ $value = $this->languageCleanupConfig->transliterateToAscii($value);
'ä' => 'ae',
'ö' => 'oe',
'ü' => 'ue',
'ß' => 'ss',
]);
$value = preg_replace('/\s+/u', ' ', $value) ?? $value; $value = preg_replace('/\s+/u', ' ', $value) ?? $value;
return trim($value); return trim($value);
@@ -3241,14 +3231,15 @@ final readonly class AgentRunner
$actions = []; $actions = [];
if ($isCommerceIntent || $hasShopResults) { if ($isCommerceIntent || $hasShopResults) {
$actions[] = ['Im Shop suchen', 'Suche die aktuelle Produktauswahl im Shop.']; foreach ($this->agentRunnerConfig->getCommerceFollowUpActions() as $label => $actionPrompt) {
$actions[] = ['Nur Zubehör anzeigen', 'Zeige aus der aktuellen Produktauswahl nur Zubehör.']; $actions[] = [$label, $actionPrompt];
$actions[] = ['Nur Geräte anzeigen', 'Zeige aus der aktuellen Produktauswahl nur Geräte.']; }
$actions[] = ['Preis anzeigen', 'Zeige mir die Preise der aktuell relevanten Produkte.'];
} }
if ($hasKnowledge || $hasShopResults) { if ($hasKnowledge || $hasShopResults) {
$actions[] = ['Technische Details anzeigen', 'Zeige technische Details zur aktuellen Antwort.']; foreach ($this->agentRunnerConfig->getKnowledgeFollowUpActions() as $label => $actionPrompt) {
$actions[] = [$label, $actionPrompt];
}
} }
if ($actions === []) { if ($actions === []) {

View File

@@ -260,6 +260,26 @@ final class AgentRunnerConfig
return $this->getRequiredStringList('input_normalization.fuzzy_routing.terms'); return $this->getRequiredStringList('input_normalization.fuzzy_routing.terms');
} }
/**
* @return string[]
*/
public function getInputNormalizationPlaceholderOutputs(): array
{
return $this->getRequiredStringList('input_normalization.placeholder_outputs');
}
/** @return array<string, string> */
public function getCommerceFollowUpActions(): array
{
return $this->getRequiredStringMap('followup_actions.commerce');
}
/** @return array<string, string> */
public function getKnowledgeFollowUpActions(): array
{
return $this->getRequiredStringMap('followup_actions.knowledge');
}
private function getRequiredInt(string $key): int private function getRequiredInt(string $key): int
{ {
$value = $this->requiredValue($key); $value = $this->requiredValue($key);
@@ -384,6 +404,39 @@ final class AgentRunnerConfig
return $out; return $out;
} }
/**
* @return array<string, string>
*/
private function getRequiredStringMap(string $key): array
{
$value = $this->requiredValue($key);
if (!is_array($value)) {
throw new \InvalidArgumentException(sprintf('RetrieX agent config key "%s" must be a string map.', $key));
}
$out = [];
foreach ($value as $mapKey => $mapValue) {
if (!is_scalar($mapKey) || !is_scalar($mapValue)) {
continue;
}
$mapKey = trim((string) $mapKey);
$mapValue = trim((string) $mapValue);
if ($mapKey !== '' && $mapValue !== '') {
$out[$mapKey] = $mapValue;
}
}
if ($out === []) {
throw new \InvalidArgumentException(sprintf('RetrieX agent config key "%s" must contain at least one valid entry.', $key));
}
return $out;
}
/** /**
* @return array<string, string> * @return array<string, string>
*/ */

View File

@@ -44,6 +44,27 @@ final class LanguageCleanupConfig
return in_array($term, $this->getProtectedTerms(), true); return in_array($term, $this->getProtectedTerms(), true);
} }
/** @return array<string, string> */
public function getAsciiTransliterationMap(): array
{
$normalization = $this->requiredMap('normalization');
if (!array_key_exists('ascii_transliteration', $normalization)) {
throw $this->invalid('normalization.ascii_transliteration', 'is missing');
}
return $this->stringMapFromValue($normalization['ascii_transliteration'], 'normalization.ascii_transliteration', true);
}
public function transliterateToAscii(string $value): string
{
$map = $this->getAsciiTransliterationMap();
if ($map === []) {
return $value;
}
return strtr($value, $map);
}
/** @return string[] */ /** @return string[] */
public function getCleanupProfileNames(): array public function getCleanupProfileNames(): array
{ {
@@ -235,6 +256,35 @@ final class LanguageCleanupConfig
return $out; return $out;
} }
/** @return array<string, string> */
private function stringMapFromValue(mixed $value, string $path, bool $required): array
{
if (!is_array($value)) {
throw $this->invalid($path, 'must be a map of non-empty strings');
}
$out = [];
foreach ($value as $key => $item) {
if (!is_scalar($key) || !is_scalar($item)) {
continue;
}
$key = trim((string) $key);
$item = trim((string) $item);
if ($key === '' || $item === '') {
continue;
}
$out[$key] = $item;
}
if ($required && $out === []) {
throw $this->invalid($path, 'must contain at least one non-empty map entry');
}
return $out;
}
/** @param string[] $terms */ /** @param string[] $terms */
private function removeProtectedTerms(array $terms): array private function removeProtectedTerms(array $terms): array
{ {

View File

@@ -148,6 +148,12 @@ final class NdjsonHybridRetrieverConfig
return $this->requiredStringListMap('exact_selection_token_variant_prefixes'); return $this->requiredStringListMap('exact_selection_token_variant_prefixes');
} }
/** @return string[] */
public function exactSelectionTokenVariantSuffixes(): array
{
return $this->requiredStringList('exact_selection_token_variant_suffixes');
}
/** @return string[] */ /** @return string[] */
public function exactSelectionIndicatorQuestionTokens(): array public function exactSelectionIndicatorQuestionTokens(): array
{ {
@@ -313,6 +319,7 @@ final class NdjsonHybridRetrieverConfig
'focused_product_max_chunks' => $this->focusedProductMaxChunks(), 'focused_product_max_chunks' => $this->focusedProductMaxChunks(),
'catalog_list_shortcut_patterns' => $this->catalogListShortcutPatterns(), 'catalog_list_shortcut_patterns' => $this->catalogListShortcutPatterns(),
'exact_selection_token_variant_prefixes' => $this->exactSelectionTokenVariantPrefixes(), 'exact_selection_token_variant_prefixes' => $this->exactSelectionTokenVariantPrefixes(),
'exact_selection_token_variant_suffixes' => $this->exactSelectionTokenVariantSuffixes(),
'exact_selection_indicator_question_tokens' => $this->exactSelectionIndicatorQuestionTokens(), 'exact_selection_indicator_question_tokens' => $this->exactSelectionIndicatorQuestionTokens(),
'exact_selection_indicator_question_phrases' => $this->exactSelectionIndicatorQuestionPhrases(), 'exact_selection_indicator_question_phrases' => $this->exactSelectionIndicatorQuestionPhrases(),
'exact_selection_indicator_table_heading_patterns' => $this->exactSelectionIndicatorTableHeadingPatterns(), 'exact_selection_indicator_table_heading_patterns' => $this->exactSelectionIndicatorTableHeadingPatterns(),

View File

@@ -583,6 +583,7 @@ final readonly class RetriexEffectiveConfigProvider
'max_length_ratio_percent' => $this->agentRunnerConfig->getInputNormalizationMaxLengthRatioPercent(), 'max_length_ratio_percent' => $this->agentRunnerConfig->getInputNormalizationMaxLengthRatioPercent(),
'heartbeat_message' => $this->agentRunnerConfig->getInputNormalizationHeartbeatMessage(), 'heartbeat_message' => $this->agentRunnerConfig->getInputNormalizationHeartbeatMessage(),
'output_prefix_pattern' => $this->agentRunnerConfig->getInputNormalizationOutputPrefixPattern(), 'output_prefix_pattern' => $this->agentRunnerConfig->getInputNormalizationOutputPrefixPattern(),
'placeholder_outputs' => $this->agentRunnerConfig->getInputNormalizationPlaceholderOutputs(),
'skip_patterns' => $this->agentRunnerConfig->getInputNormalizationSkipPatterns(), 'skip_patterns' => $this->agentRunnerConfig->getInputNormalizationSkipPatterns(),
'prompt' => [ 'prompt' => [
'intro' => $this->agentRunnerConfig->getInputNormalizationIntro(), 'intro' => $this->agentRunnerConfig->getInputNormalizationIntro(),
@@ -602,6 +603,10 @@ final readonly class RetriexEffectiveConfigProvider
'terms' => $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingTerms(), 'terms' => $this->agentRunnerConfig->getInputNormalizationFuzzyRoutingTerms(),
], ],
], ],
'followup_actions' => [
'commerce' => $this->agentRunnerConfig->getCommerceFollowUpActions(),
'knowledge' => $this->agentRunnerConfig->getKnowledgeFollowUpActions(),
],
'messages' => [ 'messages' => [
'empty_prompt' => $this->agentRunnerConfig->getEmptyPromptMessage(), 'empty_prompt' => $this->agentRunnerConfig->getEmptyPromptMessage(),
'analyze_request' => $this->agentRunnerConfig->getAnalyzeRequestMessage(), 'analyze_request' => $this->agentRunnerConfig->getAnalyzeRequestMessage(),
@@ -929,6 +934,9 @@ final readonly class RetriexEffectiveConfigProvider
return [ return [
'stopwords' => $this->stopWordsConfig->getStopWords(), 'stopwords' => $this->stopWordsConfig->getStopWords(),
'protected_terms' => $this->languageCleanupConfig->getProtectedTerms(), 'protected_terms' => $this->languageCleanupConfig->getProtectedTerms(),
'normalization' => [
'ascii_transliteration' => $this->languageCleanupConfig->getAsciiTransliterationMap(),
],
'cleanup_profile_names' => $this->languageCleanupConfig->getCleanupProfileNames(), 'cleanup_profile_names' => $this->languageCleanupConfig->getCleanupProfileNames(),
'cleanup_profiles' => $profiles, 'cleanup_profiles' => $profiles,
]; ];
@@ -1200,6 +1208,7 @@ final readonly class RetriexEffectiveConfigProvider
private function validateAgent(array $agent, array &$errors, array &$warnings): void private function validateAgent(array $agent, array &$errors, array &$warnings): void
{ {
$this->validateStringListMap($agent['messages'] ?? [], 'agent.messages', $errors, $warnings); $this->validateStringListMap($agent['messages'] ?? [], 'agent.messages', $errors, $warnings);
$this->validateStringListMap($agent['followup_actions'] ?? [], 'agent.followup_actions', $errors, $warnings);
$this->validateStringListMap($agent['source_labels'] ?? [], 'agent.source_labels', $errors, $warnings); $this->validateStringListMap($agent['source_labels'] ?? [], 'agent.source_labels', $errors, $warnings);
$this->validateStringListMap($agent['html_templates'] ?? [], 'agent.html_templates', $errors, $warnings); $this->validateStringListMap($agent['html_templates'] ?? [], 'agent.html_templates', $errors, $warnings);
@@ -1217,6 +1226,9 @@ final readonly class RetriexEffectiveConfigProvider
$errors[] = 'agent.follow_up_context.commercial_table_follow_up.query_template_without_model must not be empty.'; $errors[] = 'agent.follow_up_context.commercial_table_follow_up.query_template_without_model must not be empty.';
} }
$inputNormalization = is_array($agent['input_normalization'] ?? null) ? $agent['input_normalization'] : [];
$this->validateStringList($this->toList($inputNormalization['placeholder_outputs'] ?? []), 'agent.input_normalization.placeholder_outputs', $errors, $warnings);
$ragEvidence = is_array($agent['rag_evidence_guard'] ?? null) ? $agent['rag_evidence_guard'] : []; $ragEvidence = is_array($agent['rag_evidence_guard'] ?? null) ? $agent['rag_evidence_guard'] : [];
$ragEvidenceCleanupProfile = $ragEvidence['cleanup_profile'] ?? null; $ragEvidenceCleanupProfile = $ragEvidence['cleanup_profile'] ?? null;
if (!is_string($ragEvidenceCleanupProfile) || trim($ragEvidenceCleanupProfile) === '') { if (!is_string($ragEvidenceCleanupProfile) || trim($ragEvidenceCleanupProfile) === '') {

View File

@@ -5,6 +5,7 @@ declare(strict_types=1);
namespace App\Intent; namespace App\Intent;
use App\Config\IntentLightConfig; use App\Config\IntentLightConfig;
use App\Config\LanguageCleanupConfig;
/** /**
* IntentLite * IntentLite
@@ -20,10 +21,9 @@ final readonly class IntentLite
{ {
public function __construct( public function __construct(
private IntentLightConfig $config private IntentLightConfig $config,
) private LanguageCleanupConfig $languageCleanupConfig
{ ) {
} }
public function detectList(string $originalPrompt): array public function detectList(string $originalPrompt): array
@@ -99,16 +99,9 @@ final readonly class IntentLite
{ {
$s = mb_strtolower($s); $s = mb_strtolower($s);
// Umlaute zusätzlich absichern (falls QueryCleaner das tut) // Keep the language-specific transliteration table in YAML.
$replacements = [ // Only append an ASCII variant; do not replace the original form.
'ä' => 'ae', foreach ($this->languageCleanupConfig->getAsciiTransliterationMap() as $umlaut => $alt) {
'ö' => 'oe',
'ü' => 'ue',
'ß' => 'ss',
];
// Nur als Zusatzform speichern (nicht ersetzen!)
foreach ($replacements as $umlaut => $alt) {
if (str_contains($s, $umlaut)) { if (str_contains($s, $umlaut)) {
$s .= ' ' . str_replace($umlaut, $alt, $s); $s .= ' ' . str_replace($umlaut, $alt, $s);
break; break;

View File

@@ -4,6 +4,7 @@ declare(strict_types=1);
namespace App\Intent; namespace App\Intent;
use App\Config\LanguageCleanupConfig;
use App\Config\SalesIntentConfig; use App\Config\SalesIntentConfig;
final class SalesIntentLite final class SalesIntentLite
@@ -16,7 +17,8 @@ final class SalesIntentLite
public const ROI = 'roi'; public const ROI = 'roi';
public function __construct( public function __construct(
private readonly SalesIntentConfig $config private readonly SalesIntentConfig $config,
private readonly LanguageCleanupConfig $languageCleanupConfig
) { ) {
} }
@@ -123,11 +125,6 @@ final class SalesIntentLite
{ {
$s = mb_strtolower($s); $s = mb_strtolower($s);
return strtr($s, [ return $this->languageCleanupConfig->transliterateToAscii($s);
'ä' => 'ae',
'ö' => 'oe',
'ü' => 'ue',
'ß' => 'ss',
]);
} }
} }

View File

@@ -4,12 +4,14 @@ declare(strict_types=1);
namespace App\Knowledge\Retrieval; namespace App\Knowledge\Retrieval;
use App\Config\NdjsonHybridRetrieverConfig;
use App\Knowledge\ChunkManager; use App\Knowledge\ChunkManager;
final readonly class NdjsonChunkLookup final readonly class NdjsonChunkLookup
{ {
public function __construct( public function __construct(
private ChunkManager $chunkManager private ChunkManager $chunkManager,
private NdjsonHybridRetrieverConfig $retrieverConfig
) { ) {
} }
@@ -395,7 +397,7 @@ final readonly class NdjsonChunkLookup
$length = mb_strlen($token, 'UTF-8'); $length = mb_strlen($token, 'UTF-8');
if ($length >= 5) { if ($length >= 5) {
foreach (['innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) { foreach ($this->retrieverConfig->exactSelectionTokenVariantSuffixes() as $suffix) {
if (!str_ends_with($token, $suffix)) { if (!str_ends_with($token, $suffix)) {
continue; continue;
} }

View File

@@ -843,7 +843,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$length = mb_strlen($token, 'UTF-8'); $length = mb_strlen($token, 'UTF-8');
if ($length >= 5) { if ($length >= 5) {
foreach (['typen', 'innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) { foreach ($this->retrieverConfig->exactSelectionTokenVariantSuffixes() as $suffix) {
if (!str_ends_with($token, $suffix)) { if (!str_ends_with($token, $suffix)) {
continue; continue;
} }

View File

@@ -2,19 +2,20 @@
namespace App\Service; namespace App\Service;
use App\Config\LanguageCleanupConfig;
class FormatText class FormatText
{ {
public function __construct(private readonly LanguageCleanupConfig $languageCleanupConfig)
{
}
function slugify(string $text): string function slugify(string $text): string
{ {
$text = mb_strtolower($text, 'UTF-8'); $text = mb_strtolower($text, 'UTF-8');
// Umlaute ersetzen // Use YAML-backed language normalization instead of a PHP-owned list.
$replacements = [ $replacements = $this->languageCleanupConfig->getAsciiTransliterationMap();
'ä' => 'ae',
'ö' => 'oe',
'ü' => 'ue',
'ß' => 'ss'
];
$text = str_replace(array_keys($replacements), $replacements, $text); $text = str_replace(array_keys($replacements), $replacements, $text);
// Nicht erlaubte Zeichen entfernen // Nicht erlaubte Zeichen entfernen
@@ -27,4 +28,4 @@ class FormatText
return trim($text, '-'); return trim($text, '-');
} }
} }