Files
MtoRagSystem/src/Knowledge/KeywordSimilarity.php
2026-02-11 14:15:08 +01:00

88 lines
2.1 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
declare(strict_types=1);
namespace App\Knowledge;
/**
* KeywordSimilarity
*
* Deterministic and fault-tolerant comparison of two keywords.
* Returns a similarity score between 0.0 and 1.0.
*
* Design goals:
* - index.json remains unchanged
* - comparison logic is intelligent (typos, phonetics)
* - no alias or synonym lists
* - no LLM dependency
*/
final class KeywordSimilarity
{
/**
* Compare a query token with an index keyword.
*
* @param string $queryToken Token from user input
* @param string $indexKeyword Keyword from index.json
*
* @return float Similarity score (0.0 1.0)
*/
public static function compare(string $queryToken, string $indexKeyword): float
{
$a = self::normalize($queryToken);
$b = self::normalize($indexKeyword);
// Guard: ignore empty or very short tokens
if ($a === '' || $b === '' || mb_strlen($a) < 3 || mb_strlen($b) < 3) {
return 0.0;
}
// 1. Exact match
if ($a === $b) {
return 1.0;
}
// 2. Phonetic comparison (metaphone)
// Useful for: showpare → shopware, shopvare → shopware
if (metaphone($a) === metaphone($b)) {
return 0.85;
}
// 3. Edit distance comparison (only for longer words)
if (mb_strlen($a) >= 6 && mb_strlen($b) >= 6) {
$distance = levenshtein($a, $b);
if ($distance === 1) {
return 0.9;
}
if ($distance === 2) {
return 0.8;
}
}
// No relevant match
return 0.0;
}
/**
* Normalize a keyword to ensure stable comparison.
*/
private static function normalize(string $value): string
{
$value = mb_strtolower(trim($value));
// Remove non-alphanumeric characters
$value = preg_replace('/[^\p{L}\p{N}]/u', '', $value) ?? '';
// Normalize German umlauts
$map = [
'ä' => 'ae',
'ö' => 'oe',
'ü' => 'ue',
'ß' => 'ss',
];
return strtr($value, $map);
}
}