first commit

This commit is contained in:
team 1
2026-02-11 14:15:08 +01:00
parent a4742c2c38
commit aa7d362bc3
58 changed files with 9999 additions and 0 deletions

View File

@@ -0,0 +1,87 @@
<?php
declare(strict_types=1);
namespace App\Knowledge;
/**
* KeywordSimilarity
*
* Deterministic and fault-tolerant comparison of two keywords.
* Returns a similarity score between 0.0 and 1.0.
*
* Design goals:
* - index.json remains unchanged
* - comparison logic is intelligent (typos, phonetics)
* - no alias or synonym lists
* - no LLM dependency
*/
final class KeywordSimilarity
{
/**
* Compare a query token with an index keyword.
*
* @param string $queryToken Token from user input
* @param string $indexKeyword Keyword from index.json
*
* @return float Similarity score (0.0 1.0)
*/
public static function compare(string $queryToken, string $indexKeyword): float
{
$a = self::normalize($queryToken);
$b = self::normalize($indexKeyword);
// Guard: ignore empty or very short tokens
if ($a === '' || $b === '' || mb_strlen($a) < 3 || mb_strlen($b) < 3) {
return 0.0;
}
// 1. Exact match
if ($a === $b) {
return 1.0;
}
// 2. Phonetic comparison (metaphone)
// Useful for: showpare → shopware, shopvare → shopware
if (metaphone($a) === metaphone($b)) {
return 0.85;
}
// 3. Edit distance comparison (only for longer words)
if (mb_strlen($a) >= 6 && mb_strlen($b) >= 6) {
$distance = levenshtein($a, $b);
if ($distance === 1) {
return 0.9;
}
if ($distance === 2) {
return 0.8;
}
}
// No relevant match
return 0.0;
}
/**
* Normalize a keyword to ensure stable comparison.
*/
private static function normalize(string $value): string
{
$value = mb_strtolower(trim($value));
// Remove non-alphanumeric characters
$value = preg_replace('/[^\p{L}\p{N}]/u', '', $value) ?? '';
// Normalize German umlauts
$map = [
'ä' => 'ae',
'ö' => 'oe',
'ü' => 'ue',
'ß' => 'ss',
];
return strtr($value, $map);
}
}