patch 17c

This commit is contained in:
team 1
2026-05-01 20:38:10 +02:00
parent 1897fdf0eb
commit f98de3c785
4 changed files with 136 additions and 20 deletions

View File

@@ -159,14 +159,14 @@ final readonly class NdjsonChunkLookup
foreach ($documents as $document) {
$normalizedTitle = $document['normalized_title'];
if (!$this->isConfidentTitleAlphaTokenMatch($normalizedPrompt, $normalizedTitle)) {
if (!$this->isConfidentTitleTokenMatchAllowingMissingNumeric($normalizedPrompt, $normalizedTitle)) {
continue;
}
$score = 250 + mb_strlen($normalizedTitle, 'UTF-8');
$score = 350 + mb_strlen($normalizedTitle, 'UTF-8');
if (preg_match('/\d/u', $normalizedTitle) === 1) {
$score += 500;
$score += 750;
}
if ($best === null || $score > $bestScore) {
@@ -270,34 +270,40 @@ final readonly class NdjsonChunkLookup
}
/**
* Fallback for product titles where the prompt contains the significant
* alphabetic model tokens, but omits a numeric family token.
*
* This keeps prompts such as a product family plus variant suffix anchored
* to the correct document instead of falling back to broader semantic hits.
* Allows prompts such as "Testomat CAL" to resolve a document titled
* "Testomat 2000 CAL" without also allowing conflicting model numbers.
*/
private function isConfidentTitleAlphaTokenMatch(string $normalizedPrompt, string $normalizedTitle): bool
private function isConfidentTitleTokenMatchAllowingMissingNumeric(string $normalizedPrompt, string $normalizedTitle): bool
{
if ($normalizedPrompt === '' || $normalizedTitle === '') {
return false;
}
$titleTokens = $this->significantTitleTokens($normalizedTitle);
if (count($titleTokens) < 3 || preg_match('/\d/u', $normalizedTitle) !== 1) {
return false;
}
$alphaTokens = array_values(array_filter(
$titleTokens,
static fn (string $token): bool => preg_match('/\d/u', $token) !== 1
));
$numericTokens = array_values(array_filter(
$titleTokens,
static fn (string $token): bool => preg_match('/\d/u', $token) === 1
));
if (count($alphaTokens) < 2 || count($alphaTokens) === count($titleTokens)) {
if (count($alphaTokens) < 2 || $numericTokens === []) {
return false;
}
$promptTokenVariants = $this->tokenVariantLookup($normalizedPrompt);
foreach ($alphaTokens as $titleToken) {
foreach ($alphaTokens as $alphaToken) {
$matched = false;
foreach ($this->tokenVariants($titleToken) as $variant) {
foreach ($this->tokenVariants($alphaToken) as $variant) {
if (isset($promptTokenVariants[$variant])) {
$matched = true;
break;
@@ -309,7 +315,21 @@ final readonly class NdjsonChunkLookup
}
}
return true;
$promptHasNumericToken = preg_match('/\d/u', $normalizedPrompt) === 1;
if (!$promptHasNumericToken) {
return true;
}
foreach ($numericTokens as $numericToken) {
foreach ($this->tokenVariants($numericToken) as $variant) {
if (isset($promptTokenVariants[$variant])) {
return true;
}
}
}
return false;
}
/**