patch 17c
This commit is contained in:
@@ -159,14 +159,14 @@ final readonly class NdjsonChunkLookup
|
||||
foreach ($documents as $document) {
|
||||
$normalizedTitle = $document['normalized_title'];
|
||||
|
||||
if (!$this->isConfidentTitleAlphaTokenMatch($normalizedPrompt, $normalizedTitle)) {
|
||||
if (!$this->isConfidentTitleTokenMatchAllowingMissingNumeric($normalizedPrompt, $normalizedTitle)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$score = 250 + mb_strlen($normalizedTitle, 'UTF-8');
|
||||
$score = 350 + mb_strlen($normalizedTitle, 'UTF-8');
|
||||
|
||||
if (preg_match('/\d/u', $normalizedTitle) === 1) {
|
||||
$score += 500;
|
||||
$score += 750;
|
||||
}
|
||||
|
||||
if ($best === null || $score > $bestScore) {
|
||||
@@ -270,34 +270,40 @@ final readonly class NdjsonChunkLookup
|
||||
}
|
||||
|
||||
/**
|
||||
* Fallback for product titles where the prompt contains the significant
|
||||
* alphabetic model tokens, but omits a numeric family token.
|
||||
*
|
||||
* This keeps prompts such as a product family plus variant suffix anchored
|
||||
* to the correct document instead of falling back to broader semantic hits.
|
||||
* Allows prompts such as "Testomat CAL" to resolve a document titled
|
||||
* "Testomat 2000 CAL" without also allowing conflicting model numbers.
|
||||
*/
|
||||
private function isConfidentTitleAlphaTokenMatch(string $normalizedPrompt, string $normalizedTitle): bool
|
||||
private function isConfidentTitleTokenMatchAllowingMissingNumeric(string $normalizedPrompt, string $normalizedTitle): bool
|
||||
{
|
||||
if ($normalizedPrompt === '' || $normalizedTitle === '') {
|
||||
return false;
|
||||
}
|
||||
|
||||
$titleTokens = $this->significantTitleTokens($normalizedTitle);
|
||||
|
||||
if (count($titleTokens) < 3 || preg_match('/\d/u', $normalizedTitle) !== 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$alphaTokens = array_values(array_filter(
|
||||
$titleTokens,
|
||||
static fn (string $token): bool => preg_match('/\d/u', $token) !== 1
|
||||
));
|
||||
$numericTokens = array_values(array_filter(
|
||||
$titleTokens,
|
||||
static fn (string $token): bool => preg_match('/\d/u', $token) === 1
|
||||
));
|
||||
|
||||
if (count($alphaTokens) < 2 || count($alphaTokens) === count($titleTokens)) {
|
||||
if (count($alphaTokens) < 2 || $numericTokens === []) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$promptTokenVariants = $this->tokenVariantLookup($normalizedPrompt);
|
||||
|
||||
foreach ($alphaTokens as $titleToken) {
|
||||
foreach ($alphaTokens as $alphaToken) {
|
||||
$matched = false;
|
||||
|
||||
foreach ($this->tokenVariants($titleToken) as $variant) {
|
||||
foreach ($this->tokenVariants($alphaToken) as $variant) {
|
||||
if (isset($promptTokenVariants[$variant])) {
|
||||
$matched = true;
|
||||
break;
|
||||
@@ -309,7 +315,21 @@ final readonly class NdjsonChunkLookup
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
$promptHasNumericToken = preg_match('/\d/u', $normalizedPrompt) === 1;
|
||||
|
||||
if (!$promptHasNumericToken) {
|
||||
return true;
|
||||
}
|
||||
|
||||
foreach ($numericTokens as $numericToken) {
|
||||
foreach ($this->tokenVariants($numericToken) as $variant) {
|
||||
if (isset($promptTokenVariants[$variant])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user