fix retrieve final technical questions
This commit is contained in:
@@ -10,8 +10,7 @@ final readonly class NdjsonChunkLookup
|
||||
{
|
||||
public function __construct(
|
||||
private ChunkManager $chunkManager
|
||||
)
|
||||
{
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -64,12 +63,12 @@ final readonly class NdjsonChunkLookup
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves the best exact document title match from the user prompt.
|
||||
* Resolves the best document-title match from the user prompt.
|
||||
*
|
||||
* Matching rules:
|
||||
* - the normalized prompt must contain the full normalized document title
|
||||
* - titles containing digits are preferred, e.g. "Testomat 808"
|
||||
* - longer exact titles win over shorter generic titles
|
||||
* - exact contiguous title matches still win
|
||||
* - if no exact match exists, all significant title tokens may match in any order
|
||||
* - token fallback is intentionally conservative and handles light German suffixes
|
||||
*
|
||||
* @return array{
|
||||
* document_id:string,
|
||||
@@ -135,6 +134,27 @@ final readonly class NdjsonChunkLookup
|
||||
}
|
||||
}
|
||||
|
||||
if ($best === null) {
|
||||
foreach ($documents as $document) {
|
||||
$normalizedTitle = $document['normalized_title'];
|
||||
|
||||
if (!$this->isConfidentTitleTokenMatch($normalizedPrompt, $normalizedTitle)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$score = 500 + mb_strlen($normalizedTitle, 'UTF-8');
|
||||
|
||||
if (preg_match('/\d/u', $normalizedTitle) === 1) {
|
||||
$score += 1000;
|
||||
}
|
||||
|
||||
if ($best === null || $score > $bestScore) {
|
||||
$best = $document;
|
||||
$bestScore = $score;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($best === null) {
|
||||
return null;
|
||||
}
|
||||
@@ -192,6 +212,115 @@ final readonly class NdjsonChunkLookup
|
||||
return count($significantTokens) >= 2 || preg_match('/\d/u', $normalizedTitle) === 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fallback for factual prompts that mention a document title by its terms,
|
||||
* but not as one contiguous phrase.
|
||||
*/
|
||||
private function isConfidentTitleTokenMatch(string $normalizedPrompt, string $normalizedTitle): bool
|
||||
{
|
||||
if ($normalizedPrompt === '' || $normalizedTitle === '') {
|
||||
return false;
|
||||
}
|
||||
|
||||
$titleTokens = $this->significantTitleTokens($normalizedTitle);
|
||||
|
||||
if (count($titleTokens) < 3 && preg_match('/\d/u', $normalizedTitle) !== 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$promptTokenVariants = $this->tokenVariantLookup($normalizedPrompt);
|
||||
|
||||
foreach ($titleTokens as $titleToken) {
|
||||
$matched = false;
|
||||
|
||||
foreach ($this->tokenVariants($titleToken) as $variant) {
|
||||
if (isset($promptTokenVariants[$variant])) {
|
||||
$matched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!$matched) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
private function significantTitleTokens(string $normalizedTitle): array
|
||||
{
|
||||
$tokens = preg_split('/\s+/u', $normalizedTitle, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||||
$out = [];
|
||||
|
||||
foreach ($tokens as $token) {
|
||||
$token = trim($token);
|
||||
|
||||
if ($token === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mb_strlen($token, 'UTF-8') < 3 && preg_match('/\d/u', $token) !== 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = $token;
|
||||
}
|
||||
|
||||
return array_values(array_unique($out));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string,bool>
|
||||
*/
|
||||
private function tokenVariantLookup(string $normalizedText): array
|
||||
{
|
||||
$tokens = preg_split('/\s+/u', $normalizedText, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||||
$lookup = [];
|
||||
|
||||
foreach ($tokens as $token) {
|
||||
foreach ($this->tokenVariants($token) as $variant) {
|
||||
$lookup[$variant] = true;
|
||||
}
|
||||
}
|
||||
|
||||
return $lookup;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
private function tokenVariants(string $token): array
|
||||
{
|
||||
$token = trim($token);
|
||||
|
||||
if ($token === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
$variants = [$token];
|
||||
$length = mb_strlen($token, 'UTF-8');
|
||||
|
||||
if ($length >= 5) {
|
||||
foreach (['innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) {
|
||||
if (!str_ends_with($token, $suffix)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$stem = mb_substr($token, 0, $length - mb_strlen($suffix, 'UTF-8'), 'UTF-8');
|
||||
|
||||
if (mb_strlen($stem, 'UTF-8') >= 3) {
|
||||
$variants[] = $stem;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return array_values(array_unique($variants));
|
||||
}
|
||||
|
||||
private function normalizeText(string $value): string
|
||||
{
|
||||
$value = mb_strtolower(trim($value), 'UTF-8');
|
||||
@@ -201,4 +330,4 @@ final readonly class NdjsonChunkLookup
|
||||
|
||||
return trim($value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user