fix retrieve final technical questions

This commit is contained in:
team 1
2026-04-24 11:32:54 +02:00
parent b800c1fc8f
commit 8a7cb89c5d
3 changed files with 190 additions and 7 deletions

View File

@@ -10,8 +10,7 @@ final readonly class NdjsonChunkLookup
{
public function __construct(
private ChunkManager $chunkManager
)
{
) {
}
/**
@@ -64,12 +63,12 @@ final readonly class NdjsonChunkLookup
}
/**
* Resolves the best exact document title match from the user prompt.
* Resolves the best document-title match from the user prompt.
*
* Matching rules:
* - the normalized prompt must contain the full normalized document title
* - titles containing digits are preferred, e.g. "Testomat 808"
* - longer exact titles win over shorter generic titles
* - exact contiguous title matches still win
* - if no exact match exists, all significant title tokens may match in any order
* - token fallback is intentionally conservative and handles light German suffixes
*
* @return array{
* document_id:string,
@@ -135,6 +134,27 @@ final readonly class NdjsonChunkLookup
}
}
if ($best === null) {
foreach ($documents as $document) {
$normalizedTitle = $document['normalized_title'];
if (!$this->isConfidentTitleTokenMatch($normalizedPrompt, $normalizedTitle)) {
continue;
}
$score = 500 + mb_strlen($normalizedTitle, 'UTF-8');
if (preg_match('/\d/u', $normalizedTitle) === 1) {
$score += 1000;
}
if ($best === null || $score > $bestScore) {
$best = $document;
$bestScore = $score;
}
}
}
if ($best === null) {
return null;
}
@@ -192,6 +212,115 @@ final readonly class NdjsonChunkLookup
return count($significantTokens) >= 2 || preg_match('/\d/u', $normalizedTitle) === 1;
}
/**
* Fallback for factual prompts that mention a document title by its terms,
* but not as one contiguous phrase.
*/
private function isConfidentTitleTokenMatch(string $normalizedPrompt, string $normalizedTitle): bool
{
if ($normalizedPrompt === '' || $normalizedTitle === '') {
return false;
}
$titleTokens = $this->significantTitleTokens($normalizedTitle);
if (count($titleTokens) < 3 && preg_match('/\d/u', $normalizedTitle) !== 1) {
return false;
}
$promptTokenVariants = $this->tokenVariantLookup($normalizedPrompt);
foreach ($titleTokens as $titleToken) {
$matched = false;
foreach ($this->tokenVariants($titleToken) as $variant) {
if (isset($promptTokenVariants[$variant])) {
$matched = true;
break;
}
}
if (!$matched) {
return false;
}
}
return true;
}
/**
* @return string[]
*/
private function significantTitleTokens(string $normalizedTitle): array
{
$tokens = preg_split('/\s+/u', $normalizedTitle, -1, PREG_SPLIT_NO_EMPTY) ?: [];
$out = [];
foreach ($tokens as $token) {
$token = trim($token);
if ($token === '') {
continue;
}
if (mb_strlen($token, 'UTF-8') < 3 && preg_match('/\d/u', $token) !== 1) {
continue;
}
$out[] = $token;
}
return array_values(array_unique($out));
}
/**
* @return array<string,bool>
*/
private function tokenVariantLookup(string $normalizedText): array
{
$tokens = preg_split('/\s+/u', $normalizedText, -1, PREG_SPLIT_NO_EMPTY) ?: [];
$lookup = [];
foreach ($tokens as $token) {
foreach ($this->tokenVariants($token) as $variant) {
$lookup[$variant] = true;
}
}
return $lookup;
}
/**
* @return string[]
*/
private function tokenVariants(string $token): array
{
$token = trim($token);
if ($token === '') {
return [];
}
$variants = [$token];
$length = mb_strlen($token, 'UTF-8');
if ($length >= 5) {
foreach (['innen', 'enen', 'ern', 'en', 'er', 'es', 'e', 's', 'n'] as $suffix) {
if (!str_ends_with($token, $suffix)) {
continue;
}
$stem = mb_substr($token, 0, $length - mb_strlen($suffix, 'UTF-8'), 'UTF-8');
if (mb_strlen($stem, 'UTF-8') >= 3) {
$variants[] = $stem;
}
}
}
return array_values(array_unique($variants));
}
private function normalizeText(string $value): string
{
$value = mb_strtolower(trim($value), 'UTF-8');
@@ -201,4 +330,4 @@ final readonly class NdjsonChunkLookup
return trim($value);
}
}
}