harden retrieve logic
This commit is contained in:
@@ -60,7 +60,6 @@ final readonly class PromptBuilder
|
||||
* @param ShopProductResult[] $shopResults
|
||||
* @param bool|null $fullContext
|
||||
* @param string|null $swagFullOutPut
|
||||
* @return string
|
||||
*/
|
||||
public function build(
|
||||
string $prompt,
|
||||
@@ -71,11 +70,42 @@ final readonly class PromptBuilder
|
||||
?bool $fullContext = false,
|
||||
?string $swagFullOutPut = ''
|
||||
): string {
|
||||
$prompt = $this->normalizeBlockText($prompt);
|
||||
$urlContent = $this->normalizeBlockText($urlContent);
|
||||
$swagFullOutPut = $this->normalizeNullableBlockText($swagFullOutPut);
|
||||
|
||||
$systemBlock = $this->buildSystemBlock();
|
||||
$shopBlock = $this->buildShopBlock($shopResults, $swagFullOutPut);
|
||||
$knowledgeBlock = $this->buildKnowledgeBlock($knowledgeChunks, $urlContent, $prompt);
|
||||
$userBlock = $this->buildUserBlock($prompt);
|
||||
|
||||
// Build fixed blocks first so history only receives the remaining budget.
|
||||
$fixedPrompt = $this->implodeBlocks([
|
||||
$systemBlock,
|
||||
$shopBlock,
|
||||
$knowledgeBlock,
|
||||
$userBlock,
|
||||
]);
|
||||
|
||||
$contextBlock = $this->buildContextBlock(
|
||||
userId: $userId,
|
||||
fixedPrompt: $fixedPrompt,
|
||||
fullContext: (bool) $fullContext
|
||||
);
|
||||
|
||||
return $this->implodeBlocks([
|
||||
$systemBlock,
|
||||
$shopBlock,
|
||||
$knowledgeBlock,
|
||||
$contextBlock,
|
||||
$userBlock,
|
||||
]);
|
||||
}
|
||||
|
||||
private function buildSystemBlock(): string
|
||||
{
|
||||
$now = (new DateTimeImmutable())->format('Y-m-d H:i:s');
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 1) SYSTEM INSTRUCTIONS
|
||||
// ------------------------------------------------------------
|
||||
$activePrompt = $this->systemPromptRepository->findActive();
|
||||
|
||||
if (!$activePrompt) {
|
||||
@@ -83,46 +113,13 @@ final readonly class PromptBuilder
|
||||
}
|
||||
|
||||
$activeSystemPrompt = str_replace('{% now %}', $now, $activePrompt->getContent());
|
||||
$systemBlock = "SYSTEM:\n" . $activeSystemPrompt;
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 2) PRIORITIZED FIXED BLOCKS
|
||||
// ------------------------------------------------------------
|
||||
$shopBlock = $this->buildShopBlock($shopResults, $swagFullOutPut);
|
||||
$knowledgeBlock = $this->buildKnowledgeBlock($knowledgeChunks, $urlContent);
|
||||
$userBlock = "USER QUESTION:\n" . $prompt;
|
||||
return "SYSTEM:\n" . $this->normalizeBlockText($activeSystemPrompt);
|
||||
}
|
||||
|
||||
// Build all fixed blocks first so history only gets the remaining budget.
|
||||
$fixedBlocks = array_filter([
|
||||
$systemBlock,
|
||||
$shopBlock,
|
||||
$knowledgeBlock,
|
||||
$userBlock,
|
||||
]);
|
||||
|
||||
$fixedPrompt = implode("\n\n", $fixedBlocks);
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 3) CONVERSATION CONTEXT (AUTHORITATIVE, FILLS REMAINING SPACE)
|
||||
// ------------------------------------------------------------
|
||||
$contextBlock = $this->buildContextBlock(
|
||||
userId: $userId,
|
||||
fixedPrompt: $fixedPrompt,
|
||||
fullContext: (bool) $fullContext
|
||||
);
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 4) FINAL PROMPT ASSEMBLY
|
||||
// ------------------------------------------------------------
|
||||
$blocks = array_filter([
|
||||
$systemBlock,
|
||||
$shopBlock,
|
||||
$knowledgeBlock,
|
||||
$contextBlock,
|
||||
$userBlock,
|
||||
]);
|
||||
|
||||
return implode("\n\n", $blocks);
|
||||
private function buildUserBlock(string $prompt): string
|
||||
{
|
||||
return "USER QUESTION:\n" . $prompt;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -151,33 +148,36 @@ final readonly class PromptBuilder
|
||||
);
|
||||
}
|
||||
|
||||
$history = $this->normalizeBlockText($history);
|
||||
|
||||
if ($history === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
return
|
||||
"CONVERSATION CONTEXT (authoritative):\n" .
|
||||
"The following messages are the previous turns of this conversation.\n" .
|
||||
"They must be considered when answering the next question.\n\n" .
|
||||
"CONVERSATION CONTEXT (contextual only):\n" .
|
||||
"The following messages are previous turns of this conversation.\n" .
|
||||
"Use them to resolve references, follow-up questions, and user intent.\n" .
|
||||
"They must not override retrieved factual knowledge or live shop data.\n\n" .
|
||||
$history;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the shop block with the highest business priority.
|
||||
* Build the shop block with the highest business priority for product facts.
|
||||
*/
|
||||
private function buildShopBlock(array $shopResults, ?string $swagFullOutPut): string
|
||||
{
|
||||
$parts = [];
|
||||
|
||||
if ($swagFullOutPut !== null && trim($swagFullOutPut) !== '') {
|
||||
if ($swagFullOutPut !== null && $swagFullOutPut !== '') {
|
||||
$parts[] =
|
||||
"SHOP SEARCH QUERY:\n" .
|
||||
trim($swagFullOutPut) . "\n" .
|
||||
$swagFullOutPut . "\n" .
|
||||
"Source: Shop Search";
|
||||
}
|
||||
|
||||
if ($shopResults === []) {
|
||||
return implode("\n\n", $parts);
|
||||
return $this->implodeBlocks($parts);
|
||||
}
|
||||
|
||||
$isDetailed = count($shopResults) <= 5;
|
||||
@@ -190,19 +190,19 @@ final readonly class PromptBuilder
|
||||
|
||||
$n = $i + 1;
|
||||
$entryParts = [
|
||||
"[{$n}] " . $product->name,
|
||||
"[{$n}] " . $this->normalizeBlockText($product->name),
|
||||
];
|
||||
|
||||
if ($product->productNumber) {
|
||||
$entryParts[] = "Product number: " . $product->productNumber;
|
||||
$entryParts[] = "Product number: " . $this->normalizeBlockText($product->productNumber);
|
||||
}
|
||||
|
||||
if ($product->manufacturer) {
|
||||
$entryParts[] = "Manufacturer: " . $product->manufacturer;
|
||||
$entryParts[] = "Manufacturer: " . $this->normalizeBlockText($product->manufacturer);
|
||||
}
|
||||
|
||||
if ($product->price) {
|
||||
$entryParts[] = "Price: " . $product->price;
|
||||
$entryParts[] = "Price: " . $this->normalizeBlockText($product->price);
|
||||
}
|
||||
|
||||
if ($product->available !== null) {
|
||||
@@ -210,23 +210,27 @@ final readonly class PromptBuilder
|
||||
}
|
||||
|
||||
foreach ($product->highlights as $highlight) {
|
||||
$entryParts[] = "- " . $highlight;
|
||||
$highlight = $this->normalizeBlockText((string) $highlight);
|
||||
|
||||
if ($highlight !== '') {
|
||||
$entryParts[] = "- " . $highlight;
|
||||
}
|
||||
}
|
||||
|
||||
if ($product->url) {
|
||||
$entryParts[] = "URL: " . $product->url;
|
||||
$entryParts[] = "URL: " . $this->normalizeBlockText($product->url);
|
||||
}
|
||||
|
||||
if ($product->productImage) {
|
||||
$entryParts[] = "Product image: " . $product->productImage;
|
||||
$entryParts[] = "Product image: " . $this->normalizeBlockText($product->productImage);
|
||||
}
|
||||
|
||||
if ($isDetailed && $product->description) {
|
||||
$entryParts[] = "Description: " . $product->description;
|
||||
$entryParts[] = "Description: " . $this->normalizeBlockText($product->description);
|
||||
}
|
||||
|
||||
if ($product->customFields) {
|
||||
$entryParts[] = "Meta information: " . $product->customFields;
|
||||
$entryParts[] = "Meta information: " . $this->normalizeBlockText($product->customFields);
|
||||
}
|
||||
|
||||
$lines[] = implode("\n", $entryParts);
|
||||
@@ -235,41 +239,75 @@ final readonly class PromptBuilder
|
||||
if ($lines !== []) {
|
||||
$parts[] =
|
||||
"LIVE SHOP RESULTS (authoritative for products):\n" .
|
||||
"Use these results as authoritative for product identity, availability, pricing, and shop-visible product details.\n" .
|
||||
"If retrieved documents conflict with live shop data on product availability or price, prefer the live shop data.\n" .
|
||||
"Do not infer undocumented technical specifications from live shop data.\n\n" .
|
||||
implode("\n\n", $lines);
|
||||
}
|
||||
|
||||
return implode("\n\n", $parts);
|
||||
return $this->implodeBlocks($parts);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the supporting knowledge block.
|
||||
* Build the knowledge block.
|
||||
*
|
||||
* Retrieved knowledge is authoritative for factual statements that are present in the sources.
|
||||
* Missing facts must not be invented.
|
||||
*/
|
||||
private function buildKnowledgeBlock(array $knowledgeChunks, string $urlContent): string
|
||||
private function buildKnowledgeBlock(array $knowledgeChunks, string $urlContent, string $prompt): string
|
||||
{
|
||||
$knowledgeParts = [];
|
||||
$isTechnicalProductQuestion = $this->isLikelyTechnicalProductQuestion($prompt);
|
||||
|
||||
if ($knowledgeChunks !== []) {
|
||||
$lines = [];
|
||||
|
||||
foreach ($knowledgeChunks as $i => $chunk) {
|
||||
$chunk = $this->normalizeBlockText((string) $chunk);
|
||||
|
||||
if ($chunk === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$n = $i + 1;
|
||||
$lines[] = "[{$n}] {$chunk}";
|
||||
}
|
||||
|
||||
$knowledgeParts[] =
|
||||
"RETRIEVED KNOWLEDGE (supporting):\n" .
|
||||
"Source: Documents\n" .
|
||||
implode("\n\n", $lines);
|
||||
if ($lines !== []) {
|
||||
$knowledgeParts[] =
|
||||
"FACT GROUNDING RULES:\n" .
|
||||
"- Use retrieved knowledge as authoritative for factual answers.\n" .
|
||||
"- Extract concrete values exactly when they are present, including units, ranges, model names, indicator names, IP classes, temperatures, pressures, dimensions, counts, relay outputs, current outputs, and error codes.\n" .
|
||||
"- Do not invent missing values.\n" .
|
||||
"- Do not replace missing values with estimates, defaults, or typical industry assumptions.\n" .
|
||||
"- Do not claim that information is missing if it appears in the provided sources.\n" .
|
||||
"- Do not compare with other products unless those products are also present in the provided sources.\n" .
|
||||
"- Prefer source-faithful wording over persuasive wording.\n" .
|
||||
"- Avoid marketing language such as 'ideal', 'perfect', 'unverzichtbar', or 'state-of-the-art'.\n" .
|
||||
"- Clearly separate explicit facts from inferences.\n" .
|
||||
"- If an inference is necessary, label it with 'Inference:'.\n" .
|
||||
($isTechnicalProductQuestion
|
||||
? "- For technical product questions, answer primarily with explicitly stated facts.\n" .
|
||||
"- Keep interpretations minimal and do not generalize application areas beyond the provided sources.\n" .
|
||||
"- If the retrieved knowledge describes one specific named product, stay within that product and do not merge related product families or variants.\n" .
|
||||
"- Prefer neutral technical wording over evaluative summaries.\n" .
|
||||
"- If a detail is not explicitly stated in the provided sources, say so plainly.\n"
|
||||
: ""
|
||||
) . "\n" .
|
||||
"RETRIEVED KNOWLEDGE (authoritative for facts):\n" .
|
||||
"Source: Documents\n" .
|
||||
implode("\n\n", $lines);
|
||||
}
|
||||
}
|
||||
|
||||
if ($urlContent !== '') {
|
||||
$knowledgeParts[] =
|
||||
"CONTENT FROM URL (supporting):\n" .
|
||||
"CONTENT FROM URL (authoritative if user-provided):\n" .
|
||||
"Source: URL\n" .
|
||||
$urlContent;
|
||||
}
|
||||
|
||||
return implode("\n\n", $knowledgeParts);
|
||||
return $this->implodeBlocks($knowledgeParts);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -309,6 +347,85 @@ final readonly class PromptBuilder
|
||||
return max(0, $remaining);
|
||||
}
|
||||
|
||||
private function implodeBlocks(array $blocks): string
|
||||
{
|
||||
$filtered = array_values(array_filter(
|
||||
array_map(
|
||||
fn ($block): string => is_string($block) ? $this->normalizeBlockText($block) : '',
|
||||
$blocks
|
||||
),
|
||||
static fn (string $block): bool => $block !== ''
|
||||
));
|
||||
|
||||
return implode("\n\n", $filtered);
|
||||
}
|
||||
|
||||
private function normalizeNullableBlockText(?string $value): ?string
|
||||
{
|
||||
if ($value === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$normalized = $this->normalizeBlockText($value);
|
||||
|
||||
return $normalized === '' ? null : $normalized;
|
||||
}
|
||||
|
||||
private function normalizeBlockText(string $value): string
|
||||
{
|
||||
$value = str_replace(["\r\n", "\r"], "\n", $value);
|
||||
$value = str_replace("\u{00A0}", ' ', $value);
|
||||
$value = trim($value);
|
||||
|
||||
$value = preg_replace("/\n{3,}/", "\n\n", $value) ?? $value;
|
||||
$value = preg_replace("/[ \t]+\n/", "\n", $value) ?? $value;
|
||||
$value = preg_replace("/[ \t]{2,}/", " ", $value) ?? $value;
|
||||
|
||||
return $value;
|
||||
}
|
||||
|
||||
private function isLikelyTechnicalProductQuestion(string $prompt): bool
|
||||
{
|
||||
$normalized = mb_strtolower($prompt, 'UTF-8');
|
||||
|
||||
$keywords = [
|
||||
'technisch',
|
||||
'technical',
|
||||
'produkt',
|
||||
'product',
|
||||
'gerät',
|
||||
'device',
|
||||
'modell',
|
||||
'model',
|
||||
'messprinzip',
|
||||
'schnittstelle',
|
||||
'relais',
|
||||
'indikator',
|
||||
'spannung',
|
||||
'strom',
|
||||
'druck',
|
||||
'temperatur',
|
||||
'schutzart',
|
||||
'fehlercode',
|
||||
'wasserhärte',
|
||||
'testomat',
|
||||
];
|
||||
|
||||
$matches = 0;
|
||||
|
||||
foreach ($keywords as $keyword) {
|
||||
if (str_contains($normalized, $keyword)) {
|
||||
$matches++;
|
||||
}
|
||||
}
|
||||
|
||||
if ($matches >= 2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return preg_match('/\b[\p{L}]{2,}\s?\d{2,5}\b/u', $prompt) === 1;
|
||||
}
|
||||
|
||||
private function clamp(int $value, int $min, int $max): int
|
||||
{
|
||||
return max($min, min($max, $value));
|
||||
|
||||
@@ -1,21 +1,68 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Config;
|
||||
|
||||
class NdjsonHybridRetrieverConfig
|
||||
final class NdjsonHybridRetrieverConfig
|
||||
{
|
||||
public const VECTOR_SCORE_THRESHOLD = 0.75;
|
||||
/**
|
||||
* Default semantic similarity threshold for vector hits.
|
||||
*
|
||||
* Chosen to stay selective enough for product-family-heavy data
|
||||
* while not cutting off too many useful fallback hits.
|
||||
*/
|
||||
public const VECTOR_SCORE_THRESHOLD = 0.80;
|
||||
|
||||
public const HARD_MAX_CHUNKS = 90;
|
||||
public const HARD_MAX_VECTORK = 250;
|
||||
/**
|
||||
* Absolute safety caps.
|
||||
*
|
||||
* These limits protect the retriever from overly large candidate sets
|
||||
* even if runtime config values are set too high.
|
||||
*/
|
||||
public const HARD_MAX_CHUNKS = 72;
|
||||
public const HARD_MAX_VECTORK = 180;
|
||||
|
||||
public const LIST_BONUS = 1.25;
|
||||
/**
|
||||
* List-style queries benefit from a slightly wider candidate pool
|
||||
* before de-duplication and final selection.
|
||||
*/
|
||||
public const LIST_BONUS = 1.40;
|
||||
|
||||
public const MAX_CHUNKS_PER_DOC = 2;
|
||||
public const MIN_CHUNK_DISTANCE = 2.5;
|
||||
public const RRF_K = 60;
|
||||
/**
|
||||
* Selection rules for cross-document semantic retrieval.
|
||||
*
|
||||
* MAX_CHUNKS_PER_DOC:
|
||||
* Keeps one document from dominating the final result in normal
|
||||
* semantic retrieval mode.
|
||||
*
|
||||
* MIN_CHUNK_DISTANCE:
|
||||
* Allows nearby chunks to be selected when they are still meaningfully
|
||||
* distinct, which is important for compact product sheets.
|
||||
*/
|
||||
public const MAX_CHUNKS_PER_DOC = 3;
|
||||
public const MIN_CHUNK_DISTANCE = 1.0;
|
||||
|
||||
public const THRESHOLD_FLOOR = 0.83;
|
||||
public const THRESHOLD_CEIL = 0.92;
|
||||
public const EMPTY_RRF_FALLBACK_TOPN = 1;
|
||||
/**
|
||||
* Reciprocal Rank Fusion constant.
|
||||
*
|
||||
* Slightly lower than classic defaults so top-ranked hits matter more.
|
||||
*/
|
||||
public const RRF_K = 50;
|
||||
|
||||
/**
|
||||
* Dynamic threshold clamp boundaries.
|
||||
*
|
||||
* The floor must stay below the default threshold, otherwise the
|
||||
* configured base threshold becomes ineffective.
|
||||
*/
|
||||
public const THRESHOLD_FLOOR = 0.78;
|
||||
public const THRESHOLD_CEIL = 0.90;
|
||||
|
||||
/**
|
||||
* Fallback breadth when strict thresholding removes all fused hits.
|
||||
*
|
||||
* More than one fallback result makes the retriever less brittle.
|
||||
*/
|
||||
public const EMPTY_RRF_FALLBACK_TOPN = 3;
|
||||
}
|
||||
@@ -1,6 +1,5 @@
|
||||
<?php
|
||||
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
@@ -26,17 +25,180 @@ final readonly class NdjsonChunkLookup
|
||||
|
||||
foreach ($this->chunkManager->streamAll() as $row) {
|
||||
$id = $row['chunk_id'] ?? null;
|
||||
|
||||
if (!is_string($id) || !isset($wanted[$id])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$found[$id] = $row;
|
||||
|
||||
if (\count($found) === \count($wanted)) {
|
||||
if (count($found) === count($wanted)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $found;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all chunks of one document keyed by chunk_id.
|
||||
*
|
||||
* @return array<string,array<string,mixed>>
|
||||
*/
|
||||
public function findByDocumentId(string $documentId): array
|
||||
{
|
||||
$rows = [];
|
||||
|
||||
foreach ($this->chunkManager->streamAll() as $row) {
|
||||
$rowDocumentId = $row['document_id'] ?? null;
|
||||
$chunkId = $row['chunk_id'] ?? null;
|
||||
|
||||
if ($rowDocumentId !== $documentId || !is_string($chunkId) || $chunkId === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$rows[$chunkId] = $row;
|
||||
}
|
||||
|
||||
return $rows;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves the best exact document title match from the user prompt.
|
||||
*
|
||||
* Matching rules:
|
||||
* - the normalized prompt must contain the full normalized document title
|
||||
* - titles containing digits are preferred, e.g. "Testomat 808"
|
||||
* - longer exact titles win over shorter generic titles
|
||||
*
|
||||
* @return array{
|
||||
* document_id:string,
|
||||
* document_title:string,
|
||||
* rows:array<string,array<string,mixed>>
|
||||
* }|null
|
||||
*/
|
||||
public function findBestExactDocumentByPrompt(string $prompt): ?array
|
||||
{
|
||||
$normalizedPrompt = $this->normalizeText($prompt);
|
||||
|
||||
if ($normalizedPrompt === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
$documents = [];
|
||||
|
||||
foreach ($this->chunkManager->streamAll() as $row) {
|
||||
$documentId = $row['document_id'] ?? null;
|
||||
$chunkId = $row['chunk_id'] ?? null;
|
||||
|
||||
if (!is_string($documentId) || $documentId === '' || !is_string($chunkId) || $chunkId === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isset($documents[$documentId])) {
|
||||
$documentTitle = $this->extractDocumentTitle($row);
|
||||
|
||||
if ($documentTitle === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$documents[$documentId] = [
|
||||
'document_id' => $documentId,
|
||||
'document_title' => $documentTitle,
|
||||
'normalized_title' => $this->normalizeText($documentTitle),
|
||||
'rows' => [],
|
||||
];
|
||||
}
|
||||
|
||||
$documents[$documentId]['rows'][$chunkId] = $row;
|
||||
}
|
||||
|
||||
$best = null;
|
||||
$bestScore = null;
|
||||
|
||||
foreach ($documents as $document) {
|
||||
$normalizedTitle = $document['normalized_title'];
|
||||
|
||||
if (!$this->isConfidentTitleMatch($normalizedPrompt, $normalizedTitle)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$score = mb_strlen($normalizedTitle, 'UTF-8');
|
||||
|
||||
if (preg_match('/\d/u', $normalizedTitle) === 1) {
|
||||
$score += 1000;
|
||||
}
|
||||
|
||||
if ($best === null || $score > $bestScore) {
|
||||
$best = $document;
|
||||
$bestScore = $score;
|
||||
}
|
||||
}
|
||||
|
||||
if ($best === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return [
|
||||
'document_id' => $best['document_id'],
|
||||
'document_title' => $best['document_title'],
|
||||
'rows' => $best['rows'],
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string,mixed> $row
|
||||
*/
|
||||
private function extractDocumentTitle(array $row): string
|
||||
{
|
||||
$metadataTitle = $row['metadata']['document_title'] ?? null;
|
||||
|
||||
if (is_string($metadataTitle) && trim($metadataTitle) !== '') {
|
||||
return trim($metadataTitle);
|
||||
}
|
||||
|
||||
$text = (string) ($row['text'] ?? '');
|
||||
|
||||
if (
|
||||
$text !== '' &&
|
||||
preg_match('/^#\s*Produkt\s+Titel:\s*`?([^`\n]+)`?/imu', $text, $matches) === 1
|
||||
) {
|
||||
return trim((string) ($matches[1] ?? ''));
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
private function isConfidentTitleMatch(string $normalizedPrompt, string $normalizedTitle): bool
|
||||
{
|
||||
if ($normalizedPrompt === '' || $normalizedTitle === '') {
|
||||
return false;
|
||||
}
|
||||
|
||||
$paddedPrompt = ' ' . $normalizedPrompt . ' ';
|
||||
$paddedTitle = ' ' . $normalizedTitle . ' ';
|
||||
|
||||
if (!str_contains($paddedPrompt, $paddedTitle)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$tokens = preg_split('/\s+/u', $normalizedTitle, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||||
|
||||
$significantTokens = array_values(array_filter(
|
||||
$tokens,
|
||||
static fn (string $token): bool => mb_strlen($token, 'UTF-8') >= 3 || preg_match('/\d/u', $token) === 1
|
||||
));
|
||||
|
||||
return count($significantTokens) >= 2 || preg_match('/\d/u', $normalizedTitle) === 1;
|
||||
}
|
||||
|
||||
private function normalizeText(string $value): string
|
||||
{
|
||||
$value = mb_strtolower(trim($value), 'UTF-8');
|
||||
$value = str_replace(['-', '/', '_'], ' ', $value);
|
||||
$value = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $value) ?? $value;
|
||||
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
||||
|
||||
return trim($value);
|
||||
}
|
||||
}
|
||||
@@ -23,6 +23,7 @@ use RuntimeException;
|
||||
* Main responsibilities:
|
||||
* - detect high-level request intent
|
||||
* - optionally short-circuit to catalog list output
|
||||
* - resolve exact document-title matches before semantic retrieval
|
||||
* - run vector retrieval globally and optionally document-scoped
|
||||
* - fuse both result sets with RRF-style scoring
|
||||
* - apply selection rules for list queries vs. sales-style queries
|
||||
@@ -30,6 +31,15 @@ use RuntimeException;
|
||||
*/
|
||||
final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
/**
|
||||
* When one document clearly dominates the top-ranked window,
|
||||
* temporarily switch from "spread" mode to "dominant document" mode.
|
||||
*/
|
||||
private const DOMINANT_DOC_WINDOW = 6;
|
||||
private const DOMINANT_DOC_MIN_HITS = 3;
|
||||
private const DOMINANT_DOC_MAX_CHUNKS = 4;
|
||||
private const EXACT_DOCUMENT_MAX_CHUNKS = 6;
|
||||
|
||||
public function __construct(
|
||||
private NdjsonChunkLookup $lookup,
|
||||
private VectorSearchClient $vectorClient,
|
||||
@@ -58,6 +68,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
* - executes the full orchestration pipeline
|
||||
* - if the route resolves to a catalog list, returns the catalog block only
|
||||
* - otherwise returns the selected chunk texts
|
||||
*
|
||||
* @throws Exception
|
||||
*/
|
||||
public function retrieve(string $prompt): array
|
||||
@@ -65,12 +76,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$config = $this->requireConfig();
|
||||
$result = $this->execute($prompt, $config, false);
|
||||
|
||||
// Catalog list responses bypass normal chunk retrieval completely.
|
||||
if ($result['catalogBlock'] !== null) {
|
||||
return [$result['catalogBlock']];
|
||||
}
|
||||
|
||||
// No selected chunks means no usable retrieval result.
|
||||
if ($result['selectedChunkIds'] === []) {
|
||||
return [];
|
||||
}
|
||||
@@ -90,6 +99,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
* - fused RRF scores
|
||||
* - intent / route information
|
||||
* - threshold and list-query flags
|
||||
*
|
||||
* @throws Exception
|
||||
*/
|
||||
public function retrieveDebug(string $prompt, ?ModelGenerationConfig $config = null): array
|
||||
@@ -97,12 +107,12 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$config = $config ?? $this->requireConfig();
|
||||
$result = $this->execute($prompt, $config, true);
|
||||
|
||||
// For catalog list routes we expose a synthetic debug row.
|
||||
if ($result['catalogBlock'] !== null) {
|
||||
return [[
|
||||
'rank' => 1,
|
||||
'chunk_id' => '__CATALOG_LIST__',
|
||||
'document_id' => null,
|
||||
'chunk_index' => null,
|
||||
'raw_score' => null,
|
||||
'rrf_score' => null,
|
||||
'threshold' => 0.0,
|
||||
@@ -110,6 +120,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
'route' => $result['route'],
|
||||
'entity_label' => $result['entityLabel'],
|
||||
'is_list_query' => true,
|
||||
'selection_mode' => 'catalog_list',
|
||||
'text' => $result['catalogBlock'],
|
||||
]];
|
||||
}
|
||||
@@ -122,8 +133,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$rank = 0;
|
||||
|
||||
foreach ($result['selectedChunkIds'] as $chunkId) {
|
||||
|
||||
// Skip ids that could not be resolved to real chunk rows.
|
||||
if (!isset($result['rows'][$chunkId])) {
|
||||
continue;
|
||||
}
|
||||
@@ -134,6 +143,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
'rank' => $rank,
|
||||
'chunk_id' => $chunkId,
|
||||
'document_id' => $result['rows'][$chunkId]['document_id'] ?? null,
|
||||
'chunk_index' => $result['rows'][$chunkId]['chunk_index'] ?? null,
|
||||
'raw_score' => $result['rawScores'][$chunkId] ?? null,
|
||||
'rrf_score' => $result['rrfScores'][$chunkId] ?? null,
|
||||
'threshold' => $result['threshold'],
|
||||
@@ -141,6 +151,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
'route' => $result['route'],
|
||||
'entity_label' => $result['entityLabel'],
|
||||
'is_list_query' => $result['isListQuery'],
|
||||
'selection_mode' => $result['selectionMode'],
|
||||
'text' => trim((string)$result['rows'][$chunkId]['text']),
|
||||
];
|
||||
}
|
||||
@@ -159,8 +170,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
* 1. Detect catalog entity and sales intent
|
||||
* 2. Resolve route
|
||||
* 3. If route is a catalog list route, try direct catalog output
|
||||
* 4. Otherwise, run the normal hybrid retrieval core
|
||||
* 5. Select final chunk ids depending on query type
|
||||
* 4. If prompt matches one exact document title, use exact-document fast path
|
||||
* 5. Otherwise, run the normal hybrid retrieval core
|
||||
* 6. Select final chunk ids depending on query type
|
||||
*
|
||||
* @throws Exception
|
||||
*/
|
||||
private function execute(
|
||||
@@ -169,16 +182,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
bool $withScores
|
||||
): array
|
||||
{
|
||||
|
||||
$entityLabel = $this->catalogIntent->detect($prompt);
|
||||
$salesIntent = $this->detectSalesIntent($prompt);
|
||||
$route = $this->routeResolver->resolve($salesIntent, $entityLabel);
|
||||
|
||||
// Fast path:
|
||||
// If the route explicitly asks for a catalog list and we have an entity label,
|
||||
// we return a prebuilt catalog block instead of semantic chunk retrieval.
|
||||
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
|
||||
|
||||
$catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);
|
||||
|
||||
if ($catalogBlock !== null) {
|
||||
@@ -187,6 +195,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
'entityLabel' => $entityLabel,
|
||||
'intent' => $salesIntent,
|
||||
'isListQuery' => true,
|
||||
'selectionMode' => 'catalog_list',
|
||||
'selectedChunkIds' => [],
|
||||
'rows' => [],
|
||||
'rrfScores' => [],
|
||||
@@ -197,15 +206,40 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
}
|
||||
|
||||
$exactDocumentMatch = $this->lookup->findBestExactDocumentByPrompt($prompt);
|
||||
|
||||
if ($exactDocumentMatch !== null) {
|
||||
$selectedChunkIds = $this->selectExactDocumentChunkIds(
|
||||
$exactDocumentMatch['rows'],
|
||||
max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS))
|
||||
);
|
||||
|
||||
if ($selectedChunkIds !== []) {
|
||||
return [
|
||||
'route' => $route,
|
||||
'entityLabel' => $entityLabel,
|
||||
'intent' => $salesIntent,
|
||||
'isListQuery' => false,
|
||||
'selectionMode' => 'exact_document_title',
|
||||
'selectedChunkIds' => $selectedChunkIds,
|
||||
'rows' => $exactDocumentMatch['rows'],
|
||||
'rrfScores' => $this->buildExactDocumentScores($selectedChunkIds),
|
||||
'rawScores' => [],
|
||||
'threshold' => 1.0,
|
||||
'catalogBlock' => null,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
$core = $this->runCore($prompt, $config, $withScores, $salesIntent);
|
||||
|
||||
// No ranked chunks or no resolved rows means retrieval produced nothing usable.
|
||||
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
||||
return [
|
||||
'route' => $route,
|
||||
'entityLabel' => $entityLabel,
|
||||
'intent' => $salesIntent,
|
||||
'isListQuery' => $core['is_list_query'],
|
||||
'selectionMode' => null,
|
||||
'selectedChunkIds' => [],
|
||||
'rows' => [],
|
||||
'rrfScores' => [],
|
||||
@@ -215,18 +249,30 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
];
|
||||
}
|
||||
|
||||
// Selection strategy depends on query type:
|
||||
// - list queries prefer deduplicated chunks
|
||||
// - sales queries prefer spread across docs / chunk distance
|
||||
$selectedChunkIds = $core['is_list_query']
|
||||
? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
|
||||
: $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
||||
if ($core['is_list_query']) {
|
||||
$selectedChunkIds = $this->selectListChunkIds(
|
||||
$core['ranked_chunk_ids'],
|
||||
$core['rows'],
|
||||
$core['limit']
|
||||
);
|
||||
$selectionMode = 'list_deduplicated';
|
||||
} else {
|
||||
$salesSelection = $this->selectSalesChunkIds(
|
||||
$core['ranked_chunk_ids'],
|
||||
$core['rows'],
|
||||
$core['limit']
|
||||
);
|
||||
|
||||
$selectedChunkIds = $salesSelection['ids'];
|
||||
$selectionMode = $salesSelection['mode'];
|
||||
}
|
||||
|
||||
return [
|
||||
'route' => $route,
|
||||
'entityLabel' => $entityLabel,
|
||||
'intent' => $salesIntent,
|
||||
'isListQuery' => $core['is_list_query'],
|
||||
'selectionMode' => $selectionMode,
|
||||
'selectedChunkIds' => $selectedChunkIds,
|
||||
'rows' => $core['rows'],
|
||||
'rrfScores' => $core['rrf_scores'],
|
||||
@@ -252,6 +298,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
* - run global and optional scoped vector search
|
||||
* - fuse hits
|
||||
* - resolve chunk ids to chunk rows
|
||||
*
|
||||
* @throws Exception
|
||||
*/
|
||||
private function runCore(
|
||||
@@ -261,17 +308,14 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
string $salesIntent
|
||||
): array
|
||||
{
|
||||
|
||||
$limit = max(1, min($config->getRetrievalMaxChunks(), NdjsonHybridRetrieverConfig::HARD_MAX_CHUNKS));
|
||||
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
|
||||
|
||||
$isListQuery = $this->intentLite->isListQuery($prompt);
|
||||
|
||||
// The prompt is normalized first, then enriched before retrieval.
|
||||
$cleanQuery = $this->queryCleaner->clean($prompt);
|
||||
$cleanQuery = $this->queryEnricher->enrichPrompt($cleanQuery);
|
||||
|
||||
// Empty cleaned query means retrieval would be meaningless.
|
||||
if ($cleanQuery === '') {
|
||||
return [
|
||||
'limit' => $limit,
|
||||
@@ -290,22 +334,21 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$vectorTopKBase
|
||||
);
|
||||
|
||||
// Tag routing tries to narrow retrieval to relevant document ids.
|
||||
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
||||
$candidateDocIds = is_array($candidateDocIds)
|
||||
? array_values(array_unique(array_filter($candidateDocIds, 'is_string')))
|
||||
? array_values(array_unique(array_filter(
|
||||
$candidateDocIds,
|
||||
static fn(mixed $value): bool => is_string($value) && $value !== ''
|
||||
)))
|
||||
: [];
|
||||
|
||||
// Always run a global search.
|
||||
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
||||
|
||||
// Optionally run a scoped search if tag routing yielded document candidates.
|
||||
$scopedHits = [];
|
||||
if ($candidateDocIds !== []) {
|
||||
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
|
||||
}
|
||||
|
||||
// Nothing found at all.
|
||||
if ($globalHits === [] && $scopedHits === []) {
|
||||
return [
|
||||
'limit' => $limit,
|
||||
@@ -318,25 +361,19 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
];
|
||||
}
|
||||
|
||||
// Fuse global and scoped hits with optional scoped boost.
|
||||
$fused = $this->fuseHits(
|
||||
$globalHits,
|
||||
$scopedHits,
|
||||
$threshold,
|
||||
$salesIntent === SalesIntentLite::OBJECTION,
|
||||
$scopedHits !== [],
|
||||
$withScores
|
||||
);
|
||||
|
||||
$rrfScores = $fused['rrf_scores'];
|
||||
$rawScores = $fused['raw_scores'];
|
||||
|
||||
// Fallback:
|
||||
// If all hits were filtered by threshold but global hits exist,
|
||||
// derive a weak RRF ranking from the raw hit order.
|
||||
if ($rrfScores === [] && $globalHits !== []) {
|
||||
$rrfScores = $this->fallbackRrfFromHits(
|
||||
$globalHits
|
||||
);
|
||||
$rrfScores = $this->fallbackRrfFromHits($globalHits);
|
||||
}
|
||||
|
||||
if ($rrfScores === []) {
|
||||
@@ -351,11 +388,9 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
];
|
||||
}
|
||||
|
||||
// Highest fused score first.
|
||||
arsort($rrfScores);
|
||||
$rankedChunkIds = array_keys($rrfScores);
|
||||
|
||||
// Resolve the ranking to actual NDJSON chunk rows.
|
||||
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
|
||||
|
||||
return [
|
||||
@@ -381,9 +416,11 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
private function requireConfig(): ModelGenerationConfig
|
||||
{
|
||||
$config = $this->configRepository->findActiveForModel();
|
||||
|
||||
if ($config === null) {
|
||||
throw new RuntimeException('No active ModelGenerationConfig found.');
|
||||
}
|
||||
|
||||
return $config;
|
||||
}
|
||||
|
||||
@@ -395,6 +432,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
private function detectSalesIntent(string $prompt): string
|
||||
{
|
||||
$data = $this->salesIntentLite->detect($prompt);
|
||||
|
||||
return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
|
||||
}
|
||||
|
||||
@@ -412,7 +450,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
int $vectorTopKBase
|
||||
): array
|
||||
{
|
||||
|
||||
$threshold = NdjsonHybridRetrieverConfig::VECTOR_SCORE_THRESHOLD;
|
||||
$topK = $vectorTopKBase;
|
||||
|
||||
@@ -428,7 +465,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
$topK = max(1, min($topK, NdjsonHybridRetrieverConfig::HARD_MAX_VECTORK));
|
||||
$threshold = max(NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR, min(NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, $threshold));
|
||||
$threshold = max(
|
||||
NdjsonHybridRetrieverConfig::THRESHOLD_FLOOR,
|
||||
min(NdjsonHybridRetrieverConfig::THRESHOLD_CEIL, $threshold)
|
||||
);
|
||||
|
||||
return [$threshold, $topK];
|
||||
}
|
||||
@@ -450,31 +490,25 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
bool $captureRaw
|
||||
): array
|
||||
{
|
||||
|
||||
$rrfScores = [];
|
||||
$rawScores = [];
|
||||
|
||||
$apply = function (array $hits, bool $boost) use (&$rrfScores, &$rawScores, $threshold, $captureRaw): void {
|
||||
|
||||
$rank = 0;
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
|
||||
// Every hit must provide a chunk id and a numeric score.
|
||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$raw = (float)$hit['score'];
|
||||
|
||||
// Threshold is applied before rank fusion.
|
||||
if ($raw < $threshold) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkId = (string)$hit['chunk_id'];
|
||||
|
||||
// Store the best raw score per chunk for debug inspection.
|
||||
if ($captureRaw) {
|
||||
$rawScores[$chunkId] = max($rawScores[$chunkId] ?? 0.0, $raw);
|
||||
}
|
||||
@@ -482,12 +516,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$rank++;
|
||||
$rrf = 1.0 / (NdjsonHybridRetrieverConfig::RRF_K + $rank);
|
||||
|
||||
// Scoped result lists can get a slight relevance bonus.
|
||||
if ($boost) {
|
||||
$rrf *= 1.2;
|
||||
}
|
||||
|
||||
// Scores from multiple hit lists accumulate.
|
||||
$rrfScores[$chunkId] = ($rrfScores[$chunkId] ?? 0.0) + $rrf;
|
||||
}
|
||||
};
|
||||
@@ -513,7 +545,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$rank = 0;
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
|
||||
if (!isset($hit['chunk_id'])) {
|
||||
continue;
|
||||
}
|
||||
@@ -529,6 +560,68 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return $rrf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects a coherent chunk window from one exact document title match.
|
||||
*
|
||||
* For exact product questions we prefer a pure document slice over
|
||||
* cross-document fusion to avoid mixing neighbouring product families.
|
||||
*
|
||||
* @param array<string,array<string,mixed>> $rows
|
||||
* @return string[]
|
||||
*/
|
||||
private function selectExactDocumentChunkIds(array $rows, int $limit): array
|
||||
{
|
||||
uasort($rows, static function (array $a, array $b): int {
|
||||
$aIndex = is_int($a['chunk_index'] ?? null) ? (int)$a['chunk_index'] : PHP_INT_MAX;
|
||||
$bIndex = is_int($b['chunk_index'] ?? null) ? (int)$b['chunk_index'] : PHP_INT_MAX;
|
||||
|
||||
if ($aIndex !== $bIndex) {
|
||||
return $aIndex <=> $bIndex;
|
||||
}
|
||||
|
||||
return strcmp((string)($a['chunk_id'] ?? ''), (string)($b['chunk_id'] ?? ''));
|
||||
});
|
||||
|
||||
$selected = [];
|
||||
$max = min($limit, self::EXACT_DOCUMENT_MAX_CHUNKS);
|
||||
|
||||
foreach ($rows as $row) {
|
||||
$chunkId = $row['chunk_id'] ?? null;
|
||||
$text = trim((string)($row['text'] ?? ''));
|
||||
|
||||
if (!is_string($chunkId) || $chunkId === '' || $text === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$selected[] = $chunkId;
|
||||
|
||||
if (count($selected) >= $max) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $selected;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds synthetic scores for exact-title fast-path selections.
|
||||
*
|
||||
* These scores are only used for debug output consistency.
|
||||
*
|
||||
* @param string[] $chunkIds
|
||||
* @return array<string,float>
|
||||
*/
|
||||
private function buildExactDocumentScores(array $chunkIds): array
|
||||
{
|
||||
$scores = [];
|
||||
|
||||
foreach (array_values($chunkIds) as $rank => $chunkId) {
|
||||
$scores[(string)$chunkId] = 1.0 / (1 + $rank);
|
||||
}
|
||||
|
||||
return $scores;
|
||||
}
|
||||
|
||||
/**
|
||||
* Selection strategy for list-style queries.
|
||||
*
|
||||
@@ -543,7 +636,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$out = [];
|
||||
|
||||
foreach ($chunkIds as $id) {
|
||||
|
||||
if (!isset($rows[$id]['text'])) {
|
||||
continue;
|
||||
}
|
||||
@@ -553,7 +645,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
continue;
|
||||
}
|
||||
|
||||
// Deduplicate by normalized chunk text.
|
||||
$key = md5(mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)));
|
||||
|
||||
if (isset($seen[$key])) {
|
||||
@@ -574,18 +665,242 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
/**
|
||||
* Selection strategy for sales-oriented queries.
|
||||
*
|
||||
* Goal:
|
||||
* - avoid overloading the result with chunks from the same document
|
||||
* - avoid chunks that are too close to each other in the same document
|
||||
* - preserve top-ranked relevance while improving contextual spread
|
||||
* Modes:
|
||||
* - exact_document_title:
|
||||
* used when the prompt clearly contains one exact document title
|
||||
* and the answer should stay strictly within that document
|
||||
*
|
||||
* - sales_dominant_document:
|
||||
* used when one document clearly dominates the top hit window
|
||||
* and coherent neighbouring chunks from that document are more
|
||||
* useful than cross-document spread
|
||||
*
|
||||
* - sales_spread:
|
||||
* default mode that spreads chunks across documents and enforces
|
||||
* distance between chunk positions of the same document
|
||||
*/
|
||||
private function selectSalesChunkIds(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$out = [];
|
||||
$dominantDocId = $this->detectDominantTopDocument($chunkIds, $rows);
|
||||
|
||||
if ($dominantDocId !== null) {
|
||||
$dominantChunkIds = $this->selectDominantDocumentChunkIds(
|
||||
$dominantDocId,
|
||||
$chunkIds,
|
||||
$rows,
|
||||
$limit
|
||||
);
|
||||
|
||||
if ($dominantChunkIds !== []) {
|
||||
return [
|
||||
'ids' => $this->fillRemainingSalesChunkIds(
|
||||
$dominantChunkIds,
|
||||
$chunkIds,
|
||||
$rows,
|
||||
$limit
|
||||
),
|
||||
'mode' => 'sales_dominant_document',
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
return [
|
||||
'ids' => $this->selectSalesChunkIdsSpread($chunkIds, $rows, $limit),
|
||||
'mode' => 'sales_spread',
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects whether one document clearly dominates the first ranked window.
|
||||
*
|
||||
* This is especially useful for product-sheet style documents where
|
||||
* several adjacent chunks belong together and should be passed to the model
|
||||
* as one coherent factual block.
|
||||
*/
|
||||
private function detectDominantTopDocument(array $chunkIds, array $rows): ?string
|
||||
{
|
||||
$docWindow = [];
|
||||
|
||||
foreach (array_slice($chunkIds, 0, self::DOMINANT_DOC_WINDOW) as $chunkId) {
|
||||
if (!isset($rows[$chunkId]['text'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$text = trim((string)$rows[$chunkId]['text']);
|
||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||
|
||||
if ($text === '' || !is_string($docId) || $docId === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$docWindow[] = $docId;
|
||||
}
|
||||
|
||||
if (count($docWindow) < 2) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$counts = array_count_values($docWindow);
|
||||
arsort($counts);
|
||||
|
||||
$dominantDocId = array_key_first($counts);
|
||||
|
||||
if (!is_string($dominantDocId) || $dominantDocId === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
$dominantCount = (int)($counts[$dominantDocId] ?? 0);
|
||||
|
||||
if ($dominantCount >= self::DOMINANT_DOC_MIN_HITS) {
|
||||
return $dominantDocId;
|
||||
}
|
||||
|
||||
$first = $docWindow[0] ?? null;
|
||||
$second = $docWindow[1] ?? null;
|
||||
|
||||
if ($dominantCount >= 2 && $first === $dominantDocId && $second === $dominantDocId) {
|
||||
return $dominantDocId;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects a coherent chunk window from the dominant document.
|
||||
*
|
||||
* Strategy:
|
||||
* - use the highest-ranked chunk of that document as anchor
|
||||
* - prefer neighbouring chunk indices around that anchor
|
||||
* - sort the final selection by chunk index for prompt coherence
|
||||
*/
|
||||
private function selectDominantDocumentChunkIds(
|
||||
string $documentId,
|
||||
array $chunkIds,
|
||||
array $rows,
|
||||
int $limit
|
||||
): array
|
||||
{
|
||||
$docHits = [];
|
||||
$anchorChunkIndex = null;
|
||||
|
||||
foreach ($chunkIds as $rank => $chunkId) {
|
||||
if (!isset($rows[$chunkId]['text'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$text = trim((string)$rows[$chunkId]['text']);
|
||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||
|
||||
if ($text === '' || $docId !== $documentId) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
||||
$chunkIndex = is_int($chunkIndex) ? $chunkIndex : null;
|
||||
|
||||
if ($anchorChunkIndex === null && $chunkIndex !== null) {
|
||||
$anchorChunkIndex = $chunkIndex;
|
||||
}
|
||||
|
||||
$docHits[] = [
|
||||
'id' => (string)$chunkId,
|
||||
'rank' => $rank,
|
||||
'chunk_index' => $chunkIndex,
|
||||
];
|
||||
}
|
||||
|
||||
if ($docHits === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$maxFromDoc = min($limit, self::DOMINANT_DOC_MAX_CHUNKS);
|
||||
|
||||
if ($anchorChunkIndex !== null) {
|
||||
usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int {
|
||||
$aDistance = $a['chunk_index'] === null ? PHP_INT_MAX : abs($a['chunk_index'] - $anchorChunkIndex);
|
||||
$bDistance = $b['chunk_index'] === null ? PHP_INT_MAX : abs($b['chunk_index'] - $anchorChunkIndex);
|
||||
|
||||
if ($aDistance !== $bDistance) {
|
||||
return $aDistance <=> $bDistance;
|
||||
}
|
||||
|
||||
return $a['rank'] <=> $b['rank'];
|
||||
});
|
||||
} else {
|
||||
usort($docHits, static fn(array $a, array $b): int => $a['rank'] <=> $b['rank']);
|
||||
}
|
||||
|
||||
$selected = array_slice($docHits, 0, $maxFromDoc);
|
||||
|
||||
usort($selected, static function (array $a, array $b): int {
|
||||
$aIndex = $a['chunk_index'];
|
||||
$bIndex = $b['chunk_index'];
|
||||
|
||||
if ($aIndex === null && $bIndex === null) {
|
||||
return $a['rank'] <=> $b['rank'];
|
||||
}
|
||||
|
||||
if ($aIndex === null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if ($bIndex === null) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ($aIndex !== $bIndex) {
|
||||
return $aIndex <=> $bIndex;
|
||||
}
|
||||
|
||||
return $a['rank'] <=> $b['rank'];
|
||||
});
|
||||
|
||||
return array_map(
|
||||
static fn(array $row): string => $row['id'],
|
||||
$selected
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills the remaining sales slots after a dominant document selection.
|
||||
*
|
||||
* The already selected dominant-document chunks stay fixed.
|
||||
* Remaining slots are filled with the normal spread strategy.
|
||||
*/
|
||||
private function fillRemainingSalesChunkIds(
|
||||
array $seedChunkIds,
|
||||
array $chunkIds,
|
||||
array $rows,
|
||||
int $limit
|
||||
): array
|
||||
{
|
||||
$out = array_values(array_unique(array_map('strval', $seedChunkIds)));
|
||||
|
||||
if (count($out) >= $limit) {
|
||||
return array_slice($out, 0, $limit);
|
||||
}
|
||||
|
||||
$selected = array_fill_keys($out, true);
|
||||
$docCounter = [];
|
||||
$docChunkPositions = [];
|
||||
|
||||
foreach ($out as $chunkId) {
|
||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
||||
|
||||
if (is_string($docId) && $docId !== '') {
|
||||
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
|
||||
|
||||
if (is_int($chunkIndex)) {
|
||||
$docChunkPositions[$docId][] = $chunkIndex;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($chunkIds as $chunkId) {
|
||||
if (isset($selected[$chunkId])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isset($rows[$chunkId]['text'])) {
|
||||
continue;
|
||||
@@ -594,23 +909,80 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
||||
|
||||
// Sales selection requires a valid document context.
|
||||
if (!is_string($docId)) {
|
||||
if (!is_string($docId) || $docId === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Limit how many chunks may come from the same document.
|
||||
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Enforce a minimum distance between chunk positions of the same document.
|
||||
if (is_int($chunkIndex)) {
|
||||
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
|
||||
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$text = trim((string)$rows[$chunkId]['text']);
|
||||
if ($text === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = (string)$chunkId;
|
||||
$selected[$chunkId] = true;
|
||||
$docCounter[$docId] = ($docCounter[$docId] ?? 0) + 1;
|
||||
|
||||
if (is_int($chunkIndex)) {
|
||||
$docChunkPositions[$docId][] = $chunkIndex;
|
||||
}
|
||||
|
||||
if (count($out) >= $limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default spread selection for sales-oriented queries.
|
||||
*
|
||||
* Goal:
|
||||
* - avoid overloading the result with chunks from the same document
|
||||
* - avoid chunks that are too close to each other in the same document
|
||||
* - preserve top-ranked relevance while improving contextual spread
|
||||
*/
|
||||
private function selectSalesChunkIdsSpread(array $chunkIds, array $rows, int $limit): array
|
||||
{
|
||||
$out = [];
|
||||
$docCounter = [];
|
||||
$docChunkPositions = [];
|
||||
|
||||
foreach ($chunkIds as $chunkId) {
|
||||
if (!isset($rows[$chunkId]['text'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$docId = $rows[$chunkId]['document_id'] ?? null;
|
||||
$chunkIndex = $rows[$chunkId]['chunk_index'] ?? null;
|
||||
|
||||
if (!is_string($docId) || $docId === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (($docCounter[$docId] ?? 0) >= NdjsonHybridRetrieverConfig::MAX_CHUNKS_PER_DOC) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_int($chunkIndex)) {
|
||||
foreach ($docChunkPositions[$docId] ?? [] as $prevIdx) {
|
||||
if (abs($prevIdx - $chunkIndex) < NdjsonHybridRetrieverConfig::MIN_CHUNK_DISTANCE) {
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
|
||||
$docChunkPositions[$docId][] = $chunkIndex;
|
||||
}
|
||||
|
||||
@@ -638,7 +1010,6 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$out = [];
|
||||
|
||||
foreach ($chunkIds as $id) {
|
||||
|
||||
if (!isset($rows[$id]['text'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user