remove direct chnuk search. only vector search
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
<?php
|
||||
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
@@ -9,34 +8,26 @@ use App\Vector\VectorSearchClient;
|
||||
|
||||
final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
private const VECTOR_SCORE_THRESHOLD = 0.65;
|
||||
private const VECTOR_SCORE_THRESHOLD = 0.25;
|
||||
|
||||
public function __construct(
|
||||
private readonly NdjsonKeywordSearch $keywordSearch,
|
||||
private readonly NdjsonChunkLookup $lookup,
|
||||
private readonly VectorSearchClient $vectorClient,
|
||||
private readonly int $maxChunks = 3,
|
||||
private readonly int $vectorTopK = 5,
|
||||
private readonly int $maxChunks = 10,
|
||||
private readonly int $vectorTopK = 10,
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
public function retrieve(string $prompt, int $limit = null): array
|
||||
{
|
||||
$limit ??= $this->maxChunks;
|
||||
$limit = $this->maxChunks;
|
||||
$keywordChunks = [];
|
||||
|
||||
$terms = $this->extractTerms($prompt);
|
||||
|
||||
// 1) Keyword first
|
||||
$keywordChunks = $this->keywordSearch->search($terms, $limit);
|
||||
if (\count($keywordChunks) >= $limit) {
|
||||
return array_slice($keywordChunks, 0, $limit);
|
||||
}
|
||||
|
||||
// 2) Vector fallback / enrichment
|
||||
// Vector / enrichment
|
||||
$hits = $this->vectorClient->search($prompt, $this->vectorTopK);
|
||||
if ($hits === []) {
|
||||
return $keywordChunks;
|
||||
return $this->diversifyByDevice($keywordChunks, $limit, 1);
|
||||
}
|
||||
|
||||
$chunkIds = [];
|
||||
@@ -51,7 +42,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
}
|
||||
|
||||
if ($chunkIds === []) {
|
||||
return $keywordChunks;
|
||||
return $this->diversifyByDevice($keywordChunks, $limit, 1);
|
||||
}
|
||||
|
||||
$rows = $this->lookup->findByChunkIds($chunkIds);
|
||||
@@ -63,9 +54,9 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$keywordChunks[] = trim($rows[$id]['text']);
|
||||
}
|
||||
|
||||
// dedupe + limit
|
||||
// dedupe
|
||||
$seen = [];
|
||||
$out = [];
|
||||
$deduped = [];
|
||||
|
||||
foreach ($keywordChunks as $chunk) {
|
||||
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
|
||||
@@ -73,20 +64,13 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
continue;
|
||||
}
|
||||
$seen[$key] = true;
|
||||
$out[] = $chunk;
|
||||
if (\count($out) >= $limit) {
|
||||
break;
|
||||
}
|
||||
$deduped[] = $chunk;
|
||||
}
|
||||
|
||||
return $out;
|
||||
// diversify
|
||||
return $this->diversifyByDevice($deduped, $limit, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* minimal term extraction (we keep your old behavior)
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
private function extractTerms(string $text): array
|
||||
{
|
||||
$text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
|
||||
@@ -96,4 +80,41 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
static fn(string $w) => mb_strlen($w) > 2
|
||||
));
|
||||
}
|
||||
|
||||
private function extractDevice(string $chunk): string
|
||||
{
|
||||
$firstLine = explode("\n", $chunk, 2)[0] ?? '';
|
||||
return trim($firstLine);
|
||||
}
|
||||
|
||||
private function diversifyByDevice(array $chunks, int $limit, int $maxPerDevice = 1): array
|
||||
{
|
||||
$seenDevices = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$device = $this->extractDevice($chunk);
|
||||
|
||||
if ($device === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isset($seenDevices[$device])) {
|
||||
$seenDevices[$device] = 0;
|
||||
}
|
||||
|
||||
if ($seenDevices[$device] >= $maxPerDevice) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = $chunk;
|
||||
$seenDevices[$device]++;
|
||||
|
||||
if (\count($out) >= $limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user