new version ndjson

This commit is contained in:
team 1
2026-02-12 11:22:56 +01:00
parent 0bb0c0b42f
commit 5a52e07edc
10 changed files with 375 additions and 492 deletions

View File

@@ -0,0 +1,99 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Vector\VectorSearchClient;
final class NdjsonHybridRetriever implements RetrieverInterface
{
private const VECTOR_SCORE_THRESHOLD = 0.65;
public function __construct(
private readonly NdjsonKeywordSearch $keywordSearch,
private readonly NdjsonChunkLookup $lookup,
private readonly VectorSearchClient $vectorClient,
private readonly int $maxChunks = 3,
private readonly int $vectorTopK = 5,
)
{
}
public function retrieve(string $prompt, int $limit = null): array
{
$limit ??= $this->maxChunks;
$terms = $this->extractTerms($prompt);
// 1) Keyword first
$keywordChunks = $this->keywordSearch->search($terms, $limit);
if (\count($keywordChunks) >= $limit) {
return array_slice($keywordChunks, 0, $limit);
}
// 2) Vector fallback / enrichment
$hits = $this->vectorClient->search($prompt, $this->vectorTopK);
if ($hits === []) {
return $keywordChunks;
}
$chunkIds = [];
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
continue;
}
$chunkIds[] = (string)$hit['chunk_id'];
}
if ($chunkIds === []) {
return $keywordChunks;
}
$rows = $this->lookup->findByChunkIds($chunkIds);
foreach ($chunkIds as $id) {
if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) {
continue;
}
$keywordChunks[] = trim($rows[$id]['text']);
}
// dedupe + limit
$seen = [];
$out = [];
foreach ($keywordChunks as $chunk) {
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = $chunk;
if (\count($out) >= $limit) {
break;
}
}
return $out;
}
/**
* minimal term extraction (we keep your old behavior)
*
* @return string[]
*/
private function extractTerms(string $text): array
{
$text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
return array_values(array_filter(
explode(' ', $text),
static fn(string $w) => mb_strlen($w) > 2
));
}
}