new version ndjson
This commit is contained in:
44
src/Knowledge/Retrieval/NdjsonChunkLookup.php
Normal file
44
src/Knowledge/Retrieval/NdjsonChunkLookup.php
Normal file
@@ -0,0 +1,44 @@
|
||||
<?php
|
||||
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Knowledge\ChunkManager;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
final class NdjsonChunkLookup
|
||||
{
|
||||
public function __construct(
|
||||
private readonly ChunkManager $chunkManager
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[] $chunkIds RFC4122 UUID strings
|
||||
* @return array<string,array<string,mixed>> keyed by chunk_id
|
||||
*/
|
||||
public function findByChunkIds(array $chunkIds): array
|
||||
{
|
||||
$wanted = array_fill_keys($chunkIds, true);
|
||||
$found = [];
|
||||
|
||||
foreach ($this->chunkManager->streamAll() as $row) {
|
||||
$id = $row['chunk_id'] ?? null;
|
||||
if (!is_string($id) || !isset($wanted[$id])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$found[$id] = $row;
|
||||
|
||||
// Early exit sobald alle gefunden
|
||||
if (\count($found) === \count($wanted)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $found;
|
||||
}
|
||||
}
|
||||
99
src/Knowledge/Retrieval/NdjsonHybridRetriever.php
Normal file
99
src/Knowledge/Retrieval/NdjsonHybridRetriever.php
Normal file
@@ -0,0 +1,99 @@
|
||||
<?php
|
||||
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Vector\VectorSearchClient;
|
||||
|
||||
final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
private const VECTOR_SCORE_THRESHOLD = 0.65;
|
||||
|
||||
public function __construct(
|
||||
private readonly NdjsonKeywordSearch $keywordSearch,
|
||||
private readonly NdjsonChunkLookup $lookup,
|
||||
private readonly VectorSearchClient $vectorClient,
|
||||
private readonly int $maxChunks = 3,
|
||||
private readonly int $vectorTopK = 5,
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
public function retrieve(string $prompt, int $limit = null): array
|
||||
{
|
||||
$limit ??= $this->maxChunks;
|
||||
|
||||
$terms = $this->extractTerms($prompt);
|
||||
|
||||
// 1) Keyword first
|
||||
$keywordChunks = $this->keywordSearch->search($terms, $limit);
|
||||
if (\count($keywordChunks) >= $limit) {
|
||||
return array_slice($keywordChunks, 0, $limit);
|
||||
}
|
||||
|
||||
// 2) Vector fallback / enrichment
|
||||
$hits = $this->vectorClient->search($prompt, $this->vectorTopK);
|
||||
if ($hits === []) {
|
||||
return $keywordChunks;
|
||||
}
|
||||
|
||||
$chunkIds = [];
|
||||
foreach ($hits as $hit) {
|
||||
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||
continue;
|
||||
}
|
||||
if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
|
||||
continue;
|
||||
}
|
||||
$chunkIds[] = (string)$hit['chunk_id'];
|
||||
}
|
||||
|
||||
if ($chunkIds === []) {
|
||||
return $keywordChunks;
|
||||
}
|
||||
|
||||
$rows = $this->lookup->findByChunkIds($chunkIds);
|
||||
|
||||
foreach ($chunkIds as $id) {
|
||||
if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) {
|
||||
continue;
|
||||
}
|
||||
$keywordChunks[] = trim($rows[$id]['text']);
|
||||
}
|
||||
|
||||
// dedupe + limit
|
||||
$seen = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($keywordChunks as $chunk) {
|
||||
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
|
||||
if (isset($seen[$key])) {
|
||||
continue;
|
||||
}
|
||||
$seen[$key] = true;
|
||||
$out[] = $chunk;
|
||||
if (\count($out) >= $limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* minimal term extraction (we keep your old behavior)
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
private function extractTerms(string $text): array
|
||||
{
|
||||
$text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
|
||||
|
||||
return array_values(array_filter(
|
||||
explode(' ', $text),
|
||||
static fn(string $w) => mb_strlen($w) > 2
|
||||
));
|
||||
}
|
||||
}
|
||||
101
src/Knowledge/Retrieval/NdjsonKeywordSearch.php
Normal file
101
src/Knowledge/Retrieval/NdjsonKeywordSearch.php
Normal file
@@ -0,0 +1,101 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Knowledge\ChunkManager;
|
||||
use App\Knowledge\StopWords;
|
||||
|
||||
final class NdjsonKeywordSearch
|
||||
{
|
||||
public function __construct(
|
||||
private readonly ChunkManager $chunkManager,
|
||||
private readonly StopWords $stopWords,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Streaming Keyword-Search über index.ndjson.
|
||||
*
|
||||
* @param string[] $terms (already lowercased)
|
||||
* @return string[] best chunks
|
||||
*/
|
||||
public function search(array $terms, int $limit = 3, int $candidateCap = 200): array
|
||||
{
|
||||
$terms = array_values(array_filter($terms, function (string $t): bool {
|
||||
return $t !== '' && !\in_array($t, $this->stopWords->getStopWords(), true);
|
||||
}));
|
||||
|
||||
if ($terms === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// bounded min-heap (score => chunkText)
|
||||
$best = [];
|
||||
|
||||
foreach ($this->chunkManager->streamAll() as $row) {
|
||||
$text = $row['text'] ?? null;
|
||||
if (!is_string($text) || $text === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$score = $this->scoreText($text, $terms);
|
||||
if ($score <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$best[] = ['score' => $score, 'text' => trim($text)];
|
||||
|
||||
// keep array bounded to avoid memory spikes
|
||||
if (\count($best) > $candidateCap) {
|
||||
usort($best, fn($a, $b) => $b['score'] <=> $a['score']);
|
||||
$best = array_slice($best, 0, $candidateCap);
|
||||
}
|
||||
}
|
||||
|
||||
if ($best === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
usort($best, fn($a, $b) => $b['score'] <=> $a['score']);
|
||||
|
||||
$out = [];
|
||||
$seen = [];
|
||||
|
||||
foreach ($best as $row) {
|
||||
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $row['text']));
|
||||
if (isset($seen[$key])) {
|
||||
continue;
|
||||
}
|
||||
$seen[$key] = true;
|
||||
$out[] = $row['text'];
|
||||
|
||||
if (\count($out) >= $limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple scoring: count matches, weight long terms slightly.
|
||||
*/
|
||||
private function scoreText(string $text, array $terms): int
|
||||
{
|
||||
$content = mb_strtolower($text);
|
||||
$score = 0;
|
||||
|
||||
foreach ($terms as $term) {
|
||||
if ($term === '') {
|
||||
continue;
|
||||
}
|
||||
if (str_contains($content, $term)) {
|
||||
$score += (mb_strlen($term) >= 10) ? 2 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
return $score;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user