first commit
This commit is contained in:
42
src/Knowledge/Retrieval/CachedRetriever.php
Normal file
42
src/Knowledge/Retrieval/CachedRetriever.php
Normal file
@@ -0,0 +1,42 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use Psr\Cache\CacheItemPoolInterface;
|
||||
|
||||
final class CachedRetriever implements RetrieverInterface
|
||||
{
|
||||
public function __construct(
|
||||
private RetrieverInterface $inner,
|
||||
private CacheItemPoolInterface $cache,
|
||||
private int $ttlSeconds = 600 // 10 Minuten
|
||||
) {}
|
||||
|
||||
public function retrieve(string $prompt, int $limit = 3): array
|
||||
{
|
||||
$key = $this->buildCacheKey($prompt, $limit);
|
||||
|
||||
$item = $this->cache->getItem($key);
|
||||
if ($item->isHit()) {
|
||||
return $item->get();
|
||||
}
|
||||
|
||||
$result = $this->inner->retrieve($prompt, $limit);
|
||||
|
||||
$item->set($result);
|
||||
$item->expiresAfter($this->ttlSeconds);
|
||||
$this->cache->save($item);
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
private function buildCacheKey(string $prompt, int $limit): string
|
||||
{
|
||||
$normalized = mb_strtolower(trim($prompt));
|
||||
$normalized = preg_replace('/\s+/u', ' ', $normalized);
|
||||
|
||||
return 'rag_retrieval_' . sha1($normalized . '|' . $limit);
|
||||
}
|
||||
}
|
||||
25
src/Knowledge/Retrieval/ChunkIndexLoader.php
Normal file
25
src/Knowledge/Retrieval/ChunkIndexLoader.php
Normal file
@@ -0,0 +1,25 @@
|
||||
<?php
|
||||
// src/Knowledge/Retrieval/ChunkIndexLoader.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
final class ChunkIndexLoader
|
||||
{
|
||||
public function __construct(
|
||||
private string $indexPath
|
||||
) {}
|
||||
|
||||
public function load(): array
|
||||
{
|
||||
if (!is_file($this->indexPath)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$json = file_get_contents($this->indexPath);
|
||||
$data = $json ? json_decode($json, true) : null;
|
||||
|
||||
return is_array($data) ? $data : [];
|
||||
}
|
||||
}
|
||||
269
src/Knowledge/Retrieval/ChunkKeywordRetriever.php
Normal file
269
src/Knowledge/Retrieval/ChunkKeywordRetriever.php
Normal file
@@ -0,0 +1,269 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Knowledge\StopWords;
|
||||
use App\Knowledge\VectorSearchChunked;
|
||||
use App\Knowledge\KeywordSimilarity;
|
||||
use App\Vector\VectorSearchClient;
|
||||
|
||||
final class ChunkKeywordRetriever implements RetrieverInterface
|
||||
{
|
||||
private const MAX_KEYWORD_CANDIDATES = 200;
|
||||
private const VECTOR_SCORE_THRESHOLD = 0.65;
|
||||
private const VECTOR_TOP_K = 3;
|
||||
|
||||
public function __construct(
|
||||
private VectorSearchChunked $chunkedSearch,
|
||||
private ChunkIndexLoader $indexLoader,
|
||||
private StopWords $stopWords,
|
||||
private VectorSearchClient $vectorClient,
|
||||
private string $chunksDir,
|
||||
private int $maxChunks = 3,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function retrieve(string $prompt, int $limit = null): array
|
||||
{
|
||||
$limit ??= $this->maxChunks;
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 1) Prompt → search terms
|
||||
// ---------------------------------------------------------
|
||||
$queryTerms = $this->extractTerms($prompt);
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 2) Keyword-based candidate discovery
|
||||
// ---------------------------------------------------------
|
||||
$result = $queryTerms !== []
|
||||
? $this->findCandidateFiles($queryTerms)
|
||||
: ['files' => [], 'canonicalTerms' => []];
|
||||
|
||||
$candidateScores = array_slice(
|
||||
$result['files'],
|
||||
0,
|
||||
self::MAX_KEYWORD_CANDIDATES,
|
||||
true
|
||||
);
|
||||
|
||||
// Canonical replacement
|
||||
$effectiveTerms = array_map(
|
||||
static fn (string $term): string =>
|
||||
$result['canonicalTerms'][$term] ?? $term,
|
||||
$queryTerms
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 3) Keyword scoring
|
||||
// ---------------------------------------------------------
|
||||
$scored = [];
|
||||
|
||||
foreach ($candidateScores as $file => $similarityScore) {
|
||||
$path = $this->chunksDir . '/' . $file;
|
||||
if (!is_file($path)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunk = file_get_contents($path);
|
||||
if ($chunk === false || $chunk === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$score = $this->scoreChunk($chunk, $effectiveTerms);
|
||||
if ($score === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$scored[$file] = [
|
||||
'chunk' => trim($chunk),
|
||||
'score' => (int) round($score * $similarityScore),
|
||||
];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 🔑 EARLY EXIT: Keyword results are sufficient
|
||||
// ---------------------------------------------------------
|
||||
if (\count($scored) >= $limit) {
|
||||
return $this->finalize($scored, $limit);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 4) Vector retrieval (semantic fallback)
|
||||
// ---------------------------------------------------------
|
||||
$vectorHits = $this->vectorClient->search($prompt, self::VECTOR_TOP_K);
|
||||
|
||||
foreach ($vectorHits as $hit) {
|
||||
if (
|
||||
!isset($hit['chunk_id'], $hit['score']) ||
|
||||
$hit['score'] < self::VECTOR_SCORE_THRESHOLD
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$file = $hit['chunk_id'] . '.txt';
|
||||
$path = $this->chunksDir . '/' . $file;
|
||||
|
||||
if (!is_file($path)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$baseScore = $scored[$file]['score'] ?? 0;
|
||||
|
||||
$vectorBoost = (int) round($hit['score'] * 10);
|
||||
|
||||
if ($vectorBoost <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunk = $scored[$file]['chunk']
|
||||
?? trim((string) file_get_contents($path));
|
||||
|
||||
$scored[$file] = [
|
||||
'chunk' => $chunk,
|
||||
'score' => $baseScore + $vectorBoost,
|
||||
];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 5) Final fallback
|
||||
// ---------------------------------------------------------
|
||||
if ($scored === []) {
|
||||
return $this->fallbackSearch($prompt);
|
||||
}
|
||||
|
||||
return $this->finalize($scored, $limit);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// FINALIZATION
|
||||
// -------------------------------------------------------------
|
||||
private function finalize(array $scored, int $limit): array
|
||||
{
|
||||
uasort($scored, fn ($a, $b) => $b['score'] <=> $a['score']);
|
||||
|
||||
return array_slice(
|
||||
$this->normalizeResults(
|
||||
array_column($scored, 'chunk')
|
||||
),
|
||||
0,
|
||||
$limit
|
||||
);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// INDEX LOGIC
|
||||
// -------------------------------------------------------------
|
||||
private function findCandidateFiles(array $terms): array
|
||||
{
|
||||
$index = $this->indexLoader->load();
|
||||
$files = [];
|
||||
$canonicalTerms = [];
|
||||
|
||||
foreach ($index as $entry) {
|
||||
if (!isset($entry['file'], $entry['keywords'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach ($terms as $term) {
|
||||
foreach ($entry['keywords'] as $indexKeyword) {
|
||||
$score = KeywordSimilarity::compare($term, $indexKeyword);
|
||||
|
||||
if ($score >= 0.8) {
|
||||
$files[$entry['file']] = max(
|
||||
$files[$entry['file']] ?? 0.0,
|
||||
$score
|
||||
);
|
||||
$canonicalTerms[$term] = $indexKeyword;
|
||||
break 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [
|
||||
'files' => $files,
|
||||
'canonicalTerms' => $canonicalTerms,
|
||||
];
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// FALLBACK
|
||||
// -------------------------------------------------------------
|
||||
private function fallbackSearch(string $prompt): array
|
||||
{
|
||||
$chunkedText = trim($this->chunkedSearch->searchAsText($prompt));
|
||||
if ($chunkedText === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
return array_slice(
|
||||
$this->normalizeResults($this->splitChunks($chunkedText)),
|
||||
0,
|
||||
$this->maxChunks
|
||||
);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// SCORING
|
||||
// -------------------------------------------------------------
|
||||
private function scoreChunk(string $chunk, array $terms): int
|
||||
{
|
||||
$content = mb_strtolower($chunk);
|
||||
$score = 0;
|
||||
|
||||
foreach ($terms as $term) {
|
||||
if (
|
||||
!\in_array($term, $this->stopWords->getStopWords(), true) &&
|
||||
str_contains($content, $term)
|
||||
) {
|
||||
$score += mb_strlen($term) >= 10 ? 2 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
return $score;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// UTIL
|
||||
// -------------------------------------------------------------
|
||||
private function extractTerms(string $text): array
|
||||
{
|
||||
$text = mb_strtolower(
|
||||
preg_replace('/[^\p{L}\p{N}\s]/u', '', $text)
|
||||
);
|
||||
|
||||
return array_values(array_filter(
|
||||
explode(' ', $text),
|
||||
static fn (string $w) => mb_strlen($w) > 2
|
||||
));
|
||||
}
|
||||
|
||||
private function splitChunks(string $text): array
|
||||
{
|
||||
return array_values(array_filter(
|
||||
array_map('trim', explode("\n\n", $text)),
|
||||
static fn (string $chunk) => $chunk !== ''
|
||||
));
|
||||
}
|
||||
|
||||
private function normalizeResults(array $chunks): array
|
||||
{
|
||||
$seen = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
|
||||
if (!isset($seen[$key])) {
|
||||
$seen[$key] = true;
|
||||
$out[] = $chunk;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
}
|
||||
11
src/Knowledge/Retrieval/RetrieverInterface.php
Normal file
11
src/Knowledge/Retrieval/RetrieverInterface.php
Normal file
@@ -0,0 +1,11 @@
|
||||
<?php
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
interface RetrieverInterface
|
||||
{
|
||||
/**
|
||||
* @return string[] Plain text knowledge chunks
|
||||
*/
|
||||
public function retrieve(string $prompt, int $limit = 3): array;
|
||||
}
|
||||
Reference in New Issue
Block a user