harden code and ingester

This commit is contained in:
team 1
2026-02-12 14:31:29 +01:00
parent 5a52e07edc
commit 994f582f35
8 changed files with 77 additions and 496 deletions

View File

@@ -1,25 +0,0 @@
<?php
// src/Knowledge/Retrieval/ChunkIndexLoader.php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
final class ChunkIndexLoader
{
public function __construct(
private string $indexPath
) {}
public function load(): array
{
if (!is_file($this->indexPath)) {
return [];
}
$json = file_get_contents($this->indexPath);
$data = $json ? json_decode($json, true) : null;
return is_array($data) ? $data : [];
}
}

View File

@@ -1,269 +0,0 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Knowledge\StopWords;
use App\Knowledge\VectorSearchChunked;
use App\Knowledge\KeywordSimilarity;
use App\Vector\VectorSearchClient;
final class ChunkKeywordRetriever implements RetrieverInterface
{
private const MAX_KEYWORD_CANDIDATES = 200;
private const VECTOR_SCORE_THRESHOLD = 0.65;
private const VECTOR_TOP_K = 3;
public function __construct(
private VectorSearchChunked $chunkedSearch,
private ChunkIndexLoader $indexLoader,
private StopWords $stopWords,
private VectorSearchClient $vectorClient,
private string $chunksDir,
private int $maxChunks = 3,
) {
}
/**
* {@inheritdoc}
*/
public function retrieve(string $prompt, int $limit = null): array
{
$limit ??= $this->maxChunks;
// ---------------------------------------------------------
// 1) Prompt → search terms
// ---------------------------------------------------------
$queryTerms = $this->extractTerms($prompt);
// ---------------------------------------------------------
// 2) Keyword-based candidate discovery
// ---------------------------------------------------------
$result = $queryTerms !== []
? $this->findCandidateFiles($queryTerms)
: ['files' => [], 'canonicalTerms' => []];
$candidateScores = array_slice(
$result['files'],
0,
self::MAX_KEYWORD_CANDIDATES,
true
);
// Canonical replacement
$effectiveTerms = array_map(
static fn (string $term): string =>
$result['canonicalTerms'][$term] ?? $term,
$queryTerms
);
// ---------------------------------------------------------
// 3) Keyword scoring
// ---------------------------------------------------------
$scored = [];
foreach ($candidateScores as $file => $similarityScore) {
$path = $this->chunksDir . '/' . $file;
if (!is_file($path)) {
continue;
}
$chunk = file_get_contents($path);
if ($chunk === false || $chunk === '') {
continue;
}
$score = $this->scoreChunk($chunk, $effectiveTerms);
if ($score === 0) {
continue;
}
$scored[$file] = [
'chunk' => trim($chunk),
'score' => (int) round($score * $similarityScore),
];
}
// ---------------------------------------------------------
// 🔑 EARLY EXIT: Keyword results are sufficient
// ---------------------------------------------------------
if (\count($scored) >= $limit) {
return $this->finalize($scored, $limit);
}
// ---------------------------------------------------------
// 4) Vector retrieval (semantic fallback)
// ---------------------------------------------------------
$vectorHits = $this->vectorClient->search($prompt, self::VECTOR_TOP_K);
foreach ($vectorHits as $hit) {
if (
!isset($hit['chunk_id'], $hit['score']) ||
$hit['score'] < self::VECTOR_SCORE_THRESHOLD
) {
continue;
}
$file = $hit['chunk_id'] . '.txt';
$path = $this->chunksDir . '/' . $file;
if (!is_file($path)) {
continue;
}
$baseScore = $scored[$file]['score'] ?? 0;
$vectorBoost = (int) round($hit['score'] * 10);
if ($vectorBoost <= 0) {
continue;
}
$chunk = $scored[$file]['chunk']
?? trim((string) file_get_contents($path));
$scored[$file] = [
'chunk' => $chunk,
'score' => $baseScore + $vectorBoost,
];
}
// ---------------------------------------------------------
// 5) Final fallback
// ---------------------------------------------------------
if ($scored === []) {
return $this->fallbackSearch($prompt);
}
return $this->finalize($scored, $limit);
}
// -------------------------------------------------------------
// FINALIZATION
// -------------------------------------------------------------
private function finalize(array $scored, int $limit): array
{
uasort($scored, fn ($a, $b) => $b['score'] <=> $a['score']);
return array_slice(
$this->normalizeResults(
array_column($scored, 'chunk')
),
0,
$limit
);
}
// -------------------------------------------------------------
// INDEX LOGIC
// -------------------------------------------------------------
private function findCandidateFiles(array $terms): array
{
$index = $this->indexLoader->load();
$files = [];
$canonicalTerms = [];
foreach ($index as $entry) {
if (!isset($entry['file'], $entry['keywords'])) {
continue;
}
foreach ($terms as $term) {
foreach ($entry['keywords'] as $indexKeyword) {
$score = KeywordSimilarity::compare($term, $indexKeyword);
if ($score >= 0.8) {
$files[$entry['file']] = max(
$files[$entry['file']] ?? 0.0,
$score
);
$canonicalTerms[$term] = $indexKeyword;
break 2;
}
}
}
}
return [
'files' => $files,
'canonicalTerms' => $canonicalTerms,
];
}
// -------------------------------------------------------------
// FALLBACK
// -------------------------------------------------------------
private function fallbackSearch(string $prompt): array
{
$chunkedText = trim($this->chunkedSearch->searchAsText($prompt));
if ($chunkedText === '') {
return [];
}
return array_slice(
$this->normalizeResults($this->splitChunks($chunkedText)),
0,
$this->maxChunks
);
}
// -------------------------------------------------------------
// SCORING
// -------------------------------------------------------------
private function scoreChunk(string $chunk, array $terms): int
{
$content = mb_strtolower($chunk);
$score = 0;
foreach ($terms as $term) {
if (
!\in_array($term, $this->stopWords->getStopWords(), true) &&
str_contains($content, $term)
) {
$score += mb_strlen($term) >= 10 ? 2 : 1;
}
}
return $score;
}
// -------------------------------------------------------------
// UTIL
// -------------------------------------------------------------
private function extractTerms(string $text): array
{
$text = mb_strtolower(
preg_replace('/[^\p{L}\p{N}\s]/u', '', $text)
);
return array_values(array_filter(
explode(' ', $text),
static fn (string $w) => mb_strlen($w) > 2
));
}
private function splitChunks(string $text): array
{
return array_values(array_filter(
array_map('trim', explode("\n\n", $text)),
static fn (string $chunk) => $chunk !== ''
));
}
private function normalizeResults(array $chunks): array
{
$seen = [];
$out = [];
foreach ($chunks as $chunk) {
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
if (!isset($seen[$key])) {
$seen[$key] = true;
$out[] = $chunk;
}
}
return $out;
}
}