new version ndjson

This commit is contained in:
team 1
2026-02-12 11:22:56 +01:00
parent 0bb0c0b42f
commit 5a52e07edc
10 changed files with 375 additions and 492 deletions

View File

@@ -0,0 +1,44 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Knowledge\ChunkManager;
use Symfony\Component\Uid\Uuid;
final class NdjsonChunkLookup
{
public function __construct(
private readonly ChunkManager $chunkManager
)
{
}
/**
* @param string[] $chunkIds RFC4122 UUID strings
* @return array<string,array<string,mixed>> keyed by chunk_id
*/
public function findByChunkIds(array $chunkIds): array
{
$wanted = array_fill_keys($chunkIds, true);
$found = [];
foreach ($this->chunkManager->streamAll() as $row) {
$id = $row['chunk_id'] ?? null;
if (!is_string($id) || !isset($wanted[$id])) {
continue;
}
$found[$id] = $row;
// Early exit sobald alle gefunden
if (\count($found) === \count($wanted)) {
break;
}
}
return $found;
}
}

View File

@@ -0,0 +1,99 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Vector\VectorSearchClient;
final class NdjsonHybridRetriever implements RetrieverInterface
{
private const VECTOR_SCORE_THRESHOLD = 0.65;
public function __construct(
private readonly NdjsonKeywordSearch $keywordSearch,
private readonly NdjsonChunkLookup $lookup,
private readonly VectorSearchClient $vectorClient,
private readonly int $maxChunks = 3,
private readonly int $vectorTopK = 5,
)
{
}
public function retrieve(string $prompt, int $limit = null): array
{
$limit ??= $this->maxChunks;
$terms = $this->extractTerms($prompt);
// 1) Keyword first
$keywordChunks = $this->keywordSearch->search($terms, $limit);
if (\count($keywordChunks) >= $limit) {
return array_slice($keywordChunks, 0, $limit);
}
// 2) Vector fallback / enrichment
$hits = $this->vectorClient->search($prompt, $this->vectorTopK);
if ($hits === []) {
return $keywordChunks;
}
$chunkIds = [];
foreach ($hits as $hit) {
if (!isset($hit['chunk_id'], $hit['score'])) {
continue;
}
if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
continue;
}
$chunkIds[] = (string)$hit['chunk_id'];
}
if ($chunkIds === []) {
return $keywordChunks;
}
$rows = $this->lookup->findByChunkIds($chunkIds);
foreach ($chunkIds as $id) {
if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) {
continue;
}
$keywordChunks[] = trim($rows[$id]['text']);
}
// dedupe + limit
$seen = [];
$out = [];
foreach ($keywordChunks as $chunk) {
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = $chunk;
if (\count($out) >= $limit) {
break;
}
}
return $out;
}
/**
* minimal term extraction (we keep your old behavior)
*
* @return string[]
*/
private function extractTerms(string $text): array
{
$text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
return array_values(array_filter(
explode(' ', $text),
static fn(string $w) => mb_strlen($w) > 2
));
}
}

View File

@@ -0,0 +1,101 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Knowledge\ChunkManager;
use App\Knowledge\StopWords;
final class NdjsonKeywordSearch
{
public function __construct(
private readonly ChunkManager $chunkManager,
private readonly StopWords $stopWords,
) {
}
/**
* Streaming Keyword-Search über index.ndjson.
*
* @param string[] $terms (already lowercased)
* @return string[] best chunks
*/
public function search(array $terms, int $limit = 3, int $candidateCap = 200): array
{
$terms = array_values(array_filter($terms, function (string $t): bool {
return $t !== '' && !\in_array($t, $this->stopWords->getStopWords(), true);
}));
if ($terms === []) {
return [];
}
// bounded min-heap (score => chunkText)
$best = [];
foreach ($this->chunkManager->streamAll() as $row) {
$text = $row['text'] ?? null;
if (!is_string($text) || $text === '') {
continue;
}
$score = $this->scoreText($text, $terms);
if ($score <= 0) {
continue;
}
$best[] = ['score' => $score, 'text' => trim($text)];
// keep array bounded to avoid memory spikes
if (\count($best) > $candidateCap) {
usort($best, fn($a, $b) => $b['score'] <=> $a['score']);
$best = array_slice($best, 0, $candidateCap);
}
}
if ($best === []) {
return [];
}
usort($best, fn($a, $b) => $b['score'] <=> $a['score']);
$out = [];
$seen = [];
foreach ($best as $row) {
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $row['text']));
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = $row['text'];
if (\count($out) >= $limit) {
break;
}
}
return $out;
}
/**
* Simple scoring: count matches, weight long terms slightly.
*/
private function scoreText(string $text, array $terms): int
{
$content = mb_strtolower($text);
$score = 0;
foreach ($terms as $term) {
if ($term === '') {
continue;
}
if (str_contains($content, $term)) {
$score += (mb_strlen($term) >= 10) ? 2 : 1;
}
}
return $score;
}
}

View File

@@ -1,121 +0,0 @@
<?php
declare(strict_types=1);
namespace App\Knowledge;
use Psr\Log\LoggerInterface;
/**
* VectorSearchChunked
*
* Chunk-based retrieval service for long-form knowledge documents.
* This is a lightweight, deterministic runtime reader for
* precomputed knowledge chunks.
*
* Design principles:
* - No runtime indexing
* - No ML dependencies
* - Deterministic and fast
* - Hard limits to protect prompt size
*
* This service is intentionally simple and can later be replaced
* by a real vector database without changing the AgentRunner.
*/
final class VectorSearchChunked
{
/**
* Directory containing chunked knowledge files.
*/
private string $dataDir = 'var/knowledge/chunks';
/**
* Maximum number of chunks to return.
*/
private int $maxChunks = 3;
public function __construct(
private string $projectDir,
)
{
$this->dataDir = $this->projectDir . '/' . $this->dataDir;
}
/**
* Returns concatenated relevant chunks as plain text.
*
* @param string $prompt
* @return string
*/
public function searchAsText(string $prompt): string
{
if (!is_dir($this->dataDir)) {
return '';
}
$promptLower = mb_strtolower($prompt);
$keywords = $this->extractKeywords($promptLower);
if ($keywords === []) {
return '';
}
$matches = [];
foreach (glob($this->dataDir . '/*.txt') as $file) {
$content = file_get_contents($file);
if ($content === false) {
continue;
}
$contentLower = mb_strtolower($content);
if ($this->matchesKeywords($contentLower, $keywords)) {
$matches[] = trim($content);
}
if (count($matches) >= $this->maxChunks) {
break;
}
}
return implode("\n\n", $matches);
}
/**
* Extracts simple keywords from the prompt.
*
* This is a lightweight heuristic replacement for
* full vector or embedding-based search.
*/
private function extractKeywords(string $prompt): array
{
$words = preg_split('/\W+/u', $prompt, -1, PREG_SPLIT_NO_EMPTY);
if ($words === false) {
return [];
}
$keywords = [];
foreach ($words as $word) {
if (mb_strlen($word) >= 4) {
$keywords[] = $word;
}
}
return array_values(array_unique($keywords));
}
/**
* Checks whether the content matches at least one keyword.
*/
private function matchesKeywords(string $content, array $keywords): bool
{
foreach ($keywords as $keyword) {
if (str_contains($content, $keyword)) {
return true;
}
}
return false;
}
}