first commit
This commit is contained in:
58
src/Knowledge/Ingest/ChunkIndexWriter.php
Normal file
58
src/Knowledge/Ingest/ChunkIndexWriter.php
Normal file
@@ -0,0 +1,58 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/ChunkIndexWriter.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
final class ChunkIndexWriter
|
||||
{
|
||||
public function __construct(
|
||||
private string $indexPath
|
||||
) {}
|
||||
|
||||
public function add(array $entry): void
|
||||
{
|
||||
$index = $this->load();
|
||||
$index[] = $entry;
|
||||
$this->save($index);
|
||||
}
|
||||
|
||||
private function load(): array
|
||||
{
|
||||
if (!is_file($this->indexPath)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$json = file_get_contents($this->indexPath);
|
||||
$data = $json ? json_decode($json, true) : null;
|
||||
|
||||
return is_array($data) ? $data : [];
|
||||
}
|
||||
|
||||
private function save(array $index): void
|
||||
{
|
||||
$dir = dirname($this->indexPath);
|
||||
if (!is_dir($dir)) {
|
||||
mkdir($dir, 0775, true);
|
||||
}
|
||||
|
||||
file_put_contents(
|
||||
$this->indexPath,
|
||||
json_encode($index, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)
|
||||
);
|
||||
}
|
||||
|
||||
public function hasSourceHash(string $source, string $hash): bool
|
||||
{
|
||||
foreach ($this->load() as $entry) {
|
||||
if (
|
||||
($entry['source'] ?? null) === $source &&
|
||||
($entry['sourceHash'] ?? null) === $hash
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
149
src/Knowledge/Ingest/ChunkWriter.php
Normal file
149
src/Knowledge/Ingest/ChunkWriter.php
Normal file
@@ -0,0 +1,149 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/ChunkWriter.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
|
||||
use App\Knowledge\StopWords;
|
||||
|
||||
final class ChunkWriter
|
||||
{
|
||||
|
||||
public function __construct(
|
||||
private string $chunksDir,
|
||||
private string $manifestPath,
|
||||
private ChunkIndexWriter $indexWriter,
|
||||
private StopWords $stopWords,
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[] $chunks
|
||||
* @return string[] written filenames
|
||||
*/
|
||||
public function write(string $sourceName, array $chunks, string $sourceHash): array
|
||||
{
|
||||
if (!is_dir($this->chunksDir)) {
|
||||
mkdir($this->chunksDir, 0775, true);
|
||||
}
|
||||
|
||||
$manifest = $this->loadManifest();
|
||||
$written = [];
|
||||
|
||||
$base = $this->safeBase($sourceName);
|
||||
$ts = date('Ymd_His');
|
||||
|
||||
foreach ($chunks as $i => $chunk) {
|
||||
$filename = "{$base}__{$ts}__" . str_pad((string)$i, 4, '0', STR_PAD_LEFT) . ".txt";
|
||||
$path = rtrim($this->chunksDir, '/') . '/' . $filename;
|
||||
|
||||
$header = $this->buildHeader(
|
||||
source: $sourceName,
|
||||
index: $i
|
||||
);
|
||||
|
||||
file_put_contents($path, $header . "\n\n" . $chunk);
|
||||
|
||||
$written[] = $filename;
|
||||
|
||||
$manifest[] = [
|
||||
'file' => $filename,
|
||||
'source' => $sourceName,
|
||||
'index' => $i,
|
||||
'chars' => mb_strlen($chunk),
|
||||
'createdAt' => date('c'),
|
||||
];
|
||||
|
||||
$this->indexWriter->add([
|
||||
'file' => $filename,
|
||||
'source' => $sourceName,
|
||||
'sourceHash' => $sourceHash,
|
||||
'keywords' => $this->extractKeywords($chunk),
|
||||
'chars' => mb_strlen($chunk),
|
||||
]);
|
||||
}
|
||||
|
||||
|
||||
$this->saveManifest($manifest);
|
||||
return $written;
|
||||
}
|
||||
|
||||
private function safeBase(string $name): string
|
||||
{
|
||||
$name = pathinfo($name, PATHINFO_FILENAME);
|
||||
$name = mb_strtolower($name);
|
||||
$name = preg_replace('/[^a-z0-9\-_]+/u', '-', $name);
|
||||
return trim((string)$name, '-');
|
||||
}
|
||||
|
||||
private function loadManifest(): array
|
||||
{
|
||||
if (!is_file($this->manifestPath)) {
|
||||
return [];
|
||||
}
|
||||
$json = file_get_contents($this->manifestPath);
|
||||
$data = $json ? json_decode($json, true) : null;
|
||||
return is_array($data) ? $data : [];
|
||||
}
|
||||
|
||||
private function saveManifest(array $manifest): void
|
||||
{
|
||||
$dir = dirname($this->manifestPath);
|
||||
if (!is_dir($dir)) {
|
||||
mkdir($dir, 0775, true);
|
||||
}
|
||||
file_put_contents($this->manifestPath, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
|
||||
}
|
||||
|
||||
private function buildHeader(string $source, int $index): string
|
||||
{
|
||||
return sprintf(
|
||||
'[Quelle: %s | Abschnitt: Chunk %d]',
|
||||
$source,
|
||||
$index + 1
|
||||
);
|
||||
}
|
||||
|
||||
private function extractKeywords(string $text): array
|
||||
{
|
||||
// 1) Lowercase
|
||||
$text = mb_strtolower($text);
|
||||
|
||||
// 2) URLs entfernen (sehr wichtig)
|
||||
$text = preg_replace('#https?://\S+#u', ' ', $text);
|
||||
|
||||
// 3) Newlines & Tabs → Space
|
||||
$text = str_replace(["\r", "\n", "\t"], ' ', $text);
|
||||
|
||||
// 4) Trennzeichen → Space (NICHT löschen!)
|
||||
$text = preg_replace('/[\/\.\,\:\;\-\_\(\)\[\]\{\}]/u', ' ', $text);
|
||||
|
||||
// 5) Alles andere raus
|
||||
$text = preg_replace('/[^\p{L}\p{N}\s]/u', '', $text);
|
||||
|
||||
// 6) Whitespace normalisieren
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
$text = trim($text);
|
||||
|
||||
// 7) Wörter extrahieren
|
||||
$words = explode(' ', $text);
|
||||
|
||||
// 8) Filtern + deduplizieren
|
||||
$keywords = [];
|
||||
|
||||
foreach ($words as $word) {
|
||||
if (mb_strlen($word) < 4) {
|
||||
continue;
|
||||
}
|
||||
if (in_array($word, $this->stopWords->getStopWords() ?? [], true)) {
|
||||
continue;
|
||||
}
|
||||
$keywords[] = $word;
|
||||
}
|
||||
|
||||
return array_values(array_unique(array_slice($keywords, 0, 25)));
|
||||
}
|
||||
}
|
||||
37
src/Knowledge/Ingest/DocumentLoader.php
Normal file
37
src/Knowledge/Ingest/DocumentLoader.php
Normal file
@@ -0,0 +1,37 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/DocumentLoader.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
final class DocumentLoader
|
||||
{
|
||||
public function load(string $path): string
|
||||
{
|
||||
if (!is_file($path)) {
|
||||
throw new \RuntimeException("File not found: {$path}");
|
||||
}
|
||||
|
||||
$ext = mb_strtolower(pathinfo($path, PATHINFO_EXTENSION));
|
||||
|
||||
return match ($ext) {
|
||||
'txt', 'md' => $this->loadText($path),
|
||||
|
||||
// später:
|
||||
// 'pdf' => $this->loadPdf($path),
|
||||
// 'docx' => $this->loadDocx($path),
|
||||
|
||||
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
|
||||
};
|
||||
}
|
||||
|
||||
private function loadText(string $path): string
|
||||
{
|
||||
$content = file_get_contents($path);
|
||||
if ($content === false) {
|
||||
throw new \RuntimeException("Could not read file: {$path}");
|
||||
}
|
||||
return $content;
|
||||
}
|
||||
}
|
||||
39
src/Knowledge/Ingest/KnowledgeIngestService.php
Normal file
39
src/Knowledge/Ingest/KnowledgeIngestService.php
Normal file
@@ -0,0 +1,39 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/KnowledgeIngestService.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
final class KnowledgeIngestService
|
||||
{
|
||||
public function __construct(
|
||||
private DocumentLoader $loader,
|
||||
private SimpleChunker $chunker,
|
||||
private ChunkWriter $writer,
|
||||
private ChunkIndexWriter $indexWriter,
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
/** @return string[] written chunk filenames */
|
||||
public function ingestFile(string $path, bool $optimize = false): array
|
||||
{
|
||||
$text = $this->loader->load($path);
|
||||
|
||||
if ($optimize) {
|
||||
$text = preg_replace("/\n{3,}/", "\n\n", $text);
|
||||
$text = preg_replace("/[ \t]+$/m", "", $text);
|
||||
}
|
||||
|
||||
$sourceHash = sha1($text);
|
||||
$sourceName = basename($path);
|
||||
|
||||
if ($this->indexWriter->hasSourceHash($sourceName, $sourceHash)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$chunks = $this->chunker->chunk($text);
|
||||
return $this->writer->write($sourceName, $chunks, $sourceHash);
|
||||
}
|
||||
}
|
||||
146
src/Knowledge/Ingest/SimpleChunker.php
Normal file
146
src/Knowledge/Ingest/SimpleChunker.php
Normal file
@@ -0,0 +1,146 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/SimpleChunker.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
final class SimpleChunker
|
||||
{
|
||||
public function __construct(
|
||||
private int $maxWords = 180,
|
||||
private int $overlapWords = 30
|
||||
) {}
|
||||
|
||||
/** @return string[] */
|
||||
public function chunk(string $text): array
|
||||
{
|
||||
$text = $this->normalize($text);
|
||||
if ($text === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Split into tokens: words + whitespace preserved
|
||||
$tokens = preg_split(
|
||||
'/(\s+)/u',
|
||||
$text,
|
||||
-1,
|
||||
PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
|
||||
);
|
||||
|
||||
if (!$tokens) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Build word index → token index mapping
|
||||
$wordTokenIndexes = [];
|
||||
foreach ($tokens as $i => $token) {
|
||||
if (!preg_match('/^\s+$/u', $token)) {
|
||||
$wordTokenIndexes[] = $i;
|
||||
}
|
||||
}
|
||||
|
||||
$totalWords = count($wordTokenIndexes);
|
||||
if ($totalWords === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$chunks = [];
|
||||
$wordPos = 0;
|
||||
|
||||
while ($wordPos < $totalWords) {
|
||||
$wordEnd = min($wordPos + $this->maxWords, $totalWords);
|
||||
|
||||
$tokenStart = $wordTokenIndexes[$wordPos];
|
||||
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
|
||||
|
||||
// Intelligent cut (sentence / paragraph aware)
|
||||
$tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
|
||||
|
||||
$chunk = trim(implode('', array_slice(
|
||||
$tokens,
|
||||
$tokenStart,
|
||||
$tokenEnd - $tokenStart
|
||||
)));
|
||||
|
||||
if ($chunk !== '') {
|
||||
$chunks[] = $chunk;
|
||||
}
|
||||
|
||||
if ($wordEnd >= $totalWords) {
|
||||
break;
|
||||
}
|
||||
|
||||
$wordPos = max(0, $wordEnd - $this->overlapWords);
|
||||
}
|
||||
|
||||
return $this->dedupe($chunks);
|
||||
}
|
||||
|
||||
private function normalize(string $text): string
|
||||
{
|
||||
$text = str_replace(["\r\n", "\r"], "\n", $text);
|
||||
$text = preg_replace("/[ \t]+/u", " ", $text);
|
||||
$text = preg_replace("/\n{3,}/u", "\n\n", $text);
|
||||
|
||||
return trim((string) $text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Move cut backwards to a natural boundary if possible.
|
||||
* Rules:
|
||||
* - Never cut inside markdown list items
|
||||
* - Sentence end only if followed by a line break
|
||||
* - Paragraph breaks always allowed
|
||||
*/
|
||||
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
|
||||
{
|
||||
// Detect markdown list context (e.g. "- Foo: Bar")
|
||||
$startToken = $tokens[$start] ?? '';
|
||||
if (preg_match('/^- /u', ltrim($startToken))) {
|
||||
// Keep list blocks intact
|
||||
return $end;
|
||||
}
|
||||
|
||||
for ($i = $end - 1; $i > $start; $i--) {
|
||||
|
||||
// Paragraph boundary
|
||||
if ($tokens[$i] === "\n\n") {
|
||||
return $i + 1;
|
||||
}
|
||||
|
||||
// Sentence boundary only if followed by newline
|
||||
if (
|
||||
preg_match('/[.!?]\s*$/u', $tokens[$i]) &&
|
||||
isset($tokens[$i + 1]) &&
|
||||
str_contains($tokens[$i + 1], "\n")
|
||||
) {
|
||||
return $i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return $end;
|
||||
}
|
||||
|
||||
/** @param string[] $chunks @return string[] */
|
||||
private function dedupe(array $chunks): array
|
||||
{
|
||||
$seen = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$key = mb_strtolower(
|
||||
preg_replace('/\s+/u', ' ', trim($chunk))
|
||||
);
|
||||
|
||||
if (isset($seen[$key])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$seen[$key] = true;
|
||||
$out[] = $chunk;
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
}
|
||||
35
src/Knowledge/KeywordMapper.php
Normal file
35
src/Knowledge/KeywordMapper.php
Normal file
@@ -0,0 +1,35 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge;
|
||||
|
||||
/**
|
||||
* KeywordMapper
|
||||
*
|
||||
* Expands short or ambiguous prompts into richer semantic variants
|
||||
* before they are passed into retrieval or embedding pipelines.
|
||||
*
|
||||
* This is a direct port of prompt_mapping.py.
|
||||
*/
|
||||
final class KeywordMapper
|
||||
{
|
||||
private array $map = [
|
||||
'ki' => 'künstliche Intelligenz, AI, Projekte, Modelle, Agenten, ki',
|
||||
'shop' => 'Shopware, Onlineshop, Webshop, Commerce-System',
|
||||
'shops' => 'Shopware, Webshops, Verkaufsplattformen',
|
||||
'agentur' => 'Agentur, Firma, Unternehmen, mitho media',
|
||||
'api' => 'Schnittstelle, API, Anbindung, Integration',
|
||||
'plugin' => 'Shopware Plugin, Erweiterung, Modul, Funktion',
|
||||
];
|
||||
|
||||
/**
|
||||
* Maps a raw prompt to an expanded semantic variant if applicable.
|
||||
*/
|
||||
public function map(string $prompt): string
|
||||
{
|
||||
$key = mb_strtolower(trim($prompt));
|
||||
|
||||
return $this->map[$key] ?? $prompt;
|
||||
}
|
||||
}
|
||||
87
src/Knowledge/KeywordSimilarity.php
Normal file
87
src/Knowledge/KeywordSimilarity.php
Normal file
@@ -0,0 +1,87 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge;
|
||||
|
||||
/**
|
||||
* KeywordSimilarity
|
||||
*
|
||||
* Deterministic and fault-tolerant comparison of two keywords.
|
||||
* Returns a similarity score between 0.0 and 1.0.
|
||||
*
|
||||
* Design goals:
|
||||
* - index.json remains unchanged
|
||||
* - comparison logic is intelligent (typos, phonetics)
|
||||
* - no alias or synonym lists
|
||||
* - no LLM dependency
|
||||
*/
|
||||
final class KeywordSimilarity
|
||||
{
|
||||
/**
|
||||
* Compare a query token with an index keyword.
|
||||
*
|
||||
* @param string $queryToken Token from user input
|
||||
* @param string $indexKeyword Keyword from index.json
|
||||
*
|
||||
* @return float Similarity score (0.0 – 1.0)
|
||||
*/
|
||||
public static function compare(string $queryToken, string $indexKeyword): float
|
||||
{
|
||||
$a = self::normalize($queryToken);
|
||||
$b = self::normalize($indexKeyword);
|
||||
|
||||
// Guard: ignore empty or very short tokens
|
||||
if ($a === '' || $b === '' || mb_strlen($a) < 3 || mb_strlen($b) < 3) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// 1. Exact match
|
||||
if ($a === $b) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
// 2. Phonetic comparison (metaphone)
|
||||
// Useful for: showpare → shopware, shopvare → shopware
|
||||
if (metaphone($a) === metaphone($b)) {
|
||||
return 0.85;
|
||||
}
|
||||
|
||||
// 3. Edit distance comparison (only for longer words)
|
||||
if (mb_strlen($a) >= 6 && mb_strlen($b) >= 6) {
|
||||
$distance = levenshtein($a, $b);
|
||||
|
||||
if ($distance === 1) {
|
||||
return 0.9;
|
||||
}
|
||||
|
||||
if ($distance === 2) {
|
||||
return 0.8;
|
||||
}
|
||||
}
|
||||
|
||||
// No relevant match
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a keyword to ensure stable comparison.
|
||||
*/
|
||||
private static function normalize(string $value): string
|
||||
{
|
||||
$value = mb_strtolower(trim($value));
|
||||
|
||||
// Remove non-alphanumeric characters
|
||||
$value = preg_replace('/[^\p{L}\p{N}]/u', '', $value) ?? '';
|
||||
|
||||
// Normalize German umlauts
|
||||
$map = [
|
||||
'ä' => 'ae',
|
||||
'ö' => 'oe',
|
||||
'ü' => 'ue',
|
||||
'ß' => 'ss',
|
||||
];
|
||||
|
||||
return strtr($value, $map);
|
||||
}
|
||||
}
|
||||
42
src/Knowledge/Retrieval/CachedRetriever.php
Normal file
42
src/Knowledge/Retrieval/CachedRetriever.php
Normal file
@@ -0,0 +1,42 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use Psr\Cache\CacheItemPoolInterface;
|
||||
|
||||
final class CachedRetriever implements RetrieverInterface
|
||||
{
|
||||
public function __construct(
|
||||
private RetrieverInterface $inner,
|
||||
private CacheItemPoolInterface $cache,
|
||||
private int $ttlSeconds = 600 // 10 Minuten
|
||||
) {}
|
||||
|
||||
public function retrieve(string $prompt, int $limit = 3): array
|
||||
{
|
||||
$key = $this->buildCacheKey($prompt, $limit);
|
||||
|
||||
$item = $this->cache->getItem($key);
|
||||
if ($item->isHit()) {
|
||||
return $item->get();
|
||||
}
|
||||
|
||||
$result = $this->inner->retrieve($prompt, $limit);
|
||||
|
||||
$item->set($result);
|
||||
$item->expiresAfter($this->ttlSeconds);
|
||||
$this->cache->save($item);
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
private function buildCacheKey(string $prompt, int $limit): string
|
||||
{
|
||||
$normalized = mb_strtolower(trim($prompt));
|
||||
$normalized = preg_replace('/\s+/u', ' ', $normalized);
|
||||
|
||||
return 'rag_retrieval_' . sha1($normalized . '|' . $limit);
|
||||
}
|
||||
}
|
||||
25
src/Knowledge/Retrieval/ChunkIndexLoader.php
Normal file
25
src/Knowledge/Retrieval/ChunkIndexLoader.php
Normal file
@@ -0,0 +1,25 @@
|
||||
<?php
|
||||
// src/Knowledge/Retrieval/ChunkIndexLoader.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
final class ChunkIndexLoader
|
||||
{
|
||||
public function __construct(
|
||||
private string $indexPath
|
||||
) {}
|
||||
|
||||
public function load(): array
|
||||
{
|
||||
if (!is_file($this->indexPath)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$json = file_get_contents($this->indexPath);
|
||||
$data = $json ? json_decode($json, true) : null;
|
||||
|
||||
return is_array($data) ? $data : [];
|
||||
}
|
||||
}
|
||||
269
src/Knowledge/Retrieval/ChunkKeywordRetriever.php
Normal file
269
src/Knowledge/Retrieval/ChunkKeywordRetriever.php
Normal file
@@ -0,0 +1,269 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Knowledge\StopWords;
|
||||
use App\Knowledge\VectorSearchChunked;
|
||||
use App\Knowledge\KeywordSimilarity;
|
||||
use App\Vector\VectorSearchClient;
|
||||
|
||||
final class ChunkKeywordRetriever implements RetrieverInterface
|
||||
{
|
||||
private const MAX_KEYWORD_CANDIDATES = 200;
|
||||
private const VECTOR_SCORE_THRESHOLD = 0.65;
|
||||
private const VECTOR_TOP_K = 3;
|
||||
|
||||
public function __construct(
|
||||
private VectorSearchChunked $chunkedSearch,
|
||||
private ChunkIndexLoader $indexLoader,
|
||||
private StopWords $stopWords,
|
||||
private VectorSearchClient $vectorClient,
|
||||
private string $chunksDir,
|
||||
private int $maxChunks = 3,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function retrieve(string $prompt, int $limit = null): array
|
||||
{
|
||||
$limit ??= $this->maxChunks;
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 1) Prompt → search terms
|
||||
// ---------------------------------------------------------
|
||||
$queryTerms = $this->extractTerms($prompt);
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 2) Keyword-based candidate discovery
|
||||
// ---------------------------------------------------------
|
||||
$result = $queryTerms !== []
|
||||
? $this->findCandidateFiles($queryTerms)
|
||||
: ['files' => [], 'canonicalTerms' => []];
|
||||
|
||||
$candidateScores = array_slice(
|
||||
$result['files'],
|
||||
0,
|
||||
self::MAX_KEYWORD_CANDIDATES,
|
||||
true
|
||||
);
|
||||
|
||||
// Canonical replacement
|
||||
$effectiveTerms = array_map(
|
||||
static fn (string $term): string =>
|
||||
$result['canonicalTerms'][$term] ?? $term,
|
||||
$queryTerms
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 3) Keyword scoring
|
||||
// ---------------------------------------------------------
|
||||
$scored = [];
|
||||
|
||||
foreach ($candidateScores as $file => $similarityScore) {
|
||||
$path = $this->chunksDir . '/' . $file;
|
||||
if (!is_file($path)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunk = file_get_contents($path);
|
||||
if ($chunk === false || $chunk === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$score = $this->scoreChunk($chunk, $effectiveTerms);
|
||||
if ($score === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$scored[$file] = [
|
||||
'chunk' => trim($chunk),
|
||||
'score' => (int) round($score * $similarityScore),
|
||||
];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 🔑 EARLY EXIT: Keyword results are sufficient
|
||||
// ---------------------------------------------------------
|
||||
if (\count($scored) >= $limit) {
|
||||
return $this->finalize($scored, $limit);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 4) Vector retrieval (semantic fallback)
|
||||
// ---------------------------------------------------------
|
||||
$vectorHits = $this->vectorClient->search($prompt, self::VECTOR_TOP_K);
|
||||
|
||||
foreach ($vectorHits as $hit) {
|
||||
if (
|
||||
!isset($hit['chunk_id'], $hit['score']) ||
|
||||
$hit['score'] < self::VECTOR_SCORE_THRESHOLD
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$file = $hit['chunk_id'] . '.txt';
|
||||
$path = $this->chunksDir . '/' . $file;
|
||||
|
||||
if (!is_file($path)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$baseScore = $scored[$file]['score'] ?? 0;
|
||||
|
||||
$vectorBoost = (int) round($hit['score'] * 10);
|
||||
|
||||
if ($vectorBoost <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunk = $scored[$file]['chunk']
|
||||
?? trim((string) file_get_contents($path));
|
||||
|
||||
$scored[$file] = [
|
||||
'chunk' => $chunk,
|
||||
'score' => $baseScore + $vectorBoost,
|
||||
];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 5) Final fallback
|
||||
// ---------------------------------------------------------
|
||||
if ($scored === []) {
|
||||
return $this->fallbackSearch($prompt);
|
||||
}
|
||||
|
||||
return $this->finalize($scored, $limit);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// FINALIZATION
|
||||
// -------------------------------------------------------------
|
||||
private function finalize(array $scored, int $limit): array
|
||||
{
|
||||
uasort($scored, fn ($a, $b) => $b['score'] <=> $a['score']);
|
||||
|
||||
return array_slice(
|
||||
$this->normalizeResults(
|
||||
array_column($scored, 'chunk')
|
||||
),
|
||||
0,
|
||||
$limit
|
||||
);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// INDEX LOGIC
|
||||
// -------------------------------------------------------------
|
||||
private function findCandidateFiles(array $terms): array
|
||||
{
|
||||
$index = $this->indexLoader->load();
|
||||
$files = [];
|
||||
$canonicalTerms = [];
|
||||
|
||||
foreach ($index as $entry) {
|
||||
if (!isset($entry['file'], $entry['keywords'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach ($terms as $term) {
|
||||
foreach ($entry['keywords'] as $indexKeyword) {
|
||||
$score = KeywordSimilarity::compare($term, $indexKeyword);
|
||||
|
||||
if ($score >= 0.8) {
|
||||
$files[$entry['file']] = max(
|
||||
$files[$entry['file']] ?? 0.0,
|
||||
$score
|
||||
);
|
||||
$canonicalTerms[$term] = $indexKeyword;
|
||||
break 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [
|
||||
'files' => $files,
|
||||
'canonicalTerms' => $canonicalTerms,
|
||||
];
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// FALLBACK
|
||||
// -------------------------------------------------------------
|
||||
private function fallbackSearch(string $prompt): array
|
||||
{
|
||||
$chunkedText = trim($this->chunkedSearch->searchAsText($prompt));
|
||||
if ($chunkedText === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
return array_slice(
|
||||
$this->normalizeResults($this->splitChunks($chunkedText)),
|
||||
0,
|
||||
$this->maxChunks
|
||||
);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// SCORING
|
||||
// -------------------------------------------------------------
|
||||
private function scoreChunk(string $chunk, array $terms): int
|
||||
{
|
||||
$content = mb_strtolower($chunk);
|
||||
$score = 0;
|
||||
|
||||
foreach ($terms as $term) {
|
||||
if (
|
||||
!\in_array($term, $this->stopWords->getStopWords(), true) &&
|
||||
str_contains($content, $term)
|
||||
) {
|
||||
$score += mb_strlen($term) >= 10 ? 2 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
return $score;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// UTIL
|
||||
// -------------------------------------------------------------
|
||||
private function extractTerms(string $text): array
|
||||
{
|
||||
$text = mb_strtolower(
|
||||
preg_replace('/[^\p{L}\p{N}\s]/u', '', $text)
|
||||
);
|
||||
|
||||
return array_values(array_filter(
|
||||
explode(' ', $text),
|
||||
static fn (string $w) => mb_strlen($w) > 2
|
||||
));
|
||||
}
|
||||
|
||||
private function splitChunks(string $text): array
|
||||
{
|
||||
return array_values(array_filter(
|
||||
array_map('trim', explode("\n\n", $text)),
|
||||
static fn (string $chunk) => $chunk !== ''
|
||||
));
|
||||
}
|
||||
|
||||
private function normalizeResults(array $chunks): array
|
||||
{
|
||||
$seen = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
|
||||
if (!isset($seen[$key])) {
|
||||
$seen[$key] = true;
|
||||
$out[] = $chunk;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
}
|
||||
11
src/Knowledge/Retrieval/RetrieverInterface.php
Normal file
11
src/Knowledge/Retrieval/RetrieverInterface.php
Normal file
@@ -0,0 +1,11 @@
|
||||
<?php
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
interface RetrieverInterface
|
||||
{
|
||||
/**
|
||||
* @return string[] Plain text knowledge chunks
|
||||
*/
|
||||
public function retrieve(string $prompt, int $limit = 3): array;
|
||||
}
|
||||
1863
src/Knowledge/StopWords.php
Normal file
1863
src/Knowledge/StopWords.php
Normal file
File diff suppressed because it is too large
Load Diff
121
src/Knowledge/VectorSearchChunked.php
Normal file
121
src/Knowledge/VectorSearchChunked.php
Normal file
@@ -0,0 +1,121 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge;
|
||||
|
||||
use Psr\Log\LoggerInterface;
|
||||
|
||||
/**
|
||||
* VectorSearchChunked
|
||||
*
|
||||
* Chunk-based retrieval service for long-form knowledge documents.
|
||||
* This is a lightweight, deterministic runtime reader for
|
||||
* precomputed knowledge chunks.
|
||||
*
|
||||
* Design principles:
|
||||
* - No runtime indexing
|
||||
* - No ML dependencies
|
||||
* - Deterministic and fast
|
||||
* - Hard limits to protect prompt size
|
||||
*
|
||||
* This service is intentionally simple and can later be replaced
|
||||
* by a real vector database without changing the AgentRunner.
|
||||
*/
|
||||
final class VectorSearchChunked
|
||||
{
|
||||
/**
|
||||
* Directory containing chunked knowledge files.
|
||||
*/
|
||||
private string $dataDir = 'var/knowledge/chunks';
|
||||
|
||||
/**
|
||||
* Maximum number of chunks to return.
|
||||
*/
|
||||
private int $maxChunks = 3;
|
||||
|
||||
public function __construct(
|
||||
private string $projectDir,
|
||||
)
|
||||
{
|
||||
$this->dataDir = $this->projectDir . '/' . $this->dataDir;
|
||||
}
|
||||
/**
|
||||
* Returns concatenated relevant chunks as plain text.
|
||||
*
|
||||
* @param string $prompt
|
||||
* @return string
|
||||
*/
|
||||
public function searchAsText(string $prompt): string
|
||||
{
|
||||
|
||||
if (!is_dir($this->dataDir)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$promptLower = mb_strtolower($prompt);
|
||||
$keywords = $this->extractKeywords($promptLower);
|
||||
|
||||
if ($keywords === []) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$matches = [];
|
||||
|
||||
foreach (glob($this->dataDir . '/*.txt') as $file) {
|
||||
$content = file_get_contents($file);
|
||||
if ($content === false) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$contentLower = mb_strtolower($content);
|
||||
|
||||
if ($this->matchesKeywords($contentLower, $keywords)) {
|
||||
$matches[] = trim($content);
|
||||
}
|
||||
|
||||
if (count($matches) >= $this->maxChunks) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return implode("\n\n", $matches);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts simple keywords from the prompt.
|
||||
*
|
||||
* This is a lightweight heuristic replacement for
|
||||
* full vector or embedding-based search.
|
||||
*/
|
||||
private function extractKeywords(string $prompt): array
|
||||
{
|
||||
$words = preg_split('/\W+/u', $prompt, -1, PREG_SPLIT_NO_EMPTY);
|
||||
if ($words === false) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$keywords = [];
|
||||
foreach ($words as $word) {
|
||||
if (mb_strlen($word) >= 4) {
|
||||
$keywords[] = $word;
|
||||
}
|
||||
}
|
||||
|
||||
return array_values(array_unique($keywords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether the content matches at least one keyword.
|
||||
*/
|
||||
private function matchesKeywords(string $content, array $keywords): bool
|
||||
{
|
||||
foreach ($keywords as $keyword) {
|
||||
if (str_contains($content, $keyword)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user