first commit

2026-02-11 14:15:08 +01:00
parent a4742c2c38
commit aa7d362bc3
58 changed files with 9999 additions and 0 deletions
--- a/src/Knowledge/Ingest/ChunkIndexWriter.php
+++ b/src/Knowledge/Ingest/ChunkIndexWriter.php
@@ -0,0 +1,58 @@
+<?php
+// src/Knowledge/Ingest/ChunkIndexWriter.php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Ingest;
+
+final class ChunkIndexWriter
+{
+    public function __construct(
+        private string $indexPath
+    ) {}
+
+    public function add(array $entry): void
+    {
+        $index = $this->load();
+        $index[] = $entry;
+        $this->save($index);
+    }
+
+    private function load(): array
+    {
+        if (!is_file($this->indexPath)) {
+            return [];
+        }
+
+        $json = file_get_contents($this->indexPath);
+        $data = $json ? json_decode($json, true) : null;
+
+        return is_array($data) ? $data : [];
+    }
+
+    private function save(array $index): void
+    {
+        $dir = dirname($this->indexPath);
+        if (!is_dir($dir)) {
+            mkdir($dir, 0775, true);
+        }
+
+        file_put_contents(
+            $this->indexPath,
+            json_encode($index, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)
+        );
+    }
+
+    public function hasSourceHash(string $source, string $hash): bool
+    {
+        foreach ($this->load() as $entry) {
+            if (
+                ($entry['source'] ?? null) === $source &&
+                ($entry['sourceHash'] ?? null) === $hash
+            ) {
+                return true;
+            }
+        }
+        return false;
+    }
+}
--- a/src/Knowledge/Ingest/ChunkWriter.php
+++ b/src/Knowledge/Ingest/ChunkWriter.php
@@ -0,0 +1,149 @@
+<?php
+// src/Knowledge/Ingest/ChunkWriter.php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Ingest;
+
+
+use App\Knowledge\StopWords;
+
+final class ChunkWriter
+{
+
+    public function __construct(
+        private string           $chunksDir,
+        private string           $manifestPath,
+        private ChunkIndexWriter $indexWriter,
+        private StopWords        $stopWords,
+    )
+    {
+    }
+
+    /**
+     * @param string[] $chunks
+     * @return string[] written filenames
+     */
+    public function write(string $sourceName, array $chunks, string $sourceHash): array
+    {
+        if (!is_dir($this->chunksDir)) {
+            mkdir($this->chunksDir, 0775, true);
+        }
+
+        $manifest = $this->loadManifest();
+        $written = [];
+
+        $base = $this->safeBase($sourceName);
+        $ts = date('Ymd_His');
+
+        foreach ($chunks as $i => $chunk) {
+            $filename = "{$base}__{$ts}__" . str_pad((string)$i, 4, '0', STR_PAD_LEFT) . ".txt";
+            $path = rtrim($this->chunksDir, '/') . '/' . $filename;
+
+            $header = $this->buildHeader(
+                source: $sourceName,
+                index: $i
+            );
+
+            file_put_contents($path, $header . "\n\n" . $chunk);
+
+            $written[] = $filename;
+
+            $manifest[] = [
+                'file' => $filename,
+                'source' => $sourceName,
+                'index' => $i,
+                'chars' => mb_strlen($chunk),
+                'createdAt' => date('c'),
+            ];
+
+            $this->indexWriter->add([
+                'file' => $filename,
+                'source' => $sourceName,
+                'sourceHash' => $sourceHash,
+                'keywords' => $this->extractKeywords($chunk),
+                'chars' => mb_strlen($chunk),
+            ]);
+        }
+
+
+        $this->saveManifest($manifest);
+        return $written;
+    }
+
+    private function safeBase(string $name): string
+    {
+        $name = pathinfo($name, PATHINFO_FILENAME);
+        $name = mb_strtolower($name);
+        $name = preg_replace('/[^a-z0-9\-_]+/u', '-', $name);
+        return trim((string)$name, '-');
+    }
+
+    private function loadManifest(): array
+    {
+        if (!is_file($this->manifestPath)) {
+            return [];
+        }
+        $json = file_get_contents($this->manifestPath);
+        $data = $json ? json_decode($json, true) : null;
+        return is_array($data) ? $data : [];
+    }
+
+    private function saveManifest(array $manifest): void
+    {
+        $dir = dirname($this->manifestPath);
+        if (!is_dir($dir)) {
+            mkdir($dir, 0775, true);
+        }
+        file_put_contents($this->manifestPath, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
+    }
+
+    private function buildHeader(string $source, int $index): string
+    {
+        return sprintf(
+            '[Quelle: %s | Abschnitt: Chunk %d]',
+            $source,
+            $index + 1
+        );
+    }
+
+    private function extractKeywords(string $text): array
+    {
+        // 1) Lowercase
+        $text = mb_strtolower($text);
+
+        // 2) URLs entfernen (sehr wichtig)
+        $text = preg_replace('#https?://\S+#u', ' ', $text);
+
+        // 3) Newlines & Tabs → Space
+        $text = str_replace(["\r", "\n", "\t"], ' ', $text);
+
+        // 4) Trennzeichen → Space (NICHT löschen!)
+        $text = preg_replace('/[\/\.\,\:\;\-\_\(\)\[\]\{\}]/u', ' ', $text);
+
+        // 5) Alles andere raus
+        $text = preg_replace('/[^\p{L}\p{N}\s]/u', '', $text);
+
+        // 6) Whitespace normalisieren
+        $text = preg_replace('/\s+/u', ' ', $text);
+        $text = trim($text);
+
+        // 7) Wörter extrahieren
+        $words = explode(' ', $text);
+
+        // 8) Filtern + deduplizieren
+        $keywords = [];
+
+        foreach ($words as $word) {
+            if (mb_strlen($word) < 4) {
+                continue;
+            }
+            if (in_array($word, $this->stopWords->getStopWords() ?? [], true)) {
+                continue;
+            }
+            $keywords[] = $word;
+        }
+
+        return array_values(array_unique(array_slice($keywords, 0, 25)));
+    }
+}
--- a/src/Knowledge/Ingest/DocumentLoader.php
+++ b/src/Knowledge/Ingest/DocumentLoader.php
@@ -0,0 +1,37 @@
+<?php
+// src/Knowledge/Ingest/DocumentLoader.php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Ingest;
+
+final class DocumentLoader
+{
+    public function load(string $path): string
+    {
+        if (!is_file($path)) {
+            throw new \RuntimeException("File not found: {$path}");
+        }
+
+        $ext = mb_strtolower(pathinfo($path, PATHINFO_EXTENSION));
+
+        return match ($ext) {
+            'txt', 'md' => $this->loadText($path),
+
+            // später:
+            // 'pdf' => $this->loadPdf($path),
+            // 'docx' => $this->loadDocx($path),
+
+            default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
+        };
+    }
+
+    private function loadText(string $path): string
+    {
+        $content = file_get_contents($path);
+        if ($content === false) {
+            throw new \RuntimeException("Could not read file: {$path}");
+        }
+        return $content;
+    }
+}
--- a/src/Knowledge/Ingest/KnowledgeIngestService.php
+++ b/src/Knowledge/Ingest/KnowledgeIngestService.php
@@ -0,0 +1,39 @@
+<?php
+// src/Knowledge/Ingest/KnowledgeIngestService.php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Ingest;
+
+final class KnowledgeIngestService
+{
+    public function __construct(
+        private DocumentLoader   $loader,
+        private SimpleChunker    $chunker,
+        private ChunkWriter      $writer,
+        private ChunkIndexWriter $indexWriter,
+    )
+    {
+    }
+
+    /** @return string[] written chunk filenames */
+    public function ingestFile(string $path, bool $optimize = false): array
+    {
+        $text = $this->loader->load($path);
+
+        if ($optimize) {
+            $text = preg_replace("/\n{3,}/", "\n\n", $text);
+            $text = preg_replace("/[ \t]+$/m", "", $text);
+        }
+
+        $sourceHash = sha1($text);
+        $sourceName = basename($path);
+
+        if ($this->indexWriter->hasSourceHash($sourceName, $sourceHash)) {
+            return [];
+        }
+
+        $chunks = $this->chunker->chunk($text);
+        return $this->writer->write($sourceName, $chunks, $sourceHash);
+    }
+}
--- a/src/Knowledge/Ingest/SimpleChunker.php
+++ b/src/Knowledge/Ingest/SimpleChunker.php
@@ -0,0 +1,146 @@
+<?php
+// src/Knowledge/Ingest/SimpleChunker.php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Ingest;
+
+final class SimpleChunker
+{
+    public function __construct(
+        private int $maxWords = 180,
+        private int $overlapWords = 30
+    ) {}
+
+    /** @return string[] */
+    public function chunk(string $text): array
+    {
+        $text = $this->normalize($text);
+        if ($text === '') {
+            return [];
+        }
+
+        // Split into tokens: words + whitespace preserved
+        $tokens = preg_split(
+            '/(\s+)/u',
+            $text,
+            -1,
+            PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
+        );
+
+        if (!$tokens) {
+            return [];
+        }
+
+        // Build word index → token index mapping
+        $wordTokenIndexes = [];
+        foreach ($tokens as $i => $token) {
+            if (!preg_match('/^\s+$/u', $token)) {
+                $wordTokenIndexes[] = $i;
+            }
+        }
+
+        $totalWords = count($wordTokenIndexes);
+        if ($totalWords === 0) {
+            return [];
+        }
+
+        $chunks = [];
+        $wordPos = 0;
+
+        while ($wordPos < $totalWords) {
+            $wordEnd = min($wordPos + $this->maxWords, $totalWords);
+
+            $tokenStart = $wordTokenIndexes[$wordPos];
+            $tokenEnd   = $wordTokenIndexes[$wordEnd - 1] + 1;
+
+            // Intelligent cut (sentence / paragraph aware)
+            $tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
+
+            $chunk = trim(implode('', array_slice(
+                $tokens,
+                $tokenStart,
+                $tokenEnd - $tokenStart
+            )));
+
+            if ($chunk !== '') {
+                $chunks[] = $chunk;
+            }
+
+            if ($wordEnd >= $totalWords) {
+                break;
+            }
+
+            $wordPos = max(0, $wordEnd - $this->overlapWords);
+        }
+
+        return $this->dedupe($chunks);
+    }
+
+    private function normalize(string $text): string
+    {
+        $text = str_replace(["\r\n", "\r"], "\n", $text);
+        $text = preg_replace("/[ \t]+/u", " ", $text);
+        $text = preg_replace("/\n{3,}/u", "\n\n", $text);
+
+        return trim((string) $text);
+    }
+
+    /**
+     * Move cut backwards to a natural boundary if possible.
+     * Rules:
+     * - Never cut inside markdown list items
+     * - Sentence end only if followed by a line break
+     * - Paragraph breaks always allowed
+     */
+    private function adjustCutToBoundary(array $tokens, int $start, int $end): int
+    {
+        // Detect markdown list context (e.g. "- Foo: Bar")
+        $startToken = $tokens[$start] ?? '';
+        if (preg_match('/^- /u', ltrim($startToken))) {
+            // Keep list blocks intact
+            return $end;
+        }
+
+        for ($i = $end - 1; $i > $start; $i--) {
+
+            // Paragraph boundary
+            if ($tokens[$i] === "\n\n") {
+                return $i + 1;
+            }
+
+            // Sentence boundary only if followed by newline
+            if (
+                preg_match('/[.!?]\s*$/u', $tokens[$i]) &&
+                isset($tokens[$i + 1]) &&
+                str_contains($tokens[$i + 1], "\n")
+            ) {
+                return $i + 1;
+            }
+        }
+
+        return $end;
+    }
+
+    /** @param string[] $chunks @return string[] */
+    private function dedupe(array $chunks): array
+    {
+        $seen = [];
+        $out  = [];
+
+        foreach ($chunks as $chunk) {
+            $key = mb_strtolower(
+                preg_replace('/\s+/u', ' ', trim($chunk))
+            );
+
+            if (isset($seen[$key])) {
+                continue;
+            }
+
+            $seen[$key] = true;
+            $out[] = $chunk;
+        }
+
+        return $out;
+    }
+}
--- a/src/Knowledge/KeywordMapper.php
+++ b/src/Knowledge/KeywordMapper.php
@@ -0,0 +1,35 @@
+<?php
+
+declare(strict_types=1);
+
+namespace App\Knowledge;
+
+/**
+ * KeywordMapper
+ *
+ * Expands short or ambiguous prompts into richer semantic variants
+ * before they are passed into retrieval or embedding pipelines.
+ *
+ * This is a direct port of prompt_mapping.py.
+ */
+final class KeywordMapper
+{
+    private array $map = [
+        'ki'      => 'künstliche Intelligenz, AI, Projekte, Modelle, Agenten, ki',
+        'shop'    => 'Shopware, Onlineshop, Webshop, Commerce-System',
+        'shops'   => 'Shopware, Webshops, Verkaufsplattformen',
+        'agentur' => 'Agentur, Firma, Unternehmen, mitho media',
+        'api'     => 'Schnittstelle, API, Anbindung, Integration',
+        'plugin'  => 'Shopware Plugin, Erweiterung, Modul, Funktion',
+    ];
+
+    /**
+     * Maps a raw prompt to an expanded semantic variant if applicable.
+     */
+    public function map(string $prompt): string
+    {
+        $key = mb_strtolower(trim($prompt));
+
+        return $this->map[$key] ?? $prompt;
+    }
+}
--- a/src/Knowledge/KeywordSimilarity.php
+++ b/src/Knowledge/KeywordSimilarity.php
@@ -0,0 +1,87 @@
+<?php
+
+declare(strict_types=1);
+
+namespace App\Knowledge;
+
+/**
+ * KeywordSimilarity
+ *
+ * Deterministic and fault-tolerant comparison of two keywords.
+ * Returns a similarity score between 0.0 and 1.0.
+ *
+ * Design goals:
+ * - index.json remains unchanged
+ * - comparison logic is intelligent (typos, phonetics)
+ * - no alias or synonym lists
+ * - no LLM dependency
+ */
+final class KeywordSimilarity
+{
+    /**
+     * Compare a query token with an index keyword.
+     *
+     * @param string $queryToken   Token from user input
+     * @param string $indexKeyword Keyword from index.json
+     *
+     * @return float Similarity score (0.0 – 1.0)
+     */
+    public static function compare(string $queryToken, string $indexKeyword): float
+    {
+        $a = self::normalize($queryToken);
+        $b = self::normalize($indexKeyword);
+
+        // Guard: ignore empty or very short tokens
+        if ($a === '' || $b === '' || mb_strlen($a) < 3 || mb_strlen($b) < 3) {
+            return 0.0;
+        }
+
+        // 1. Exact match
+        if ($a === $b) {
+            return 1.0;
+        }
+
+        // 2. Phonetic comparison (metaphone)
+        // Useful for: showpare → shopware, shopvare → shopware
+        if (metaphone($a) === metaphone($b)) {
+            return 0.85;
+        }
+
+        // 3. Edit distance comparison (only for longer words)
+        if (mb_strlen($a) >= 6 && mb_strlen($b) >= 6) {
+            $distance = levenshtein($a, $b);
+
+            if ($distance === 1) {
+                return 0.9;
+            }
+
+            if ($distance === 2) {
+                return 0.8;
+            }
+        }
+
+        // No relevant match
+        return 0.0;
+    }
+
+    /**
+     * Normalize a keyword to ensure stable comparison.
+     */
+    private static function normalize(string $value): string
+    {
+        $value = mb_strtolower(trim($value));
+
+        // Remove non-alphanumeric characters
+        $value = preg_replace('/[^\p{L}\p{N}]/u', '', $value) ?? '';
+
+        // Normalize German umlauts
+        $map = [
+            'ä' => 'ae',
+            'ö' => 'oe',
+            'ü' => 'ue',
+            'ß' => 'ss',
+        ];
+
+        return strtr($value, $map);
+    }
+}
--- a/src/Knowledge/Retrieval/CachedRetriever.php
+++ b/src/Knowledge/Retrieval/CachedRetriever.php
@@ -0,0 +1,42 @@
+<?php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Retrieval;
+
+use Psr\Cache\CacheItemPoolInterface;
+
+final class CachedRetriever implements RetrieverInterface
+{
+    public function __construct(
+        private RetrieverInterface    $inner,
+        private CacheItemPoolInterface $cache,
+        private int                   $ttlSeconds = 600 // 10 Minuten
+    ) {}
+
+    public function retrieve(string $prompt, int $limit = 3): array
+    {
+        $key = $this->buildCacheKey($prompt, $limit);
+
+        $item = $this->cache->getItem($key);
+        if ($item->isHit()) {
+            return $item->get();
+        }
+
+        $result = $this->inner->retrieve($prompt, $limit);
+
+        $item->set($result);
+        $item->expiresAfter($this->ttlSeconds);
+        $this->cache->save($item);
+
+        return $result;
+    }
+
+    private function buildCacheKey(string $prompt, int $limit): string
+    {
+        $normalized = mb_strtolower(trim($prompt));
+        $normalized = preg_replace('/\s+/u', ' ', $normalized);
+
+        return 'rag_retrieval_' . sha1($normalized . '|' . $limit);
+    }
+}
--- a/src/Knowledge/Retrieval/ChunkIndexLoader.php
+++ b/src/Knowledge/Retrieval/ChunkIndexLoader.php
@@ -0,0 +1,25 @@
+<?php
+// src/Knowledge/Retrieval/ChunkIndexLoader.php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Retrieval;
+
+final class ChunkIndexLoader
+{
+    public function __construct(
+        private string $indexPath
+    ) {}
+
+    public function load(): array
+    {
+        if (!is_file($this->indexPath)) {
+            return [];
+        }
+
+        $json = file_get_contents($this->indexPath);
+        $data = $json ? json_decode($json, true) : null;
+
+        return is_array($data) ? $data : [];
+    }
+}
--- a/src/Knowledge/Retrieval/ChunkKeywordRetriever.php
+++ b/src/Knowledge/Retrieval/ChunkKeywordRetriever.php
@@ -0,0 +1,269 @@
+<?php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Retrieval;
+
+use App\Knowledge\StopWords;
+use App\Knowledge\VectorSearchChunked;
+use App\Knowledge\KeywordSimilarity;
+use App\Vector\VectorSearchClient;
+
+final class ChunkKeywordRetriever implements RetrieverInterface
+{
+    private const MAX_KEYWORD_CANDIDATES = 200;
+    private const VECTOR_SCORE_THRESHOLD = 0.65;
+    private const VECTOR_TOP_K = 3;
+
+    public function __construct(
+        private VectorSearchChunked $chunkedSearch,
+        private ChunkIndexLoader    $indexLoader,
+        private StopWords           $stopWords,
+        private VectorSearchClient  $vectorClient,
+        private string              $chunksDir,
+        private int                 $maxChunks = 3,
+    ) {
+    }
+
+    /**
+     * {@inheritdoc}
+     */
+    public function retrieve(string $prompt, int $limit = null): array
+    {
+        $limit ??= $this->maxChunks;
+
+        // ---------------------------------------------------------
+        // 1) Prompt → search terms
+        // ---------------------------------------------------------
+        $queryTerms = $this->extractTerms($prompt);
+
+        // ---------------------------------------------------------
+        // 2) Keyword-based candidate discovery
+        // ---------------------------------------------------------
+        $result = $queryTerms !== []
+            ? $this->findCandidateFiles($queryTerms)
+            : ['files' => [], 'canonicalTerms' => []];
+
+        $candidateScores = array_slice(
+            $result['files'],
+            0,
+            self::MAX_KEYWORD_CANDIDATES,
+            true
+        );
+
+        // Canonical replacement
+        $effectiveTerms = array_map(
+            static fn (string $term): string =>
+                $result['canonicalTerms'][$term] ?? $term,
+            $queryTerms
+        );
+
+        // ---------------------------------------------------------
+        // 3) Keyword scoring
+        // ---------------------------------------------------------
+        $scored = [];
+
+        foreach ($candidateScores as $file => $similarityScore) {
+            $path = $this->chunksDir . '/' . $file;
+            if (!is_file($path)) {
+                continue;
+            }
+
+            $chunk = file_get_contents($path);
+            if ($chunk === false || $chunk === '') {
+                continue;
+            }
+
+            $score = $this->scoreChunk($chunk, $effectiveTerms);
+            if ($score === 0) {
+                continue;
+            }
+
+            $scored[$file] = [
+                'chunk' => trim($chunk),
+                'score' => (int) round($score * $similarityScore),
+            ];
+        }
+
+        // ---------------------------------------------------------
+        // 🔑 EARLY EXIT: Keyword results are sufficient
+        // ---------------------------------------------------------
+        if (\count($scored) >= $limit) {
+            return $this->finalize($scored, $limit);
+        }
+
+        // ---------------------------------------------------------
+        // 4) Vector retrieval (semantic fallback)
+        // ---------------------------------------------------------
+        $vectorHits = $this->vectorClient->search($prompt, self::VECTOR_TOP_K);
+
+        foreach ($vectorHits as $hit) {
+            if (
+                !isset($hit['chunk_id'], $hit['score']) ||
+                $hit['score'] < self::VECTOR_SCORE_THRESHOLD
+            ) {
+                continue;
+            }
+
+            $file = $hit['chunk_id'] . '.txt';
+            $path = $this->chunksDir . '/' . $file;
+
+            if (!is_file($path)) {
+                continue;
+            }
+
+            $baseScore = $scored[$file]['score'] ?? 0;
+
+            $vectorBoost = (int) round($hit['score'] * 10);
+
+            if ($vectorBoost <= 0) {
+                continue;
+            }
+
+            $chunk = $scored[$file]['chunk']
+                ?? trim((string) file_get_contents($path));
+
+            $scored[$file] = [
+                'chunk' => $chunk,
+                'score' => $baseScore + $vectorBoost,
+            ];
+        }
+
+        // ---------------------------------------------------------
+        // 5) Final fallback
+        // ---------------------------------------------------------
+        if ($scored === []) {
+            return $this->fallbackSearch($prompt);
+        }
+
+        return $this->finalize($scored, $limit);
+    }
+
+    // -------------------------------------------------------------
+    // FINALIZATION
+    // -------------------------------------------------------------
+    private function finalize(array $scored, int $limit): array
+    {
+        uasort($scored, fn ($a, $b) => $b['score'] <=> $a['score']);
+
+        return array_slice(
+            $this->normalizeResults(
+                array_column($scored, 'chunk')
+            ),
+            0,
+            $limit
+        );
+    }
+
+    // -------------------------------------------------------------
+    // INDEX LOGIC
+    // -------------------------------------------------------------
+    private function findCandidateFiles(array $terms): array
+    {
+        $index = $this->indexLoader->load();
+        $files = [];
+        $canonicalTerms = [];
+
+        foreach ($index as $entry) {
+            if (!isset($entry['file'], $entry['keywords'])) {
+                continue;
+            }
+
+            foreach ($terms as $term) {
+                foreach ($entry['keywords'] as $indexKeyword) {
+                    $score = KeywordSimilarity::compare($term, $indexKeyword);
+
+                    if ($score >= 0.8) {
+                        $files[$entry['file']] = max(
+                            $files[$entry['file']] ?? 0.0,
+                            $score
+                        );
+                        $canonicalTerms[$term] = $indexKeyword;
+                        break 2;
+                    }
+                }
+            }
+        }
+
+        return [
+            'files' => $files,
+            'canonicalTerms' => $canonicalTerms,
+        ];
+    }
+
+    // -------------------------------------------------------------
+    // FALLBACK
+    // -------------------------------------------------------------
+    private function fallbackSearch(string $prompt): array
+    {
+        $chunkedText = trim($this->chunkedSearch->searchAsText($prompt));
+        if ($chunkedText === '') {
+            return [];
+        }
+
+        return array_slice(
+            $this->normalizeResults($this->splitChunks($chunkedText)),
+            0,
+            $this->maxChunks
+        );
+    }
+
+    // -------------------------------------------------------------
+    // SCORING
+    // -------------------------------------------------------------
+    private function scoreChunk(string $chunk, array $terms): int
+    {
+        $content = mb_strtolower($chunk);
+        $score = 0;
+
+        foreach ($terms as $term) {
+            if (
+                !\in_array($term, $this->stopWords->getStopWords(), true) &&
+                str_contains($content, $term)
+            ) {
+                $score += mb_strlen($term) >= 10 ? 2 : 1;
+            }
+        }
+
+        return $score;
+    }
+
+    // -------------------------------------------------------------
+    // UTIL
+    // -------------------------------------------------------------
+    private function extractTerms(string $text): array
+    {
+        $text = mb_strtolower(
+            preg_replace('/[^\p{L}\p{N}\s]/u', '', $text)
+        );
+
+        return array_values(array_filter(
+            explode(' ', $text),
+            static fn (string $w) => mb_strlen($w) > 2
+        ));
+    }
+
+    private function splitChunks(string $text): array
+    {
+        return array_values(array_filter(
+            array_map('trim', explode("\n\n", $text)),
+            static fn (string $chunk) => $chunk !== ''
+        ));
+    }
+
+    private function normalizeResults(array $chunks): array
+    {
+        $seen = [];
+        $out = [];
+
+        foreach ($chunks as $chunk) {
+            $key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
+            if (!isset($seen[$key])) {
+                $seen[$key] = true;
+                $out[] = $chunk;
+            }
+        }
+
+        return $out;
+    }
+}
--- a/src/Knowledge/Retrieval/RetrieverInterface.php
+++ b/src/Knowledge/Retrieval/RetrieverInterface.php
@@ -0,0 +1,11 @@
+<?php
+
+namespace App\Knowledge\Retrieval;
+
+interface RetrieverInterface
+{
+    /**
+     * @return string[]  Plain text knowledge chunks
+     */
+    public function retrieve(string $prompt, int $limit = 3): array;
+}
--- a/src/Knowledge/StopWords.php
+++ b/src/Knowledge/StopWords.php
--- a/src/Knowledge/VectorSearchChunked.php
+++ b/src/Knowledge/VectorSearchChunked.php
@@ -0,0 +1,121 @@
+<?php
+
+declare(strict_types=1);
+
+namespace App\Knowledge;
+
+use Psr\Log\LoggerInterface;
+
+/**
+ * VectorSearchChunked
+ *
+ * Chunk-based retrieval service for long-form knowledge documents.
+ * This is a lightweight, deterministic runtime reader for
+ * precomputed knowledge chunks.
+ *
+ * Design principles:
+ * - No runtime indexing
+ * - No ML dependencies
+ * - Deterministic and fast
+ * - Hard limits to protect prompt size
+ *
+ * This service is intentionally simple and can later be replaced
+ * by a real vector database without changing the AgentRunner.
+ */
+final class VectorSearchChunked
+{
+    /**
+     * Directory containing chunked knowledge files.
+     */
+    private string $dataDir = 'var/knowledge/chunks';
+
+    /**
+     * Maximum number of chunks to return.
+     */
+    private int $maxChunks = 3;
+
+    public function __construct(
+        private string          $projectDir,
+    )
+    {
+        $this->dataDir = $this->projectDir . '/' . $this->dataDir;
+    }
+    /**
+     * Returns concatenated relevant chunks as plain text.
+     *
+     * @param string $prompt
+     * @return string
+     */
+    public function searchAsText(string $prompt): string
+    {
+
+        if (!is_dir($this->dataDir)) {
+            return '';
+        }
+
+        $promptLower = mb_strtolower($prompt);
+        $keywords = $this->extractKeywords($promptLower);
+
+        if ($keywords === []) {
+            return '';
+        }
+
+        $matches = [];
+
+        foreach (glob($this->dataDir . '/*.txt') as $file) {
+            $content = file_get_contents($file);
+            if ($content === false) {
+                continue;
+            }
+
+            $contentLower = mb_strtolower($content);
+
+            if ($this->matchesKeywords($contentLower, $keywords)) {
+                $matches[] = trim($content);
+            }
+
+            if (count($matches) >= $this->maxChunks) {
+                break;
+            }
+        }
+
+        return implode("\n\n", $matches);
+    }
+
+    /**
+     * Extracts simple keywords from the prompt.
+     *
+     * This is a lightweight heuristic replacement for
+     * full vector or embedding-based search.
+     */
+    private function extractKeywords(string $prompt): array
+    {
+        $words = preg_split('/\W+/u', $prompt, -1, PREG_SPLIT_NO_EMPTY);
+        if ($words === false) {
+            return [];
+        }
+
+        $keywords = [];
+        foreach ($words as $word) {
+            if (mb_strlen($word) >= 4) {
+                $keywords[] = $word;
+            }
+        }
+
+        return array_values(array_unique($keywords));
+    }
+
+    /**
+     * Checks whether the content matches at least one keyword.
+     */
+    private function matchesKeywords(string $content, array $keywords): bool
+    {
+        foreach ($keywords as $keyword) {
+            if (str_contains($content, $keyword)) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+}