diff --git a/.gitignore b/.gitignore
index 48ad1c7..ad15f48 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@
/src/Vector/vector.index
/src/Vector/vector_meta.json
/var/locks
+/var/agent-history
+/var/cache
.env.local
test.*
# ---> Symfony
diff --git a/config/services.yaml b/config/services.yaml
index 2f4b633..8f933e1 100644
--- a/config/services.yaml
+++ b/config/services.yaml
@@ -1,130 +1,118 @@
-# This file is the entry point to configure your own services.
-# Files in the packages/ subdirectory configure your dependencies.
-#
-# https://symfony.com/doc/current/best_practices.html
+# ------------------------------------------------------------
+# Parameters
+# ------------------------------------------------------------
parameters:
- mto.index.chunk_size: 800
- mto.index.chunk_overlap: 100
- mto.index.embedding_model: 'nomic-embed-text'
- mto.index.embedding_dimension: 768
- mto.index.scoring_version: 1
- mto.vector.python_bin: '/var/www/html/src/Vector/.venv/bin/python'
- mto.vector.ingest_script: '/src/Vector/vector_ingest.py'
- mto.vector.timeout: 600
+ mto.index.chunk_size: 800
+ mto.index.chunk_overlap: 100
+ mto.index.embedding_model: 'nomic-embed-text'
+ mto.index.embedding_dimension: 768
+ mto.index.scoring_version: 1
+
+ mto.vector.python_bin: '/var/www/html/src/Vector/.venv/bin/python'
+ mto.vector.ingest_script: '/src/Vector/vector_ingest.py'
+ mto.vector.timeout: 600
+
+# ------------------------------------------------------------
+# Services
+# ------------------------------------------------------------
services:
- # ------------------------------------------------------------
- # Default service configuration
- # ------------------------------------------------------------
- _defaults:
- autowire: true
- autoconfigure: true
+ # ------------------------------------------------------------
+ # Default service configuration
+ # ------------------------------------------------------------
+ _defaults:
+ autowire: true
+ autoconfigure: true
- # Bind the agent-specific Monolog channel explicitly
- bind:
- Psr\Log\LoggerInterface $agentLogger: '@monolog.logger.agent'
- string $projectDir: '%kernel.project_dir%'
+ bind:
+ Psr\Log\LoggerInterface $agentLogger: '@monolog.logger.agent'
+ string $projectDir: '%kernel.project_dir%'
- # ------------------------------------------------------------
- # Automatically register all services in src/
- # ------------------------------------------------------------
- App\:
- resource: '../src/'
- exclude:
- - '../src/DependencyInjection/'
- - '../src/Entity/'
- - '../src/Kernel.php'
+ # ------------------------------------------------------------
+ # Automatically register all services in src/
+ # ------------------------------------------------------------
+ App\:
+ resource: '../src/'
+ exclude:
+ - '../src/DependencyInjection/'
+ - '../src/Entity/'
+ - '../src/Kernel.php'
- # ------------------------------------------------------------
- # AI Agent – Infrastructure
- # ------------------------------------------------------------
- App\Infrastructure\OllamaClient:
- arguments:
- $apiUrl: '%env(AI_LLM_API_URL)%'
- $model: '%env(AI_LLM_MODEL)%'
- $timeoutSeconds: '%env(int:AI_LLM_TIMEOUT)%'
+ App\Controller\:
+ resource: '../src/Controller/'
+ tags: ['controller.service_arguments']
- # ------------------------------------------------------------
- # AI Agent – Context & state
- # ------------------------------------------------------------
- App\Context\ContextService:
- arguments:
- $historyDir: '%env(AI_HISTORY_DIR)%'
- $projectDir: '%kernel.project_dir%'
+ # ------------------------------------------------------------
+ # AI Agent – Infrastructure
+ # ------------------------------------------------------------
+ App\Infrastructure\OllamaClient:
+ arguments:
+ $apiUrl: '%env(AI_LLM_API_URL)%'
+ $model: '%env(AI_LLM_MODEL)%'
+ $timeoutSeconds: '%env(int:AI_LLM_TIMEOUT)%'
- # ------------------------------------------------------------
- # AI Agent – Debug & logging flags
- # ------------------------------------------------------------
- App\Agent\AgentRunner:
- arguments:
- $debug: '%env(bool:AI_DEBUG)%'
- $logPrompt: '%env(bool:AI_LOG_PROMPT)%'
- $logContext: '%env(bool:AI_LOG_CONTEXT)%'
+ # ------------------------------------------------------------
+ # AI Agent – Context & Runner
+ # ------------------------------------------------------------
+ App\Context\ContextService:
+ arguments:
+ $historyDir: '%env(AI_HISTORY_DIR)%'
+ $projectDir: '%kernel.project_dir%'
- App\Controller\:
- resource: '../src/Controller/'
- tags: [ 'controller.service_arguments' ]
+ App\Agent\AgentRunner:
+ arguments:
+ $debug: '%env(bool:AI_DEBUG)%'
+ $logPrompt: '%env(bool:AI_LOG_PROMPT)%'
+ $logContext: '%env(bool:AI_LOG_CONTEXT)%'
- # ------------------------------------------------------------
- # AI Agent – Knowledge
- # ------------------------------------------------------------
- App\Knowledge\Retrieval\ChunkKeywordRetriever:
- arguments:
- $chunksDir: '%kernel.project_dir%/var/knowledge/chunks'
+ # ------------------------------------------------------------
+ # NDJSON Retrieval Stack (FINAL ARCHITECTURE)
+ # ------------------------------------------------------------
- App\Knowledge\Retrieval\CachedRetriever:
- arguments:
- $inner: '@App\Knowledge\Retrieval\ChunkKeywordRetriever'
- $cache: '@cache.app'
- $ttlSeconds: 600
+ App\Knowledge\Retrieval\NdjsonChunkLookup: ~
- App\Knowledge\Retrieval\RetrieverInterface:
- alias: App\Knowledge\Retrieval\CachedRetriever
+ App\Knowledge\Retrieval\NdjsonKeywordSearch: ~
- App\Knowledge\Ingest\ChunkWriter:
- arguments:
- $chunksDir: '%kernel.project_dir%/var/knowledge/chunks'
- $manifestPath: '%kernel.project_dir%/var/knowledge/manifest.json'
+ App\Knowledge\Retrieval\NdjsonHybridRetriever:
+ arguments:
+ $maxChunks: 3
+ $vectorTopK: 5
- App\Knowledge\Ingest\ChunkIndexWriter:
- arguments:
- $indexPath: '%kernel.project_dir%/var/knowledge/index.json'
+ App\Knowledge\Retrieval\CachedRetriever:
+ arguments:
+ $inner: '@App\Knowledge\Retrieval\NdjsonHybridRetriever'
+ $cache: '@cache.app'
+ $ttlSeconds: 600
- App\Knowledge\Retrieval\ChunkIndexLoader:
- arguments:
- $indexPath: '%kernel.project_dir%/var/knowledge/index.json'
+ App\Knowledge\Retrieval\RetrieverInterface:
+ alias: App\Knowledge\Retrieval\CachedRetriever
- App\Command\KnowledgeIngestCommand:
- arguments:
- $uploadsDir: '%kernel.project_dir%/var/knowledge/uploads'
+ # ------------------------------------------------------------
+ # Vector Search (FAISS NDJSON-based)
+ # ------------------------------------------------------------
- App\Vector\VectorSearchClient:
- arguments:
- $vectorDir: '%kernel.project_dir%/src/Vector'
+ App\Vector\VectorSearchClient:
+ arguments:
+ $vectorDir: '%kernel.project_dir%/var/knowledge'
- App\Command\VectorIngestCommand:
- arguments:
- $vectorDir: '%kernel.project_dir%/src/Vector'
- $projectDir: '%kernel.project_dir%'
+ App\Vector\VectorIndexBuilder:
+ arguments:
+ $pythonBin: '%mto.vector.python_bin%'
+ $relativeScriptPath: '%mto.vector.ingest_script%'
+ $timeoutSeconds: '%mto.vector.timeout%'
- App\Command\VectorInstallCommand:
- arguments:
- $vectorDir: '%kernel.project_dir%/src/Vector'
+ # ------------------------------------------------------------
+ # Index Configuration (Guardrails)
+ # ------------------------------------------------------------
- App\Index\IndexConfiguration:
- arguments:
- $chunkSize: '%mto.index.chunk_size%'
- $chunkOverlap: '%mto.index.chunk_overlap%'
- $embeddingModel: '%mto.index.embedding_model%'
- $embeddingDimension: '%mto.index.embedding_dimension%'
- $scoringVersion: '%mto.index.scoring_version%'
- $indexFormat: 'ndjson'
- $vectorBackend: 'faiss'
-
- App\Vector\VectorIndexBuilder:
- arguments:
- $pythonBin: '%mto.vector.python_bin%'
- $relativeScriptPath: '%mto.vector.ingest_script%'
- $timeoutSeconds: '%mto.vector.timeout%'
\ No newline at end of file
+ App\Index\IndexConfiguration:
+ arguments:
+ $chunkSize: '%mto.index.chunk_size%'
+ $chunkOverlap: '%mto.index.chunk_overlap%'
+ $embeddingModel: '%mto.index.embedding_model%'
+ $embeddingDimension: '%mto.index.embedding_dimension%'
+ $scoringVersion: '%mto.index.scoring_version%'
+ $indexFormat: 'ndjson'
+ $vectorBackend: 'faiss'
diff --git a/public/assets/js/base.js b/public/assets/js/base.js
index 645fc16..0619e89 100644
--- a/public/assets/js/base.js
+++ b/public/assets/js/base.js
@@ -113,12 +113,16 @@ document.addEventListener('DOMContentLoaded', () => {
const text = dataLines.join('\n');
if (text === '[DONE]') {
- // Finales Rendering mit Normalisierung
+
+ // 🔥 Final render flush
if (renderTimer) {
clearTimeout(renderTimer);
renderTimer = null;
}
+
+ bubble.innerHTML = renderMarkdown(raw);
chatEl.scrollTop = chatEl.scrollHeight;
+
abort = true;
return;
}
diff --git a/src/Command/KnowledgeIngestCommand.php b/src/Command/KnowledgeIngestCommand.php
index 081c0a6..25d9e65 100644
--- a/src/Command/KnowledgeIngestCommand.php
+++ b/src/Command/KnowledgeIngestCommand.php
@@ -1,28 +1,25 @@
addArgument(
- 'file',
- InputArgument::OPTIONAL,
- 'Path to a single .txt/.md file'
- )
- ->addOption(
- 'all',
- null,
- InputOption::VALUE_NONE,
- 'Ingest all .md files from the uploads directory'
- )
- ->addOption(
- 'optimize',
- 'o',
- InputOption::VALUE_NONE,
- 'Optimize chunks for retrieval quality'
- );
+ ->addArgument('versionId', InputArgument::REQUIRED, 'UUID of DocumentVersion')
+ ->addArgument('userId', InputArgument::REQUIRED, 'UUID of user triggering ingest');
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
- $files = [];
- $optimize = (bool) $input->getOption('optimize');
+ $versionId = $input->getArgument('versionId');
+ $userId = $input->getArgument('userId');
- if ($input->getOption('all')) {
- if (!is_dir($this->uploadsDir)) {
- $output->writeln('❌ uploads directory not found');
- return Command::FAILURE;
- }
+ $version = $this->em->getRepository(DocumentVersion::class)->find($versionId);
+ $user = $this->em->getRepository(User::class)->find($userId);
- $finder = new Finder();
- $finder
- ->files()
- ->in($this->uploadsDir)
- ->name('*.md');
-
- if (!$finder->hasResults()) {
- $output->writeln('ℹ️ No .md files found in uploads/');
- return Command::SUCCESS;
- }
-
- foreach ($finder as $file) {
- $files[] = $file->getRealPath();
- }
-
- $output->writeln(sprintf(
- '📂 Ingesting %d markdown files from uploads (%s)',
- count($files),
- $optimize ? 'optimized' : 'standard'
- ));
- } else {
- $file = $input->getArgument('file');
-
- if (!$file) {
- $output->writeln('❌ Either provide a file or use --all');
- return Command::FAILURE;
- }
-
- $files[] = (string) $file;
+ if (!$version || !$user) {
+ $output->writeln('Version or User not found.');
+ return Command::FAILURE;
}
- $totalWritten = 0;
+ $output->writeln('Starting ingest...');
- foreach ($files as $filePath) {
- $output->writeln('➡️ Ingesting: ' . $filePath);
+ $this->ingestFlow->ingestDocumentVersion($version, $user);
- $written = $this->ingest->ingestFile(
- $filePath,
- optimize: $optimize
- );
-
- $totalWritten += count($written);
-
- foreach ($written as $chunk) {
- $output->writeln(' - ' . $chunk);
- }
- }
-
- $output->writeln('');
- $output->writeln('✅ Total written chunks: ' . $totalWritten);
+ $output->writeln('Ingest completed.');
return Command::SUCCESS;
}
diff --git a/src/Command/VectorIngestCommand.php b/src/Command/VectorIngestCommand.php
index 823b6ad..5c84759 100644
--- a/src/Command/VectorIngestCommand.php
+++ b/src/Command/VectorIngestCommand.php
@@ -4,86 +4,27 @@ declare(strict_types=1);
namespace App\Command;
+use App\Vector\VectorIndexBuilder;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
-#[AsCommand(
- name: 'mto:agent:vector:ingest',
- description: 'Builds the FAISS vector index from index.json'
-)]
-final class VectorIngestCommand extends Command
+#[AsCommand(name: 'mto:agent:vector:rebuild')]
+class VectorIngestCommand extends Command
{
public function __construct(
- private readonly string $vectorDir,
- private readonly string $projectDir
+ private readonly VectorIndexBuilder $builder
) {
parent::__construct();
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
- $vectorDir = rtrim($this->vectorDir, '/');
+ $output->writeln('Rebuilding vector index...');
+ $this->builder->rebuildFromNdjson();
+ $output->writeln('Done.');
- if (!is_dir($vectorDir)) {
- $output->writeln('Vector directory not found');
- return Command::FAILURE;
- }
-
- $script = $vectorDir . '/vector_ingest.py';
-
- if (!is_file($script)) {
- $output->writeln('vector_ingest.py not found');
- return Command::FAILURE;
- }
-
- // -------------------------------------------------
- // Enforce venv usage
- // -------------------------------------------------
- $venvPython = $vectorDir . '/.venv/bin/python';
-
- if (!is_file($venvPython)) {
- $output->writeln('No Python virtual environment found.');
- $output->writeln('Run first:');
- $output->writeln(' php bin/console mto:agent:vector:install');
- return Command::FAILURE;
- }
-
- $knowledgeDir = rtrim($this->projectDir, '/') . '/var/knowledge';
-
- if (!is_dir($knowledgeDir)) {
- $output->writeln('Knowledge directory not found:');
- $output->writeln($knowledgeDir);
- return Command::FAILURE;
- }
-
- $output->writeln('Building FAISS vector index…');
- $output->writeln(sprintf(
- 'Vector dir: %s',
- $vectorDir
- ));
- $output->writeln(sprintf(
- 'Knowledge dir: %s',
- $knowledgeDir
- ));
-
- $cmd = sprintf(
- '%s %s %s %s 2>&1',
- escapeshellarg($venvPython),
- escapeshellarg($script),
- escapeshellarg($vectorDir),
- escapeshellarg($knowledgeDir)
- );
-
- exec($cmd, $out, $exitCode);
-
- foreach ($out as $line) {
- $output->writeln($line);
- }
-
- return $exitCode === 0
- ? Command::SUCCESS
- : Command::FAILURE;
+ return Command::SUCCESS;
}
}
diff --git a/src/Command/VectorInstallCommand.php b/src/Command/VectorInstallCommand.php
deleted file mode 100644
index 6b17ba1..0000000
--- a/src/Command/VectorInstallCommand.php
+++ /dev/null
@@ -1,114 +0,0 @@
-vectorDir)) {
- $output->writeln('Vector directory not found');
- return Command::FAILURE;
- }
-
- $vectorDir = rtrim($this->vectorDir, '/');
- $venvDir = $vectorDir . '/.venv';
- $venvPython = $venvDir . '/bin/python';
-
- // -------------------------------------------------
- // 1) Create venv if missing
- // -------------------------------------------------
- if (!is_dir($venvDir)) {
- $output->writeln('Creating Python virtual environment…');
-
- $cmd = sprintf(
- 'python3 -m venv %s 2>&1',
- escapeshellarg($venvDir)
- );
-
- exec($cmd, $out, $exitCode);
-
- foreach ($out as $line) {
- $output->writeln($line);
- }
-
- if ($exitCode !== 0 || !is_file($venvPython)) {
- $output->writeln('');
- $output->writeln('Failed to create Python virtual environment.');
- $output->writeln('Ensure that python3-venv is installed on the system.');
- return Command::FAILURE;
- }
- } else {
- $output->writeln('Using existing Python virtual environment');
- }
-
- // -------------------------------------------------
- // 2) Ensure pip exists inside venv
- // -------------------------------------------------
- $cmd = sprintf(
- '%s -m pip --version 2>&1',
- escapeshellarg($venvPython)
- );
-
- exec($cmd, $out, $exitCode);
-
- if ($exitCode !== 0) {
- $output->writeln('');
- $output->writeln('The existing virtual environment has no pip.');
- $output->writeln('This usually means it was created before python3-pip was installed.');
- $output->writeln('Fix:');
- $output->writeln(sprintf(' rm -rf %s', $venvDir));
- $output->writeln(' php bin/console mto:agent:vector:install');
- return Command::FAILURE;
- }
-
- // -------------------------------------------------
- // 3) Install / update dependencies
- // -------------------------------------------------
- $output->writeln('Installing vector dependencies…');
-
- $cmd = sprintf(
- '%s -m pip install --upgrade faiss-cpu sentence-transformers 2>&1',
- escapeshellarg($venvPython)
- );
-
- exec($cmd, $out, $exitCode);
-
- foreach ($out as $line) {
- $output->writeln($line);
- }
-
- if ($exitCode !== 0) {
- $output->writeln('Dependency installation failed');
- return Command::FAILURE;
- }
-
- $output->writeln('');
- $output->writeln('Vector dependencies installed successfully');
- $output->writeln(sprintf('venv: %s', $venvDir));
-
- return Command::SUCCESS;
- }
-}
diff --git a/src/Knowledge/Retrieval/NdjsonChunkLookup.php b/src/Knowledge/Retrieval/NdjsonChunkLookup.php
new file mode 100644
index 0000000..7fd8ae4
--- /dev/null
+++ b/src/Knowledge/Retrieval/NdjsonChunkLookup.php
@@ -0,0 +1,44 @@
+> keyed by chunk_id
+ */
+ public function findByChunkIds(array $chunkIds): array
+ {
+ $wanted = array_fill_keys($chunkIds, true);
+ $found = [];
+
+ foreach ($this->chunkManager->streamAll() as $row) {
+ $id = $row['chunk_id'] ?? null;
+ if (!is_string($id) || !isset($wanted[$id])) {
+ continue;
+ }
+
+ $found[$id] = $row;
+
+ // Early exit sobald alle gefunden
+ if (\count($found) === \count($wanted)) {
+ break;
+ }
+ }
+
+ return $found;
+ }
+}
diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
new file mode 100644
index 0000000..83e15c5
--- /dev/null
+++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php
@@ -0,0 +1,99 @@
+maxChunks;
+
+ $terms = $this->extractTerms($prompt);
+
+ // 1) Keyword first
+ $keywordChunks = $this->keywordSearch->search($terms, $limit);
+ if (\count($keywordChunks) >= $limit) {
+ return array_slice($keywordChunks, 0, $limit);
+ }
+
+ // 2) Vector fallback / enrichment
+ $hits = $this->vectorClient->search($prompt, $this->vectorTopK);
+ if ($hits === []) {
+ return $keywordChunks;
+ }
+
+ $chunkIds = [];
+ foreach ($hits as $hit) {
+ if (!isset($hit['chunk_id'], $hit['score'])) {
+ continue;
+ }
+ if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
+ continue;
+ }
+ $chunkIds[] = (string)$hit['chunk_id'];
+ }
+
+ if ($chunkIds === []) {
+ return $keywordChunks;
+ }
+
+ $rows = $this->lookup->findByChunkIds($chunkIds);
+
+ foreach ($chunkIds as $id) {
+ if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) {
+ continue;
+ }
+ $keywordChunks[] = trim($rows[$id]['text']);
+ }
+
+ // dedupe + limit
+ $seen = [];
+ $out = [];
+
+ foreach ($keywordChunks as $chunk) {
+ $key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
+ if (isset($seen[$key])) {
+ continue;
+ }
+ $seen[$key] = true;
+ $out[] = $chunk;
+ if (\count($out) >= $limit) {
+ break;
+ }
+ }
+
+ return $out;
+ }
+
+ /**
+ * minimal term extraction (we keep your old behavior)
+ *
+ * @return string[]
+ */
+ private function extractTerms(string $text): array
+ {
+ $text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
+
+ return array_values(array_filter(
+ explode(' ', $text),
+ static fn(string $w) => mb_strlen($w) > 2
+ ));
+ }
+}
diff --git a/src/Knowledge/Retrieval/NdjsonKeywordSearch.php b/src/Knowledge/Retrieval/NdjsonKeywordSearch.php
new file mode 100644
index 0000000..e15c91d
--- /dev/null
+++ b/src/Knowledge/Retrieval/NdjsonKeywordSearch.php
@@ -0,0 +1,101 @@
+stopWords->getStopWords(), true);
+ }));
+
+ if ($terms === []) {
+ return [];
+ }
+
+ // bounded min-heap (score => chunkText)
+ $best = [];
+
+ foreach ($this->chunkManager->streamAll() as $row) {
+ $text = $row['text'] ?? null;
+ if (!is_string($text) || $text === '') {
+ continue;
+ }
+
+ $score = $this->scoreText($text, $terms);
+ if ($score <= 0) {
+ continue;
+ }
+
+ $best[] = ['score' => $score, 'text' => trim($text)];
+
+ // keep array bounded to avoid memory spikes
+ if (\count($best) > $candidateCap) {
+ usort($best, fn($a, $b) => $b['score'] <=> $a['score']);
+ $best = array_slice($best, 0, $candidateCap);
+ }
+ }
+
+ if ($best === []) {
+ return [];
+ }
+
+ usort($best, fn($a, $b) => $b['score'] <=> $a['score']);
+
+ $out = [];
+ $seen = [];
+
+ foreach ($best as $row) {
+ $key = mb_strtolower(preg_replace('/\s+/u', ' ', $row['text']));
+ if (isset($seen[$key])) {
+ continue;
+ }
+ $seen[$key] = true;
+ $out[] = $row['text'];
+
+ if (\count($out) >= $limit) {
+ break;
+ }
+ }
+
+ return $out;
+ }
+
+ /**
+ * Simple scoring: count matches, weight long terms slightly.
+ */
+ private function scoreText(string $text, array $terms): int
+ {
+ $content = mb_strtolower($text);
+ $score = 0;
+
+ foreach ($terms as $term) {
+ if ($term === '') {
+ continue;
+ }
+ if (str_contains($content, $term)) {
+ $score += (mb_strlen($term) >= 10) ? 2 : 1;
+ }
+ }
+
+ return $score;
+ }
+}
diff --git a/src/Knowledge/VectorSearchChunked.php b/src/Knowledge/VectorSearchChunked.php
deleted file mode 100644
index a94a956..0000000
--- a/src/Knowledge/VectorSearchChunked.php
+++ /dev/null
@@ -1,121 +0,0 @@
-dataDir = $this->projectDir . '/' . $this->dataDir;
- }
- /**
- * Returns concatenated relevant chunks as plain text.
- *
- * @param string $prompt
- * @return string
- */
- public function searchAsText(string $prompt): string
- {
-
- if (!is_dir($this->dataDir)) {
- return '';
- }
-
- $promptLower = mb_strtolower($prompt);
- $keywords = $this->extractKeywords($promptLower);
-
- if ($keywords === []) {
- return '';
- }
-
- $matches = [];
-
- foreach (glob($this->dataDir . '/*.txt') as $file) {
- $content = file_get_contents($file);
- if ($content === false) {
- continue;
- }
-
- $contentLower = mb_strtolower($content);
-
- if ($this->matchesKeywords($contentLower, $keywords)) {
- $matches[] = trim($content);
- }
-
- if (count($matches) >= $this->maxChunks) {
- break;
- }
- }
-
- return implode("\n\n", $matches);
- }
-
- /**
- * Extracts simple keywords from the prompt.
- *
- * This is a lightweight heuristic replacement for
- * full vector or embedding-based search.
- */
- private function extractKeywords(string $prompt): array
- {
- $words = preg_split('/\W+/u', $prompt, -1, PREG_SPLIT_NO_EMPTY);
- if ($words === false) {
- return [];
- }
-
- $keywords = [];
- foreach ($words as $word) {
- if (mb_strlen($word) >= 4) {
- $keywords[] = $word;
- }
- }
-
- return array_values(array_unique($keywords));
- }
-
- /**
- * Checks whether the content matches at least one keyword.
- */
- private function matchesKeywords(string $content, array $keywords): bool
- {
- foreach ($keywords as $keyword) {
- if (str_contains($content, $keyword)) {
- return true;
- }
- }
-
- return false;
- }
-}