From 5a52e07edc056af1bad5f571ce51259760905df5 Mon Sep 17 00:00:00 2001 From: team 1 Date: Thu, 12 Feb 2026 11:22:56 +0100 Subject: [PATCH] new version ndjson --- .gitignore | 2 + config/services.yaml | 204 +++++++++--------- public/assets/js/base.js | 6 +- src/Command/KnowledgeIngestCommand.php | 101 ++------- src/Command/VectorIngestCommand.php | 75 +------ src/Command/VectorInstallCommand.php | 114 ---------- src/Knowledge/Retrieval/NdjsonChunkLookup.php | 44 ++++ .../Retrieval/NdjsonHybridRetriever.php | 99 +++++++++ .../Retrieval/NdjsonKeywordSearch.php | 101 +++++++++ src/Knowledge/VectorSearchChunked.php | 121 ----------- 10 files changed, 375 insertions(+), 492 deletions(-) delete mode 100644 src/Command/VectorInstallCommand.php create mode 100644 src/Knowledge/Retrieval/NdjsonChunkLookup.php create mode 100644 src/Knowledge/Retrieval/NdjsonHybridRetriever.php create mode 100644 src/Knowledge/Retrieval/NdjsonKeywordSearch.php delete mode 100644 src/Knowledge/VectorSearchChunked.php diff --git a/.gitignore b/.gitignore index 48ad1c7..ad15f48 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ /src/Vector/vector.index /src/Vector/vector_meta.json /var/locks +/var/agent-history +/var/cache .env.local test.* # ---> Symfony diff --git a/config/services.yaml b/config/services.yaml index 2f4b633..8f933e1 100644 --- a/config/services.yaml +++ b/config/services.yaml @@ -1,130 +1,118 @@ -# This file is the entry point to configure your own services. -# Files in the packages/ subdirectory configure your dependencies. -# -# https://symfony.com/doc/current/best_practices.html +# ------------------------------------------------------------ +# Parameters +# ------------------------------------------------------------ parameters: - mto.index.chunk_size: 800 - mto.index.chunk_overlap: 100 - mto.index.embedding_model: 'nomic-embed-text' - mto.index.embedding_dimension: 768 - mto.index.scoring_version: 1 - mto.vector.python_bin: '/var/www/html/src/Vector/.venv/bin/python' - mto.vector.ingest_script: '/src/Vector/vector_ingest.py' - mto.vector.timeout: 600 + mto.index.chunk_size: 800 + mto.index.chunk_overlap: 100 + mto.index.embedding_model: 'nomic-embed-text' + mto.index.embedding_dimension: 768 + mto.index.scoring_version: 1 + + mto.vector.python_bin: '/var/www/html/src/Vector/.venv/bin/python' + mto.vector.ingest_script: '/src/Vector/vector_ingest.py' + mto.vector.timeout: 600 + +# ------------------------------------------------------------ +# Services +# ------------------------------------------------------------ services: - # ------------------------------------------------------------ - # Default service configuration - # ------------------------------------------------------------ - _defaults: - autowire: true - autoconfigure: true + # ------------------------------------------------------------ + # Default service configuration + # ------------------------------------------------------------ + _defaults: + autowire: true + autoconfigure: true - # Bind the agent-specific Monolog channel explicitly - bind: - Psr\Log\LoggerInterface $agentLogger: '@monolog.logger.agent' - string $projectDir: '%kernel.project_dir%' + bind: + Psr\Log\LoggerInterface $agentLogger: '@monolog.logger.agent' + string $projectDir: '%kernel.project_dir%' - # ------------------------------------------------------------ - # Automatically register all services in src/ - # ------------------------------------------------------------ - App\: - resource: '../src/' - exclude: - - '../src/DependencyInjection/' - - '../src/Entity/' - - '../src/Kernel.php' + # ------------------------------------------------------------ + # Automatically register all services in src/ + # ------------------------------------------------------------ + App\: + resource: '../src/' + exclude: + - '../src/DependencyInjection/' + - '../src/Entity/' + - '../src/Kernel.php' - # ------------------------------------------------------------ - # AI Agent – Infrastructure - # ------------------------------------------------------------ - App\Infrastructure\OllamaClient: - arguments: - $apiUrl: '%env(AI_LLM_API_URL)%' - $model: '%env(AI_LLM_MODEL)%' - $timeoutSeconds: '%env(int:AI_LLM_TIMEOUT)%' + App\Controller\: + resource: '../src/Controller/' + tags: ['controller.service_arguments'] - # ------------------------------------------------------------ - # AI Agent – Context & state - # ------------------------------------------------------------ - App\Context\ContextService: - arguments: - $historyDir: '%env(AI_HISTORY_DIR)%' - $projectDir: '%kernel.project_dir%' + # ------------------------------------------------------------ + # AI Agent – Infrastructure + # ------------------------------------------------------------ + App\Infrastructure\OllamaClient: + arguments: + $apiUrl: '%env(AI_LLM_API_URL)%' + $model: '%env(AI_LLM_MODEL)%' + $timeoutSeconds: '%env(int:AI_LLM_TIMEOUT)%' - # ------------------------------------------------------------ - # AI Agent – Debug & logging flags - # ------------------------------------------------------------ - App\Agent\AgentRunner: - arguments: - $debug: '%env(bool:AI_DEBUG)%' - $logPrompt: '%env(bool:AI_LOG_PROMPT)%' - $logContext: '%env(bool:AI_LOG_CONTEXT)%' + # ------------------------------------------------------------ + # AI Agent – Context & Runner + # ------------------------------------------------------------ + App\Context\ContextService: + arguments: + $historyDir: '%env(AI_HISTORY_DIR)%' + $projectDir: '%kernel.project_dir%' - App\Controller\: - resource: '../src/Controller/' - tags: [ 'controller.service_arguments' ] + App\Agent\AgentRunner: + arguments: + $debug: '%env(bool:AI_DEBUG)%' + $logPrompt: '%env(bool:AI_LOG_PROMPT)%' + $logContext: '%env(bool:AI_LOG_CONTEXT)%' - # ------------------------------------------------------------ - # AI Agent – Knowledge - # ------------------------------------------------------------ - App\Knowledge\Retrieval\ChunkKeywordRetriever: - arguments: - $chunksDir: '%kernel.project_dir%/var/knowledge/chunks' + # ------------------------------------------------------------ + # NDJSON Retrieval Stack (FINAL ARCHITECTURE) + # ------------------------------------------------------------ - App\Knowledge\Retrieval\CachedRetriever: - arguments: - $inner: '@App\Knowledge\Retrieval\ChunkKeywordRetriever' - $cache: '@cache.app' - $ttlSeconds: 600 + App\Knowledge\Retrieval\NdjsonChunkLookup: ~ - App\Knowledge\Retrieval\RetrieverInterface: - alias: App\Knowledge\Retrieval\CachedRetriever + App\Knowledge\Retrieval\NdjsonKeywordSearch: ~ - App\Knowledge\Ingest\ChunkWriter: - arguments: - $chunksDir: '%kernel.project_dir%/var/knowledge/chunks' - $manifestPath: '%kernel.project_dir%/var/knowledge/manifest.json' + App\Knowledge\Retrieval\NdjsonHybridRetriever: + arguments: + $maxChunks: 3 + $vectorTopK: 5 - App\Knowledge\Ingest\ChunkIndexWriter: - arguments: - $indexPath: '%kernel.project_dir%/var/knowledge/index.json' + App\Knowledge\Retrieval\CachedRetriever: + arguments: + $inner: '@App\Knowledge\Retrieval\NdjsonHybridRetriever' + $cache: '@cache.app' + $ttlSeconds: 600 - App\Knowledge\Retrieval\ChunkIndexLoader: - arguments: - $indexPath: '%kernel.project_dir%/var/knowledge/index.json' + App\Knowledge\Retrieval\RetrieverInterface: + alias: App\Knowledge\Retrieval\CachedRetriever - App\Command\KnowledgeIngestCommand: - arguments: - $uploadsDir: '%kernel.project_dir%/var/knowledge/uploads' + # ------------------------------------------------------------ + # Vector Search (FAISS NDJSON-based) + # ------------------------------------------------------------ - App\Vector\VectorSearchClient: - arguments: - $vectorDir: '%kernel.project_dir%/src/Vector' + App\Vector\VectorSearchClient: + arguments: + $vectorDir: '%kernel.project_dir%/var/knowledge' - App\Command\VectorIngestCommand: - arguments: - $vectorDir: '%kernel.project_dir%/src/Vector' - $projectDir: '%kernel.project_dir%' + App\Vector\VectorIndexBuilder: + arguments: + $pythonBin: '%mto.vector.python_bin%' + $relativeScriptPath: '%mto.vector.ingest_script%' + $timeoutSeconds: '%mto.vector.timeout%' - App\Command\VectorInstallCommand: - arguments: - $vectorDir: '%kernel.project_dir%/src/Vector' + # ------------------------------------------------------------ + # Index Configuration (Guardrails) + # ------------------------------------------------------------ - App\Index\IndexConfiguration: - arguments: - $chunkSize: '%mto.index.chunk_size%' - $chunkOverlap: '%mto.index.chunk_overlap%' - $embeddingModel: '%mto.index.embedding_model%' - $embeddingDimension: '%mto.index.embedding_dimension%' - $scoringVersion: '%mto.index.scoring_version%' - $indexFormat: 'ndjson' - $vectorBackend: 'faiss' - - App\Vector\VectorIndexBuilder: - arguments: - $pythonBin: '%mto.vector.python_bin%' - $relativeScriptPath: '%mto.vector.ingest_script%' - $timeoutSeconds: '%mto.vector.timeout%' \ No newline at end of file + App\Index\IndexConfiguration: + arguments: + $chunkSize: '%mto.index.chunk_size%' + $chunkOverlap: '%mto.index.chunk_overlap%' + $embeddingModel: '%mto.index.embedding_model%' + $embeddingDimension: '%mto.index.embedding_dimension%' + $scoringVersion: '%mto.index.scoring_version%' + $indexFormat: 'ndjson' + $vectorBackend: 'faiss' diff --git a/public/assets/js/base.js b/public/assets/js/base.js index 645fc16..0619e89 100644 --- a/public/assets/js/base.js +++ b/public/assets/js/base.js @@ -113,12 +113,16 @@ document.addEventListener('DOMContentLoaded', () => { const text = dataLines.join('\n'); if (text === '[DONE]') { - // Finales Rendering mit Normalisierung + + // 🔥 Final render flush if (renderTimer) { clearTimeout(renderTimer); renderTimer = null; } + + bubble.innerHTML = renderMarkdown(raw); chatEl.scrollTop = chatEl.scrollHeight; + abort = true; return; } diff --git a/src/Command/KnowledgeIngestCommand.php b/src/Command/KnowledgeIngestCommand.php index 081c0a6..25d9e65 100644 --- a/src/Command/KnowledgeIngestCommand.php +++ b/src/Command/KnowledgeIngestCommand.php @@ -1,28 +1,25 @@ addArgument( - 'file', - InputArgument::OPTIONAL, - 'Path to a single .txt/.md file' - ) - ->addOption( - 'all', - null, - InputOption::VALUE_NONE, - 'Ingest all .md files from the uploads directory' - ) - ->addOption( - 'optimize', - 'o', - InputOption::VALUE_NONE, - 'Optimize chunks for retrieval quality' - ); + ->addArgument('versionId', InputArgument::REQUIRED, 'UUID of DocumentVersion') + ->addArgument('userId', InputArgument::REQUIRED, 'UUID of user triggering ingest'); } protected function execute(InputInterface $input, OutputInterface $output): int { - $files = []; - $optimize = (bool) $input->getOption('optimize'); + $versionId = $input->getArgument('versionId'); + $userId = $input->getArgument('userId'); - if ($input->getOption('all')) { - if (!is_dir($this->uploadsDir)) { - $output->writeln('❌ uploads directory not found'); - return Command::FAILURE; - } + $version = $this->em->getRepository(DocumentVersion::class)->find($versionId); + $user = $this->em->getRepository(User::class)->find($userId); - $finder = new Finder(); - $finder - ->files() - ->in($this->uploadsDir) - ->name('*.md'); - - if (!$finder->hasResults()) { - $output->writeln('ℹ️ No .md files found in uploads/'); - return Command::SUCCESS; - } - - foreach ($finder as $file) { - $files[] = $file->getRealPath(); - } - - $output->writeln(sprintf( - '📂 Ingesting %d markdown files from uploads (%s)', - count($files), - $optimize ? 'optimized' : 'standard' - )); - } else { - $file = $input->getArgument('file'); - - if (!$file) { - $output->writeln('❌ Either provide a file or use --all'); - return Command::FAILURE; - } - - $files[] = (string) $file; + if (!$version || !$user) { + $output->writeln('Version or User not found.'); + return Command::FAILURE; } - $totalWritten = 0; + $output->writeln('Starting ingest...'); - foreach ($files as $filePath) { - $output->writeln('➡️ Ingesting: ' . $filePath); + $this->ingestFlow->ingestDocumentVersion($version, $user); - $written = $this->ingest->ingestFile( - $filePath, - optimize: $optimize - ); - - $totalWritten += count($written); - - foreach ($written as $chunk) { - $output->writeln(' - ' . $chunk); - } - } - - $output->writeln(''); - $output->writeln('✅ Total written chunks: ' . $totalWritten); + $output->writeln('Ingest completed.'); return Command::SUCCESS; } diff --git a/src/Command/VectorIngestCommand.php b/src/Command/VectorIngestCommand.php index 823b6ad..5c84759 100644 --- a/src/Command/VectorIngestCommand.php +++ b/src/Command/VectorIngestCommand.php @@ -4,86 +4,27 @@ declare(strict_types=1); namespace App\Command; +use App\Vector\VectorIndexBuilder; use Symfony\Component\Console\Attribute\AsCommand; use Symfony\Component\Console\Command\Command; use Symfony\Component\Console\Input\InputInterface; use Symfony\Component\Console\Output\OutputInterface; -#[AsCommand( - name: 'mto:agent:vector:ingest', - description: 'Builds the FAISS vector index from index.json' -)] -final class VectorIngestCommand extends Command +#[AsCommand(name: 'mto:agent:vector:rebuild')] +class VectorIngestCommand extends Command { public function __construct( - private readonly string $vectorDir, - private readonly string $projectDir + private readonly VectorIndexBuilder $builder ) { parent::__construct(); } protected function execute(InputInterface $input, OutputInterface $output): int { - $vectorDir = rtrim($this->vectorDir, '/'); + $output->writeln('Rebuilding vector index...'); + $this->builder->rebuildFromNdjson(); + $output->writeln('Done.'); - if (!is_dir($vectorDir)) { - $output->writeln('Vector directory not found'); - return Command::FAILURE; - } - - $script = $vectorDir . '/vector_ingest.py'; - - if (!is_file($script)) { - $output->writeln('vector_ingest.py not found'); - return Command::FAILURE; - } - - // ------------------------------------------------- - // Enforce venv usage - // ------------------------------------------------- - $venvPython = $vectorDir . '/.venv/bin/python'; - - if (!is_file($venvPython)) { - $output->writeln('No Python virtual environment found.'); - $output->writeln('Run first:'); - $output->writeln(' php bin/console mto:agent:vector:install'); - return Command::FAILURE; - } - - $knowledgeDir = rtrim($this->projectDir, '/') . '/var/knowledge'; - - if (!is_dir($knowledgeDir)) { - $output->writeln('Knowledge directory not found:'); - $output->writeln($knowledgeDir); - return Command::FAILURE; - } - - $output->writeln('Building FAISS vector index…'); - $output->writeln(sprintf( - 'Vector dir: %s', - $vectorDir - )); - $output->writeln(sprintf( - 'Knowledge dir: %s', - $knowledgeDir - )); - - $cmd = sprintf( - '%s %s %s %s 2>&1', - escapeshellarg($venvPython), - escapeshellarg($script), - escapeshellarg($vectorDir), - escapeshellarg($knowledgeDir) - ); - - exec($cmd, $out, $exitCode); - - foreach ($out as $line) { - $output->writeln($line); - } - - return $exitCode === 0 - ? Command::SUCCESS - : Command::FAILURE; + return Command::SUCCESS; } } diff --git a/src/Command/VectorInstallCommand.php b/src/Command/VectorInstallCommand.php deleted file mode 100644 index 6b17ba1..0000000 --- a/src/Command/VectorInstallCommand.php +++ /dev/null @@ -1,114 +0,0 @@ -vectorDir)) { - $output->writeln('Vector directory not found'); - return Command::FAILURE; - } - - $vectorDir = rtrim($this->vectorDir, '/'); - $venvDir = $vectorDir . '/.venv'; - $venvPython = $venvDir . '/bin/python'; - - // ------------------------------------------------- - // 1) Create venv if missing - // ------------------------------------------------- - if (!is_dir($venvDir)) { - $output->writeln('Creating Python virtual environment…'); - - $cmd = sprintf( - 'python3 -m venv %s 2>&1', - escapeshellarg($venvDir) - ); - - exec($cmd, $out, $exitCode); - - foreach ($out as $line) { - $output->writeln($line); - } - - if ($exitCode !== 0 || !is_file($venvPython)) { - $output->writeln(''); - $output->writeln('Failed to create Python virtual environment.'); - $output->writeln('Ensure that python3-venv is installed on the system.'); - return Command::FAILURE; - } - } else { - $output->writeln('Using existing Python virtual environment'); - } - - // ------------------------------------------------- - // 2) Ensure pip exists inside venv - // ------------------------------------------------- - $cmd = sprintf( - '%s -m pip --version 2>&1', - escapeshellarg($venvPython) - ); - - exec($cmd, $out, $exitCode); - - if ($exitCode !== 0) { - $output->writeln(''); - $output->writeln('The existing virtual environment has no pip.'); - $output->writeln('This usually means it was created before python3-pip was installed.'); - $output->writeln('Fix:'); - $output->writeln(sprintf(' rm -rf %s', $venvDir)); - $output->writeln(' php bin/console mto:agent:vector:install'); - return Command::FAILURE; - } - - // ------------------------------------------------- - // 3) Install / update dependencies - // ------------------------------------------------- - $output->writeln('Installing vector dependencies…'); - - $cmd = sprintf( - '%s -m pip install --upgrade faiss-cpu sentence-transformers 2>&1', - escapeshellarg($venvPython) - ); - - exec($cmd, $out, $exitCode); - - foreach ($out as $line) { - $output->writeln($line); - } - - if ($exitCode !== 0) { - $output->writeln('Dependency installation failed'); - return Command::FAILURE; - } - - $output->writeln(''); - $output->writeln('Vector dependencies installed successfully'); - $output->writeln(sprintf('venv: %s', $venvDir)); - - return Command::SUCCESS; - } -} diff --git a/src/Knowledge/Retrieval/NdjsonChunkLookup.php b/src/Knowledge/Retrieval/NdjsonChunkLookup.php new file mode 100644 index 0000000..7fd8ae4 --- /dev/null +++ b/src/Knowledge/Retrieval/NdjsonChunkLookup.php @@ -0,0 +1,44 @@ +> keyed by chunk_id + */ + public function findByChunkIds(array $chunkIds): array + { + $wanted = array_fill_keys($chunkIds, true); + $found = []; + + foreach ($this->chunkManager->streamAll() as $row) { + $id = $row['chunk_id'] ?? null; + if (!is_string($id) || !isset($wanted[$id])) { + continue; + } + + $found[$id] = $row; + + // Early exit sobald alle gefunden + if (\count($found) === \count($wanted)) { + break; + } + } + + return $found; + } +} diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php new file mode 100644 index 0000000..83e15c5 --- /dev/null +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -0,0 +1,99 @@ +maxChunks; + + $terms = $this->extractTerms($prompt); + + // 1) Keyword first + $keywordChunks = $this->keywordSearch->search($terms, $limit); + if (\count($keywordChunks) >= $limit) { + return array_slice($keywordChunks, 0, $limit); + } + + // 2) Vector fallback / enrichment + $hits = $this->vectorClient->search($prompt, $this->vectorTopK); + if ($hits === []) { + return $keywordChunks; + } + + $chunkIds = []; + foreach ($hits as $hit) { + if (!isset($hit['chunk_id'], $hit['score'])) { + continue; + } + if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) { + continue; + } + $chunkIds[] = (string)$hit['chunk_id']; + } + + if ($chunkIds === []) { + return $keywordChunks; + } + + $rows = $this->lookup->findByChunkIds($chunkIds); + + foreach ($chunkIds as $id) { + if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) { + continue; + } + $keywordChunks[] = trim($rows[$id]['text']); + } + + // dedupe + limit + $seen = []; + $out = []; + + foreach ($keywordChunks as $chunk) { + $key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk)); + if (isset($seen[$key])) { + continue; + } + $seen[$key] = true; + $out[] = $chunk; + if (\count($out) >= $limit) { + break; + } + } + + return $out; + } + + /** + * minimal term extraction (we keep your old behavior) + * + * @return string[] + */ + private function extractTerms(string $text): array + { + $text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text)); + + return array_values(array_filter( + explode(' ', $text), + static fn(string $w) => mb_strlen($w) > 2 + )); + } +} diff --git a/src/Knowledge/Retrieval/NdjsonKeywordSearch.php b/src/Knowledge/Retrieval/NdjsonKeywordSearch.php new file mode 100644 index 0000000..e15c91d --- /dev/null +++ b/src/Knowledge/Retrieval/NdjsonKeywordSearch.php @@ -0,0 +1,101 @@ +stopWords->getStopWords(), true); + })); + + if ($terms === []) { + return []; + } + + // bounded min-heap (score => chunkText) + $best = []; + + foreach ($this->chunkManager->streamAll() as $row) { + $text = $row['text'] ?? null; + if (!is_string($text) || $text === '') { + continue; + } + + $score = $this->scoreText($text, $terms); + if ($score <= 0) { + continue; + } + + $best[] = ['score' => $score, 'text' => trim($text)]; + + // keep array bounded to avoid memory spikes + if (\count($best) > $candidateCap) { + usort($best, fn($a, $b) => $b['score'] <=> $a['score']); + $best = array_slice($best, 0, $candidateCap); + } + } + + if ($best === []) { + return []; + } + + usort($best, fn($a, $b) => $b['score'] <=> $a['score']); + + $out = []; + $seen = []; + + foreach ($best as $row) { + $key = mb_strtolower(preg_replace('/\s+/u', ' ', $row['text'])); + if (isset($seen[$key])) { + continue; + } + $seen[$key] = true; + $out[] = $row['text']; + + if (\count($out) >= $limit) { + break; + } + } + + return $out; + } + + /** + * Simple scoring: count matches, weight long terms slightly. + */ + private function scoreText(string $text, array $terms): int + { + $content = mb_strtolower($text); + $score = 0; + + foreach ($terms as $term) { + if ($term === '') { + continue; + } + if (str_contains($content, $term)) { + $score += (mb_strlen($term) >= 10) ? 2 : 1; + } + } + + return $score; + } +} diff --git a/src/Knowledge/VectorSearchChunked.php b/src/Knowledge/VectorSearchChunked.php deleted file mode 100644 index a94a956..0000000 --- a/src/Knowledge/VectorSearchChunked.php +++ /dev/null @@ -1,121 +0,0 @@ -dataDir = $this->projectDir . '/' . $this->dataDir; - } - /** - * Returns concatenated relevant chunks as plain text. - * - * @param string $prompt - * @return string - */ - public function searchAsText(string $prompt): string - { - - if (!is_dir($this->dataDir)) { - return ''; - } - - $promptLower = mb_strtolower($prompt); - $keywords = $this->extractKeywords($promptLower); - - if ($keywords === []) { - return ''; - } - - $matches = []; - - foreach (glob($this->dataDir . '/*.txt') as $file) { - $content = file_get_contents($file); - if ($content === false) { - continue; - } - - $contentLower = mb_strtolower($content); - - if ($this->matchesKeywords($contentLower, $keywords)) { - $matches[] = trim($content); - } - - if (count($matches) >= $this->maxChunks) { - break; - } - } - - return implode("\n\n", $matches); - } - - /** - * Extracts simple keywords from the prompt. - * - * This is a lightweight heuristic replacement for - * full vector or embedding-based search. - */ - private function extractKeywords(string $prompt): array - { - $words = preg_split('/\W+/u', $prompt, -1, PREG_SPLIT_NO_EMPTY); - if ($words === false) { - return []; - } - - $keywords = []; - foreach ($words as $word) { - if (mb_strlen($word) >= 4) { - $keywords[] = $word; - } - } - - return array_values(array_unique($keywords)); - } - - /** - * Checks whether the content matches at least one keyword. - */ - private function matchesKeywords(string $content, array $keywords): bool - { - foreach ($keywords as $keyword) { - if (str_contains($content, $keyword)) { - return true; - } - } - - return false; - } -}