new version ndjson
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -5,6 +5,8 @@
|
|||||||
/src/Vector/vector.index
|
/src/Vector/vector.index
|
||||||
/src/Vector/vector_meta.json
|
/src/Vector/vector_meta.json
|
||||||
/var/locks
|
/var/locks
|
||||||
|
/var/agent-history
|
||||||
|
/var/cache
|
||||||
.env.local
|
.env.local
|
||||||
test.*
|
test.*
|
||||||
# ---> Symfony
|
# ---> Symfony
|
||||||
|
|||||||
@@ -1,130 +1,118 @@
|
|||||||
# This file is the entry point to configure your own services.
|
# ------------------------------------------------------------
|
||||||
# Files in the packages/ subdirectory configure your dependencies.
|
# Parameters
|
||||||
#
|
# ------------------------------------------------------------
|
||||||
# https://symfony.com/doc/current/best_practices.html
|
|
||||||
|
|
||||||
parameters:
|
parameters:
|
||||||
mto.index.chunk_size: 800
|
mto.index.chunk_size: 800
|
||||||
mto.index.chunk_overlap: 100
|
mto.index.chunk_overlap: 100
|
||||||
mto.index.embedding_model: 'nomic-embed-text'
|
mto.index.embedding_model: 'nomic-embed-text'
|
||||||
mto.index.embedding_dimension: 768
|
mto.index.embedding_dimension: 768
|
||||||
mto.index.scoring_version: 1
|
mto.index.scoring_version: 1
|
||||||
mto.vector.python_bin: '/var/www/html/src/Vector/.venv/bin/python'
|
|
||||||
mto.vector.ingest_script: '/src/Vector/vector_ingest.py'
|
mto.vector.python_bin: '/var/www/html/src/Vector/.venv/bin/python'
|
||||||
mto.vector.timeout: 600
|
mto.vector.ingest_script: '/src/Vector/vector_ingest.py'
|
||||||
|
mto.vector.timeout: 600
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# Services
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
services:
|
services:
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# Default service configuration
|
# Default service configuration
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
_defaults:
|
_defaults:
|
||||||
autowire: true
|
autowire: true
|
||||||
autoconfigure: true
|
autoconfigure: true
|
||||||
|
|
||||||
# Bind the agent-specific Monolog channel explicitly
|
bind:
|
||||||
bind:
|
Psr\Log\LoggerInterface $agentLogger: '@monolog.logger.agent'
|
||||||
Psr\Log\LoggerInterface $agentLogger: '@monolog.logger.agent'
|
string $projectDir: '%kernel.project_dir%'
|
||||||
string $projectDir: '%kernel.project_dir%'
|
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# Automatically register all services in src/
|
# Automatically register all services in src/
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
App\:
|
App\:
|
||||||
resource: '../src/'
|
resource: '../src/'
|
||||||
exclude:
|
exclude:
|
||||||
- '../src/DependencyInjection/'
|
- '../src/DependencyInjection/'
|
||||||
- '../src/Entity/'
|
- '../src/Entity/'
|
||||||
- '../src/Kernel.php'
|
- '../src/Kernel.php'
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
App\Controller\:
|
||||||
# AI Agent – Infrastructure
|
resource: '../src/Controller/'
|
||||||
# ------------------------------------------------------------
|
tags: ['controller.service_arguments']
|
||||||
App\Infrastructure\OllamaClient:
|
|
||||||
arguments:
|
|
||||||
$apiUrl: '%env(AI_LLM_API_URL)%'
|
|
||||||
$model: '%env(AI_LLM_MODEL)%'
|
|
||||||
$timeoutSeconds: '%env(int:AI_LLM_TIMEOUT)%'
|
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# AI Agent – Context & state
|
# AI Agent – Infrastructure
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
App\Context\ContextService:
|
App\Infrastructure\OllamaClient:
|
||||||
arguments:
|
arguments:
|
||||||
$historyDir: '%env(AI_HISTORY_DIR)%'
|
$apiUrl: '%env(AI_LLM_API_URL)%'
|
||||||
$projectDir: '%kernel.project_dir%'
|
$model: '%env(AI_LLM_MODEL)%'
|
||||||
|
$timeoutSeconds: '%env(int:AI_LLM_TIMEOUT)%'
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# AI Agent – Debug & logging flags
|
# AI Agent – Context & Runner
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
App\Agent\AgentRunner:
|
App\Context\ContextService:
|
||||||
arguments:
|
arguments:
|
||||||
$debug: '%env(bool:AI_DEBUG)%'
|
$historyDir: '%env(AI_HISTORY_DIR)%'
|
||||||
$logPrompt: '%env(bool:AI_LOG_PROMPT)%'
|
$projectDir: '%kernel.project_dir%'
|
||||||
$logContext: '%env(bool:AI_LOG_CONTEXT)%'
|
|
||||||
|
|
||||||
App\Controller\:
|
App\Agent\AgentRunner:
|
||||||
resource: '../src/Controller/'
|
arguments:
|
||||||
tags: [ 'controller.service_arguments' ]
|
$debug: '%env(bool:AI_DEBUG)%'
|
||||||
|
$logPrompt: '%env(bool:AI_LOG_PROMPT)%'
|
||||||
|
$logContext: '%env(bool:AI_LOG_CONTEXT)%'
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# AI Agent – Knowledge
|
# NDJSON Retrieval Stack (FINAL ARCHITECTURE)
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
App\Knowledge\Retrieval\ChunkKeywordRetriever:
|
|
||||||
arguments:
|
|
||||||
$chunksDir: '%kernel.project_dir%/var/knowledge/chunks'
|
|
||||||
|
|
||||||
App\Knowledge\Retrieval\CachedRetriever:
|
App\Knowledge\Retrieval\NdjsonChunkLookup: ~
|
||||||
arguments:
|
|
||||||
$inner: '@App\Knowledge\Retrieval\ChunkKeywordRetriever'
|
|
||||||
$cache: '@cache.app'
|
|
||||||
$ttlSeconds: 600
|
|
||||||
|
|
||||||
App\Knowledge\Retrieval\RetrieverInterface:
|
App\Knowledge\Retrieval\NdjsonKeywordSearch: ~
|
||||||
alias: App\Knowledge\Retrieval\CachedRetriever
|
|
||||||
|
|
||||||
App\Knowledge\Ingest\ChunkWriter:
|
App\Knowledge\Retrieval\NdjsonHybridRetriever:
|
||||||
arguments:
|
arguments:
|
||||||
$chunksDir: '%kernel.project_dir%/var/knowledge/chunks'
|
$maxChunks: 3
|
||||||
$manifestPath: '%kernel.project_dir%/var/knowledge/manifest.json'
|
$vectorTopK: 5
|
||||||
|
|
||||||
App\Knowledge\Ingest\ChunkIndexWriter:
|
App\Knowledge\Retrieval\CachedRetriever:
|
||||||
arguments:
|
arguments:
|
||||||
$indexPath: '%kernel.project_dir%/var/knowledge/index.json'
|
$inner: '@App\Knowledge\Retrieval\NdjsonHybridRetriever'
|
||||||
|
$cache: '@cache.app'
|
||||||
|
$ttlSeconds: 600
|
||||||
|
|
||||||
App\Knowledge\Retrieval\ChunkIndexLoader:
|
App\Knowledge\Retrieval\RetrieverInterface:
|
||||||
arguments:
|
alias: App\Knowledge\Retrieval\CachedRetriever
|
||||||
$indexPath: '%kernel.project_dir%/var/knowledge/index.json'
|
|
||||||
|
|
||||||
App\Command\KnowledgeIngestCommand:
|
# ------------------------------------------------------------
|
||||||
arguments:
|
# Vector Search (FAISS NDJSON-based)
|
||||||
$uploadsDir: '%kernel.project_dir%/var/knowledge/uploads'
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
App\Vector\VectorSearchClient:
|
App\Vector\VectorSearchClient:
|
||||||
arguments:
|
arguments:
|
||||||
$vectorDir: '%kernel.project_dir%/src/Vector'
|
$vectorDir: '%kernel.project_dir%/var/knowledge'
|
||||||
|
|
||||||
App\Command\VectorIngestCommand:
|
App\Vector\VectorIndexBuilder:
|
||||||
arguments:
|
arguments:
|
||||||
$vectorDir: '%kernel.project_dir%/src/Vector'
|
$pythonBin: '%mto.vector.python_bin%'
|
||||||
$projectDir: '%kernel.project_dir%'
|
$relativeScriptPath: '%mto.vector.ingest_script%'
|
||||||
|
$timeoutSeconds: '%mto.vector.timeout%'
|
||||||
|
|
||||||
App\Command\VectorInstallCommand:
|
# ------------------------------------------------------------
|
||||||
arguments:
|
# Index Configuration (Guardrails)
|
||||||
$vectorDir: '%kernel.project_dir%/src/Vector'
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
App\Index\IndexConfiguration:
|
App\Index\IndexConfiguration:
|
||||||
arguments:
|
arguments:
|
||||||
$chunkSize: '%mto.index.chunk_size%'
|
$chunkSize: '%mto.index.chunk_size%'
|
||||||
$chunkOverlap: '%mto.index.chunk_overlap%'
|
$chunkOverlap: '%mto.index.chunk_overlap%'
|
||||||
$embeddingModel: '%mto.index.embedding_model%'
|
$embeddingModel: '%mto.index.embedding_model%'
|
||||||
$embeddingDimension: '%mto.index.embedding_dimension%'
|
$embeddingDimension: '%mto.index.embedding_dimension%'
|
||||||
$scoringVersion: '%mto.index.scoring_version%'
|
$scoringVersion: '%mto.index.scoring_version%'
|
||||||
$indexFormat: 'ndjson'
|
$indexFormat: 'ndjson'
|
||||||
$vectorBackend: 'faiss'
|
$vectorBackend: 'faiss'
|
||||||
|
|
||||||
App\Vector\VectorIndexBuilder:
|
|
||||||
arguments:
|
|
||||||
$pythonBin: '%mto.vector.python_bin%'
|
|
||||||
$relativeScriptPath: '%mto.vector.ingest_script%'
|
|
||||||
$timeoutSeconds: '%mto.vector.timeout%'
|
|
||||||
|
|||||||
@@ -113,12 +113,16 @@ document.addEventListener('DOMContentLoaded', () => {
|
|||||||
const text = dataLines.join('\n');
|
const text = dataLines.join('\n');
|
||||||
|
|
||||||
if (text === '[DONE]') {
|
if (text === '[DONE]') {
|
||||||
// Finales Rendering mit Normalisierung
|
|
||||||
|
// 🔥 Final render flush
|
||||||
if (renderTimer) {
|
if (renderTimer) {
|
||||||
clearTimeout(renderTimer);
|
clearTimeout(renderTimer);
|
||||||
renderTimer = null;
|
renderTimer = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bubble.innerHTML = renderMarkdown(raw);
|
||||||
chatEl.scrollTop = chatEl.scrollHeight;
|
chatEl.scrollTop = chatEl.scrollHeight;
|
||||||
|
|
||||||
abort = true;
|
abort = true;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,28 +1,25 @@
|
|||||||
<?php
|
<?php
|
||||||
// src/Command/KnowledgeIngestCommand.php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
declare(strict_types=1);
|
||||||
|
|
||||||
namespace App\Command;
|
namespace App\Command;
|
||||||
|
|
||||||
use App\Knowledge\Ingest\KnowledgeIngestService;
|
use App\Entity\DocumentVersion;
|
||||||
|
use App\Entity\User;
|
||||||
|
use App\Ingest\IngestFlow;
|
||||||
|
use Doctrine\ORM\EntityManagerInterface;
|
||||||
use Symfony\Component\Console\Attribute\AsCommand;
|
use Symfony\Component\Console\Attribute\AsCommand;
|
||||||
use Symfony\Component\Console\Command\Command;
|
use Symfony\Component\Console\Command\Command;
|
||||||
use Symfony\Component\Console\Input\InputArgument;
|
use Symfony\Component\Console\Input\InputArgument;
|
||||||
use Symfony\Component\Console\Input\InputInterface;
|
use Symfony\Component\Console\Input\InputInterface;
|
||||||
use Symfony\Component\Console\Input\InputOption;
|
|
||||||
use Symfony\Component\Console\Output\OutputInterface;
|
use Symfony\Component\Console\Output\OutputInterface;
|
||||||
use Symfony\Component\Finder\Finder;
|
|
||||||
|
|
||||||
#[AsCommand(
|
#[AsCommand(name: 'mto:agent:ingest:version')]
|
||||||
name: 'mto:agent:knowledge:ingest',
|
class KnowledgeIngestCommand extends Command
|
||||||
description: 'Ingest one or multiple markdown/text documents into file-based knowledge chunks'
|
|
||||||
)]
|
|
||||||
final class KnowledgeIngestCommand extends Command
|
|
||||||
{
|
{
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly KnowledgeIngestService $ingest,
|
private readonly IngestFlow $ingestFlow,
|
||||||
private readonly string $uploadsDir,
|
private readonly EntityManagerInterface $em,
|
||||||
) {
|
) {
|
||||||
parent::__construct();
|
parent::__construct();
|
||||||
}
|
}
|
||||||
@@ -30,86 +27,28 @@ final class KnowledgeIngestCommand extends Command
|
|||||||
protected function configure(): void
|
protected function configure(): void
|
||||||
{
|
{
|
||||||
$this
|
$this
|
||||||
->addArgument(
|
->addArgument('versionId', InputArgument::REQUIRED, 'UUID of DocumentVersion')
|
||||||
'file',
|
->addArgument('userId', InputArgument::REQUIRED, 'UUID of user triggering ingest');
|
||||||
InputArgument::OPTIONAL,
|
|
||||||
'Path to a single .txt/.md file'
|
|
||||||
)
|
|
||||||
->addOption(
|
|
||||||
'all',
|
|
||||||
null,
|
|
||||||
InputOption::VALUE_NONE,
|
|
||||||
'Ingest all .md files from the uploads directory'
|
|
||||||
)
|
|
||||||
->addOption(
|
|
||||||
'optimize',
|
|
||||||
'o',
|
|
||||||
InputOption::VALUE_NONE,
|
|
||||||
'Optimize chunks for retrieval quality'
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||||
{
|
{
|
||||||
$files = [];
|
$versionId = $input->getArgument('versionId');
|
||||||
$optimize = (bool) $input->getOption('optimize');
|
$userId = $input->getArgument('userId');
|
||||||
|
|
||||||
if ($input->getOption('all')) {
|
$version = $this->em->getRepository(DocumentVersion::class)->find($versionId);
|
||||||
if (!is_dir($this->uploadsDir)) {
|
$user = $this->em->getRepository(User::class)->find($userId);
|
||||||
$output->writeln('<error>❌ uploads directory not found</error>');
|
|
||||||
return Command::FAILURE;
|
|
||||||
}
|
|
||||||
|
|
||||||
$finder = new Finder();
|
if (!$version || !$user) {
|
||||||
$finder
|
$output->writeln('<error>Version or User not found.</error>');
|
||||||
->files()
|
return Command::FAILURE;
|
||||||
->in($this->uploadsDir)
|
|
||||||
->name('*.md');
|
|
||||||
|
|
||||||
if (!$finder->hasResults()) {
|
|
||||||
$output->writeln('<comment>ℹ️ No .md files found in uploads/</comment>');
|
|
||||||
return Command::SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach ($finder as $file) {
|
|
||||||
$files[] = $file->getRealPath();
|
|
||||||
}
|
|
||||||
|
|
||||||
$output->writeln(sprintf(
|
|
||||||
'📂 Ingesting %d markdown files from uploads (%s)',
|
|
||||||
count($files),
|
|
||||||
$optimize ? 'optimized' : 'standard'
|
|
||||||
));
|
|
||||||
} else {
|
|
||||||
$file = $input->getArgument('file');
|
|
||||||
|
|
||||||
if (!$file) {
|
|
||||||
$output->writeln('<error>❌ Either provide a file or use --all</error>');
|
|
||||||
return Command::FAILURE;
|
|
||||||
}
|
|
||||||
|
|
||||||
$files[] = (string) $file;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$totalWritten = 0;
|
$output->writeln('Starting ingest...');
|
||||||
|
|
||||||
foreach ($files as $filePath) {
|
$this->ingestFlow->ingestDocumentVersion($version, $user);
|
||||||
$output->writeln('➡️ Ingesting: ' . $filePath);
|
|
||||||
|
|
||||||
$written = $this->ingest->ingestFile(
|
$output->writeln('<info>Ingest completed.</info>');
|
||||||
$filePath,
|
|
||||||
optimize: $optimize
|
|
||||||
);
|
|
||||||
|
|
||||||
$totalWritten += count($written);
|
|
||||||
|
|
||||||
foreach ($written as $chunk) {
|
|
||||||
$output->writeln(' - ' . $chunk);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$output->writeln('');
|
|
||||||
$output->writeln('✅ Total written chunks: ' . $totalWritten);
|
|
||||||
|
|
||||||
return Command::SUCCESS;
|
return Command::SUCCESS;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,86 +4,27 @@ declare(strict_types=1);
|
|||||||
|
|
||||||
namespace App\Command;
|
namespace App\Command;
|
||||||
|
|
||||||
|
use App\Vector\VectorIndexBuilder;
|
||||||
use Symfony\Component\Console\Attribute\AsCommand;
|
use Symfony\Component\Console\Attribute\AsCommand;
|
||||||
use Symfony\Component\Console\Command\Command;
|
use Symfony\Component\Console\Command\Command;
|
||||||
use Symfony\Component\Console\Input\InputInterface;
|
use Symfony\Component\Console\Input\InputInterface;
|
||||||
use Symfony\Component\Console\Output\OutputInterface;
|
use Symfony\Component\Console\Output\OutputInterface;
|
||||||
|
|
||||||
#[AsCommand(
|
#[AsCommand(name: 'mto:agent:vector:rebuild')]
|
||||||
name: 'mto:agent:vector:ingest',
|
class VectorIngestCommand extends Command
|
||||||
description: 'Builds the FAISS vector index from index.json'
|
|
||||||
)]
|
|
||||||
final class VectorIngestCommand extends Command
|
|
||||||
{
|
{
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly string $vectorDir,
|
private readonly VectorIndexBuilder $builder
|
||||||
private readonly string $projectDir
|
|
||||||
) {
|
) {
|
||||||
parent::__construct();
|
parent::__construct();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||||
{
|
{
|
||||||
$vectorDir = rtrim($this->vectorDir, '/');
|
$output->writeln('Rebuilding vector index...');
|
||||||
|
$this->builder->rebuildFromNdjson();
|
||||||
|
$output->writeln('Done.');
|
||||||
|
|
||||||
if (!is_dir($vectorDir)) {
|
return Command::SUCCESS;
|
||||||
$output->writeln('<error>Vector directory not found</error>');
|
|
||||||
return Command::FAILURE;
|
|
||||||
}
|
|
||||||
|
|
||||||
$script = $vectorDir . '/vector_ingest.py';
|
|
||||||
|
|
||||||
if (!is_file($script)) {
|
|
||||||
$output->writeln('<error>vector_ingest.py not found</error>');
|
|
||||||
return Command::FAILURE;
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------
|
|
||||||
// Enforce venv usage
|
|
||||||
// -------------------------------------------------
|
|
||||||
$venvPython = $vectorDir . '/.venv/bin/python';
|
|
||||||
|
|
||||||
if (!is_file($venvPython)) {
|
|
||||||
$output->writeln('<error>No Python virtual environment found.</error>');
|
|
||||||
$output->writeln('<comment>Run first:</comment>');
|
|
||||||
$output->writeln('<info> php bin/console mto:agent:vector:install</info>');
|
|
||||||
return Command::FAILURE;
|
|
||||||
}
|
|
||||||
|
|
||||||
$knowledgeDir = rtrim($this->projectDir, '/') . '/var/knowledge';
|
|
||||||
|
|
||||||
if (!is_dir($knowledgeDir)) {
|
|
||||||
$output->writeln('<error>Knowledge directory not found:</error>');
|
|
||||||
$output->writeln($knowledgeDir);
|
|
||||||
return Command::FAILURE;
|
|
||||||
}
|
|
||||||
|
|
||||||
$output->writeln('<info>Building FAISS vector index…</info>');
|
|
||||||
$output->writeln(sprintf(
|
|
||||||
'<comment>Vector dir:</comment> %s',
|
|
||||||
$vectorDir
|
|
||||||
));
|
|
||||||
$output->writeln(sprintf(
|
|
||||||
'<comment>Knowledge dir:</comment> %s',
|
|
||||||
$knowledgeDir
|
|
||||||
));
|
|
||||||
|
|
||||||
$cmd = sprintf(
|
|
||||||
'%s %s %s %s 2>&1',
|
|
||||||
escapeshellarg($venvPython),
|
|
||||||
escapeshellarg($script),
|
|
||||||
escapeshellarg($vectorDir),
|
|
||||||
escapeshellarg($knowledgeDir)
|
|
||||||
);
|
|
||||||
|
|
||||||
exec($cmd, $out, $exitCode);
|
|
||||||
|
|
||||||
foreach ($out as $line) {
|
|
||||||
$output->writeln($line);
|
|
||||||
}
|
|
||||||
|
|
||||||
return $exitCode === 0
|
|
||||||
? Command::SUCCESS
|
|
||||||
: Command::FAILURE;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,114 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace App\Command;
|
|
||||||
|
|
||||||
use Symfony\Component\Console\Attribute\AsCommand;
|
|
||||||
use Symfony\Component\Console\Command\Command;
|
|
||||||
use Symfony\Component\Console\Input\InputInterface;
|
|
||||||
use Symfony\Component\Console\Output\OutputInterface;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* System requirements (once per environment):
|
|
||||||
* sudo apt update
|
|
||||||
* sudo apt install -y python3-venv python3-pip
|
|
||||||
*/
|
|
||||||
#[AsCommand(
|
|
||||||
name: 'mto:agent:vector:install',
|
|
||||||
description: 'Creates a Python venv and installs vector dependencies'
|
|
||||||
)]
|
|
||||||
final class VectorInstallCommand extends Command
|
|
||||||
{
|
|
||||||
public function __construct(
|
|
||||||
private readonly string $vectorDir
|
|
||||||
) {
|
|
||||||
parent::__construct();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
|
||||||
{
|
|
||||||
if (!is_dir($this->vectorDir)) {
|
|
||||||
$output->writeln('<error>Vector directory not found</error>');
|
|
||||||
return Command::FAILURE;
|
|
||||||
}
|
|
||||||
|
|
||||||
$vectorDir = rtrim($this->vectorDir, '/');
|
|
||||||
$venvDir = $vectorDir . '/.venv';
|
|
||||||
$venvPython = $venvDir . '/bin/python';
|
|
||||||
|
|
||||||
// -------------------------------------------------
|
|
||||||
// 1) Create venv if missing
|
|
||||||
// -------------------------------------------------
|
|
||||||
if (!is_dir($venvDir)) {
|
|
||||||
$output->writeln('<info>Creating Python virtual environment…</info>');
|
|
||||||
|
|
||||||
$cmd = sprintf(
|
|
||||||
'python3 -m venv %s 2>&1',
|
|
||||||
escapeshellarg($venvDir)
|
|
||||||
);
|
|
||||||
|
|
||||||
exec($cmd, $out, $exitCode);
|
|
||||||
|
|
||||||
foreach ($out as $line) {
|
|
||||||
$output->writeln($line);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($exitCode !== 0 || !is_file($venvPython)) {
|
|
||||||
$output->writeln('');
|
|
||||||
$output->writeln('<error>Failed to create Python virtual environment.</error>');
|
|
||||||
$output->writeln('<comment>Ensure that python3-venv is installed on the system.</comment>');
|
|
||||||
return Command::FAILURE;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
$output->writeln('<info>Using existing Python virtual environment</info>');
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------
|
|
||||||
// 2) Ensure pip exists inside venv
|
|
||||||
// -------------------------------------------------
|
|
||||||
$cmd = sprintf(
|
|
||||||
'%s -m pip --version 2>&1',
|
|
||||||
escapeshellarg($venvPython)
|
|
||||||
);
|
|
||||||
|
|
||||||
exec($cmd, $out, $exitCode);
|
|
||||||
|
|
||||||
if ($exitCode !== 0) {
|
|
||||||
$output->writeln('');
|
|
||||||
$output->writeln('<error>The existing virtual environment has no pip.</error>');
|
|
||||||
$output->writeln('<comment>This usually means it was created before python3-pip was installed.</comment>');
|
|
||||||
$output->writeln('<comment>Fix:</comment>');
|
|
||||||
$output->writeln(sprintf('<info> rm -rf %s</info>', $venvDir));
|
|
||||||
$output->writeln('<info> php bin/console mto:agent:vector:install</info>');
|
|
||||||
return Command::FAILURE;
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------
|
|
||||||
// 3) Install / update dependencies
|
|
||||||
// -------------------------------------------------
|
|
||||||
$output->writeln('<info>Installing vector dependencies…</info>');
|
|
||||||
|
|
||||||
$cmd = sprintf(
|
|
||||||
'%s -m pip install --upgrade faiss-cpu sentence-transformers 2>&1',
|
|
||||||
escapeshellarg($venvPython)
|
|
||||||
);
|
|
||||||
|
|
||||||
exec($cmd, $out, $exitCode);
|
|
||||||
|
|
||||||
foreach ($out as $line) {
|
|
||||||
$output->writeln($line);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($exitCode !== 0) {
|
|
||||||
$output->writeln('<error>Dependency installation failed</error>');
|
|
||||||
return Command::FAILURE;
|
|
||||||
}
|
|
||||||
|
|
||||||
$output->writeln('');
|
|
||||||
$output->writeln('<info>Vector dependencies installed successfully</info>');
|
|
||||||
$output->writeln(sprintf('<comment>venv:</comment> %s', $venvDir));
|
|
||||||
|
|
||||||
return Command::SUCCESS;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
44
src/Knowledge/Retrieval/NdjsonChunkLookup.php
Normal file
44
src/Knowledge/Retrieval/NdjsonChunkLookup.php
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Knowledge\Retrieval;
|
||||||
|
|
||||||
|
use App\Knowledge\ChunkManager;
|
||||||
|
use Symfony\Component\Uid\Uuid;
|
||||||
|
|
||||||
|
final class NdjsonChunkLookup
|
||||||
|
{
|
||||||
|
public function __construct(
|
||||||
|
private readonly ChunkManager $chunkManager
|
||||||
|
)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param string[] $chunkIds RFC4122 UUID strings
|
||||||
|
* @return array<string,array<string,mixed>> keyed by chunk_id
|
||||||
|
*/
|
||||||
|
public function findByChunkIds(array $chunkIds): array
|
||||||
|
{
|
||||||
|
$wanted = array_fill_keys($chunkIds, true);
|
||||||
|
$found = [];
|
||||||
|
|
||||||
|
foreach ($this->chunkManager->streamAll() as $row) {
|
||||||
|
$id = $row['chunk_id'] ?? null;
|
||||||
|
if (!is_string($id) || !isset($wanted[$id])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$found[$id] = $row;
|
||||||
|
|
||||||
|
// Early exit sobald alle gefunden
|
||||||
|
if (\count($found) === \count($wanted)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $found;
|
||||||
|
}
|
||||||
|
}
|
||||||
99
src/Knowledge/Retrieval/NdjsonHybridRetriever.php
Normal file
99
src/Knowledge/Retrieval/NdjsonHybridRetriever.php
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Knowledge\Retrieval;
|
||||||
|
|
||||||
|
use App\Vector\VectorSearchClient;
|
||||||
|
|
||||||
|
final class NdjsonHybridRetriever implements RetrieverInterface
|
||||||
|
{
|
||||||
|
private const VECTOR_SCORE_THRESHOLD = 0.65;
|
||||||
|
|
||||||
|
public function __construct(
|
||||||
|
private readonly NdjsonKeywordSearch $keywordSearch,
|
||||||
|
private readonly NdjsonChunkLookup $lookup,
|
||||||
|
private readonly VectorSearchClient $vectorClient,
|
||||||
|
private readonly int $maxChunks = 3,
|
||||||
|
private readonly int $vectorTopK = 5,
|
||||||
|
)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public function retrieve(string $prompt, int $limit = null): array
|
||||||
|
{
|
||||||
|
$limit ??= $this->maxChunks;
|
||||||
|
|
||||||
|
$terms = $this->extractTerms($prompt);
|
||||||
|
|
||||||
|
// 1) Keyword first
|
||||||
|
$keywordChunks = $this->keywordSearch->search($terms, $limit);
|
||||||
|
if (\count($keywordChunks) >= $limit) {
|
||||||
|
return array_slice($keywordChunks, 0, $limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) Vector fallback / enrichment
|
||||||
|
$hits = $this->vectorClient->search($prompt, $this->vectorTopK);
|
||||||
|
if ($hits === []) {
|
||||||
|
return $keywordChunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
$chunkIds = [];
|
||||||
|
foreach ($hits as $hit) {
|
||||||
|
if (!isset($hit['chunk_id'], $hit['score'])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if ((float)$hit['score'] < self::VECTOR_SCORE_THRESHOLD) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
$chunkIds[] = (string)$hit['chunk_id'];
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($chunkIds === []) {
|
||||||
|
return $keywordChunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
$rows = $this->lookup->findByChunkIds($chunkIds);
|
||||||
|
|
||||||
|
foreach ($chunkIds as $id) {
|
||||||
|
if (!isset($rows[$id]['text']) || !is_string($rows[$id]['text'])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
$keywordChunks[] = trim($rows[$id]['text']);
|
||||||
|
}
|
||||||
|
|
||||||
|
// dedupe + limit
|
||||||
|
$seen = [];
|
||||||
|
$out = [];
|
||||||
|
|
||||||
|
foreach ($keywordChunks as $chunk) {
|
||||||
|
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
|
||||||
|
if (isset($seen[$key])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
$seen[$key] = true;
|
||||||
|
$out[] = $chunk;
|
||||||
|
if (\count($out) >= $limit) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* minimal term extraction (we keep your old behavior)
|
||||||
|
*
|
||||||
|
* @return string[]
|
||||||
|
*/
|
||||||
|
private function extractTerms(string $text): array
|
||||||
|
{
|
||||||
|
$text = mb_strtolower((string)preg_replace('/[^\p{L}\p{N}\s]/u', '', $text));
|
||||||
|
|
||||||
|
return array_values(array_filter(
|
||||||
|
explode(' ', $text),
|
||||||
|
static fn(string $w) => mb_strlen($w) > 2
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
101
src/Knowledge/Retrieval/NdjsonKeywordSearch.php
Normal file
101
src/Knowledge/Retrieval/NdjsonKeywordSearch.php
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Knowledge\Retrieval;
|
||||||
|
|
||||||
|
use App\Knowledge\ChunkManager;
|
||||||
|
use App\Knowledge\StopWords;
|
||||||
|
|
||||||
|
final class NdjsonKeywordSearch
|
||||||
|
{
|
||||||
|
public function __construct(
|
||||||
|
private readonly ChunkManager $chunkManager,
|
||||||
|
private readonly StopWords $stopWords,
|
||||||
|
) {
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Streaming Keyword-Search über index.ndjson.
|
||||||
|
*
|
||||||
|
* @param string[] $terms (already lowercased)
|
||||||
|
* @return string[] best chunks
|
||||||
|
*/
|
||||||
|
public function search(array $terms, int $limit = 3, int $candidateCap = 200): array
|
||||||
|
{
|
||||||
|
$terms = array_values(array_filter($terms, function (string $t): bool {
|
||||||
|
return $t !== '' && !\in_array($t, $this->stopWords->getStopWords(), true);
|
||||||
|
}));
|
||||||
|
|
||||||
|
if ($terms === []) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// bounded min-heap (score => chunkText)
|
||||||
|
$best = [];
|
||||||
|
|
||||||
|
foreach ($this->chunkManager->streamAll() as $row) {
|
||||||
|
$text = $row['text'] ?? null;
|
||||||
|
if (!is_string($text) || $text === '') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$score = $this->scoreText($text, $terms);
|
||||||
|
if ($score <= 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$best[] = ['score' => $score, 'text' => trim($text)];
|
||||||
|
|
||||||
|
// keep array bounded to avoid memory spikes
|
||||||
|
if (\count($best) > $candidateCap) {
|
||||||
|
usort($best, fn($a, $b) => $b['score'] <=> $a['score']);
|
||||||
|
$best = array_slice($best, 0, $candidateCap);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($best === []) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
usort($best, fn($a, $b) => $b['score'] <=> $a['score']);
|
||||||
|
|
||||||
|
$out = [];
|
||||||
|
$seen = [];
|
||||||
|
|
||||||
|
foreach ($best as $row) {
|
||||||
|
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $row['text']));
|
||||||
|
if (isset($seen[$key])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
$seen[$key] = true;
|
||||||
|
$out[] = $row['text'];
|
||||||
|
|
||||||
|
if (\count($out) >= $limit) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple scoring: count matches, weight long terms slightly.
|
||||||
|
*/
|
||||||
|
private function scoreText(string $text, array $terms): int
|
||||||
|
{
|
||||||
|
$content = mb_strtolower($text);
|
||||||
|
$score = 0;
|
||||||
|
|
||||||
|
foreach ($terms as $term) {
|
||||||
|
if ($term === '') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (str_contains($content, $term)) {
|
||||||
|
$score += (mb_strlen($term) >= 10) ? 2 : 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $score;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,121 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace App\Knowledge;
|
|
||||||
|
|
||||||
use Psr\Log\LoggerInterface;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* VectorSearchChunked
|
|
||||||
*
|
|
||||||
* Chunk-based retrieval service for long-form knowledge documents.
|
|
||||||
* This is a lightweight, deterministic runtime reader for
|
|
||||||
* precomputed knowledge chunks.
|
|
||||||
*
|
|
||||||
* Design principles:
|
|
||||||
* - No runtime indexing
|
|
||||||
* - No ML dependencies
|
|
||||||
* - Deterministic and fast
|
|
||||||
* - Hard limits to protect prompt size
|
|
||||||
*
|
|
||||||
* This service is intentionally simple and can later be replaced
|
|
||||||
* by a real vector database without changing the AgentRunner.
|
|
||||||
*/
|
|
||||||
final class VectorSearchChunked
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* Directory containing chunked knowledge files.
|
|
||||||
*/
|
|
||||||
private string $dataDir = 'var/knowledge/chunks';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Maximum number of chunks to return.
|
|
||||||
*/
|
|
||||||
private int $maxChunks = 3;
|
|
||||||
|
|
||||||
public function __construct(
|
|
||||||
private string $projectDir,
|
|
||||||
)
|
|
||||||
{
|
|
||||||
$this->dataDir = $this->projectDir . '/' . $this->dataDir;
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Returns concatenated relevant chunks as plain text.
|
|
||||||
*
|
|
||||||
* @param string $prompt
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function searchAsText(string $prompt): string
|
|
||||||
{
|
|
||||||
|
|
||||||
if (!is_dir($this->dataDir)) {
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
|
|
||||||
$promptLower = mb_strtolower($prompt);
|
|
||||||
$keywords = $this->extractKeywords($promptLower);
|
|
||||||
|
|
||||||
if ($keywords === []) {
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
|
|
||||||
$matches = [];
|
|
||||||
|
|
||||||
foreach (glob($this->dataDir . '/*.txt') as $file) {
|
|
||||||
$content = file_get_contents($file);
|
|
||||||
if ($content === false) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$contentLower = mb_strtolower($content);
|
|
||||||
|
|
||||||
if ($this->matchesKeywords($contentLower, $keywords)) {
|
|
||||||
$matches[] = trim($content);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (count($matches) >= $this->maxChunks) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return implode("\n\n", $matches);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Extracts simple keywords from the prompt.
|
|
||||||
*
|
|
||||||
* This is a lightweight heuristic replacement for
|
|
||||||
* full vector or embedding-based search.
|
|
||||||
*/
|
|
||||||
private function extractKeywords(string $prompt): array
|
|
||||||
{
|
|
||||||
$words = preg_split('/\W+/u', $prompt, -1, PREG_SPLIT_NO_EMPTY);
|
|
||||||
if ($words === false) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
$keywords = [];
|
|
||||||
foreach ($words as $word) {
|
|
||||||
if (mb_strlen($word) >= 4) {
|
|
||||||
$keywords[] = $word;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return array_values(array_unique($keywords));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks whether the content matches at least one keyword.
|
|
||||||
*/
|
|
||||||
private function matchesKeywords(string $content, array $keywords): bool
|
|
||||||
{
|
|
||||||
foreach ($keywords as $keyword) {
|
|
||||||
if (str_contains($content, $keyword)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user