new version ndjson

This commit is contained in:
team 1
2026-02-12 11:22:56 +01:00
parent 0bb0c0b42f
commit 5a52e07edc
10 changed files with 375 additions and 492 deletions

View File

@@ -1,28 +1,25 @@
<?php
// src/Command/KnowledgeIngestCommand.php
declare(strict_types=1);
namespace App\Command;
use App\Knowledge\Ingest\KnowledgeIngestService;
use App\Entity\DocumentVersion;
use App\Entity\User;
use App\Ingest\IngestFlow;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Finder\Finder;
#[AsCommand(
name: 'mto:agent:knowledge:ingest',
description: 'Ingest one or multiple markdown/text documents into file-based knowledge chunks'
)]
final class KnowledgeIngestCommand extends Command
#[AsCommand(name: 'mto:agent:ingest:version')]
class KnowledgeIngestCommand extends Command
{
public function __construct(
private readonly KnowledgeIngestService $ingest,
private readonly string $uploadsDir,
private readonly IngestFlow $ingestFlow,
private readonly EntityManagerInterface $em,
) {
parent::__construct();
}
@@ -30,86 +27,28 @@ final class KnowledgeIngestCommand extends Command
protected function configure(): void
{
$this
->addArgument(
'file',
InputArgument::OPTIONAL,
'Path to a single .txt/.md file'
)
->addOption(
'all',
null,
InputOption::VALUE_NONE,
'Ingest all .md files from the uploads directory'
)
->addOption(
'optimize',
'o',
InputOption::VALUE_NONE,
'Optimize chunks for retrieval quality'
);
->addArgument('versionId', InputArgument::REQUIRED, 'UUID of DocumentVersion')
->addArgument('userId', InputArgument::REQUIRED, 'UUID of user triggering ingest');
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$files = [];
$optimize = (bool) $input->getOption('optimize');
$versionId = $input->getArgument('versionId');
$userId = $input->getArgument('userId');
if ($input->getOption('all')) {
if (!is_dir($this->uploadsDir)) {
$output->writeln('<error>❌ uploads directory not found</error>');
return Command::FAILURE;
}
$version = $this->em->getRepository(DocumentVersion::class)->find($versionId);
$user = $this->em->getRepository(User::class)->find($userId);
$finder = new Finder();
$finder
->files()
->in($this->uploadsDir)
->name('*.md');
if (!$finder->hasResults()) {
$output->writeln('<comment> No .md files found in uploads/</comment>');
return Command::SUCCESS;
}
foreach ($finder as $file) {
$files[] = $file->getRealPath();
}
$output->writeln(sprintf(
'📂 Ingesting %d markdown files from uploads (%s)',
count($files),
$optimize ? 'optimized' : 'standard'
));
} else {
$file = $input->getArgument('file');
if (!$file) {
$output->writeln('<error>❌ Either provide a file or use --all</error>');
return Command::FAILURE;
}
$files[] = (string) $file;
if (!$version || !$user) {
$output->writeln('<error>Version or User not found.</error>');
return Command::FAILURE;
}
$totalWritten = 0;
$output->writeln('Starting ingest...');
foreach ($files as $filePath) {
$output->writeln('➡️ Ingesting: ' . $filePath);
$this->ingestFlow->ingestDocumentVersion($version, $user);
$written = $this->ingest->ingestFile(
$filePath,
optimize: $optimize
);
$totalWritten += count($written);
foreach ($written as $chunk) {
$output->writeln(' - ' . $chunk);
}
}
$output->writeln('');
$output->writeln('✅ Total written chunks: ' . $totalWritten);
$output->writeln('<info>Ingest completed.</info>');
return Command::SUCCESS;
}

View File

@@ -4,86 +4,27 @@ declare(strict_types=1);
namespace App\Command;
use App\Vector\VectorIndexBuilder;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
#[AsCommand(
name: 'mto:agent:vector:ingest',
description: 'Builds the FAISS vector index from index.json'
)]
final class VectorIngestCommand extends Command
#[AsCommand(name: 'mto:agent:vector:rebuild')]
class VectorIngestCommand extends Command
{
public function __construct(
private readonly string $vectorDir,
private readonly string $projectDir
private readonly VectorIndexBuilder $builder
) {
parent::__construct();
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$vectorDir = rtrim($this->vectorDir, '/');
$output->writeln('Rebuilding vector index...');
$this->builder->rebuildFromNdjson();
$output->writeln('Done.');
if (!is_dir($vectorDir)) {
$output->writeln('<error>Vector directory not found</error>');
return Command::FAILURE;
}
$script = $vectorDir . '/vector_ingest.py';
if (!is_file($script)) {
$output->writeln('<error>vector_ingest.py not found</error>');
return Command::FAILURE;
}
// -------------------------------------------------
// Enforce venv usage
// -------------------------------------------------
$venvPython = $vectorDir . '/.venv/bin/python';
if (!is_file($venvPython)) {
$output->writeln('<error>No Python virtual environment found.</error>');
$output->writeln('<comment>Run first:</comment>');
$output->writeln('<info> php bin/console mto:agent:vector:install</info>');
return Command::FAILURE;
}
$knowledgeDir = rtrim($this->projectDir, '/') . '/var/knowledge';
if (!is_dir($knowledgeDir)) {
$output->writeln('<error>Knowledge directory not found:</error>');
$output->writeln($knowledgeDir);
return Command::FAILURE;
}
$output->writeln('<info>Building FAISS vector index…</info>');
$output->writeln(sprintf(
'<comment>Vector dir:</comment> %s',
$vectorDir
));
$output->writeln(sprintf(
'<comment>Knowledge dir:</comment> %s',
$knowledgeDir
));
$cmd = sprintf(
'%s %s %s %s 2>&1',
escapeshellarg($venvPython),
escapeshellarg($script),
escapeshellarg($vectorDir),
escapeshellarg($knowledgeDir)
);
exec($cmd, $out, $exitCode);
foreach ($out as $line) {
$output->writeln($line);
}
return $exitCode === 0
? Command::SUCCESS
: Command::FAILURE;
return Command::SUCCESS;
}
}

View File

@@ -1,114 +0,0 @@
<?php
declare(strict_types=1);
namespace App\Command;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
/**
* System requirements (once per environment):
* sudo apt update
* sudo apt install -y python3-venv python3-pip
*/
#[AsCommand(
name: 'mto:agent:vector:install',
description: 'Creates a Python venv and installs vector dependencies'
)]
final class VectorInstallCommand extends Command
{
public function __construct(
private readonly string $vectorDir
) {
parent::__construct();
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
if (!is_dir($this->vectorDir)) {
$output->writeln('<error>Vector directory not found</error>');
return Command::FAILURE;
}
$vectorDir = rtrim($this->vectorDir, '/');
$venvDir = $vectorDir . '/.venv';
$venvPython = $venvDir . '/bin/python';
// -------------------------------------------------
// 1) Create venv if missing
// -------------------------------------------------
if (!is_dir($venvDir)) {
$output->writeln('<info>Creating Python virtual environment…</info>');
$cmd = sprintf(
'python3 -m venv %s 2>&1',
escapeshellarg($venvDir)
);
exec($cmd, $out, $exitCode);
foreach ($out as $line) {
$output->writeln($line);
}
if ($exitCode !== 0 || !is_file($venvPython)) {
$output->writeln('');
$output->writeln('<error>Failed to create Python virtual environment.</error>');
$output->writeln('<comment>Ensure that python3-venv is installed on the system.</comment>');
return Command::FAILURE;
}
} else {
$output->writeln('<info>Using existing Python virtual environment</info>');
}
// -------------------------------------------------
// 2) Ensure pip exists inside venv
// -------------------------------------------------
$cmd = sprintf(
'%s -m pip --version 2>&1',
escapeshellarg($venvPython)
);
exec($cmd, $out, $exitCode);
if ($exitCode !== 0) {
$output->writeln('');
$output->writeln('<error>The existing virtual environment has no pip.</error>');
$output->writeln('<comment>This usually means it was created before python3-pip was installed.</comment>');
$output->writeln('<comment>Fix:</comment>');
$output->writeln(sprintf('<info> rm -rf %s</info>', $venvDir));
$output->writeln('<info> php bin/console mto:agent:vector:install</info>');
return Command::FAILURE;
}
// -------------------------------------------------
// 3) Install / update dependencies
// -------------------------------------------------
$output->writeln('<info>Installing vector dependencies…</info>');
$cmd = sprintf(
'%s -m pip install --upgrade faiss-cpu sentence-transformers 2>&1',
escapeshellarg($venvPython)
);
exec($cmd, $out, $exitCode);
foreach ($out as $line) {
$output->writeln($line);
}
if ($exitCode !== 0) {
$output->writeln('<error>Dependency installation failed</error>');
return Command::FAILURE;
}
$output->writeln('');
$output->writeln('<info>Vector dependencies installed successfully</info>');
$output->writeln(sprintf('<comment>venv:</comment> %s', $venvDir));
return Command::SUCCESS;
}
}