first commit
This commit is contained in:
136
src/Agent/AgentRunner.php
Normal file
136
src/Agent/AgentRunner.php
Normal file
@@ -0,0 +1,136 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Agent;
|
||||
|
||||
use App\Context\ContextService;
|
||||
use App\Context\UrlAnalyzer;
|
||||
use App\Infrastructure\OllamaClient;
|
||||
use App\Knowledge\Retrieval\RetrieverInterface;
|
||||
use Generator;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use Throwable;
|
||||
use App\Agent\StreamChunker;
|
||||
|
||||
final readonly class AgentRunner
|
||||
{
|
||||
public function __construct(
|
||||
private PromptBuilder $promptBuilder,
|
||||
private ThinkSuppressor $thinkSuppressor,
|
||||
private ContextService $contextService,
|
||||
private UrlAnalyzer $urlAnalyzer,
|
||||
private RetrieverInterface $retriever,
|
||||
private OllamaClient $ollamaClient,
|
||||
private LoggerInterface $agentLogger,
|
||||
private bool $debug,
|
||||
private bool $logPrompt,
|
||||
private bool $logContext,
|
||||
) {}
|
||||
|
||||
public function run(string $prompt, string $userId): Generator
|
||||
{
|
||||
$prompt = trim($prompt);
|
||||
|
||||
if ($prompt === '') {
|
||||
yield '❌ Empty prompt.';
|
||||
return;
|
||||
}
|
||||
|
||||
$this->agentLogger->info('Agent run started', [
|
||||
'userId' => $userId,
|
||||
]);
|
||||
|
||||
try {
|
||||
// ---------------------------------------------------------
|
||||
// 1) Context strategy
|
||||
// ---------------------------------------------------------
|
||||
$includeFullContext = false;
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 2) Extract URL content (if present)
|
||||
// ---------------------------------------------------------
|
||||
$urlContent = $this->urlAnalyzer->extractContentFromPrompt($prompt);
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 3) Retrieve RAG knowledge
|
||||
// ---------------------------------------------------------
|
||||
$knowledgeChunks = $this->retriever->retrieve($prompt);
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 4) Build final prompt
|
||||
// ---------------------------------------------------------
|
||||
$finalPrompt = $this->promptBuilder->build(
|
||||
prompt: $prompt,
|
||||
userId: $userId,
|
||||
urlContent: $urlContent,
|
||||
knowledgeChunks: $knowledgeChunks,
|
||||
fullContext: $includeFullContext
|
||||
);
|
||||
|
||||
if ($this->debug && $this->logPrompt) {
|
||||
$this->agentLogger->debug($finalPrompt);
|
||||
}
|
||||
|
||||
if ($this->debug && $this->logContext) {
|
||||
$this->agentLogger->debug('Conversation context snapshot', [
|
||||
'context' => $this->contextService->buildUserContext(
|
||||
$userId,
|
||||
$includeFullContext
|
||||
),
|
||||
]);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 5) Stream tokens from the LLM backend (chunked streaming)
|
||||
// ---------------------------------------------------------
|
||||
$fullOutput = '';
|
||||
$chunker = new StreamChunker();
|
||||
|
||||
foreach ($this->ollamaClient->stream($finalPrompt) as $token) {
|
||||
$cleanToken = $this->thinkSuppressor->filter($token);
|
||||
|
||||
if ($cleanToken === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Vollständige Antwort weiter sammeln (für History)
|
||||
$fullOutput .= $cleanToken;
|
||||
|
||||
// ⬇️ Token in Chunker geben
|
||||
$chunk = $chunker->push($cleanToken);
|
||||
if ($chunk !== null) {
|
||||
yield $chunk;
|
||||
}
|
||||
}
|
||||
|
||||
// ⬇️ Rest flushen
|
||||
$finalChunk = $chunker->flush();
|
||||
if ($finalChunk !== null) {
|
||||
yield $finalChunk;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 6) Persist conversation history
|
||||
// ---------------------------------------------------------
|
||||
$this->contextService->appendHistory(
|
||||
$userId,
|
||||
$prompt,
|
||||
$fullOutput
|
||||
);
|
||||
|
||||
$this->agentLogger->info('Agent run finished', [
|
||||
'userId' => $userId,
|
||||
'outputLength' => mb_strlen($fullOutput),
|
||||
'contextMode' => 'recent',
|
||||
]);
|
||||
} catch (Throwable $e) {
|
||||
$this->agentLogger->error('Agent run failed', [
|
||||
'userId' => $userId,
|
||||
'exception' => $e,
|
||||
]);
|
||||
|
||||
yield "\n❌ An internal error occurred while processing the request.";
|
||||
}
|
||||
}
|
||||
}
|
||||
136
src/Agent/PromptBuilder.php
Normal file
136
src/Agent/PromptBuilder.php
Normal file
@@ -0,0 +1,136 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Agent;
|
||||
|
||||
use App\Context\ContextService;
|
||||
use App\Context\UrlAnalyzer;
|
||||
use DateTimeImmutable;
|
||||
|
||||
final class PromptBuilder
|
||||
{
|
||||
public function __construct(
|
||||
private readonly ContextService $contextService,
|
||||
private readonly UrlAnalyzer $urlAnalyzer,
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the final prompt string for the LLM.
|
||||
*
|
||||
* @param string $prompt
|
||||
* @param string $userId
|
||||
* @param string $urlContent
|
||||
* @param string[] $knowledgeChunks
|
||||
* @param bool $fullContext
|
||||
*/
|
||||
public function build(
|
||||
string $prompt,
|
||||
string $userId,
|
||||
string $urlContent,
|
||||
array $knowledgeChunks,
|
||||
bool $fullContext = false,
|
||||
): string
|
||||
{
|
||||
$now = (new DateTimeImmutable())->format('Y-m-d H:i:s');
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 1) SYSTEM INSTRUCTIONS
|
||||
// ------------------------------------------------------------
|
||||
$systemLines = [
|
||||
'You are a conversational AI assistant.',
|
||||
'Respond clearly, precisely, and in context of the ongoing conversation.',
|
||||
'The conversation context is authoritative and must be respected.',
|
||||
'External knowledge is supporting information only.',
|
||||
'If the user asks for contact details such as phone number, email address, postal address or contact person, and the provided context contains such information, answer explicitly with the concrete data.',
|
||||
'Do not omit contact details.',
|
||||
'It is allowed and desired to quote contact data verbatim if it appears in the context.',
|
||||
"Current date and time: {$now}",
|
||||
'',
|
||||
'IMPORTANT FORMATTING RULES:',
|
||||
'- Always answer in valid Markdown.',
|
||||
'- Use headings, lists, and paragraphs where appropriate.',
|
||||
'- Insert line breaks early and often.',
|
||||
'- Never write long paragraphs without newlines.',
|
||||
'- Each list item must start on a new line.',
|
||||
'- Prefer short paragraphs over dense text blocks.',
|
||||
'',
|
||||
'IMPORTANT LANGUAGE RULES:',
|
||||
'- If the user input contains misspellings, silently use the correct canonical terms in your answer.',
|
||||
'- Never mention, explain, or point out spelling mistakes.',
|
||||
'- Do not ask clarifying questions about possible misspellings.',
|
||||
'- Do not repeat or quote misspelled terms from the user input.',
|
||||
'- Always use the correct technical spelling found in the provided context.',
|
||||
'- Answer directly and confidently using always correct canonical terminology.'
|
||||
];
|
||||
|
||||
$systemBlock = "SYSTEM:\n" . implode("\n", $systemLines);
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 2) CONVERSATION CONTEXT (AUTHORITATIVE)
|
||||
// ------------------------------------------------------------
|
||||
$history = $this->contextService->buildUserContext(
|
||||
userId: $userId,
|
||||
full: $fullContext
|
||||
);
|
||||
|
||||
$contextBlock = '';
|
||||
if ($history !== '') {
|
||||
$contextBlock =
|
||||
"CONVERSATION CONTEXT (authoritative):\n" .
|
||||
"The following messages are the previous turns of this conversation.\n" .
|
||||
"They must be considered when answering the next question.\n\n" .
|
||||
$history;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 3) EXTERNAL KNOWLEDGE (SUPPORTING)
|
||||
// ------------------------------------------------------------
|
||||
$knowledgeParts = [];
|
||||
|
||||
if ($knowledgeChunks !== []) {
|
||||
$lines = [];
|
||||
|
||||
foreach ($knowledgeChunks as $i => $chunk) {
|
||||
$n = $i + 1;
|
||||
$lines[] = "[{$n}] {$chunk}";
|
||||
}
|
||||
|
||||
$knowledgeParts[] =
|
||||
"RETRIEVED KNOWLEDGE (supporting):\n" .
|
||||
implode("\n\n", $lines);
|
||||
}
|
||||
|
||||
if ($urlContent !== '') {
|
||||
$knowledgeParts[] =
|
||||
"CONTENT FROM URL (supporting):\n" .
|
||||
$urlContent;
|
||||
}
|
||||
|
||||
$knowledgeBlock = '';
|
||||
if ($knowledgeParts !== []) {
|
||||
$knowledgeBlock = implode("\n\n", $knowledgeParts);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 4) USER QUESTION
|
||||
// ------------------------------------------------------------
|
||||
$userBlock =
|
||||
"USER QUESTION:\n" .
|
||||
$prompt;
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 5) FINAL PROMPT ASSEMBLY
|
||||
// ------------------------------------------------------------
|
||||
$blocks = array_filter([
|
||||
$systemBlock,
|
||||
$contextBlock,
|
||||
$knowledgeBlock,
|
||||
$userBlock,
|
||||
]);
|
||||
|
||||
return implode("\n\n", $blocks);
|
||||
}
|
||||
}
|
||||
61
src/Agent/StreamChunker.php
Normal file
61
src/Agent/StreamChunker.php
Normal file
@@ -0,0 +1,61 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Agent;
|
||||
|
||||
final class StreamChunker
|
||||
{
|
||||
private string $buffer = '';
|
||||
private bool $insideCodeBlock = false;
|
||||
private int $minChunkSize = 120;
|
||||
|
||||
public function push(string $token): ?string
|
||||
{
|
||||
$this->buffer .= $token;
|
||||
|
||||
if (str_contains($token, '```')) {
|
||||
$this->insideCodeBlock = !$this->insideCodeBlock;
|
||||
}
|
||||
|
||||
if ($this->shouldFlush()) {
|
||||
$out = $this->buffer;
|
||||
$this->buffer = '';
|
||||
return $out;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public function flush(): ?string
|
||||
{
|
||||
if ($this->buffer === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
$out = $this->buffer;
|
||||
$this->buffer = '';
|
||||
return $out;
|
||||
}
|
||||
|
||||
private function shouldFlush(): bool
|
||||
{
|
||||
if ($this->insideCodeBlock) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (str_ends_with($this->buffer, "\n\n")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (preg_match('/[.!?]\s$/', $this->buffer)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (preg_match('/\n[-*] .+\n$/', $this->buffer)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return mb_strlen($this->buffer) >= $this->minChunkSize;
|
||||
}
|
||||
}
|
||||
88
src/Agent/ThinkSuppressor.php
Normal file
88
src/Agent/ThinkSuppressor.php
Normal file
@@ -0,0 +1,88 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Agent;
|
||||
|
||||
/**
|
||||
* ThinkSuppressor
|
||||
*
|
||||
* Robust streaming-safe suppressor for internal <think>...</think> sections.
|
||||
*
|
||||
* Key properties:
|
||||
* - Handles token fragmentation (partial tags across tokens)
|
||||
* - Stateful per stream, stateless per request
|
||||
* - Does not buffer full responses
|
||||
* - Deterministic and predictable
|
||||
*/
|
||||
final class ThinkSuppressor
|
||||
{
|
||||
/** Indicates whether the stream is currently inside a <think> block. */
|
||||
private bool $insideThink = false;
|
||||
|
||||
/** Indicates whether the think section has been fully closed. */
|
||||
private bool $thinkSectionCompleted = false;
|
||||
|
||||
/**
|
||||
* Rolling buffer for detecting fragmented tags across tokens.
|
||||
*/
|
||||
private string $rollingBuffer = '';
|
||||
|
||||
/**
|
||||
* Maximum buffer length needed to safely detect tags.
|
||||
*/
|
||||
private int $maxBufferLength = 32;
|
||||
|
||||
/**
|
||||
* Filters a single token from the LLM stream.
|
||||
*
|
||||
* @param string $token Raw token from the LLM
|
||||
* @return string Cleaned token safe for user output
|
||||
*/
|
||||
public function filter(string $token): string
|
||||
{
|
||||
// Append to rolling buffer
|
||||
$this->rollingBuffer .= $token;
|
||||
if (strlen($this->rollingBuffer) > $this->maxBufferLength) {
|
||||
$this->rollingBuffer = substr($this->rollingBuffer, -$this->maxBufferLength);
|
||||
}
|
||||
|
||||
// If think section is already completed, just strip stray closing tags
|
||||
if ($this->thinkSectionCompleted) {
|
||||
return str_replace('</think>', '', $token);
|
||||
}
|
||||
|
||||
// Detect fragmented opening <think> tag
|
||||
if (!$this->insideThink && str_contains($this->rollingBuffer, '<think>')) {
|
||||
$this->insideThink = true;
|
||||
return '';
|
||||
}
|
||||
|
||||
// Detect fragmented closing </think> tag
|
||||
if ($this->insideThink && str_contains($this->rollingBuffer, '</think>')) {
|
||||
$this->insideThink = false;
|
||||
$this->thinkSectionCompleted = true;
|
||||
|
||||
// Emit a single line break after think section ends
|
||||
return "\n";
|
||||
}
|
||||
|
||||
// Suppress all content while inside <think>...</think>
|
||||
if ($this->insideThink) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return $token;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the suppressor state.
|
||||
* Must be called before starting a new stream.
|
||||
*/
|
||||
public function reset(): void
|
||||
{
|
||||
$this->insideThink = false;
|
||||
$this->thinkSectionCompleted = false;
|
||||
$this->rollingBuffer = '';
|
||||
}
|
||||
}
|
||||
84
src/Command/AgentCliCommand.php
Normal file
84
src/Command/AgentCliCommand.php
Normal file
@@ -0,0 +1,84 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use App\Agent\AgentRunner;
|
||||
use Symfony\Component\Console\Attribute\AsCommand;
|
||||
use Symfony\Component\Console\Command\Command;
|
||||
use Symfony\Component\Console\Input\InputArgument;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use Symfony\Component\Console\Style\SymfonyStyle;
|
||||
|
||||
/**
|
||||
* AgentCliCommand
|
||||
*
|
||||
* Interactive CLI interface for the AI agent.
|
||||
* Symfony-native, streaming-first implementation.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Read user input from STDIN
|
||||
* - Stream tokens from the AgentRunner
|
||||
* - Render streamed output to the terminal
|
||||
*
|
||||
* The AgentRunner is the single owner of:
|
||||
* - Think suppression
|
||||
* - Context handling
|
||||
* - Streaming semantics
|
||||
*/
|
||||
#[AsCommand(
|
||||
name: 'mto:agent:chat',
|
||||
description: 'Start an interactive CLI chat with the AI agent'
|
||||
)]
|
||||
final class AgentCliCommand extends Command
|
||||
{
|
||||
public function __construct(
|
||||
private readonly AgentRunner $agentRunner,
|
||||
) {
|
||||
parent::__construct();
|
||||
}
|
||||
|
||||
protected function configure(): void
|
||||
{
|
||||
$this
|
||||
->addArgument('user-id', InputArgument::OPTIONAL, 'User/session identifier', 'cli');
|
||||
}
|
||||
|
||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$io = new SymfonyStyle($input, $output);
|
||||
$userId = (string) $input->getArgument('user-id');
|
||||
|
||||
$io->success('AI Agent CLI started. Press Ctrl+C or type "exit" to quit.');
|
||||
$io->writeln('');
|
||||
|
||||
while (true) {
|
||||
$prompt = $io->ask('Question');
|
||||
|
||||
if ($prompt === null) {
|
||||
// EOF (e.g. piped input ended)
|
||||
$io->writeln('');
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
|
||||
$prompt = trim($prompt);
|
||||
|
||||
if ($prompt === '' || strtolower($prompt) === 'exit') {
|
||||
$io->writeln('');
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
|
||||
$io->writeln('');
|
||||
$io->writeln('<info>Answer:</info>');
|
||||
|
||||
foreach ($this->agentRunner->run($prompt, $userId) as $token) {
|
||||
$output->write($token);
|
||||
}
|
||||
|
||||
$io->writeln('');
|
||||
$io->writeln('');
|
||||
}
|
||||
}
|
||||
}
|
||||
116
src/Command/KnowledgeIngestCommand.php
Normal file
116
src/Command/KnowledgeIngestCommand.php
Normal file
@@ -0,0 +1,116 @@
|
||||
<?php
|
||||
// src/Command/KnowledgeIngestCommand.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use App\Knowledge\Ingest\KnowledgeIngestService;
|
||||
use Symfony\Component\Console\Attribute\AsCommand;
|
||||
use Symfony\Component\Console\Command\Command;
|
||||
use Symfony\Component\Console\Input\InputArgument;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use Symfony\Component\Finder\Finder;
|
||||
|
||||
#[AsCommand(
|
||||
name: 'mto:agent:knowledge:ingest',
|
||||
description: 'Ingest one or multiple markdown/text documents into file-based knowledge chunks'
|
||||
)]
|
||||
final class KnowledgeIngestCommand extends Command
|
||||
{
|
||||
public function __construct(
|
||||
private readonly KnowledgeIngestService $ingest,
|
||||
private readonly string $uploadsDir,
|
||||
) {
|
||||
parent::__construct();
|
||||
}
|
||||
|
||||
protected function configure(): void
|
||||
{
|
||||
$this
|
||||
->addArgument(
|
||||
'file',
|
||||
InputArgument::OPTIONAL,
|
||||
'Path to a single .txt/.md file'
|
||||
)
|
||||
->addOption(
|
||||
'all',
|
||||
null,
|
||||
InputOption::VALUE_NONE,
|
||||
'Ingest all .md files from the uploads directory'
|
||||
)
|
||||
->addOption(
|
||||
'optimize',
|
||||
'o',
|
||||
InputOption::VALUE_NONE,
|
||||
'Optimize chunks for retrieval quality'
|
||||
);
|
||||
}
|
||||
|
||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$files = [];
|
||||
$optimize = (bool) $input->getOption('optimize');
|
||||
|
||||
if ($input->getOption('all')) {
|
||||
if (!is_dir($this->uploadsDir)) {
|
||||
$output->writeln('<error>❌ uploads directory not found</error>');
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$finder = new Finder();
|
||||
$finder
|
||||
->files()
|
||||
->in($this->uploadsDir)
|
||||
->name('*.md');
|
||||
|
||||
if (!$finder->hasResults()) {
|
||||
$output->writeln('<comment>ℹ️ No .md files found in uploads/</comment>');
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
|
||||
foreach ($finder as $file) {
|
||||
$files[] = $file->getRealPath();
|
||||
}
|
||||
|
||||
$output->writeln(sprintf(
|
||||
'📂 Ingesting %d markdown files from uploads (%s)',
|
||||
count($files),
|
||||
$optimize ? 'optimized' : 'standard'
|
||||
));
|
||||
} else {
|
||||
$file = $input->getArgument('file');
|
||||
|
||||
if (!$file) {
|
||||
$output->writeln('<error>❌ Either provide a file or use --all</error>');
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$files[] = (string) $file;
|
||||
}
|
||||
|
||||
$totalWritten = 0;
|
||||
|
||||
foreach ($files as $filePath) {
|
||||
$output->writeln('➡️ Ingesting: ' . $filePath);
|
||||
|
||||
$written = $this->ingest->ingestFile(
|
||||
$filePath,
|
||||
optimize: $optimize
|
||||
);
|
||||
|
||||
$totalWritten += count($written);
|
||||
|
||||
foreach ($written as $chunk) {
|
||||
$output->writeln(' - ' . $chunk);
|
||||
}
|
||||
}
|
||||
|
||||
$output->writeln('');
|
||||
$output->writeln('✅ Total written chunks: ' . $totalWritten);
|
||||
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
}
|
||||
89
src/Command/VectorIngestCommand.php
Normal file
89
src/Command/VectorIngestCommand.php
Normal file
@@ -0,0 +1,89 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use Symfony\Component\Console\Attribute\AsCommand;
|
||||
use Symfony\Component\Console\Command\Command;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
|
||||
#[AsCommand(
|
||||
name: 'mto:agent:vector:ingest',
|
||||
description: 'Builds the FAISS vector index from index.json'
|
||||
)]
|
||||
final class VectorIngestCommand extends Command
|
||||
{
|
||||
public function __construct(
|
||||
private readonly string $vectorDir,
|
||||
private readonly string $projectDir
|
||||
) {
|
||||
parent::__construct();
|
||||
}
|
||||
|
||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$vectorDir = rtrim($this->vectorDir, '/');
|
||||
|
||||
if (!is_dir($vectorDir)) {
|
||||
$output->writeln('<error>Vector directory not found</error>');
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$script = $vectorDir . '/vector_ingest.py';
|
||||
|
||||
if (!is_file($script)) {
|
||||
$output->writeln('<error>vector_ingest.py not found</error>');
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// Enforce venv usage
|
||||
// -------------------------------------------------
|
||||
$venvPython = $vectorDir . '/.venv/bin/python';
|
||||
|
||||
if (!is_file($venvPython)) {
|
||||
$output->writeln('<error>No Python virtual environment found.</error>');
|
||||
$output->writeln('<comment>Run first:</comment>');
|
||||
$output->writeln('<info> php bin/console mto:agent:vector:install</info>');
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$knowledgeDir = rtrim($this->projectDir, '/') . '/var/knowledge';
|
||||
|
||||
if (!is_dir($knowledgeDir)) {
|
||||
$output->writeln('<error>Knowledge directory not found:</error>');
|
||||
$output->writeln($knowledgeDir);
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$output->writeln('<info>Building FAISS vector index…</info>');
|
||||
$output->writeln(sprintf(
|
||||
'<comment>Vector dir:</comment> %s',
|
||||
$vectorDir
|
||||
));
|
||||
$output->writeln(sprintf(
|
||||
'<comment>Knowledge dir:</comment> %s',
|
||||
$knowledgeDir
|
||||
));
|
||||
|
||||
$cmd = sprintf(
|
||||
'%s %s %s %s 2>&1',
|
||||
escapeshellarg($venvPython),
|
||||
escapeshellarg($script),
|
||||
escapeshellarg($vectorDir),
|
||||
escapeshellarg($knowledgeDir)
|
||||
);
|
||||
|
||||
exec($cmd, $out, $exitCode);
|
||||
|
||||
foreach ($out as $line) {
|
||||
$output->writeln($line);
|
||||
}
|
||||
|
||||
return $exitCode === 0
|
||||
? Command::SUCCESS
|
||||
: Command::FAILURE;
|
||||
}
|
||||
}
|
||||
114
src/Command/VectorInstallCommand.php
Normal file
114
src/Command/VectorInstallCommand.php
Normal file
@@ -0,0 +1,114 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use Symfony\Component\Console\Attribute\AsCommand;
|
||||
use Symfony\Component\Console\Command\Command;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
|
||||
/**
|
||||
* System requirements (once per environment):
|
||||
* sudo apt update
|
||||
* sudo apt install -y python3-venv python3-pip
|
||||
*/
|
||||
#[AsCommand(
|
||||
name: 'mto:agent:vector:install',
|
||||
description: 'Creates a Python venv and installs vector dependencies'
|
||||
)]
|
||||
final class VectorInstallCommand extends Command
|
||||
{
|
||||
public function __construct(
|
||||
private readonly string $vectorDir
|
||||
) {
|
||||
parent::__construct();
|
||||
}
|
||||
|
||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
if (!is_dir($this->vectorDir)) {
|
||||
$output->writeln('<error>Vector directory not found</error>');
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$vectorDir = rtrim($this->vectorDir, '/');
|
||||
$venvDir = $vectorDir . '/.venv';
|
||||
$venvPython = $venvDir . '/bin/python';
|
||||
|
||||
// -------------------------------------------------
|
||||
// 1) Create venv if missing
|
||||
// -------------------------------------------------
|
||||
if (!is_dir($venvDir)) {
|
||||
$output->writeln('<info>Creating Python virtual environment…</info>');
|
||||
|
||||
$cmd = sprintf(
|
||||
'python3 -m venv %s 2>&1',
|
||||
escapeshellarg($venvDir)
|
||||
);
|
||||
|
||||
exec($cmd, $out, $exitCode);
|
||||
|
||||
foreach ($out as $line) {
|
||||
$output->writeln($line);
|
||||
}
|
||||
|
||||
if ($exitCode !== 0 || !is_file($venvPython)) {
|
||||
$output->writeln('');
|
||||
$output->writeln('<error>Failed to create Python virtual environment.</error>');
|
||||
$output->writeln('<comment>Ensure that python3-venv is installed on the system.</comment>');
|
||||
return Command::FAILURE;
|
||||
}
|
||||
} else {
|
||||
$output->writeln('<info>Using existing Python virtual environment</info>');
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// 2) Ensure pip exists inside venv
|
||||
// -------------------------------------------------
|
||||
$cmd = sprintf(
|
||||
'%s -m pip --version 2>&1',
|
||||
escapeshellarg($venvPython)
|
||||
);
|
||||
|
||||
exec($cmd, $out, $exitCode);
|
||||
|
||||
if ($exitCode !== 0) {
|
||||
$output->writeln('');
|
||||
$output->writeln('<error>The existing virtual environment has no pip.</error>');
|
||||
$output->writeln('<comment>This usually means it was created before python3-pip was installed.</comment>');
|
||||
$output->writeln('<comment>Fix:</comment>');
|
||||
$output->writeln(sprintf('<info> rm -rf %s</info>', $venvDir));
|
||||
$output->writeln('<info> php bin/console mto:agent:vector:install</info>');
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// 3) Install / update dependencies
|
||||
// -------------------------------------------------
|
||||
$output->writeln('<info>Installing vector dependencies…</info>');
|
||||
|
||||
$cmd = sprintf(
|
||||
'%s -m pip install --upgrade faiss-cpu sentence-transformers 2>&1',
|
||||
escapeshellarg($venvPython)
|
||||
);
|
||||
|
||||
exec($cmd, $out, $exitCode);
|
||||
|
||||
foreach ($out as $line) {
|
||||
$output->writeln($line);
|
||||
}
|
||||
|
||||
if ($exitCode !== 0) {
|
||||
$output->writeln('<error>Dependency installation failed</error>');
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$output->writeln('');
|
||||
$output->writeln('<info>Vector dependencies installed successfully</info>');
|
||||
$output->writeln(sprintf('<comment>venv:</comment> %s', $venvDir));
|
||||
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
}
|
||||
126
src/Context/ContextService.php
Normal file
126
src/Context/ContextService.php
Normal file
@@ -0,0 +1,126 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Context;
|
||||
|
||||
/**
|
||||
* ContextService
|
||||
*
|
||||
* Manages conversational history persistence and retrieval.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Persist completed conversation turns (append-only)
|
||||
* - Provide recent or extended conversation context
|
||||
* - Resolve history storage paths safely
|
||||
*
|
||||
* Non-responsibilities:
|
||||
* - No follow-up detection
|
||||
* - No prompt semantics
|
||||
* - No interpretation of user intent
|
||||
*
|
||||
* Context levels:
|
||||
* - Regular context: last N lines (default)
|
||||
* - Full context: extended history for special cases
|
||||
*/
|
||||
final class ContextService
|
||||
{
|
||||
private string $historyDir;
|
||||
|
||||
/**
|
||||
* Number of lines included in regular context.
|
||||
* Intended for normal conversational continuity.
|
||||
*/
|
||||
private int $maxRegularLines = 20;
|
||||
|
||||
/**
|
||||
* Number of lines included in full context.
|
||||
* Intended for exceptional or diagnostic scenarios.
|
||||
*/
|
||||
private int $maxFullLines = 500;
|
||||
|
||||
public function __construct(
|
||||
string $historyDir,
|
||||
string $projectDir,
|
||||
) {
|
||||
/**
|
||||
* Normalize history directory:
|
||||
* - Allow relative paths in env (e.g. "var/agent-history")
|
||||
* - Always resolve to an absolute path based on project root
|
||||
*/
|
||||
$historyDir = rtrim($historyDir, '/');
|
||||
|
||||
if (!str_starts_with($historyDir, '/')) {
|
||||
$historyDir = rtrim($projectDir, '/') . '/' . ltrim($historyDir, '/');
|
||||
}
|
||||
|
||||
$this->historyDir = $historyDir;
|
||||
|
||||
// Ensure directory exists
|
||||
if (!is_dir($this->historyDir)) {
|
||||
mkdir($this->historyDir, 0777, true);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the conversation context for a given user.
|
||||
*
|
||||
* @param string $userId Stable client identifier
|
||||
* @param bool $full Whether to load extended history
|
||||
*/
|
||||
public function buildUserContext(string $userId, bool $full = false): string
|
||||
{
|
||||
$path = $this->getHistoryPath($userId);
|
||||
|
||||
if (!is_file($path)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$lines = file($path, FILE_IGNORE_NEW_LINES);
|
||||
if ($lines === false) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$maxLines = $full ? $this->maxFullLines : $this->maxRegularLines;
|
||||
$selected = array_slice($lines, -$maxLines);
|
||||
|
||||
return implode("\n", $selected);
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends a completed interaction to the user's history.
|
||||
*
|
||||
* Format (append-only):
|
||||
* Question: <user prompt>
|
||||
* <assistant response>
|
||||
*/
|
||||
public function appendHistory(string $userId, string $prompt, string $response): void
|
||||
{
|
||||
$path = $this->getHistoryPath($userId);
|
||||
|
||||
$entry = "Question: {$prompt}\n{$response}\n";
|
||||
file_put_contents($path, $entry, FILE_APPEND | LOCK_EX);
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes the complete conversation history for a user.
|
||||
*/
|
||||
public function deleteHistory(string $userId): void
|
||||
{
|
||||
$path = $this->getHistoryPath($userId);
|
||||
|
||||
if (is_file($path)) {
|
||||
unlink($path);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves the absolute history file path for a user.
|
||||
*/
|
||||
private function getHistoryPath(string $userId): string
|
||||
{
|
||||
$safeUserId = preg_replace('/[^a-zA-Z0-9_-]/', '_', $userId);
|
||||
|
||||
return $this->historyDir . '/' . $safeUserId . '.txt';
|
||||
}
|
||||
}
|
||||
120
src/Context/UrlAnalyzer.php
Normal file
120
src/Context/UrlAnalyzer.php
Normal file
@@ -0,0 +1,120 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Context;
|
||||
|
||||
use RuntimeException;
|
||||
|
||||
/**
|
||||
* UrlAnalyzer
|
||||
*
|
||||
* Extracts and analyzes URL content from user prompts in a production-safe way.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Detect the first URL inside a prompt
|
||||
* - Fetch remote content with strict limits
|
||||
* - Clean and normalize readable text
|
||||
* - Identify trusted internal domains based on URL host
|
||||
*
|
||||
* Design constraints:
|
||||
* - No framework dependencies
|
||||
* - No prompt or agent logic
|
||||
* - Defensive against slow or large responses
|
||||
*/
|
||||
final class UrlAnalyzer
|
||||
{
|
||||
private int $timeoutSeconds = 20;
|
||||
private int $maxChars = 5000;
|
||||
|
||||
/**
|
||||
* List of trusted internal domains.
|
||||
* Used for marking content as authoritative.
|
||||
*/
|
||||
private array $internalDomains = [
|
||||
'mitho-media.de',
|
||||
];
|
||||
|
||||
/**
|
||||
* Extracts readable text from the first URL found in a prompt.
|
||||
*
|
||||
* @param string $prompt
|
||||
* @return string Cleaned page text or empty string on failure
|
||||
*/
|
||||
public function extractContentFromPrompt(string $prompt): string
|
||||
{
|
||||
if (!preg_match('~https?://\S+|www\.\S+~i', $prompt, $matches)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$url = $matches[0];
|
||||
if (!str_starts_with($url, 'http')) {
|
||||
$url = 'https://' . $url;
|
||||
}
|
||||
|
||||
$parts = parse_url($url);
|
||||
if ($parts === false || empty($parts['host'])) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$context = stream_context_create([
|
||||
'http' => [
|
||||
'timeout' => $this->timeoutSeconds,
|
||||
'user_agent' => 'mithoAgent/1.0',
|
||||
'ignore_errors' => true,
|
||||
],
|
||||
]);
|
||||
|
||||
$handle = @fopen($url, 'rb', false, $context);
|
||||
if ($handle === false) {
|
||||
return '';
|
||||
}
|
||||
|
||||
try {
|
||||
$html = '';
|
||||
while (!feof($handle) && strlen($html) < $this->maxChars * 2) {
|
||||
$html .= fread($handle, 1024);
|
||||
}
|
||||
} finally {
|
||||
fclose($handle);
|
||||
}
|
||||
|
||||
if ($html === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Remove script and style blocks
|
||||
$html = preg_replace('~<script[^>]*>.*?</script>~is', '', $html) ?? $html;
|
||||
$html = preg_replace('~<style[^>]*>.*?</style>~is', '', $html) ?? $html;
|
||||
|
||||
// Strip remaining HTML and normalize whitespace
|
||||
$text = strip_tags($html);
|
||||
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
|
||||
|
||||
return mb_substr(trim($text), 0, $this->maxChars);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether a URL belongs to a trusted internal domain.
|
||||
*
|
||||
* @param string $url
|
||||
* @return bool
|
||||
*/
|
||||
public function isInternalDomainUrl(string $url): bool
|
||||
{
|
||||
$parts = parse_url($url);
|
||||
if ($parts === false || empty($parts['host'])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$host = mb_strtolower($parts['host']);
|
||||
|
||||
foreach ($this->internalDomains as $domain) {
|
||||
if ($host === $domain || str_ends_with($host, '.' . $domain)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
0
src/Controller/.gitignore
vendored
Normal file
0
src/Controller/.gitignore
vendored
Normal file
115
src/Controller/AskSseController.php
Normal file
115
src/Controller/AskSseController.php
Normal file
@@ -0,0 +1,115 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Controller;
|
||||
|
||||
use App\Agent\AgentRunner;
|
||||
use App\Http\ClientIdResolver;
|
||||
use Symfony\Component\HttpFoundation\Request;
|
||||
use Symfony\Component\HttpFoundation\Response;
|
||||
use Symfony\Component\HttpFoundation\StreamedResponse;
|
||||
use Symfony\Component\Routing\Annotation\Route;
|
||||
|
||||
final readonly class AskSseController
|
||||
{
|
||||
public function __construct(
|
||||
private AgentRunner $agentRunner,
|
||||
private ClientIdResolver $clientIdResolver,
|
||||
) {}
|
||||
|
||||
#[Route('/ask-sse', name: 'ask_sse', methods: ['POST'])]
|
||||
public function stream(Request $request): StreamedResponse
|
||||
{
|
||||
$data = json_decode($request->getContent(), true);
|
||||
$prompt = trim((string) ($data['prompt'] ?? ''));
|
||||
|
||||
$cookieResponse = new Response();
|
||||
$clientId = $this->clientIdResolver->resolve($request, $cookieResponse);
|
||||
|
||||
return new StreamedResponse(
|
||||
function () use ($prompt, $clientId, $cookieResponse): void {
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// Disable all PHP output buffering
|
||||
// ---------------------------------------------------------
|
||||
while (ob_get_level() > 0) {
|
||||
ob_end_flush();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// Forward cookies
|
||||
// ---------------------------------------------------------
|
||||
foreach ($cookieResponse->headers->getCookies() as $cookie) {
|
||||
header('Set-Cookie: ' . $cookie, false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// SSE prelude
|
||||
// ---------------------------------------------------------
|
||||
echo "retry: 3000\n\n";
|
||||
flush();
|
||||
|
||||
if ($prompt === '') {
|
||||
$this->sendEvent('error', 'Empty prompt');
|
||||
return;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 🔥 FIXED: Sende Chunks direkt (behält \n!)
|
||||
// ---------------------------------------------------------
|
||||
foreach ($this->agentRunner->run($prompt, $clientId) as $chunk) {
|
||||
// Normalize line endings
|
||||
$chunk = str_replace(["\r\n", "\r"], "\n", $chunk);
|
||||
|
||||
// Sende Chunk direkt mit \n
|
||||
$this->sendData($chunk);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// Signal completion
|
||||
// ---------------------------------------------------------
|
||||
$this->sendEvent('done', '[DONE]');
|
||||
},
|
||||
200,
|
||||
[
|
||||
'Content-Type' => 'text/event-stream; charset=utf-8',
|
||||
'Cache-Control' => 'no-cache, no-store, must-revalidate',
|
||||
'Connection' => 'keep-alive',
|
||||
'X-Accel-Buffering' => 'no',
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* FIXED: Behält Markdown-Struktur (\n) bei
|
||||
*
|
||||
* SSE erlaubt mehrere "data:"-Zeilen pro Event.
|
||||
* Jede Zeile wird als separate data-Zeile gesendet.
|
||||
*/
|
||||
private function sendData(string $data): void
|
||||
{
|
||||
// Split by \n und sende jede Zeile einzeln
|
||||
$lines = explode("\n", $data);
|
||||
|
||||
foreach ($lines as $line) {
|
||||
echo 'data: ' . $line . "\n";
|
||||
}
|
||||
|
||||
// Leere Zeile = Ende der SSE-Message
|
||||
echo "\n\n";
|
||||
flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a named SSE event.
|
||||
*/
|
||||
private function sendEvent(string $event, string $data): void
|
||||
{
|
||||
$safe = str_replace(["\r", "\n"], ' ', $data);
|
||||
|
||||
echo "event: {$event}\n";
|
||||
echo "data: {$safe}\n\n";
|
||||
flush();
|
||||
}
|
||||
}
|
||||
127
src/Controller/HistoryController.php
Normal file
127
src/Controller/HistoryController.php
Normal file
@@ -0,0 +1,127 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Controller;
|
||||
|
||||
use App\Context\ContextService;
|
||||
use App\Http\ClientIdResolver;
|
||||
use Symfony\Component\HttpFoundation\JsonResponse;
|
||||
use Symfony\Component\HttpFoundation\Request;
|
||||
use Symfony\Component\HttpFoundation\Response;
|
||||
use Symfony\Component\Routing\Annotation\Route;
|
||||
|
||||
/**
|
||||
* HistoryController
|
||||
*
|
||||
* Read-only and destructive endpoints for conversation history.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Expose stored chat history for frontend reload
|
||||
* - Allow explicit deletion of the current client's history
|
||||
*
|
||||
* Identity handling:
|
||||
* - Client identity is resolved exclusively via ClientIdResolver
|
||||
* - No user identifiers are accepted from the request
|
||||
*/
|
||||
final class HistoryController
|
||||
{
|
||||
public function __construct(
|
||||
private readonly ContextService $contextService,
|
||||
private readonly ClientIdResolver $clientIdResolver,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* Returns the full conversation history for the current client
|
||||
* in a frontend-friendly structure.
|
||||
*/
|
||||
#[Route('/history', name: 'chat_history', methods: ['GET'])]
|
||||
public function history(Request $request): JsonResponse
|
||||
{
|
||||
// Resolve client ID (cookie-based)
|
||||
$response = new Response();
|
||||
$clientId = $this->clientIdResolver->resolve($request, $response);
|
||||
|
||||
$raw = $this->contextService->buildUserContext($clientId, full: true);
|
||||
|
||||
if ($raw === '') {
|
||||
return $this->jsonWithCookies([], $response);
|
||||
}
|
||||
|
||||
$messages = [];
|
||||
$lines = explode("\n", $raw);
|
||||
|
||||
$assistantBuffer = [];
|
||||
|
||||
foreach ($lines as $line) {
|
||||
// User message
|
||||
if (str_starts_with($line, 'Question: ')) {
|
||||
// Flush previous assistant output
|
||||
if ($assistantBuffer !== []) {
|
||||
$messages[] = [
|
||||
'role' => 'assistant',
|
||||
'text' => trim(implode("\n", $assistantBuffer)),
|
||||
];
|
||||
$assistantBuffer = [];
|
||||
}
|
||||
|
||||
$messages[] = [
|
||||
'role' => 'user',
|
||||
'text' => trim(substr($line, 10)),
|
||||
];
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Assistant output (can span multiple lines)
|
||||
if (trim($line) !== '') {
|
||||
$assistantBuffer[] = $line;
|
||||
}
|
||||
}
|
||||
|
||||
// Flush trailing assistant output
|
||||
if ($assistantBuffer !== []) {
|
||||
$messages[] = [
|
||||
'role' => 'assistant',
|
||||
'text' => trim(implode("\n", $assistantBuffer)),
|
||||
];
|
||||
}
|
||||
|
||||
return $this->jsonWithCookies($messages, $response);
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes the complete conversation history for the current client.
|
||||
*/
|
||||
#[Route('/history/delete', name: 'delete_history', methods: ['POST'])]
|
||||
public function delete(Request $request): JsonResponse
|
||||
{
|
||||
// Resolve client ID (cookie-based)
|
||||
$response = new Response();
|
||||
$clientId = $this->clientIdResolver->resolve($request, $response);
|
||||
|
||||
$this->contextService->deleteHistory($clientId);
|
||||
|
||||
return $this->jsonWithCookies(
|
||||
[
|
||||
'status' => 'ok',
|
||||
'message' => 'History deleted',
|
||||
],
|
||||
$response
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to return JSON responses while forwarding cookies.
|
||||
*/
|
||||
private function jsonWithCookies(array $data, Response $cookieResponse): JsonResponse
|
||||
{
|
||||
$json = new JsonResponse($data);
|
||||
|
||||
foreach ($cookieResponse->headers->getCookies() as $cookie) {
|
||||
$json->headers->setCookie($cookie);
|
||||
}
|
||||
|
||||
return $json;
|
||||
}
|
||||
}
|
||||
46
src/Http/ClientIdResolver.php
Normal file
46
src/Http/ClientIdResolver.php
Normal file
@@ -0,0 +1,46 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Http;
|
||||
|
||||
use Symfony\Component\HttpFoundation\Cookie;
|
||||
use Symfony\Component\HttpFoundation\Request;
|
||||
use Symfony\Component\HttpFoundation\Response;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
/**
|
||||
* ClientIdResolver
|
||||
*
|
||||
* Resolves a stable, anonymous client identifier for browser-based users.
|
||||
* The identifier is stored as an HttpOnly cookie.
|
||||
*/
|
||||
final class ClientIdResolver
|
||||
{
|
||||
private const COOKIE_NAME = 'ai_client_id';
|
||||
|
||||
public function resolve(Request $request, Response $response): string
|
||||
{
|
||||
$clientId = $request->cookies->get(self::COOKIE_NAME);
|
||||
|
||||
if (is_string($clientId) && $clientId !== '') {
|
||||
return $clientId;
|
||||
}
|
||||
|
||||
$clientId = Uuid::v4()->toRfc4122();
|
||||
|
||||
$response->headers->setCookie(
|
||||
new Cookie(
|
||||
name: self::COOKIE_NAME,
|
||||
value: $clientId,
|
||||
expire: strtotime('+1 year'),
|
||||
path: '/',
|
||||
secure: false, // set true in production with HTTPS
|
||||
httpOnly: true,
|
||||
sameSite: Cookie::SAMESITE_LAX
|
||||
)
|
||||
);
|
||||
|
||||
return $clientId;
|
||||
}
|
||||
}
|
||||
148
src/Infrastructure/OllamaClient.php
Normal file
148
src/Infrastructure/OllamaClient.php
Normal file
@@ -0,0 +1,148 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Infrastructure;
|
||||
|
||||
use Generator;
|
||||
use JsonException;
|
||||
use RuntimeException;
|
||||
use Throwable;
|
||||
|
||||
/**
|
||||
* OllamaClient
|
||||
*
|
||||
* Production-ready streaming client for Ollama-compatible LLM backends.
|
||||
*
|
||||
* Key properties:
|
||||
* - True live streaming (tokens are yielded while the request is running)
|
||||
* - PHP-safe (no yield inside cURL callbacks)
|
||||
* - Works for both HTTP streaming and CLI usage
|
||||
* - Deterministic and resource-safe
|
||||
*
|
||||
* Implementation strategy:
|
||||
* - Use curl_multi_* to keep control of the execution loop
|
||||
* - Accumulate partial chunks into a rolling buffer
|
||||
* - Extract JSON lines incrementally
|
||||
* - Yield tokens immediately when they arrive
|
||||
*/
|
||||
final class OllamaClient
|
||||
{
|
||||
private string $apiUrl;
|
||||
private string $model;
|
||||
private int $timeoutSeconds;
|
||||
|
||||
public function __construct(
|
||||
string $apiUrl,
|
||||
string $model,
|
||||
int $timeoutSeconds,
|
||||
) {
|
||||
$this->apiUrl = $apiUrl;
|
||||
$this->model = $model;
|
||||
$this->timeoutSeconds = $timeoutSeconds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Streams tokens from the LLM backend in real time.
|
||||
*
|
||||
* @param string $prompt Fully constructed prompt
|
||||
*
|
||||
* @return Generator<string>
|
||||
* @throws JsonException
|
||||
*/
|
||||
public function stream(string $prompt): Generator
|
||||
{
|
||||
$payload = json_encode([
|
||||
'model' => $this->model,
|
||||
'prompt' => $prompt,
|
||||
'stream' => true,
|
||||
], JSON_THROW_ON_ERROR);
|
||||
|
||||
$buffer = '';
|
||||
$done = false;
|
||||
|
||||
$ch = curl_init($this->apiUrl);
|
||||
if ($ch === false) {
|
||||
throw new RuntimeException('Failed to initialize cURL');
|
||||
}
|
||||
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
|
||||
CURLOPT_POSTFIELDS => $payload,
|
||||
CURLOPT_RETURNTRANSFER => false,
|
||||
CURLOPT_TIMEOUT => $this->timeoutSeconds,
|
||||
CURLOPT_WRITEFUNCTION => function ($curl, string $data) use (&$buffer, &$done): int {
|
||||
$buffer .= $data;
|
||||
return strlen($data);
|
||||
},
|
||||
]);
|
||||
|
||||
$mh = curl_multi_init();
|
||||
if ($mh === false) {
|
||||
curl_close($ch);
|
||||
throw new RuntimeException('Failed to initialize cURL multi handle');
|
||||
}
|
||||
|
||||
curl_multi_add_handle($mh, $ch);
|
||||
|
||||
try {
|
||||
do {
|
||||
// Execute the multi handle
|
||||
do {
|
||||
$status = curl_multi_exec($mh, $running);
|
||||
} while ($status === CURLM_CALL_MULTI_PERFORM);
|
||||
|
||||
// Read incoming data from the buffer
|
||||
while (($pos = strpos($buffer, "\n")) !== false) {
|
||||
$line = trim(substr($buffer, 0, $pos));
|
||||
$buffer = substr($buffer, $pos + 1);
|
||||
|
||||
if ($line === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
$json = json_decode($line, true, flags: JSON_THROW_ON_ERROR);
|
||||
} catch (Throwable) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isset($json['response'])) {
|
||||
yield $json['response'];
|
||||
}
|
||||
|
||||
if (!empty($json['done'])) {
|
||||
$done = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for network activity
|
||||
if ($running) {
|
||||
curl_multi_select($mh, 0.2);
|
||||
}
|
||||
} while ($running && !$done);
|
||||
|
||||
// Flush remaining buffer (edge case)
|
||||
if (!$done && trim($buffer) !== '') {
|
||||
try {
|
||||
$json = json_decode(trim($buffer), true, flags: JSON_THROW_ON_ERROR);
|
||||
if (isset($json['response'])) {
|
||||
yield $json['response'];
|
||||
}
|
||||
} catch (Throwable) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
|
||||
if (curl_errno($ch)) {
|
||||
$error = curl_error($ch);
|
||||
throw new RuntimeException('LLM connection error: ' . $error);
|
||||
}
|
||||
} finally {
|
||||
curl_multi_remove_handle($mh, $ch);
|
||||
curl_multi_close($mh);
|
||||
curl_close($ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
11
src/Kernel.php
Normal file
11
src/Kernel.php
Normal file
@@ -0,0 +1,11 @@
|
||||
<?php
|
||||
|
||||
namespace App;
|
||||
|
||||
use Symfony\Bundle\FrameworkBundle\Kernel\MicroKernelTrait;
|
||||
use Symfony\Component\HttpKernel\Kernel as BaseKernel;
|
||||
|
||||
class Kernel extends BaseKernel
|
||||
{
|
||||
use MicroKernelTrait;
|
||||
}
|
||||
58
src/Knowledge/Ingest/ChunkIndexWriter.php
Normal file
58
src/Knowledge/Ingest/ChunkIndexWriter.php
Normal file
@@ -0,0 +1,58 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/ChunkIndexWriter.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
final class ChunkIndexWriter
|
||||
{
|
||||
public function __construct(
|
||||
private string $indexPath
|
||||
) {}
|
||||
|
||||
public function add(array $entry): void
|
||||
{
|
||||
$index = $this->load();
|
||||
$index[] = $entry;
|
||||
$this->save($index);
|
||||
}
|
||||
|
||||
private function load(): array
|
||||
{
|
||||
if (!is_file($this->indexPath)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$json = file_get_contents($this->indexPath);
|
||||
$data = $json ? json_decode($json, true) : null;
|
||||
|
||||
return is_array($data) ? $data : [];
|
||||
}
|
||||
|
||||
private function save(array $index): void
|
||||
{
|
||||
$dir = dirname($this->indexPath);
|
||||
if (!is_dir($dir)) {
|
||||
mkdir($dir, 0775, true);
|
||||
}
|
||||
|
||||
file_put_contents(
|
||||
$this->indexPath,
|
||||
json_encode($index, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)
|
||||
);
|
||||
}
|
||||
|
||||
public function hasSourceHash(string $source, string $hash): bool
|
||||
{
|
||||
foreach ($this->load() as $entry) {
|
||||
if (
|
||||
($entry['source'] ?? null) === $source &&
|
||||
($entry['sourceHash'] ?? null) === $hash
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
149
src/Knowledge/Ingest/ChunkWriter.php
Normal file
149
src/Knowledge/Ingest/ChunkWriter.php
Normal file
@@ -0,0 +1,149 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/ChunkWriter.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
|
||||
use App\Knowledge\StopWords;
|
||||
|
||||
final class ChunkWriter
|
||||
{
|
||||
|
||||
public function __construct(
|
||||
private string $chunksDir,
|
||||
private string $manifestPath,
|
||||
private ChunkIndexWriter $indexWriter,
|
||||
private StopWords $stopWords,
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[] $chunks
|
||||
* @return string[] written filenames
|
||||
*/
|
||||
public function write(string $sourceName, array $chunks, string $sourceHash): array
|
||||
{
|
||||
if (!is_dir($this->chunksDir)) {
|
||||
mkdir($this->chunksDir, 0775, true);
|
||||
}
|
||||
|
||||
$manifest = $this->loadManifest();
|
||||
$written = [];
|
||||
|
||||
$base = $this->safeBase($sourceName);
|
||||
$ts = date('Ymd_His');
|
||||
|
||||
foreach ($chunks as $i => $chunk) {
|
||||
$filename = "{$base}__{$ts}__" . str_pad((string)$i, 4, '0', STR_PAD_LEFT) . ".txt";
|
||||
$path = rtrim($this->chunksDir, '/') . '/' . $filename;
|
||||
|
||||
$header = $this->buildHeader(
|
||||
source: $sourceName,
|
||||
index: $i
|
||||
);
|
||||
|
||||
file_put_contents($path, $header . "\n\n" . $chunk);
|
||||
|
||||
$written[] = $filename;
|
||||
|
||||
$manifest[] = [
|
||||
'file' => $filename,
|
||||
'source' => $sourceName,
|
||||
'index' => $i,
|
||||
'chars' => mb_strlen($chunk),
|
||||
'createdAt' => date('c'),
|
||||
];
|
||||
|
||||
$this->indexWriter->add([
|
||||
'file' => $filename,
|
||||
'source' => $sourceName,
|
||||
'sourceHash' => $sourceHash,
|
||||
'keywords' => $this->extractKeywords($chunk),
|
||||
'chars' => mb_strlen($chunk),
|
||||
]);
|
||||
}
|
||||
|
||||
|
||||
$this->saveManifest($manifest);
|
||||
return $written;
|
||||
}
|
||||
|
||||
private function safeBase(string $name): string
|
||||
{
|
||||
$name = pathinfo($name, PATHINFO_FILENAME);
|
||||
$name = mb_strtolower($name);
|
||||
$name = preg_replace('/[^a-z0-9\-_]+/u', '-', $name);
|
||||
return trim((string)$name, '-');
|
||||
}
|
||||
|
||||
private function loadManifest(): array
|
||||
{
|
||||
if (!is_file($this->manifestPath)) {
|
||||
return [];
|
||||
}
|
||||
$json = file_get_contents($this->manifestPath);
|
||||
$data = $json ? json_decode($json, true) : null;
|
||||
return is_array($data) ? $data : [];
|
||||
}
|
||||
|
||||
private function saveManifest(array $manifest): void
|
||||
{
|
||||
$dir = dirname($this->manifestPath);
|
||||
if (!is_dir($dir)) {
|
||||
mkdir($dir, 0775, true);
|
||||
}
|
||||
file_put_contents($this->manifestPath, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
|
||||
}
|
||||
|
||||
private function buildHeader(string $source, int $index): string
|
||||
{
|
||||
return sprintf(
|
||||
'[Quelle: %s | Abschnitt: Chunk %d]',
|
||||
$source,
|
||||
$index + 1
|
||||
);
|
||||
}
|
||||
|
||||
private function extractKeywords(string $text): array
|
||||
{
|
||||
// 1) Lowercase
|
||||
$text = mb_strtolower($text);
|
||||
|
||||
// 2) URLs entfernen (sehr wichtig)
|
||||
$text = preg_replace('#https?://\S+#u', ' ', $text);
|
||||
|
||||
// 3) Newlines & Tabs → Space
|
||||
$text = str_replace(["\r", "\n", "\t"], ' ', $text);
|
||||
|
||||
// 4) Trennzeichen → Space (NICHT löschen!)
|
||||
$text = preg_replace('/[\/\.\,\:\;\-\_\(\)\[\]\{\}]/u', ' ', $text);
|
||||
|
||||
// 5) Alles andere raus
|
||||
$text = preg_replace('/[^\p{L}\p{N}\s]/u', '', $text);
|
||||
|
||||
// 6) Whitespace normalisieren
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
$text = trim($text);
|
||||
|
||||
// 7) Wörter extrahieren
|
||||
$words = explode(' ', $text);
|
||||
|
||||
// 8) Filtern + deduplizieren
|
||||
$keywords = [];
|
||||
|
||||
foreach ($words as $word) {
|
||||
if (mb_strlen($word) < 4) {
|
||||
continue;
|
||||
}
|
||||
if (in_array($word, $this->stopWords->getStopWords() ?? [], true)) {
|
||||
continue;
|
||||
}
|
||||
$keywords[] = $word;
|
||||
}
|
||||
|
||||
return array_values(array_unique(array_slice($keywords, 0, 25)));
|
||||
}
|
||||
}
|
||||
37
src/Knowledge/Ingest/DocumentLoader.php
Normal file
37
src/Knowledge/Ingest/DocumentLoader.php
Normal file
@@ -0,0 +1,37 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/DocumentLoader.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
final class DocumentLoader
|
||||
{
|
||||
public function load(string $path): string
|
||||
{
|
||||
if (!is_file($path)) {
|
||||
throw new \RuntimeException("File not found: {$path}");
|
||||
}
|
||||
|
||||
$ext = mb_strtolower(pathinfo($path, PATHINFO_EXTENSION));
|
||||
|
||||
return match ($ext) {
|
||||
'txt', 'md' => $this->loadText($path),
|
||||
|
||||
// später:
|
||||
// 'pdf' => $this->loadPdf($path),
|
||||
// 'docx' => $this->loadDocx($path),
|
||||
|
||||
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
|
||||
};
|
||||
}
|
||||
|
||||
private function loadText(string $path): string
|
||||
{
|
||||
$content = file_get_contents($path);
|
||||
if ($content === false) {
|
||||
throw new \RuntimeException("Could not read file: {$path}");
|
||||
}
|
||||
return $content;
|
||||
}
|
||||
}
|
||||
39
src/Knowledge/Ingest/KnowledgeIngestService.php
Normal file
39
src/Knowledge/Ingest/KnowledgeIngestService.php
Normal file
@@ -0,0 +1,39 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/KnowledgeIngestService.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
final class KnowledgeIngestService
|
||||
{
|
||||
public function __construct(
|
||||
private DocumentLoader $loader,
|
||||
private SimpleChunker $chunker,
|
||||
private ChunkWriter $writer,
|
||||
private ChunkIndexWriter $indexWriter,
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
/** @return string[] written chunk filenames */
|
||||
public function ingestFile(string $path, bool $optimize = false): array
|
||||
{
|
||||
$text = $this->loader->load($path);
|
||||
|
||||
if ($optimize) {
|
||||
$text = preg_replace("/\n{3,}/", "\n\n", $text);
|
||||
$text = preg_replace("/[ \t]+$/m", "", $text);
|
||||
}
|
||||
|
||||
$sourceHash = sha1($text);
|
||||
$sourceName = basename($path);
|
||||
|
||||
if ($this->indexWriter->hasSourceHash($sourceName, $sourceHash)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$chunks = $this->chunker->chunk($text);
|
||||
return $this->writer->write($sourceName, $chunks, $sourceHash);
|
||||
}
|
||||
}
|
||||
146
src/Knowledge/Ingest/SimpleChunker.php
Normal file
146
src/Knowledge/Ingest/SimpleChunker.php
Normal file
@@ -0,0 +1,146 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/SimpleChunker.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
final class SimpleChunker
|
||||
{
|
||||
public function __construct(
|
||||
private int $maxWords = 180,
|
||||
private int $overlapWords = 30
|
||||
) {}
|
||||
|
||||
/** @return string[] */
|
||||
public function chunk(string $text): array
|
||||
{
|
||||
$text = $this->normalize($text);
|
||||
if ($text === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Split into tokens: words + whitespace preserved
|
||||
$tokens = preg_split(
|
||||
'/(\s+)/u',
|
||||
$text,
|
||||
-1,
|
||||
PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
|
||||
);
|
||||
|
||||
if (!$tokens) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Build word index → token index mapping
|
||||
$wordTokenIndexes = [];
|
||||
foreach ($tokens as $i => $token) {
|
||||
if (!preg_match('/^\s+$/u', $token)) {
|
||||
$wordTokenIndexes[] = $i;
|
||||
}
|
||||
}
|
||||
|
||||
$totalWords = count($wordTokenIndexes);
|
||||
if ($totalWords === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$chunks = [];
|
||||
$wordPos = 0;
|
||||
|
||||
while ($wordPos < $totalWords) {
|
||||
$wordEnd = min($wordPos + $this->maxWords, $totalWords);
|
||||
|
||||
$tokenStart = $wordTokenIndexes[$wordPos];
|
||||
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
|
||||
|
||||
// Intelligent cut (sentence / paragraph aware)
|
||||
$tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
|
||||
|
||||
$chunk = trim(implode('', array_slice(
|
||||
$tokens,
|
||||
$tokenStart,
|
||||
$tokenEnd - $tokenStart
|
||||
)));
|
||||
|
||||
if ($chunk !== '') {
|
||||
$chunks[] = $chunk;
|
||||
}
|
||||
|
||||
if ($wordEnd >= $totalWords) {
|
||||
break;
|
||||
}
|
||||
|
||||
$wordPos = max(0, $wordEnd - $this->overlapWords);
|
||||
}
|
||||
|
||||
return $this->dedupe($chunks);
|
||||
}
|
||||
|
||||
private function normalize(string $text): string
|
||||
{
|
||||
$text = str_replace(["\r\n", "\r"], "\n", $text);
|
||||
$text = preg_replace("/[ \t]+/u", " ", $text);
|
||||
$text = preg_replace("/\n{3,}/u", "\n\n", $text);
|
||||
|
||||
return trim((string) $text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Move cut backwards to a natural boundary if possible.
|
||||
* Rules:
|
||||
* - Never cut inside markdown list items
|
||||
* - Sentence end only if followed by a line break
|
||||
* - Paragraph breaks always allowed
|
||||
*/
|
||||
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
|
||||
{
|
||||
// Detect markdown list context (e.g. "- Foo: Bar")
|
||||
$startToken = $tokens[$start] ?? '';
|
||||
if (preg_match('/^- /u', ltrim($startToken))) {
|
||||
// Keep list blocks intact
|
||||
return $end;
|
||||
}
|
||||
|
||||
for ($i = $end - 1; $i > $start; $i--) {
|
||||
|
||||
// Paragraph boundary
|
||||
if ($tokens[$i] === "\n\n") {
|
||||
return $i + 1;
|
||||
}
|
||||
|
||||
// Sentence boundary only if followed by newline
|
||||
if (
|
||||
preg_match('/[.!?]\s*$/u', $tokens[$i]) &&
|
||||
isset($tokens[$i + 1]) &&
|
||||
str_contains($tokens[$i + 1], "\n")
|
||||
) {
|
||||
return $i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return $end;
|
||||
}
|
||||
|
||||
/** @param string[] $chunks @return string[] */
|
||||
private function dedupe(array $chunks): array
|
||||
{
|
||||
$seen = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$key = mb_strtolower(
|
||||
preg_replace('/\s+/u', ' ', trim($chunk))
|
||||
);
|
||||
|
||||
if (isset($seen[$key])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$seen[$key] = true;
|
||||
$out[] = $chunk;
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
}
|
||||
35
src/Knowledge/KeywordMapper.php
Normal file
35
src/Knowledge/KeywordMapper.php
Normal file
@@ -0,0 +1,35 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge;
|
||||
|
||||
/**
|
||||
* KeywordMapper
|
||||
*
|
||||
* Expands short or ambiguous prompts into richer semantic variants
|
||||
* before they are passed into retrieval or embedding pipelines.
|
||||
*
|
||||
* This is a direct port of prompt_mapping.py.
|
||||
*/
|
||||
final class KeywordMapper
|
||||
{
|
||||
private array $map = [
|
||||
'ki' => 'künstliche Intelligenz, AI, Projekte, Modelle, Agenten, ki',
|
||||
'shop' => 'Shopware, Onlineshop, Webshop, Commerce-System',
|
||||
'shops' => 'Shopware, Webshops, Verkaufsplattformen',
|
||||
'agentur' => 'Agentur, Firma, Unternehmen, mitho media',
|
||||
'api' => 'Schnittstelle, API, Anbindung, Integration',
|
||||
'plugin' => 'Shopware Plugin, Erweiterung, Modul, Funktion',
|
||||
];
|
||||
|
||||
/**
|
||||
* Maps a raw prompt to an expanded semantic variant if applicable.
|
||||
*/
|
||||
public function map(string $prompt): string
|
||||
{
|
||||
$key = mb_strtolower(trim($prompt));
|
||||
|
||||
return $this->map[$key] ?? $prompt;
|
||||
}
|
||||
}
|
||||
87
src/Knowledge/KeywordSimilarity.php
Normal file
87
src/Knowledge/KeywordSimilarity.php
Normal file
@@ -0,0 +1,87 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge;
|
||||
|
||||
/**
|
||||
* KeywordSimilarity
|
||||
*
|
||||
* Deterministic and fault-tolerant comparison of two keywords.
|
||||
* Returns a similarity score between 0.0 and 1.0.
|
||||
*
|
||||
* Design goals:
|
||||
* - index.json remains unchanged
|
||||
* - comparison logic is intelligent (typos, phonetics)
|
||||
* - no alias or synonym lists
|
||||
* - no LLM dependency
|
||||
*/
|
||||
final class KeywordSimilarity
|
||||
{
|
||||
/**
|
||||
* Compare a query token with an index keyword.
|
||||
*
|
||||
* @param string $queryToken Token from user input
|
||||
* @param string $indexKeyword Keyword from index.json
|
||||
*
|
||||
* @return float Similarity score (0.0 – 1.0)
|
||||
*/
|
||||
public static function compare(string $queryToken, string $indexKeyword): float
|
||||
{
|
||||
$a = self::normalize($queryToken);
|
||||
$b = self::normalize($indexKeyword);
|
||||
|
||||
// Guard: ignore empty or very short tokens
|
||||
if ($a === '' || $b === '' || mb_strlen($a) < 3 || mb_strlen($b) < 3) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// 1. Exact match
|
||||
if ($a === $b) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
// 2. Phonetic comparison (metaphone)
|
||||
// Useful for: showpare → shopware, shopvare → shopware
|
||||
if (metaphone($a) === metaphone($b)) {
|
||||
return 0.85;
|
||||
}
|
||||
|
||||
// 3. Edit distance comparison (only for longer words)
|
||||
if (mb_strlen($a) >= 6 && mb_strlen($b) >= 6) {
|
||||
$distance = levenshtein($a, $b);
|
||||
|
||||
if ($distance === 1) {
|
||||
return 0.9;
|
||||
}
|
||||
|
||||
if ($distance === 2) {
|
||||
return 0.8;
|
||||
}
|
||||
}
|
||||
|
||||
// No relevant match
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a keyword to ensure stable comparison.
|
||||
*/
|
||||
private static function normalize(string $value): string
|
||||
{
|
||||
$value = mb_strtolower(trim($value));
|
||||
|
||||
// Remove non-alphanumeric characters
|
||||
$value = preg_replace('/[^\p{L}\p{N}]/u', '', $value) ?? '';
|
||||
|
||||
// Normalize German umlauts
|
||||
$map = [
|
||||
'ä' => 'ae',
|
||||
'ö' => 'oe',
|
||||
'ü' => 'ue',
|
||||
'ß' => 'ss',
|
||||
];
|
||||
|
||||
return strtr($value, $map);
|
||||
}
|
||||
}
|
||||
42
src/Knowledge/Retrieval/CachedRetriever.php
Normal file
42
src/Knowledge/Retrieval/CachedRetriever.php
Normal file
@@ -0,0 +1,42 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use Psr\Cache\CacheItemPoolInterface;
|
||||
|
||||
final class CachedRetriever implements RetrieverInterface
|
||||
{
|
||||
public function __construct(
|
||||
private RetrieverInterface $inner,
|
||||
private CacheItemPoolInterface $cache,
|
||||
private int $ttlSeconds = 600 // 10 Minuten
|
||||
) {}
|
||||
|
||||
public function retrieve(string $prompt, int $limit = 3): array
|
||||
{
|
||||
$key = $this->buildCacheKey($prompt, $limit);
|
||||
|
||||
$item = $this->cache->getItem($key);
|
||||
if ($item->isHit()) {
|
||||
return $item->get();
|
||||
}
|
||||
|
||||
$result = $this->inner->retrieve($prompt, $limit);
|
||||
|
||||
$item->set($result);
|
||||
$item->expiresAfter($this->ttlSeconds);
|
||||
$this->cache->save($item);
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
private function buildCacheKey(string $prompt, int $limit): string
|
||||
{
|
||||
$normalized = mb_strtolower(trim($prompt));
|
||||
$normalized = preg_replace('/\s+/u', ' ', $normalized);
|
||||
|
||||
return 'rag_retrieval_' . sha1($normalized . '|' . $limit);
|
||||
}
|
||||
}
|
||||
25
src/Knowledge/Retrieval/ChunkIndexLoader.php
Normal file
25
src/Knowledge/Retrieval/ChunkIndexLoader.php
Normal file
@@ -0,0 +1,25 @@
|
||||
<?php
|
||||
// src/Knowledge/Retrieval/ChunkIndexLoader.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
final class ChunkIndexLoader
|
||||
{
|
||||
public function __construct(
|
||||
private string $indexPath
|
||||
) {}
|
||||
|
||||
public function load(): array
|
||||
{
|
||||
if (!is_file($this->indexPath)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$json = file_get_contents($this->indexPath);
|
||||
$data = $json ? json_decode($json, true) : null;
|
||||
|
||||
return is_array($data) ? $data : [];
|
||||
}
|
||||
}
|
||||
269
src/Knowledge/Retrieval/ChunkKeywordRetriever.php
Normal file
269
src/Knowledge/Retrieval/ChunkKeywordRetriever.php
Normal file
@@ -0,0 +1,269 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use App\Knowledge\StopWords;
|
||||
use App\Knowledge\VectorSearchChunked;
|
||||
use App\Knowledge\KeywordSimilarity;
|
||||
use App\Vector\VectorSearchClient;
|
||||
|
||||
final class ChunkKeywordRetriever implements RetrieverInterface
|
||||
{
|
||||
private const MAX_KEYWORD_CANDIDATES = 200;
|
||||
private const VECTOR_SCORE_THRESHOLD = 0.65;
|
||||
private const VECTOR_TOP_K = 3;
|
||||
|
||||
public function __construct(
|
||||
private VectorSearchChunked $chunkedSearch,
|
||||
private ChunkIndexLoader $indexLoader,
|
||||
private StopWords $stopWords,
|
||||
private VectorSearchClient $vectorClient,
|
||||
private string $chunksDir,
|
||||
private int $maxChunks = 3,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function retrieve(string $prompt, int $limit = null): array
|
||||
{
|
||||
$limit ??= $this->maxChunks;
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 1) Prompt → search terms
|
||||
// ---------------------------------------------------------
|
||||
$queryTerms = $this->extractTerms($prompt);
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 2) Keyword-based candidate discovery
|
||||
// ---------------------------------------------------------
|
||||
$result = $queryTerms !== []
|
||||
? $this->findCandidateFiles($queryTerms)
|
||||
: ['files' => [], 'canonicalTerms' => []];
|
||||
|
||||
$candidateScores = array_slice(
|
||||
$result['files'],
|
||||
0,
|
||||
self::MAX_KEYWORD_CANDIDATES,
|
||||
true
|
||||
);
|
||||
|
||||
// Canonical replacement
|
||||
$effectiveTerms = array_map(
|
||||
static fn (string $term): string =>
|
||||
$result['canonicalTerms'][$term] ?? $term,
|
||||
$queryTerms
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 3) Keyword scoring
|
||||
// ---------------------------------------------------------
|
||||
$scored = [];
|
||||
|
||||
foreach ($candidateScores as $file => $similarityScore) {
|
||||
$path = $this->chunksDir . '/' . $file;
|
||||
if (!is_file($path)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunk = file_get_contents($path);
|
||||
if ($chunk === false || $chunk === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$score = $this->scoreChunk($chunk, $effectiveTerms);
|
||||
if ($score === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$scored[$file] = [
|
||||
'chunk' => trim($chunk),
|
||||
'score' => (int) round($score * $similarityScore),
|
||||
];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 🔑 EARLY EXIT: Keyword results are sufficient
|
||||
// ---------------------------------------------------------
|
||||
if (\count($scored) >= $limit) {
|
||||
return $this->finalize($scored, $limit);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 4) Vector retrieval (semantic fallback)
|
||||
// ---------------------------------------------------------
|
||||
$vectorHits = $this->vectorClient->search($prompt, self::VECTOR_TOP_K);
|
||||
|
||||
foreach ($vectorHits as $hit) {
|
||||
if (
|
||||
!isset($hit['chunk_id'], $hit['score']) ||
|
||||
$hit['score'] < self::VECTOR_SCORE_THRESHOLD
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$file = $hit['chunk_id'] . '.txt';
|
||||
$path = $this->chunksDir . '/' . $file;
|
||||
|
||||
if (!is_file($path)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$baseScore = $scored[$file]['score'] ?? 0;
|
||||
|
||||
$vectorBoost = (int) round($hit['score'] * 10);
|
||||
|
||||
if ($vectorBoost <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunk = $scored[$file]['chunk']
|
||||
?? trim((string) file_get_contents($path));
|
||||
|
||||
$scored[$file] = [
|
||||
'chunk' => $chunk,
|
||||
'score' => $baseScore + $vectorBoost,
|
||||
];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 5) Final fallback
|
||||
// ---------------------------------------------------------
|
||||
if ($scored === []) {
|
||||
return $this->fallbackSearch($prompt);
|
||||
}
|
||||
|
||||
return $this->finalize($scored, $limit);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// FINALIZATION
|
||||
// -------------------------------------------------------------
|
||||
private function finalize(array $scored, int $limit): array
|
||||
{
|
||||
uasort($scored, fn ($a, $b) => $b['score'] <=> $a['score']);
|
||||
|
||||
return array_slice(
|
||||
$this->normalizeResults(
|
||||
array_column($scored, 'chunk')
|
||||
),
|
||||
0,
|
||||
$limit
|
||||
);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// INDEX LOGIC
|
||||
// -------------------------------------------------------------
|
||||
private function findCandidateFiles(array $terms): array
|
||||
{
|
||||
$index = $this->indexLoader->load();
|
||||
$files = [];
|
||||
$canonicalTerms = [];
|
||||
|
||||
foreach ($index as $entry) {
|
||||
if (!isset($entry['file'], $entry['keywords'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach ($terms as $term) {
|
||||
foreach ($entry['keywords'] as $indexKeyword) {
|
||||
$score = KeywordSimilarity::compare($term, $indexKeyword);
|
||||
|
||||
if ($score >= 0.8) {
|
||||
$files[$entry['file']] = max(
|
||||
$files[$entry['file']] ?? 0.0,
|
||||
$score
|
||||
);
|
||||
$canonicalTerms[$term] = $indexKeyword;
|
||||
break 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [
|
||||
'files' => $files,
|
||||
'canonicalTerms' => $canonicalTerms,
|
||||
];
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// FALLBACK
|
||||
// -------------------------------------------------------------
|
||||
private function fallbackSearch(string $prompt): array
|
||||
{
|
||||
$chunkedText = trim($this->chunkedSearch->searchAsText($prompt));
|
||||
if ($chunkedText === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
return array_slice(
|
||||
$this->normalizeResults($this->splitChunks($chunkedText)),
|
||||
0,
|
||||
$this->maxChunks
|
||||
);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// SCORING
|
||||
// -------------------------------------------------------------
|
||||
private function scoreChunk(string $chunk, array $terms): int
|
||||
{
|
||||
$content = mb_strtolower($chunk);
|
||||
$score = 0;
|
||||
|
||||
foreach ($terms as $term) {
|
||||
if (
|
||||
!\in_array($term, $this->stopWords->getStopWords(), true) &&
|
||||
str_contains($content, $term)
|
||||
) {
|
||||
$score += mb_strlen($term) >= 10 ? 2 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
return $score;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// UTIL
|
||||
// -------------------------------------------------------------
|
||||
private function extractTerms(string $text): array
|
||||
{
|
||||
$text = mb_strtolower(
|
||||
preg_replace('/[^\p{L}\p{N}\s]/u', '', $text)
|
||||
);
|
||||
|
||||
return array_values(array_filter(
|
||||
explode(' ', $text),
|
||||
static fn (string $w) => mb_strlen($w) > 2
|
||||
));
|
||||
}
|
||||
|
||||
private function splitChunks(string $text): array
|
||||
{
|
||||
return array_values(array_filter(
|
||||
array_map('trim', explode("\n\n", $text)),
|
||||
static fn (string $chunk) => $chunk !== ''
|
||||
));
|
||||
}
|
||||
|
||||
private function normalizeResults(array $chunks): array
|
||||
{
|
||||
$seen = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
|
||||
if (!isset($seen[$key])) {
|
||||
$seen[$key] = true;
|
||||
$out[] = $chunk;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
}
|
||||
11
src/Knowledge/Retrieval/RetrieverInterface.php
Normal file
11
src/Knowledge/Retrieval/RetrieverInterface.php
Normal file
@@ -0,0 +1,11 @@
|
||||
<?php
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
interface RetrieverInterface
|
||||
{
|
||||
/**
|
||||
* @return string[] Plain text knowledge chunks
|
||||
*/
|
||||
public function retrieve(string $prompt, int $limit = 3): array;
|
||||
}
|
||||
1863
src/Knowledge/StopWords.php
Normal file
1863
src/Knowledge/StopWords.php
Normal file
File diff suppressed because it is too large
Load Diff
121
src/Knowledge/VectorSearchChunked.php
Normal file
121
src/Knowledge/VectorSearchChunked.php
Normal file
@@ -0,0 +1,121 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge;
|
||||
|
||||
use Psr\Log\LoggerInterface;
|
||||
|
||||
/**
|
||||
* VectorSearchChunked
|
||||
*
|
||||
* Chunk-based retrieval service for long-form knowledge documents.
|
||||
* This is a lightweight, deterministic runtime reader for
|
||||
* precomputed knowledge chunks.
|
||||
*
|
||||
* Design principles:
|
||||
* - No runtime indexing
|
||||
* - No ML dependencies
|
||||
* - Deterministic and fast
|
||||
* - Hard limits to protect prompt size
|
||||
*
|
||||
* This service is intentionally simple and can later be replaced
|
||||
* by a real vector database without changing the AgentRunner.
|
||||
*/
|
||||
final class VectorSearchChunked
|
||||
{
|
||||
/**
|
||||
* Directory containing chunked knowledge files.
|
||||
*/
|
||||
private string $dataDir = 'var/knowledge/chunks';
|
||||
|
||||
/**
|
||||
* Maximum number of chunks to return.
|
||||
*/
|
||||
private int $maxChunks = 3;
|
||||
|
||||
public function __construct(
|
||||
private string $projectDir,
|
||||
)
|
||||
{
|
||||
$this->dataDir = $this->projectDir . '/' . $this->dataDir;
|
||||
}
|
||||
/**
|
||||
* Returns concatenated relevant chunks as plain text.
|
||||
*
|
||||
* @param string $prompt
|
||||
* @return string
|
||||
*/
|
||||
public function searchAsText(string $prompt): string
|
||||
{
|
||||
|
||||
if (!is_dir($this->dataDir)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$promptLower = mb_strtolower($prompt);
|
||||
$keywords = $this->extractKeywords($promptLower);
|
||||
|
||||
if ($keywords === []) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$matches = [];
|
||||
|
||||
foreach (glob($this->dataDir . '/*.txt') as $file) {
|
||||
$content = file_get_contents($file);
|
||||
if ($content === false) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$contentLower = mb_strtolower($content);
|
||||
|
||||
if ($this->matchesKeywords($contentLower, $keywords)) {
|
||||
$matches[] = trim($content);
|
||||
}
|
||||
|
||||
if (count($matches) >= $this->maxChunks) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return implode("\n\n", $matches);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts simple keywords from the prompt.
|
||||
*
|
||||
* This is a lightweight heuristic replacement for
|
||||
* full vector or embedding-based search.
|
||||
*/
|
||||
private function extractKeywords(string $prompt): array
|
||||
{
|
||||
$words = preg_split('/\W+/u', $prompt, -1, PREG_SPLIT_NO_EMPTY);
|
||||
if ($words === false) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$keywords = [];
|
||||
foreach ($words as $word) {
|
||||
if (mb_strlen($word) >= 4) {
|
||||
$keywords[] = $word;
|
||||
}
|
||||
}
|
||||
|
||||
return array_values(array_unique($keywords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether the content matches at least one keyword.
|
||||
*/
|
||||
private function matchesKeywords(string $content, array $keywords): bool
|
||||
{
|
||||
foreach ($keywords as $keyword) {
|
||||
if (str_contains($content, $keyword)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
55
src/Vector/VectorSearchClient.php
Normal file
55
src/Vector/VectorSearchClient.php
Normal file
@@ -0,0 +1,55 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Vector;
|
||||
|
||||
use Psr\Log\LoggerInterface;
|
||||
|
||||
final class VectorSearchClient
|
||||
{
|
||||
public function __construct(
|
||||
private readonly string $vectorDir,
|
||||
private LoggerInterface $agentLogger,
|
||||
) {
|
||||
}
|
||||
|
||||
public function search(string $query, int $limit = 5): array
|
||||
{
|
||||
$script = rtrim($this->vectorDir, '/') . '/vector_search.py';
|
||||
$this->agentLogger->info("Run vector search script $script");
|
||||
if (!is_file($script)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// Determine Python interpreter (venv preferred)
|
||||
// -------------------------------------------------
|
||||
$venvPython = $this->vectorDir . '/.venv/bin/python';
|
||||
$pythonBin = is_file($venvPython) ? $venvPython : 'python3';
|
||||
|
||||
$cmd = sprintf(
|
||||
'%s %s %s %d 2>&1',
|
||||
escapeshellarg($pythonBin),
|
||||
escapeshellarg($script),
|
||||
escapeshellarg($query),
|
||||
$limit
|
||||
);
|
||||
|
||||
exec($cmd, $out, $exitCode);
|
||||
|
||||
if ($exitCode !== 0 || empty($out)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$json = implode("\n", $out);
|
||||
|
||||
$this->agentLogger->info($json);
|
||||
|
||||
try {
|
||||
return json_decode($json, true, 512, JSON_THROW_ON_ERROR);
|
||||
} catch (\Throwable) {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
}
|
||||
89
src/Vector/vector_ingest.py
Normal file
89
src/Vector/vector_ingest.py
Normal file
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Argument handling
|
||||
# ---------------------------------------------------------
|
||||
if len(sys.argv) < 3:
|
||||
print("ERROR: Missing arguments (vectorDir, knowledgeDir)")
|
||||
sys.exit(2)
|
||||
|
||||
vector_dir = Path(sys.argv[1]).resolve()
|
||||
knowledge_dir = Path(sys.argv[2]).resolve()
|
||||
|
||||
index_json = knowledge_dir / "index.json"
|
||||
index_out = vector_dir / "vector.index"
|
||||
meta_out = vector_dir / "vector_meta.json"
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Dependency checks
|
||||
# ---------------------------------------------------------
|
||||
try:
|
||||
import faiss # noqa
|
||||
except Exception:
|
||||
print("ERROR: Python module 'faiss' not found.")
|
||||
sys.exit(10)
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer # noqa
|
||||
except Exception:
|
||||
print("ERROR: Python module 'sentence-transformers' not found.")
|
||||
sys.exit(11)
|
||||
|
||||
import faiss
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# File checks
|
||||
# ---------------------------------------------------------
|
||||
if not index_json.is_file():
|
||||
print(f"ERROR: index.json not found at {index_json}")
|
||||
sys.exit(20)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Load chunks from index.json
|
||||
# ---------------------------------------------------------
|
||||
with open(index_json, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
texts = []
|
||||
ids = []
|
||||
|
||||
for entry in data:
|
||||
if "file" not in entry:
|
||||
continue
|
||||
|
||||
chunk_path = knowledge_dir / "chunks" / entry["file"]
|
||||
if not chunk_path.is_file():
|
||||
continue
|
||||
|
||||
text = chunk_path.read_text(encoding="utf-8").strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
texts.append(text)
|
||||
ids.append(entry["file"])
|
||||
|
||||
if not texts:
|
||||
print("ERROR: No chunks loaded from index.json")
|
||||
sys.exit(21)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Build vector index
|
||||
# ---------------------------------------------------------
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
embeddings = model.encode(texts, normalize_embeddings=True)
|
||||
|
||||
dim = embeddings.shape[1]
|
||||
index = faiss.IndexFlatIP(dim)
|
||||
index.add(embeddings)
|
||||
|
||||
faiss.write_index(index, str(index_out))
|
||||
|
||||
with open(meta_out, "w", encoding="utf-8") as f:
|
||||
json.dump(ids, f)
|
||||
|
||||
print(f"Indexed {len(ids)} chunks.")
|
||||
72
src/Vector/vector_search.py
Normal file
72
src/Vector/vector_search.py
Normal file
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Argument handling
|
||||
# ---------------------------------------------------------
|
||||
if len(sys.argv) < 3:
|
||||
print("ERROR: Missing arguments (query, limit)")
|
||||
sys.exit(2)
|
||||
|
||||
query = sys.argv[1]
|
||||
limit = int(sys.argv[2])
|
||||
|
||||
vector_dir = Path(__file__).resolve().parent
|
||||
index_path = vector_dir / "vector.index"
|
||||
meta_path = vector_dir / "vector_meta.json"
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Dependency checks (controlled)
|
||||
# ---------------------------------------------------------
|
||||
try:
|
||||
import faiss # noqa
|
||||
except Exception:
|
||||
print("ERROR: Python module 'faiss' not found.")
|
||||
sys.exit(10)
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer # noqa
|
||||
except Exception:
|
||||
print("ERROR: Python module 'sentence-transformers' not found.")
|
||||
sys.exit(11)
|
||||
|
||||
import faiss
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# File checks
|
||||
# ---------------------------------------------------------
|
||||
if not index_path.is_file() or not meta_path.is_file():
|
||||
print("ERROR: Vector index not found. Run vector ingest first.")
|
||||
sys.exit(20)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Load model and index
|
||||
# ---------------------------------------------------------
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
query_vec = model.encode([query], normalize_embeddings=True)
|
||||
|
||||
index = faiss.read_index(str(index_path))
|
||||
|
||||
with open(meta_path, "r", encoding="utf-8") as f:
|
||||
ids = json.load(f)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Search
|
||||
# ---------------------------------------------------------
|
||||
scores, indices = index.search(query_vec, limit)
|
||||
|
||||
results = []
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx == -1:
|
||||
continue
|
||||
|
||||
results.append({
|
||||
"chunk_id": ids[idx],
|
||||
"score": float(score)
|
||||
})
|
||||
|
||||
print(json.dumps(results))
|
||||
Reference in New Issue
Block a user