first commit

This commit is contained in:
team 1
2026-02-11 14:15:08 +01:00
parent a4742c2c38
commit aa7d362bc3
58 changed files with 9999 additions and 0 deletions

136
src/Agent/AgentRunner.php Normal file
View File

@@ -0,0 +1,136 @@
<?php
declare(strict_types=1);
namespace App\Agent;
use App\Context\ContextService;
use App\Context\UrlAnalyzer;
use App\Infrastructure\OllamaClient;
use App\Knowledge\Retrieval\RetrieverInterface;
use Generator;
use Psr\Log\LoggerInterface;
use Throwable;
use App\Agent\StreamChunker;
final readonly class AgentRunner
{
public function __construct(
private PromptBuilder $promptBuilder,
private ThinkSuppressor $thinkSuppressor,
private ContextService $contextService,
private UrlAnalyzer $urlAnalyzer,
private RetrieverInterface $retriever,
private OllamaClient $ollamaClient,
private LoggerInterface $agentLogger,
private bool $debug,
private bool $logPrompt,
private bool $logContext,
) {}
public function run(string $prompt, string $userId): Generator
{
$prompt = trim($prompt);
if ($prompt === '') {
yield '❌ Empty prompt.';
return;
}
$this->agentLogger->info('Agent run started', [
'userId' => $userId,
]);
try {
// ---------------------------------------------------------
// 1) Context strategy
// ---------------------------------------------------------
$includeFullContext = false;
// ---------------------------------------------------------
// 2) Extract URL content (if present)
// ---------------------------------------------------------
$urlContent = $this->urlAnalyzer->extractContentFromPrompt($prompt);
// ---------------------------------------------------------
// 3) Retrieve RAG knowledge
// ---------------------------------------------------------
$knowledgeChunks = $this->retriever->retrieve($prompt);
// ---------------------------------------------------------
// 4) Build final prompt
// ---------------------------------------------------------
$finalPrompt = $this->promptBuilder->build(
prompt: $prompt,
userId: $userId,
urlContent: $urlContent,
knowledgeChunks: $knowledgeChunks,
fullContext: $includeFullContext
);
if ($this->debug && $this->logPrompt) {
$this->agentLogger->debug($finalPrompt);
}
if ($this->debug && $this->logContext) {
$this->agentLogger->debug('Conversation context snapshot', [
'context' => $this->contextService->buildUserContext(
$userId,
$includeFullContext
),
]);
}
// ---------------------------------------------------------
// 5) Stream tokens from the LLM backend (chunked streaming)
// ---------------------------------------------------------
$fullOutput = '';
$chunker = new StreamChunker();
foreach ($this->ollamaClient->stream($finalPrompt) as $token) {
$cleanToken = $this->thinkSuppressor->filter($token);
if ($cleanToken === '') {
continue;
}
// Vollständige Antwort weiter sammeln (für History)
$fullOutput .= $cleanToken;
// ⬇️ Token in Chunker geben
$chunk = $chunker->push($cleanToken);
if ($chunk !== null) {
yield $chunk;
}
}
// ⬇️ Rest flushen
$finalChunk = $chunker->flush();
if ($finalChunk !== null) {
yield $finalChunk;
}
// ---------------------------------------------------------
// 6) Persist conversation history
// ---------------------------------------------------------
$this->contextService->appendHistory(
$userId,
$prompt,
$fullOutput
);
$this->agentLogger->info('Agent run finished', [
'userId' => $userId,
'outputLength' => mb_strlen($fullOutput),
'contextMode' => 'recent',
]);
} catch (Throwable $e) {
$this->agentLogger->error('Agent run failed', [
'userId' => $userId,
'exception' => $e,
]);
yield "\n❌ An internal error occurred while processing the request.";
}
}
}

136
src/Agent/PromptBuilder.php Normal file
View File

@@ -0,0 +1,136 @@
<?php
declare(strict_types=1);
namespace App\Agent;
use App\Context\ContextService;
use App\Context\UrlAnalyzer;
use DateTimeImmutable;
final class PromptBuilder
{
public function __construct(
private readonly ContextService $contextService,
private readonly UrlAnalyzer $urlAnalyzer,
)
{
}
/**
* Build the final prompt string for the LLM.
*
* @param string $prompt
* @param string $userId
* @param string $urlContent
* @param string[] $knowledgeChunks
* @param bool $fullContext
*/
public function build(
string $prompt,
string $userId,
string $urlContent,
array $knowledgeChunks,
bool $fullContext = false,
): string
{
$now = (new DateTimeImmutable())->format('Y-m-d H:i:s');
// ------------------------------------------------------------
// 1) SYSTEM INSTRUCTIONS
// ------------------------------------------------------------
$systemLines = [
'You are a conversational AI assistant.',
'Respond clearly, precisely, and in context of the ongoing conversation.',
'The conversation context is authoritative and must be respected.',
'External knowledge is supporting information only.',
'If the user asks for contact details such as phone number, email address, postal address or contact person, and the provided context contains such information, answer explicitly with the concrete data.',
'Do not omit contact details.',
'It is allowed and desired to quote contact data verbatim if it appears in the context.',
"Current date and time: {$now}",
'',
'IMPORTANT FORMATTING RULES:',
'- Always answer in valid Markdown.',
'- Use headings, lists, and paragraphs where appropriate.',
'- Insert line breaks early and often.',
'- Never write long paragraphs without newlines.',
'- Each list item must start on a new line.',
'- Prefer short paragraphs over dense text blocks.',
'',
'IMPORTANT LANGUAGE RULES:',
'- If the user input contains misspellings, silently use the correct canonical terms in your answer.',
'- Never mention, explain, or point out spelling mistakes.',
'- Do not ask clarifying questions about possible misspellings.',
'- Do not repeat or quote misspelled terms from the user input.',
'- Always use the correct technical spelling found in the provided context.',
'- Answer directly and confidently using always correct canonical terminology.'
];
$systemBlock = "SYSTEM:\n" . implode("\n", $systemLines);
// ------------------------------------------------------------
// 2) CONVERSATION CONTEXT (AUTHORITATIVE)
// ------------------------------------------------------------
$history = $this->contextService->buildUserContext(
userId: $userId,
full: $fullContext
);
$contextBlock = '';
if ($history !== '') {
$contextBlock =
"CONVERSATION CONTEXT (authoritative):\n" .
"The following messages are the previous turns of this conversation.\n" .
"They must be considered when answering the next question.\n\n" .
$history;
}
// ------------------------------------------------------------
// 3) EXTERNAL KNOWLEDGE (SUPPORTING)
// ------------------------------------------------------------
$knowledgeParts = [];
if ($knowledgeChunks !== []) {
$lines = [];
foreach ($knowledgeChunks as $i => $chunk) {
$n = $i + 1;
$lines[] = "[{$n}] {$chunk}";
}
$knowledgeParts[] =
"RETRIEVED KNOWLEDGE (supporting):\n" .
implode("\n\n", $lines);
}
if ($urlContent !== '') {
$knowledgeParts[] =
"CONTENT FROM URL (supporting):\n" .
$urlContent;
}
$knowledgeBlock = '';
if ($knowledgeParts !== []) {
$knowledgeBlock = implode("\n\n", $knowledgeParts);
}
// ------------------------------------------------------------
// 4) USER QUESTION
// ------------------------------------------------------------
$userBlock =
"USER QUESTION:\n" .
$prompt;
// ------------------------------------------------------------
// 5) FINAL PROMPT ASSEMBLY
// ------------------------------------------------------------
$blocks = array_filter([
$systemBlock,
$contextBlock,
$knowledgeBlock,
$userBlock,
]);
return implode("\n\n", $blocks);
}
}

View File

@@ -0,0 +1,61 @@
<?php
declare(strict_types=1);
namespace App\Agent;
final class StreamChunker
{
private string $buffer = '';
private bool $insideCodeBlock = false;
private int $minChunkSize = 120;
public function push(string $token): ?string
{
$this->buffer .= $token;
if (str_contains($token, '```')) {
$this->insideCodeBlock = !$this->insideCodeBlock;
}
if ($this->shouldFlush()) {
$out = $this->buffer;
$this->buffer = '';
return $out;
}
return null;
}
public function flush(): ?string
{
if ($this->buffer === '') {
return null;
}
$out = $this->buffer;
$this->buffer = '';
return $out;
}
private function shouldFlush(): bool
{
if ($this->insideCodeBlock) {
return false;
}
if (str_ends_with($this->buffer, "\n\n")) {
return true;
}
if (preg_match('/[.!?]\s$/', $this->buffer)) {
return true;
}
if (preg_match('/\n[-*] .+\n$/', $this->buffer)) {
return true;
}
return mb_strlen($this->buffer) >= $this->minChunkSize;
}
}

View File

@@ -0,0 +1,88 @@
<?php
declare(strict_types=1);
namespace App\Agent;
/**
* ThinkSuppressor
*
* Robust streaming-safe suppressor for internal <think>...</think> sections.
*
* Key properties:
* - Handles token fragmentation (partial tags across tokens)
* - Stateful per stream, stateless per request
* - Does not buffer full responses
* - Deterministic and predictable
*/
final class ThinkSuppressor
{
/** Indicates whether the stream is currently inside a <think> block. */
private bool $insideThink = false;
/** Indicates whether the think section has been fully closed. */
private bool $thinkSectionCompleted = false;
/**
* Rolling buffer for detecting fragmented tags across tokens.
*/
private string $rollingBuffer = '';
/**
* Maximum buffer length needed to safely detect tags.
*/
private int $maxBufferLength = 32;
/**
* Filters a single token from the LLM stream.
*
* @param string $token Raw token from the LLM
* @return string Cleaned token safe for user output
*/
public function filter(string $token): string
{
// Append to rolling buffer
$this->rollingBuffer .= $token;
if (strlen($this->rollingBuffer) > $this->maxBufferLength) {
$this->rollingBuffer = substr($this->rollingBuffer, -$this->maxBufferLength);
}
// If think section is already completed, just strip stray closing tags
if ($this->thinkSectionCompleted) {
return str_replace('</think>', '', $token);
}
// Detect fragmented opening <think> tag
if (!$this->insideThink && str_contains($this->rollingBuffer, '<think>')) {
$this->insideThink = true;
return '';
}
// Detect fragmented closing </think> tag
if ($this->insideThink && str_contains($this->rollingBuffer, '</think>')) {
$this->insideThink = false;
$this->thinkSectionCompleted = true;
// Emit a single line break after think section ends
return "\n";
}
// Suppress all content while inside <think>...</think>
if ($this->insideThink) {
return '';
}
return $token;
}
/**
* Resets the suppressor state.
* Must be called before starting a new stream.
*/
public function reset(): void
{
$this->insideThink = false;
$this->thinkSectionCompleted = false;
$this->rollingBuffer = '';
}
}

View File

@@ -0,0 +1,84 @@
<?php
declare(strict_types=1);
namespace App\Command;
use App\Agent\AgentRunner;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
/**
* AgentCliCommand
*
* Interactive CLI interface for the AI agent.
* Symfony-native, streaming-first implementation.
*
* Responsibilities:
* - Read user input from STDIN
* - Stream tokens from the AgentRunner
* - Render streamed output to the terminal
*
* The AgentRunner is the single owner of:
* - Think suppression
* - Context handling
* - Streaming semantics
*/
#[AsCommand(
name: 'mto:agent:chat',
description: 'Start an interactive CLI chat with the AI agent'
)]
final class AgentCliCommand extends Command
{
public function __construct(
private readonly AgentRunner $agentRunner,
) {
parent::__construct();
}
protected function configure(): void
{
$this
->addArgument('user-id', InputArgument::OPTIONAL, 'User/session identifier', 'cli');
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$io = new SymfonyStyle($input, $output);
$userId = (string) $input->getArgument('user-id');
$io->success('AI Agent CLI started. Press Ctrl+C or type "exit" to quit.');
$io->writeln('');
while (true) {
$prompt = $io->ask('Question');
if ($prompt === null) {
// EOF (e.g. piped input ended)
$io->writeln('');
return Command::SUCCESS;
}
$prompt = trim($prompt);
if ($prompt === '' || strtolower($prompt) === 'exit') {
$io->writeln('');
return Command::SUCCESS;
}
$io->writeln('');
$io->writeln('<info>Answer:</info>');
foreach ($this->agentRunner->run($prompt, $userId) as $token) {
$output->write($token);
}
$io->writeln('');
$io->writeln('');
}
}
}

View File

@@ -0,0 +1,116 @@
<?php
// src/Command/KnowledgeIngestCommand.php
declare(strict_types=1);
namespace App\Command;
use App\Knowledge\Ingest\KnowledgeIngestService;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Finder\Finder;
#[AsCommand(
name: 'mto:agent:knowledge:ingest',
description: 'Ingest one or multiple markdown/text documents into file-based knowledge chunks'
)]
final class KnowledgeIngestCommand extends Command
{
public function __construct(
private readonly KnowledgeIngestService $ingest,
private readonly string $uploadsDir,
) {
parent::__construct();
}
protected function configure(): void
{
$this
->addArgument(
'file',
InputArgument::OPTIONAL,
'Path to a single .txt/.md file'
)
->addOption(
'all',
null,
InputOption::VALUE_NONE,
'Ingest all .md files from the uploads directory'
)
->addOption(
'optimize',
'o',
InputOption::VALUE_NONE,
'Optimize chunks for retrieval quality'
);
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$files = [];
$optimize = (bool) $input->getOption('optimize');
if ($input->getOption('all')) {
if (!is_dir($this->uploadsDir)) {
$output->writeln('<error>❌ uploads directory not found</error>');
return Command::FAILURE;
}
$finder = new Finder();
$finder
->files()
->in($this->uploadsDir)
->name('*.md');
if (!$finder->hasResults()) {
$output->writeln('<comment> No .md files found in uploads/</comment>');
return Command::SUCCESS;
}
foreach ($finder as $file) {
$files[] = $file->getRealPath();
}
$output->writeln(sprintf(
'📂 Ingesting %d markdown files from uploads (%s)',
count($files),
$optimize ? 'optimized' : 'standard'
));
} else {
$file = $input->getArgument('file');
if (!$file) {
$output->writeln('<error>❌ Either provide a file or use --all</error>');
return Command::FAILURE;
}
$files[] = (string) $file;
}
$totalWritten = 0;
foreach ($files as $filePath) {
$output->writeln('➡️ Ingesting: ' . $filePath);
$written = $this->ingest->ingestFile(
$filePath,
optimize: $optimize
);
$totalWritten += count($written);
foreach ($written as $chunk) {
$output->writeln(' - ' . $chunk);
}
}
$output->writeln('');
$output->writeln('✅ Total written chunks: ' . $totalWritten);
return Command::SUCCESS;
}
}

View File

@@ -0,0 +1,89 @@
<?php
declare(strict_types=1);
namespace App\Command;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
#[AsCommand(
name: 'mto:agent:vector:ingest',
description: 'Builds the FAISS vector index from index.json'
)]
final class VectorIngestCommand extends Command
{
public function __construct(
private readonly string $vectorDir,
private readonly string $projectDir
) {
parent::__construct();
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$vectorDir = rtrim($this->vectorDir, '/');
if (!is_dir($vectorDir)) {
$output->writeln('<error>Vector directory not found</error>');
return Command::FAILURE;
}
$script = $vectorDir . '/vector_ingest.py';
if (!is_file($script)) {
$output->writeln('<error>vector_ingest.py not found</error>');
return Command::FAILURE;
}
// -------------------------------------------------
// Enforce venv usage
// -------------------------------------------------
$venvPython = $vectorDir . '/.venv/bin/python';
if (!is_file($venvPython)) {
$output->writeln('<error>No Python virtual environment found.</error>');
$output->writeln('<comment>Run first:</comment>');
$output->writeln('<info> php bin/console mto:agent:vector:install</info>');
return Command::FAILURE;
}
$knowledgeDir = rtrim($this->projectDir, '/') . '/var/knowledge';
if (!is_dir($knowledgeDir)) {
$output->writeln('<error>Knowledge directory not found:</error>');
$output->writeln($knowledgeDir);
return Command::FAILURE;
}
$output->writeln('<info>Building FAISS vector index…</info>');
$output->writeln(sprintf(
'<comment>Vector dir:</comment> %s',
$vectorDir
));
$output->writeln(sprintf(
'<comment>Knowledge dir:</comment> %s',
$knowledgeDir
));
$cmd = sprintf(
'%s %s %s %s 2>&1',
escapeshellarg($venvPython),
escapeshellarg($script),
escapeshellarg($vectorDir),
escapeshellarg($knowledgeDir)
);
exec($cmd, $out, $exitCode);
foreach ($out as $line) {
$output->writeln($line);
}
return $exitCode === 0
? Command::SUCCESS
: Command::FAILURE;
}
}

View File

@@ -0,0 +1,114 @@
<?php
declare(strict_types=1);
namespace App\Command;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
/**
* System requirements (once per environment):
* sudo apt update
* sudo apt install -y python3-venv python3-pip
*/
#[AsCommand(
name: 'mto:agent:vector:install',
description: 'Creates a Python venv and installs vector dependencies'
)]
final class VectorInstallCommand extends Command
{
public function __construct(
private readonly string $vectorDir
) {
parent::__construct();
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
if (!is_dir($this->vectorDir)) {
$output->writeln('<error>Vector directory not found</error>');
return Command::FAILURE;
}
$vectorDir = rtrim($this->vectorDir, '/');
$venvDir = $vectorDir . '/.venv';
$venvPython = $venvDir . '/bin/python';
// -------------------------------------------------
// 1) Create venv if missing
// -------------------------------------------------
if (!is_dir($venvDir)) {
$output->writeln('<info>Creating Python virtual environment…</info>');
$cmd = sprintf(
'python3 -m venv %s 2>&1',
escapeshellarg($venvDir)
);
exec($cmd, $out, $exitCode);
foreach ($out as $line) {
$output->writeln($line);
}
if ($exitCode !== 0 || !is_file($venvPython)) {
$output->writeln('');
$output->writeln('<error>Failed to create Python virtual environment.</error>');
$output->writeln('<comment>Ensure that python3-venv is installed on the system.</comment>');
return Command::FAILURE;
}
} else {
$output->writeln('<info>Using existing Python virtual environment</info>');
}
// -------------------------------------------------
// 2) Ensure pip exists inside venv
// -------------------------------------------------
$cmd = sprintf(
'%s -m pip --version 2>&1',
escapeshellarg($venvPython)
);
exec($cmd, $out, $exitCode);
if ($exitCode !== 0) {
$output->writeln('');
$output->writeln('<error>The existing virtual environment has no pip.</error>');
$output->writeln('<comment>This usually means it was created before python3-pip was installed.</comment>');
$output->writeln('<comment>Fix:</comment>');
$output->writeln(sprintf('<info> rm -rf %s</info>', $venvDir));
$output->writeln('<info> php bin/console mto:agent:vector:install</info>');
return Command::FAILURE;
}
// -------------------------------------------------
// 3) Install / update dependencies
// -------------------------------------------------
$output->writeln('<info>Installing vector dependencies…</info>');
$cmd = sprintf(
'%s -m pip install --upgrade faiss-cpu sentence-transformers 2>&1',
escapeshellarg($venvPython)
);
exec($cmd, $out, $exitCode);
foreach ($out as $line) {
$output->writeln($line);
}
if ($exitCode !== 0) {
$output->writeln('<error>Dependency installation failed</error>');
return Command::FAILURE;
}
$output->writeln('');
$output->writeln('<info>Vector dependencies installed successfully</info>');
$output->writeln(sprintf('<comment>venv:</comment> %s', $venvDir));
return Command::SUCCESS;
}
}

View File

@@ -0,0 +1,126 @@
<?php
declare(strict_types=1);
namespace App\Context;
/**
* ContextService
*
* Manages conversational history persistence and retrieval.
*
* Responsibilities:
* - Persist completed conversation turns (append-only)
* - Provide recent or extended conversation context
* - Resolve history storage paths safely
*
* Non-responsibilities:
* - No follow-up detection
* - No prompt semantics
* - No interpretation of user intent
*
* Context levels:
* - Regular context: last N lines (default)
* - Full context: extended history for special cases
*/
final class ContextService
{
private string $historyDir;
/**
* Number of lines included in regular context.
* Intended for normal conversational continuity.
*/
private int $maxRegularLines = 20;
/**
* Number of lines included in full context.
* Intended for exceptional or diagnostic scenarios.
*/
private int $maxFullLines = 500;
public function __construct(
string $historyDir,
string $projectDir,
) {
/**
* Normalize history directory:
* - Allow relative paths in env (e.g. "var/agent-history")
* - Always resolve to an absolute path based on project root
*/
$historyDir = rtrim($historyDir, '/');
if (!str_starts_with($historyDir, '/')) {
$historyDir = rtrim($projectDir, '/') . '/' . ltrim($historyDir, '/');
}
$this->historyDir = $historyDir;
// Ensure directory exists
if (!is_dir($this->historyDir)) {
mkdir($this->historyDir, 0777, true);
}
}
/**
* Returns the conversation context for a given user.
*
* @param string $userId Stable client identifier
* @param bool $full Whether to load extended history
*/
public function buildUserContext(string $userId, bool $full = false): string
{
$path = $this->getHistoryPath($userId);
if (!is_file($path)) {
return '';
}
$lines = file($path, FILE_IGNORE_NEW_LINES);
if ($lines === false) {
return '';
}
$maxLines = $full ? $this->maxFullLines : $this->maxRegularLines;
$selected = array_slice($lines, -$maxLines);
return implode("\n", $selected);
}
/**
* Appends a completed interaction to the user's history.
*
* Format (append-only):
* Question: <user prompt>
* <assistant response>
*/
public function appendHistory(string $userId, string $prompt, string $response): void
{
$path = $this->getHistoryPath($userId);
$entry = "Question: {$prompt}\n{$response}\n";
file_put_contents($path, $entry, FILE_APPEND | LOCK_EX);
}
/**
* Deletes the complete conversation history for a user.
*/
public function deleteHistory(string $userId): void
{
$path = $this->getHistoryPath($userId);
if (is_file($path)) {
unlink($path);
}
}
/**
* Resolves the absolute history file path for a user.
*/
private function getHistoryPath(string $userId): string
{
$safeUserId = preg_replace('/[^a-zA-Z0-9_-]/', '_', $userId);
return $this->historyDir . '/' . $safeUserId . '.txt';
}
}

120
src/Context/UrlAnalyzer.php Normal file
View File

@@ -0,0 +1,120 @@
<?php
declare(strict_types=1);
namespace App\Context;
use RuntimeException;
/**
* UrlAnalyzer
*
* Extracts and analyzes URL content from user prompts in a production-safe way.
*
* Responsibilities:
* - Detect the first URL inside a prompt
* - Fetch remote content with strict limits
* - Clean and normalize readable text
* - Identify trusted internal domains based on URL host
*
* Design constraints:
* - No framework dependencies
* - No prompt or agent logic
* - Defensive against slow or large responses
*/
final class UrlAnalyzer
{
private int $timeoutSeconds = 20;
private int $maxChars = 5000;
/**
* List of trusted internal domains.
* Used for marking content as authoritative.
*/
private array $internalDomains = [
'mitho-media.de',
];
/**
* Extracts readable text from the first URL found in a prompt.
*
* @param string $prompt
* @return string Cleaned page text or empty string on failure
*/
public function extractContentFromPrompt(string $prompt): string
{
if (!preg_match('~https?://\S+|www\.\S+~i', $prompt, $matches)) {
return '';
}
$url = $matches[0];
if (!str_starts_with($url, 'http')) {
$url = 'https://' . $url;
}
$parts = parse_url($url);
if ($parts === false || empty($parts['host'])) {
return '';
}
$context = stream_context_create([
'http' => [
'timeout' => $this->timeoutSeconds,
'user_agent' => 'mithoAgent/1.0',
'ignore_errors' => true,
],
]);
$handle = @fopen($url, 'rb', false, $context);
if ($handle === false) {
return '';
}
try {
$html = '';
while (!feof($handle) && strlen($html) < $this->maxChars * 2) {
$html .= fread($handle, 1024);
}
} finally {
fclose($handle);
}
if ($html === '') {
return '';
}
// Remove script and style blocks
$html = preg_replace('~<script[^>]*>.*?</script>~is', '', $html) ?? $html;
$html = preg_replace('~<style[^>]*>.*?</style>~is', '', $html) ?? $html;
// Strip remaining HTML and normalize whitespace
$text = strip_tags($html);
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
return mb_substr(trim($text), 0, $this->maxChars);
}
/**
* Determines whether a URL belongs to a trusted internal domain.
*
* @param string $url
* @return bool
*/
public function isInternalDomainUrl(string $url): bool
{
$parts = parse_url($url);
if ($parts === false || empty($parts['host'])) {
return false;
}
$host = mb_strtolower($parts['host']);
foreach ($this->internalDomains as $domain) {
if ($host === $domain || str_ends_with($host, '.' . $domain)) {
return true;
}
}
return false;
}
}

0
src/Controller/.gitignore vendored Normal file
View File

View File

@@ -0,0 +1,115 @@
<?php
declare(strict_types=1);
namespace App\Controller;
use App\Agent\AgentRunner;
use App\Http\ClientIdResolver;
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\HttpFoundation\StreamedResponse;
use Symfony\Component\Routing\Annotation\Route;
final readonly class AskSseController
{
public function __construct(
private AgentRunner $agentRunner,
private ClientIdResolver $clientIdResolver,
) {}
#[Route('/ask-sse', name: 'ask_sse', methods: ['POST'])]
public function stream(Request $request): StreamedResponse
{
$data = json_decode($request->getContent(), true);
$prompt = trim((string) ($data['prompt'] ?? ''));
$cookieResponse = new Response();
$clientId = $this->clientIdResolver->resolve($request, $cookieResponse);
return new StreamedResponse(
function () use ($prompt, $clientId, $cookieResponse): void {
// ---------------------------------------------------------
// Disable all PHP output buffering
// ---------------------------------------------------------
while (ob_get_level() > 0) {
ob_end_flush();
}
// ---------------------------------------------------------
// Forward cookies
// ---------------------------------------------------------
foreach ($cookieResponse->headers->getCookies() as $cookie) {
header('Set-Cookie: ' . $cookie, false);
}
// ---------------------------------------------------------
// SSE prelude
// ---------------------------------------------------------
echo "retry: 3000\n\n";
flush();
if ($prompt === '') {
$this->sendEvent('error', 'Empty prompt');
return;
}
// ---------------------------------------------------------
// 🔥 FIXED: Sende Chunks direkt (behält \n!)
// ---------------------------------------------------------
foreach ($this->agentRunner->run($prompt, $clientId) as $chunk) {
// Normalize line endings
$chunk = str_replace(["\r\n", "\r"], "\n", $chunk);
// Sende Chunk direkt mit \n
$this->sendData($chunk);
}
// ---------------------------------------------------------
// Signal completion
// ---------------------------------------------------------
$this->sendEvent('done', '[DONE]');
},
200,
[
'Content-Type' => 'text/event-stream; charset=utf-8',
'Cache-Control' => 'no-cache, no-store, must-revalidate',
'Connection' => 'keep-alive',
'X-Accel-Buffering' => 'no',
]
);
}
/**
* FIXED: Behält Markdown-Struktur (\n) bei
*
* SSE erlaubt mehrere "data:"-Zeilen pro Event.
* Jede Zeile wird als separate data-Zeile gesendet.
*/
private function sendData(string $data): void
{
// Split by \n und sende jede Zeile einzeln
$lines = explode("\n", $data);
foreach ($lines as $line) {
echo 'data: ' . $line . "\n";
}
// Leere Zeile = Ende der SSE-Message
echo "\n\n";
flush();
}
/**
* Sends a named SSE event.
*/
private function sendEvent(string $event, string $data): void
{
$safe = str_replace(["\r", "\n"], ' ', $data);
echo "event: {$event}\n";
echo "data: {$safe}\n\n";
flush();
}
}

View File

@@ -0,0 +1,127 @@
<?php
declare(strict_types=1);
namespace App\Controller;
use App\Context\ContextService;
use App\Http\ClientIdResolver;
use Symfony\Component\HttpFoundation\JsonResponse;
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\Routing\Annotation\Route;
/**
* HistoryController
*
* Read-only and destructive endpoints for conversation history.
*
* Responsibilities:
* - Expose stored chat history for frontend reload
* - Allow explicit deletion of the current client's history
*
* Identity handling:
* - Client identity is resolved exclusively via ClientIdResolver
* - No user identifiers are accepted from the request
*/
final class HistoryController
{
public function __construct(
private readonly ContextService $contextService,
private readonly ClientIdResolver $clientIdResolver,
) {}
/**
* Returns the full conversation history for the current client
* in a frontend-friendly structure.
*/
#[Route('/history', name: 'chat_history', methods: ['GET'])]
public function history(Request $request): JsonResponse
{
// Resolve client ID (cookie-based)
$response = new Response();
$clientId = $this->clientIdResolver->resolve($request, $response);
$raw = $this->contextService->buildUserContext($clientId, full: true);
if ($raw === '') {
return $this->jsonWithCookies([], $response);
}
$messages = [];
$lines = explode("\n", $raw);
$assistantBuffer = [];
foreach ($lines as $line) {
// User message
if (str_starts_with($line, 'Question: ')) {
// Flush previous assistant output
if ($assistantBuffer !== []) {
$messages[] = [
'role' => 'assistant',
'text' => trim(implode("\n", $assistantBuffer)),
];
$assistantBuffer = [];
}
$messages[] = [
'role' => 'user',
'text' => trim(substr($line, 10)),
];
continue;
}
// Assistant output (can span multiple lines)
if (trim($line) !== '') {
$assistantBuffer[] = $line;
}
}
// Flush trailing assistant output
if ($assistantBuffer !== []) {
$messages[] = [
'role' => 'assistant',
'text' => trim(implode("\n", $assistantBuffer)),
];
}
return $this->jsonWithCookies($messages, $response);
}
/**
* Deletes the complete conversation history for the current client.
*/
#[Route('/history/delete', name: 'delete_history', methods: ['POST'])]
public function delete(Request $request): JsonResponse
{
// Resolve client ID (cookie-based)
$response = new Response();
$clientId = $this->clientIdResolver->resolve($request, $response);
$this->contextService->deleteHistory($clientId);
return $this->jsonWithCookies(
[
'status' => 'ok',
'message' => 'History deleted',
],
$response
);
}
/**
* Helper to return JSON responses while forwarding cookies.
*/
private function jsonWithCookies(array $data, Response $cookieResponse): JsonResponse
{
$json = new JsonResponse($data);
foreach ($cookieResponse->headers->getCookies() as $cookie) {
$json->headers->setCookie($cookie);
}
return $json;
}
}

View File

@@ -0,0 +1,46 @@
<?php
declare(strict_types=1);
namespace App\Http;
use Symfony\Component\HttpFoundation\Cookie;
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\Uid\Uuid;
/**
* ClientIdResolver
*
* Resolves a stable, anonymous client identifier for browser-based users.
* The identifier is stored as an HttpOnly cookie.
*/
final class ClientIdResolver
{
private const COOKIE_NAME = 'ai_client_id';
public function resolve(Request $request, Response $response): string
{
$clientId = $request->cookies->get(self::COOKIE_NAME);
if (is_string($clientId) && $clientId !== '') {
return $clientId;
}
$clientId = Uuid::v4()->toRfc4122();
$response->headers->setCookie(
new Cookie(
name: self::COOKIE_NAME,
value: $clientId,
expire: strtotime('+1 year'),
path: '/',
secure: false, // set true in production with HTTPS
httpOnly: true,
sameSite: Cookie::SAMESITE_LAX
)
);
return $clientId;
}
}

View File

@@ -0,0 +1,148 @@
<?php
declare(strict_types=1);
namespace App\Infrastructure;
use Generator;
use JsonException;
use RuntimeException;
use Throwable;
/**
* OllamaClient
*
* Production-ready streaming client for Ollama-compatible LLM backends.
*
* Key properties:
* - True live streaming (tokens are yielded while the request is running)
* - PHP-safe (no yield inside cURL callbacks)
* - Works for both HTTP streaming and CLI usage
* - Deterministic and resource-safe
*
* Implementation strategy:
* - Use curl_multi_* to keep control of the execution loop
* - Accumulate partial chunks into a rolling buffer
* - Extract JSON lines incrementally
* - Yield tokens immediately when they arrive
*/
final class OllamaClient
{
private string $apiUrl;
private string $model;
private int $timeoutSeconds;
public function __construct(
string $apiUrl,
string $model,
int $timeoutSeconds,
) {
$this->apiUrl = $apiUrl;
$this->model = $model;
$this->timeoutSeconds = $timeoutSeconds;
}
/**
* Streams tokens from the LLM backend in real time.
*
* @param string $prompt Fully constructed prompt
*
* @return Generator<string>
* @throws JsonException
*/
public function stream(string $prompt): Generator
{
$payload = json_encode([
'model' => $this->model,
'prompt' => $prompt,
'stream' => true,
], JSON_THROW_ON_ERROR);
$buffer = '';
$done = false;
$ch = curl_init($this->apiUrl);
if ($ch === false) {
throw new RuntimeException('Failed to initialize cURL');
}
curl_setopt_array($ch, [
CURLOPT_POST => true,
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
CURLOPT_POSTFIELDS => $payload,
CURLOPT_RETURNTRANSFER => false,
CURLOPT_TIMEOUT => $this->timeoutSeconds,
CURLOPT_WRITEFUNCTION => function ($curl, string $data) use (&$buffer, &$done): int {
$buffer .= $data;
return strlen($data);
},
]);
$mh = curl_multi_init();
if ($mh === false) {
curl_close($ch);
throw new RuntimeException('Failed to initialize cURL multi handle');
}
curl_multi_add_handle($mh, $ch);
try {
do {
// Execute the multi handle
do {
$status = curl_multi_exec($mh, $running);
} while ($status === CURLM_CALL_MULTI_PERFORM);
// Read incoming data from the buffer
while (($pos = strpos($buffer, "\n")) !== false) {
$line = trim(substr($buffer, 0, $pos));
$buffer = substr($buffer, $pos + 1);
if ($line === '') {
continue;
}
try {
$json = json_decode($line, true, flags: JSON_THROW_ON_ERROR);
} catch (Throwable) {
continue;
}
if (isset($json['response'])) {
yield $json['response'];
}
if (!empty($json['done'])) {
$done = true;
}
}
// Wait for network activity
if ($running) {
curl_multi_select($mh, 0.2);
}
} while ($running && !$done);
// Flush remaining buffer (edge case)
if (!$done && trim($buffer) !== '') {
try {
$json = json_decode(trim($buffer), true, flags: JSON_THROW_ON_ERROR);
if (isset($json['response'])) {
yield $json['response'];
}
} catch (Throwable) {
// ignore
}
}
if (curl_errno($ch)) {
$error = curl_error($ch);
throw new RuntimeException('LLM connection error: ' . $error);
}
} finally {
curl_multi_remove_handle($mh, $ch);
curl_multi_close($mh);
curl_close($ch);
}
}
}

11
src/Kernel.php Normal file
View File

@@ -0,0 +1,11 @@
<?php
namespace App;
use Symfony\Bundle\FrameworkBundle\Kernel\MicroKernelTrait;
use Symfony\Component\HttpKernel\Kernel as BaseKernel;
class Kernel extends BaseKernel
{
use MicroKernelTrait;
}

View File

@@ -0,0 +1,58 @@
<?php
// src/Knowledge/Ingest/ChunkIndexWriter.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
final class ChunkIndexWriter
{
public function __construct(
private string $indexPath
) {}
public function add(array $entry): void
{
$index = $this->load();
$index[] = $entry;
$this->save($index);
}
private function load(): array
{
if (!is_file($this->indexPath)) {
return [];
}
$json = file_get_contents($this->indexPath);
$data = $json ? json_decode($json, true) : null;
return is_array($data) ? $data : [];
}
private function save(array $index): void
{
$dir = dirname($this->indexPath);
if (!is_dir($dir)) {
mkdir($dir, 0775, true);
}
file_put_contents(
$this->indexPath,
json_encode($index, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)
);
}
public function hasSourceHash(string $source, string $hash): bool
{
foreach ($this->load() as $entry) {
if (
($entry['source'] ?? null) === $source &&
($entry['sourceHash'] ?? null) === $hash
) {
return true;
}
}
return false;
}
}

View File

@@ -0,0 +1,149 @@
<?php
// src/Knowledge/Ingest/ChunkWriter.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
use App\Knowledge\StopWords;
final class ChunkWriter
{
public function __construct(
private string $chunksDir,
private string $manifestPath,
private ChunkIndexWriter $indexWriter,
private StopWords $stopWords,
)
{
}
/**
* @param string[] $chunks
* @return string[] written filenames
*/
public function write(string $sourceName, array $chunks, string $sourceHash): array
{
if (!is_dir($this->chunksDir)) {
mkdir($this->chunksDir, 0775, true);
}
$manifest = $this->loadManifest();
$written = [];
$base = $this->safeBase($sourceName);
$ts = date('Ymd_His');
foreach ($chunks as $i => $chunk) {
$filename = "{$base}__{$ts}__" . str_pad((string)$i, 4, '0', STR_PAD_LEFT) . ".txt";
$path = rtrim($this->chunksDir, '/') . '/' . $filename;
$header = $this->buildHeader(
source: $sourceName,
index: $i
);
file_put_contents($path, $header . "\n\n" . $chunk);
$written[] = $filename;
$manifest[] = [
'file' => $filename,
'source' => $sourceName,
'index' => $i,
'chars' => mb_strlen($chunk),
'createdAt' => date('c'),
];
$this->indexWriter->add([
'file' => $filename,
'source' => $sourceName,
'sourceHash' => $sourceHash,
'keywords' => $this->extractKeywords($chunk),
'chars' => mb_strlen($chunk),
]);
}
$this->saveManifest($manifest);
return $written;
}
private function safeBase(string $name): string
{
$name = pathinfo($name, PATHINFO_FILENAME);
$name = mb_strtolower($name);
$name = preg_replace('/[^a-z0-9\-_]+/u', '-', $name);
return trim((string)$name, '-');
}
private function loadManifest(): array
{
if (!is_file($this->manifestPath)) {
return [];
}
$json = file_get_contents($this->manifestPath);
$data = $json ? json_decode($json, true) : null;
return is_array($data) ? $data : [];
}
private function saveManifest(array $manifest): void
{
$dir = dirname($this->manifestPath);
if (!is_dir($dir)) {
mkdir($dir, 0775, true);
}
file_put_contents($this->manifestPath, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
}
private function buildHeader(string $source, int $index): string
{
return sprintf(
'[Quelle: %s | Abschnitt: Chunk %d]',
$source,
$index + 1
);
}
private function extractKeywords(string $text): array
{
// 1) Lowercase
$text = mb_strtolower($text);
// 2) URLs entfernen (sehr wichtig)
$text = preg_replace('#https?://\S+#u', ' ', $text);
// 3) Newlines & Tabs → Space
$text = str_replace(["\r", "\n", "\t"], ' ', $text);
// 4) Trennzeichen → Space (NICHT löschen!)
$text = preg_replace('/[\/\.\,\:\;\-\_\(\)\[\]\{\}]/u', ' ', $text);
// 5) Alles andere raus
$text = preg_replace('/[^\p{L}\p{N}\s]/u', '', $text);
// 6) Whitespace normalisieren
$text = preg_replace('/\s+/u', ' ', $text);
$text = trim($text);
// 7) Wörter extrahieren
$words = explode(' ', $text);
// 8) Filtern + deduplizieren
$keywords = [];
foreach ($words as $word) {
if (mb_strlen($word) < 4) {
continue;
}
if (in_array($word, $this->stopWords->getStopWords() ?? [], true)) {
continue;
}
$keywords[] = $word;
}
return array_values(array_unique(array_slice($keywords, 0, 25)));
}
}

View File

@@ -0,0 +1,37 @@
<?php
// src/Knowledge/Ingest/DocumentLoader.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
final class DocumentLoader
{
public function load(string $path): string
{
if (!is_file($path)) {
throw new \RuntimeException("File not found: {$path}");
}
$ext = mb_strtolower(pathinfo($path, PATHINFO_EXTENSION));
return match ($ext) {
'txt', 'md' => $this->loadText($path),
// später:
// 'pdf' => $this->loadPdf($path),
// 'docx' => $this->loadDocx($path),
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
};
}
private function loadText(string $path): string
{
$content = file_get_contents($path);
if ($content === false) {
throw new \RuntimeException("Could not read file: {$path}");
}
return $content;
}
}

View File

@@ -0,0 +1,39 @@
<?php
// src/Knowledge/Ingest/KnowledgeIngestService.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
final class KnowledgeIngestService
{
public function __construct(
private DocumentLoader $loader,
private SimpleChunker $chunker,
private ChunkWriter $writer,
private ChunkIndexWriter $indexWriter,
)
{
}
/** @return string[] written chunk filenames */
public function ingestFile(string $path, bool $optimize = false): array
{
$text = $this->loader->load($path);
if ($optimize) {
$text = preg_replace("/\n{3,}/", "\n\n", $text);
$text = preg_replace("/[ \t]+$/m", "", $text);
}
$sourceHash = sha1($text);
$sourceName = basename($path);
if ($this->indexWriter->hasSourceHash($sourceName, $sourceHash)) {
return [];
}
$chunks = $this->chunker->chunk($text);
return $this->writer->write($sourceName, $chunks, $sourceHash);
}
}

View File

@@ -0,0 +1,146 @@
<?php
// src/Knowledge/Ingest/SimpleChunker.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
final class SimpleChunker
{
public function __construct(
private int $maxWords = 180,
private int $overlapWords = 30
) {}
/** @return string[] */
public function chunk(string $text): array
{
$text = $this->normalize($text);
if ($text === '') {
return [];
}
// Split into tokens: words + whitespace preserved
$tokens = preg_split(
'/(\s+)/u',
$text,
-1,
PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
);
if (!$tokens) {
return [];
}
// Build word index → token index mapping
$wordTokenIndexes = [];
foreach ($tokens as $i => $token) {
if (!preg_match('/^\s+$/u', $token)) {
$wordTokenIndexes[] = $i;
}
}
$totalWords = count($wordTokenIndexes);
if ($totalWords === 0) {
return [];
}
$chunks = [];
$wordPos = 0;
while ($wordPos < $totalWords) {
$wordEnd = min($wordPos + $this->maxWords, $totalWords);
$tokenStart = $wordTokenIndexes[$wordPos];
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
// Intelligent cut (sentence / paragraph aware)
$tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
$chunk = trim(implode('', array_slice(
$tokens,
$tokenStart,
$tokenEnd - $tokenStart
)));
if ($chunk !== '') {
$chunks[] = $chunk;
}
if ($wordEnd >= $totalWords) {
break;
}
$wordPos = max(0, $wordEnd - $this->overlapWords);
}
return $this->dedupe($chunks);
}
private function normalize(string $text): string
{
$text = str_replace(["\r\n", "\r"], "\n", $text);
$text = preg_replace("/[ \t]+/u", " ", $text);
$text = preg_replace("/\n{3,}/u", "\n\n", $text);
return trim((string) $text);
}
/**
* Move cut backwards to a natural boundary if possible.
* Rules:
* - Never cut inside markdown list items
* - Sentence end only if followed by a line break
* - Paragraph breaks always allowed
*/
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
{
// Detect markdown list context (e.g. "- Foo: Bar")
$startToken = $tokens[$start] ?? '';
if (preg_match('/^- /u', ltrim($startToken))) {
// Keep list blocks intact
return $end;
}
for ($i = $end - 1; $i > $start; $i--) {
// Paragraph boundary
if ($tokens[$i] === "\n\n") {
return $i + 1;
}
// Sentence boundary only if followed by newline
if (
preg_match('/[.!?]\s*$/u', $tokens[$i]) &&
isset($tokens[$i + 1]) &&
str_contains($tokens[$i + 1], "\n")
) {
return $i + 1;
}
}
return $end;
}
/** @param string[] $chunks @return string[] */
private function dedupe(array $chunks): array
{
$seen = [];
$out = [];
foreach ($chunks as $chunk) {
$key = mb_strtolower(
preg_replace('/\s+/u', ' ', trim($chunk))
);
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = $chunk;
}
return $out;
}
}

View File

@@ -0,0 +1,35 @@
<?php
declare(strict_types=1);
namespace App\Knowledge;
/**
* KeywordMapper
*
* Expands short or ambiguous prompts into richer semantic variants
* before they are passed into retrieval or embedding pipelines.
*
* This is a direct port of prompt_mapping.py.
*/
final class KeywordMapper
{
private array $map = [
'ki' => 'künstliche Intelligenz, AI, Projekte, Modelle, Agenten, ki',
'shop' => 'Shopware, Onlineshop, Webshop, Commerce-System',
'shops' => 'Shopware, Webshops, Verkaufsplattformen',
'agentur' => 'Agentur, Firma, Unternehmen, mitho media',
'api' => 'Schnittstelle, API, Anbindung, Integration',
'plugin' => 'Shopware Plugin, Erweiterung, Modul, Funktion',
];
/**
* Maps a raw prompt to an expanded semantic variant if applicable.
*/
public function map(string $prompt): string
{
$key = mb_strtolower(trim($prompt));
return $this->map[$key] ?? $prompt;
}
}

View File

@@ -0,0 +1,87 @@
<?php
declare(strict_types=1);
namespace App\Knowledge;
/**
* KeywordSimilarity
*
* Deterministic and fault-tolerant comparison of two keywords.
* Returns a similarity score between 0.0 and 1.0.
*
* Design goals:
* - index.json remains unchanged
* - comparison logic is intelligent (typos, phonetics)
* - no alias or synonym lists
* - no LLM dependency
*/
final class KeywordSimilarity
{
/**
* Compare a query token with an index keyword.
*
* @param string $queryToken Token from user input
* @param string $indexKeyword Keyword from index.json
*
* @return float Similarity score (0.0 1.0)
*/
public static function compare(string $queryToken, string $indexKeyword): float
{
$a = self::normalize($queryToken);
$b = self::normalize($indexKeyword);
// Guard: ignore empty or very short tokens
if ($a === '' || $b === '' || mb_strlen($a) < 3 || mb_strlen($b) < 3) {
return 0.0;
}
// 1. Exact match
if ($a === $b) {
return 1.0;
}
// 2. Phonetic comparison (metaphone)
// Useful for: showpare → shopware, shopvare → shopware
if (metaphone($a) === metaphone($b)) {
return 0.85;
}
// 3. Edit distance comparison (only for longer words)
if (mb_strlen($a) >= 6 && mb_strlen($b) >= 6) {
$distance = levenshtein($a, $b);
if ($distance === 1) {
return 0.9;
}
if ($distance === 2) {
return 0.8;
}
}
// No relevant match
return 0.0;
}
/**
* Normalize a keyword to ensure stable comparison.
*/
private static function normalize(string $value): string
{
$value = mb_strtolower(trim($value));
// Remove non-alphanumeric characters
$value = preg_replace('/[^\p{L}\p{N}]/u', '', $value) ?? '';
// Normalize German umlauts
$map = [
'ä' => 'ae',
'ö' => 'oe',
'ü' => 'ue',
'ß' => 'ss',
];
return strtr($value, $map);
}
}

View File

@@ -0,0 +1,42 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use Psr\Cache\CacheItemPoolInterface;
final class CachedRetriever implements RetrieverInterface
{
public function __construct(
private RetrieverInterface $inner,
private CacheItemPoolInterface $cache,
private int $ttlSeconds = 600 // 10 Minuten
) {}
public function retrieve(string $prompt, int $limit = 3): array
{
$key = $this->buildCacheKey($prompt, $limit);
$item = $this->cache->getItem($key);
if ($item->isHit()) {
return $item->get();
}
$result = $this->inner->retrieve($prompt, $limit);
$item->set($result);
$item->expiresAfter($this->ttlSeconds);
$this->cache->save($item);
return $result;
}
private function buildCacheKey(string $prompt, int $limit): string
{
$normalized = mb_strtolower(trim($prompt));
$normalized = preg_replace('/\s+/u', ' ', $normalized);
return 'rag_retrieval_' . sha1($normalized . '|' . $limit);
}
}

View File

@@ -0,0 +1,25 @@
<?php
// src/Knowledge/Retrieval/ChunkIndexLoader.php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
final class ChunkIndexLoader
{
public function __construct(
private string $indexPath
) {}
public function load(): array
{
if (!is_file($this->indexPath)) {
return [];
}
$json = file_get_contents($this->indexPath);
$data = $json ? json_decode($json, true) : null;
return is_array($data) ? $data : [];
}
}

View File

@@ -0,0 +1,269 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Retrieval;
use App\Knowledge\StopWords;
use App\Knowledge\VectorSearchChunked;
use App\Knowledge\KeywordSimilarity;
use App\Vector\VectorSearchClient;
final class ChunkKeywordRetriever implements RetrieverInterface
{
private const MAX_KEYWORD_CANDIDATES = 200;
private const VECTOR_SCORE_THRESHOLD = 0.65;
private const VECTOR_TOP_K = 3;
public function __construct(
private VectorSearchChunked $chunkedSearch,
private ChunkIndexLoader $indexLoader,
private StopWords $stopWords,
private VectorSearchClient $vectorClient,
private string $chunksDir,
private int $maxChunks = 3,
) {
}
/**
* {@inheritdoc}
*/
public function retrieve(string $prompt, int $limit = null): array
{
$limit ??= $this->maxChunks;
// ---------------------------------------------------------
// 1) Prompt → search terms
// ---------------------------------------------------------
$queryTerms = $this->extractTerms($prompt);
// ---------------------------------------------------------
// 2) Keyword-based candidate discovery
// ---------------------------------------------------------
$result = $queryTerms !== []
? $this->findCandidateFiles($queryTerms)
: ['files' => [], 'canonicalTerms' => []];
$candidateScores = array_slice(
$result['files'],
0,
self::MAX_KEYWORD_CANDIDATES,
true
);
// Canonical replacement
$effectiveTerms = array_map(
static fn (string $term): string =>
$result['canonicalTerms'][$term] ?? $term,
$queryTerms
);
// ---------------------------------------------------------
// 3) Keyword scoring
// ---------------------------------------------------------
$scored = [];
foreach ($candidateScores as $file => $similarityScore) {
$path = $this->chunksDir . '/' . $file;
if (!is_file($path)) {
continue;
}
$chunk = file_get_contents($path);
if ($chunk === false || $chunk === '') {
continue;
}
$score = $this->scoreChunk($chunk, $effectiveTerms);
if ($score === 0) {
continue;
}
$scored[$file] = [
'chunk' => trim($chunk),
'score' => (int) round($score * $similarityScore),
];
}
// ---------------------------------------------------------
// 🔑 EARLY EXIT: Keyword results are sufficient
// ---------------------------------------------------------
if (\count($scored) >= $limit) {
return $this->finalize($scored, $limit);
}
// ---------------------------------------------------------
// 4) Vector retrieval (semantic fallback)
// ---------------------------------------------------------
$vectorHits = $this->vectorClient->search($prompt, self::VECTOR_TOP_K);
foreach ($vectorHits as $hit) {
if (
!isset($hit['chunk_id'], $hit['score']) ||
$hit['score'] < self::VECTOR_SCORE_THRESHOLD
) {
continue;
}
$file = $hit['chunk_id'] . '.txt';
$path = $this->chunksDir . '/' . $file;
if (!is_file($path)) {
continue;
}
$baseScore = $scored[$file]['score'] ?? 0;
$vectorBoost = (int) round($hit['score'] * 10);
if ($vectorBoost <= 0) {
continue;
}
$chunk = $scored[$file]['chunk']
?? trim((string) file_get_contents($path));
$scored[$file] = [
'chunk' => $chunk,
'score' => $baseScore + $vectorBoost,
];
}
// ---------------------------------------------------------
// 5) Final fallback
// ---------------------------------------------------------
if ($scored === []) {
return $this->fallbackSearch($prompt);
}
return $this->finalize($scored, $limit);
}
// -------------------------------------------------------------
// FINALIZATION
// -------------------------------------------------------------
private function finalize(array $scored, int $limit): array
{
uasort($scored, fn ($a, $b) => $b['score'] <=> $a['score']);
return array_slice(
$this->normalizeResults(
array_column($scored, 'chunk')
),
0,
$limit
);
}
// -------------------------------------------------------------
// INDEX LOGIC
// -------------------------------------------------------------
private function findCandidateFiles(array $terms): array
{
$index = $this->indexLoader->load();
$files = [];
$canonicalTerms = [];
foreach ($index as $entry) {
if (!isset($entry['file'], $entry['keywords'])) {
continue;
}
foreach ($terms as $term) {
foreach ($entry['keywords'] as $indexKeyword) {
$score = KeywordSimilarity::compare($term, $indexKeyword);
if ($score >= 0.8) {
$files[$entry['file']] = max(
$files[$entry['file']] ?? 0.0,
$score
);
$canonicalTerms[$term] = $indexKeyword;
break 2;
}
}
}
}
return [
'files' => $files,
'canonicalTerms' => $canonicalTerms,
];
}
// -------------------------------------------------------------
// FALLBACK
// -------------------------------------------------------------
private function fallbackSearch(string $prompt): array
{
$chunkedText = trim($this->chunkedSearch->searchAsText($prompt));
if ($chunkedText === '') {
return [];
}
return array_slice(
$this->normalizeResults($this->splitChunks($chunkedText)),
0,
$this->maxChunks
);
}
// -------------------------------------------------------------
// SCORING
// -------------------------------------------------------------
private function scoreChunk(string $chunk, array $terms): int
{
$content = mb_strtolower($chunk);
$score = 0;
foreach ($terms as $term) {
if (
!\in_array($term, $this->stopWords->getStopWords(), true) &&
str_contains($content, $term)
) {
$score += mb_strlen($term) >= 10 ? 2 : 1;
}
}
return $score;
}
// -------------------------------------------------------------
// UTIL
// -------------------------------------------------------------
private function extractTerms(string $text): array
{
$text = mb_strtolower(
preg_replace('/[^\p{L}\p{N}\s]/u', '', $text)
);
return array_values(array_filter(
explode(' ', $text),
static fn (string $w) => mb_strlen($w) > 2
));
}
private function splitChunks(string $text): array
{
return array_values(array_filter(
array_map('trim', explode("\n\n", $text)),
static fn (string $chunk) => $chunk !== ''
));
}
private function normalizeResults(array $chunks): array
{
$seen = [];
$out = [];
foreach ($chunks as $chunk) {
$key = mb_strtolower(preg_replace('/\s+/u', ' ', $chunk));
if (!isset($seen[$key])) {
$seen[$key] = true;
$out[] = $chunk;
}
}
return $out;
}
}

View File

@@ -0,0 +1,11 @@
<?php
namespace App\Knowledge\Retrieval;
interface RetrieverInterface
{
/**
* @return string[] Plain text knowledge chunks
*/
public function retrieve(string $prompt, int $limit = 3): array;
}

1863
src/Knowledge/StopWords.php Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,121 @@
<?php
declare(strict_types=1);
namespace App\Knowledge;
use Psr\Log\LoggerInterface;
/**
* VectorSearchChunked
*
* Chunk-based retrieval service for long-form knowledge documents.
* This is a lightweight, deterministic runtime reader for
* precomputed knowledge chunks.
*
* Design principles:
* - No runtime indexing
* - No ML dependencies
* - Deterministic and fast
* - Hard limits to protect prompt size
*
* This service is intentionally simple and can later be replaced
* by a real vector database without changing the AgentRunner.
*/
final class VectorSearchChunked
{
/**
* Directory containing chunked knowledge files.
*/
private string $dataDir = 'var/knowledge/chunks';
/**
* Maximum number of chunks to return.
*/
private int $maxChunks = 3;
public function __construct(
private string $projectDir,
)
{
$this->dataDir = $this->projectDir . '/' . $this->dataDir;
}
/**
* Returns concatenated relevant chunks as plain text.
*
* @param string $prompt
* @return string
*/
public function searchAsText(string $prompt): string
{
if (!is_dir($this->dataDir)) {
return '';
}
$promptLower = mb_strtolower($prompt);
$keywords = $this->extractKeywords($promptLower);
if ($keywords === []) {
return '';
}
$matches = [];
foreach (glob($this->dataDir . '/*.txt') as $file) {
$content = file_get_contents($file);
if ($content === false) {
continue;
}
$contentLower = mb_strtolower($content);
if ($this->matchesKeywords($contentLower, $keywords)) {
$matches[] = trim($content);
}
if (count($matches) >= $this->maxChunks) {
break;
}
}
return implode("\n\n", $matches);
}
/**
* Extracts simple keywords from the prompt.
*
* This is a lightweight heuristic replacement for
* full vector or embedding-based search.
*/
private function extractKeywords(string $prompt): array
{
$words = preg_split('/\W+/u', $prompt, -1, PREG_SPLIT_NO_EMPTY);
if ($words === false) {
return [];
}
$keywords = [];
foreach ($words as $word) {
if (mb_strlen($word) >= 4) {
$keywords[] = $word;
}
}
return array_values(array_unique($keywords));
}
/**
* Checks whether the content matches at least one keyword.
*/
private function matchesKeywords(string $content, array $keywords): bool
{
foreach ($keywords as $keyword) {
if (str_contains($content, $keyword)) {
return true;
}
}
return false;
}
}

View File

@@ -0,0 +1,55 @@
<?php
declare(strict_types=1);
namespace App\Vector;
use Psr\Log\LoggerInterface;
final class VectorSearchClient
{
public function __construct(
private readonly string $vectorDir,
private LoggerInterface $agentLogger,
) {
}
public function search(string $query, int $limit = 5): array
{
$script = rtrim($this->vectorDir, '/') . '/vector_search.py';
$this->agentLogger->info("Run vector search script $script");
if (!is_file($script)) {
return [];
}
// -------------------------------------------------
// Determine Python interpreter (venv preferred)
// -------------------------------------------------
$venvPython = $this->vectorDir . '/.venv/bin/python';
$pythonBin = is_file($venvPython) ? $venvPython : 'python3';
$cmd = sprintf(
'%s %s %s %d 2>&1',
escapeshellarg($pythonBin),
escapeshellarg($script),
escapeshellarg($query),
$limit
);
exec($cmd, $out, $exitCode);
if ($exitCode !== 0 || empty($out)) {
return [];
}
$json = implode("\n", $out);
$this->agentLogger->info($json);
try {
return json_decode($json, true, 512, JSON_THROW_ON_ERROR);
} catch (\Throwable) {
return [];
}
}
}

View File

@@ -0,0 +1,89 @@
#!/usr/bin/env python3
import sys
import json
from pathlib import Path
# ---------------------------------------------------------
# Argument handling
# ---------------------------------------------------------
if len(sys.argv) < 3:
print("ERROR: Missing arguments (vectorDir, knowledgeDir)")
sys.exit(2)
vector_dir = Path(sys.argv[1]).resolve()
knowledge_dir = Path(sys.argv[2]).resolve()
index_json = knowledge_dir / "index.json"
index_out = vector_dir / "vector.index"
meta_out = vector_dir / "vector_meta.json"
# ---------------------------------------------------------
# Dependency checks
# ---------------------------------------------------------
try:
import faiss # noqa
except Exception:
print("ERROR: Python module 'faiss' not found.")
sys.exit(10)
try:
from sentence_transformers import SentenceTransformer # noqa
except Exception:
print("ERROR: Python module 'sentence-transformers' not found.")
sys.exit(11)
import faiss
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if not index_json.is_file():
print(f"ERROR: index.json not found at {index_json}")
sys.exit(20)
# ---------------------------------------------------------
# Load chunks from index.json
# ---------------------------------------------------------
with open(index_json, "r", encoding="utf-8") as f:
data = json.load(f)
texts = []
ids = []
for entry in data:
if "file" not in entry:
continue
chunk_path = knowledge_dir / "chunks" / entry["file"]
if not chunk_path.is_file():
continue
text = chunk_path.read_text(encoding="utf-8").strip()
if not text:
continue
texts.append(text)
ids.append(entry["file"])
if not texts:
print("ERROR: No chunks loaded from index.json")
sys.exit(21)
# ---------------------------------------------------------
# Build vector index
# ---------------------------------------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, normalize_embeddings=True)
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
faiss.write_index(index, str(index_out))
with open(meta_out, "w", encoding="utf-8") as f:
json.dump(ids, f)
print(f"Indexed {len(ids)} chunks.")

View File

@@ -0,0 +1,72 @@
#!/usr/bin/env python3
import sys
import json
from pathlib import Path
# ---------------------------------------------------------
# Argument handling
# ---------------------------------------------------------
if len(sys.argv) < 3:
print("ERROR: Missing arguments (query, limit)")
sys.exit(2)
query = sys.argv[1]
limit = int(sys.argv[2])
vector_dir = Path(__file__).resolve().parent
index_path = vector_dir / "vector.index"
meta_path = vector_dir / "vector_meta.json"
# ---------------------------------------------------------
# Dependency checks (controlled)
# ---------------------------------------------------------
try:
import faiss # noqa
except Exception:
print("ERROR: Python module 'faiss' not found.")
sys.exit(10)
try:
from sentence_transformers import SentenceTransformer # noqa
except Exception:
print("ERROR: Python module 'sentence-transformers' not found.")
sys.exit(11)
import faiss
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------
# File checks
# ---------------------------------------------------------
if not index_path.is_file() or not meta_path.is_file():
print("ERROR: Vector index not found. Run vector ingest first.")
sys.exit(20)
# ---------------------------------------------------------
# Load model and index
# ---------------------------------------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
query_vec = model.encode([query], normalize_embeddings=True)
index = faiss.read_index(str(index_path))
with open(meta_path, "r", encoding="utf-8") as f:
ids = json.load(f)
# ---------------------------------------------------------
# Search
# ---------------------------------------------------------
scores, indices = index.search(query_vec, limit)
results = []
for score, idx in zip(scores[0], indices[0]):
if idx == -1:
continue
results.append({
"chunk_id": ids[idx],
"score": float(score)
})
print(json.dumps(results))