first commit

This commit is contained in:
team 1
2026-02-11 14:15:08 +01:00
parent a4742c2c38
commit aa7d362bc3
58 changed files with 9999 additions and 0 deletions

View File

@@ -0,0 +1,58 @@
<?php
// src/Knowledge/Ingest/ChunkIndexWriter.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
final class ChunkIndexWriter
{
public function __construct(
private string $indexPath
) {}
public function add(array $entry): void
{
$index = $this->load();
$index[] = $entry;
$this->save($index);
}
private function load(): array
{
if (!is_file($this->indexPath)) {
return [];
}
$json = file_get_contents($this->indexPath);
$data = $json ? json_decode($json, true) : null;
return is_array($data) ? $data : [];
}
private function save(array $index): void
{
$dir = dirname($this->indexPath);
if (!is_dir($dir)) {
mkdir($dir, 0775, true);
}
file_put_contents(
$this->indexPath,
json_encode($index, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)
);
}
public function hasSourceHash(string $source, string $hash): bool
{
foreach ($this->load() as $entry) {
if (
($entry['source'] ?? null) === $source &&
($entry['sourceHash'] ?? null) === $hash
) {
return true;
}
}
return false;
}
}

View File

@@ -0,0 +1,149 @@
<?php
// src/Knowledge/Ingest/ChunkWriter.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
use App\Knowledge\StopWords;
final class ChunkWriter
{
public function __construct(
private string $chunksDir,
private string $manifestPath,
private ChunkIndexWriter $indexWriter,
private StopWords $stopWords,
)
{
}
/**
* @param string[] $chunks
* @return string[] written filenames
*/
public function write(string $sourceName, array $chunks, string $sourceHash): array
{
if (!is_dir($this->chunksDir)) {
mkdir($this->chunksDir, 0775, true);
}
$manifest = $this->loadManifest();
$written = [];
$base = $this->safeBase($sourceName);
$ts = date('Ymd_His');
foreach ($chunks as $i => $chunk) {
$filename = "{$base}__{$ts}__" . str_pad((string)$i, 4, '0', STR_PAD_LEFT) . ".txt";
$path = rtrim($this->chunksDir, '/') . '/' . $filename;
$header = $this->buildHeader(
source: $sourceName,
index: $i
);
file_put_contents($path, $header . "\n\n" . $chunk);
$written[] = $filename;
$manifest[] = [
'file' => $filename,
'source' => $sourceName,
'index' => $i,
'chars' => mb_strlen($chunk),
'createdAt' => date('c'),
];
$this->indexWriter->add([
'file' => $filename,
'source' => $sourceName,
'sourceHash' => $sourceHash,
'keywords' => $this->extractKeywords($chunk),
'chars' => mb_strlen($chunk),
]);
}
$this->saveManifest($manifest);
return $written;
}
private function safeBase(string $name): string
{
$name = pathinfo($name, PATHINFO_FILENAME);
$name = mb_strtolower($name);
$name = preg_replace('/[^a-z0-9\-_]+/u', '-', $name);
return trim((string)$name, '-');
}
private function loadManifest(): array
{
if (!is_file($this->manifestPath)) {
return [];
}
$json = file_get_contents($this->manifestPath);
$data = $json ? json_decode($json, true) : null;
return is_array($data) ? $data : [];
}
private function saveManifest(array $manifest): void
{
$dir = dirname($this->manifestPath);
if (!is_dir($dir)) {
mkdir($dir, 0775, true);
}
file_put_contents($this->manifestPath, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
}
private function buildHeader(string $source, int $index): string
{
return sprintf(
'[Quelle: %s | Abschnitt: Chunk %d]',
$source,
$index + 1
);
}
private function extractKeywords(string $text): array
{
// 1) Lowercase
$text = mb_strtolower($text);
// 2) URLs entfernen (sehr wichtig)
$text = preg_replace('#https?://\S+#u', ' ', $text);
// 3) Newlines & Tabs → Space
$text = str_replace(["\r", "\n", "\t"], ' ', $text);
// 4) Trennzeichen → Space (NICHT löschen!)
$text = preg_replace('/[\/\.\,\:\;\-\_\(\)\[\]\{\}]/u', ' ', $text);
// 5) Alles andere raus
$text = preg_replace('/[^\p{L}\p{N}\s]/u', '', $text);
// 6) Whitespace normalisieren
$text = preg_replace('/\s+/u', ' ', $text);
$text = trim($text);
// 7) Wörter extrahieren
$words = explode(' ', $text);
// 8) Filtern + deduplizieren
$keywords = [];
foreach ($words as $word) {
if (mb_strlen($word) < 4) {
continue;
}
if (in_array($word, $this->stopWords->getStopWords() ?? [], true)) {
continue;
}
$keywords[] = $word;
}
return array_values(array_unique(array_slice($keywords, 0, 25)));
}
}

View File

@@ -0,0 +1,37 @@
<?php
// src/Knowledge/Ingest/DocumentLoader.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
final class DocumentLoader
{
public function load(string $path): string
{
if (!is_file($path)) {
throw new \RuntimeException("File not found: {$path}");
}
$ext = mb_strtolower(pathinfo($path, PATHINFO_EXTENSION));
return match ($ext) {
'txt', 'md' => $this->loadText($path),
// später:
// 'pdf' => $this->loadPdf($path),
// 'docx' => $this->loadDocx($path),
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
};
}
private function loadText(string $path): string
{
$content = file_get_contents($path);
if ($content === false) {
throw new \RuntimeException("Could not read file: {$path}");
}
return $content;
}
}

View File

@@ -0,0 +1,39 @@
<?php
// src/Knowledge/Ingest/KnowledgeIngestService.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
final class KnowledgeIngestService
{
public function __construct(
private DocumentLoader $loader,
private SimpleChunker $chunker,
private ChunkWriter $writer,
private ChunkIndexWriter $indexWriter,
)
{
}
/** @return string[] written chunk filenames */
public function ingestFile(string $path, bool $optimize = false): array
{
$text = $this->loader->load($path);
if ($optimize) {
$text = preg_replace("/\n{3,}/", "\n\n", $text);
$text = preg_replace("/[ \t]+$/m", "", $text);
}
$sourceHash = sha1($text);
$sourceName = basename($path);
if ($this->indexWriter->hasSourceHash($sourceName, $sourceHash)) {
return [];
}
$chunks = $this->chunker->chunk($text);
return $this->writer->write($sourceName, $chunks, $sourceHash);
}
}

View File

@@ -0,0 +1,146 @@
<?php
// src/Knowledge/Ingest/SimpleChunker.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
final class SimpleChunker
{
public function __construct(
private int $maxWords = 180,
private int $overlapWords = 30
) {}
/** @return string[] */
public function chunk(string $text): array
{
$text = $this->normalize($text);
if ($text === '') {
return [];
}
// Split into tokens: words + whitespace preserved
$tokens = preg_split(
'/(\s+)/u',
$text,
-1,
PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
);
if (!$tokens) {
return [];
}
// Build word index → token index mapping
$wordTokenIndexes = [];
foreach ($tokens as $i => $token) {
if (!preg_match('/^\s+$/u', $token)) {
$wordTokenIndexes[] = $i;
}
}
$totalWords = count($wordTokenIndexes);
if ($totalWords === 0) {
return [];
}
$chunks = [];
$wordPos = 0;
while ($wordPos < $totalWords) {
$wordEnd = min($wordPos + $this->maxWords, $totalWords);
$tokenStart = $wordTokenIndexes[$wordPos];
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
// Intelligent cut (sentence / paragraph aware)
$tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
$chunk = trim(implode('', array_slice(
$tokens,
$tokenStart,
$tokenEnd - $tokenStart
)));
if ($chunk !== '') {
$chunks[] = $chunk;
}
if ($wordEnd >= $totalWords) {
break;
}
$wordPos = max(0, $wordEnd - $this->overlapWords);
}
return $this->dedupe($chunks);
}
private function normalize(string $text): string
{
$text = str_replace(["\r\n", "\r"], "\n", $text);
$text = preg_replace("/[ \t]+/u", " ", $text);
$text = preg_replace("/\n{3,}/u", "\n\n", $text);
return trim((string) $text);
}
/**
* Move cut backwards to a natural boundary if possible.
* Rules:
* - Never cut inside markdown list items
* - Sentence end only if followed by a line break
* - Paragraph breaks always allowed
*/
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
{
// Detect markdown list context (e.g. "- Foo: Bar")
$startToken = $tokens[$start] ?? '';
if (preg_match('/^- /u', ltrim($startToken))) {
// Keep list blocks intact
return $end;
}
for ($i = $end - 1; $i > $start; $i--) {
// Paragraph boundary
if ($tokens[$i] === "\n\n") {
return $i + 1;
}
// Sentence boundary only if followed by newline
if (
preg_match('/[.!?]\s*$/u', $tokens[$i]) &&
isset($tokens[$i + 1]) &&
str_contains($tokens[$i + 1], "\n")
) {
return $i + 1;
}
}
return $end;
}
/** @param string[] $chunks @return string[] */
private function dedupe(array $chunks): array
{
$seen = [];
$out = [];
foreach ($chunks as $chunk) {
$key = mb_strtolower(
preg_replace('/\s+/u', ' ', trim($chunk))
);
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$out[] = $chunk;
}
return $out;
}
}