first commit

2026-02-11 14:15:08 +01:00
parent a4742c2c38
commit aa7d362bc3
58 changed files with 9999 additions and 0 deletions
--- a/src/Knowledge/Ingest/ChunkIndexWriter.php
+++ b/src/Knowledge/Ingest/ChunkIndexWriter.php
@@ -0,0 +1,58 @@
+<?php
+// src/Knowledge/Ingest/ChunkIndexWriter.php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Ingest;
+
+final class ChunkIndexWriter
+{
+    public function __construct(
+        private string $indexPath
+    ) {}
+
+    public function add(array $entry): void
+    {
+        $index = $this->load();
+        $index[] = $entry;
+        $this->save($index);
+    }
+
+    private function load(): array
+    {
+        if (!is_file($this->indexPath)) {
+            return [];
+        }
+
+        $json = file_get_contents($this->indexPath);
+        $data = $json ? json_decode($json, true) : null;
+
+        return is_array($data) ? $data : [];
+    }
+
+    private function save(array $index): void
+    {
+        $dir = dirname($this->indexPath);
+        if (!is_dir($dir)) {
+            mkdir($dir, 0775, true);
+        }
+
+        file_put_contents(
+            $this->indexPath,
+            json_encode($index, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)
+        );
+    }
+
+    public function hasSourceHash(string $source, string $hash): bool
+    {
+        foreach ($this->load() as $entry) {
+            if (
+                ($entry['source'] ?? null) === $source &&
+                ($entry['sourceHash'] ?? null) === $hash
+            ) {
+                return true;
+            }
+        }
+        return false;
+    }
+}
--- a/src/Knowledge/Ingest/ChunkWriter.php
+++ b/src/Knowledge/Ingest/ChunkWriter.php
@@ -0,0 +1,149 @@
+<?php
+// src/Knowledge/Ingest/ChunkWriter.php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Ingest;
+
+
+use App\Knowledge\StopWords;
+
+final class ChunkWriter
+{
+
+    public function __construct(
+        private string           $chunksDir,
+        private string           $manifestPath,
+        private ChunkIndexWriter $indexWriter,
+        private StopWords        $stopWords,
+    )
+    {
+    }
+
+    /**
+     * @param string[] $chunks
+     * @return string[] written filenames
+     */
+    public function write(string $sourceName, array $chunks, string $sourceHash): array
+    {
+        if (!is_dir($this->chunksDir)) {
+            mkdir($this->chunksDir, 0775, true);
+        }
+
+        $manifest = $this->loadManifest();
+        $written = [];
+
+        $base = $this->safeBase($sourceName);
+        $ts = date('Ymd_His');
+
+        foreach ($chunks as $i => $chunk) {
+            $filename = "{$base}__{$ts}__" . str_pad((string)$i, 4, '0', STR_PAD_LEFT) . ".txt";
+            $path = rtrim($this->chunksDir, '/') . '/' . $filename;
+
+            $header = $this->buildHeader(
+                source: $sourceName,
+                index: $i
+            );
+
+            file_put_contents($path, $header . "\n\n" . $chunk);
+
+            $written[] = $filename;
+
+            $manifest[] = [
+                'file' => $filename,
+                'source' => $sourceName,
+                'index' => $i,
+                'chars' => mb_strlen($chunk),
+                'createdAt' => date('c'),
+            ];
+
+            $this->indexWriter->add([
+                'file' => $filename,
+                'source' => $sourceName,
+                'sourceHash' => $sourceHash,
+                'keywords' => $this->extractKeywords($chunk),
+                'chars' => mb_strlen($chunk),
+            ]);
+        }
+
+
+        $this->saveManifest($manifest);
+        return $written;
+    }
+
+    private function safeBase(string $name): string
+    {
+        $name = pathinfo($name, PATHINFO_FILENAME);
+        $name = mb_strtolower($name);
+        $name = preg_replace('/[^a-z0-9\-_]+/u', '-', $name);
+        return trim((string)$name, '-');
+    }
+
+    private function loadManifest(): array
+    {
+        if (!is_file($this->manifestPath)) {
+            return [];
+        }
+        $json = file_get_contents($this->manifestPath);
+        $data = $json ? json_decode($json, true) : null;
+        return is_array($data) ? $data : [];
+    }
+
+    private function saveManifest(array $manifest): void
+    {
+        $dir = dirname($this->manifestPath);
+        if (!is_dir($dir)) {
+            mkdir($dir, 0775, true);
+        }
+        file_put_contents($this->manifestPath, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
+    }
+
+    private function buildHeader(string $source, int $index): string
+    {
+        return sprintf(
+            '[Quelle: %s | Abschnitt: Chunk %d]',
+            $source,
+            $index + 1
+        );
+    }
+
+    private function extractKeywords(string $text): array
+    {
+        // 1) Lowercase
+        $text = mb_strtolower($text);
+
+        // 2) URLs entfernen (sehr wichtig)
+        $text = preg_replace('#https?://\S+#u', ' ', $text);
+
+        // 3) Newlines & Tabs → Space
+        $text = str_replace(["\r", "\n", "\t"], ' ', $text);
+
+        // 4) Trennzeichen → Space (NICHT löschen!)
+        $text = preg_replace('/[\/\.\,\:\;\-\_\(\)\[\]\{\}]/u', ' ', $text);
+
+        // 5) Alles andere raus
+        $text = preg_replace('/[^\p{L}\p{N}\s]/u', '', $text);
+
+        // 6) Whitespace normalisieren
+        $text = preg_replace('/\s+/u', ' ', $text);
+        $text = trim($text);
+
+        // 7) Wörter extrahieren
+        $words = explode(' ', $text);
+
+        // 8) Filtern + deduplizieren
+        $keywords = [];
+
+        foreach ($words as $word) {
+            if (mb_strlen($word) < 4) {
+                continue;
+            }
+            if (in_array($word, $this->stopWords->getStopWords() ?? [], true)) {
+                continue;
+            }
+            $keywords[] = $word;
+        }
+
+        return array_values(array_unique(array_slice($keywords, 0, 25)));
+    }
+}
--- a/src/Knowledge/Ingest/DocumentLoader.php
+++ b/src/Knowledge/Ingest/DocumentLoader.php
@@ -0,0 +1,37 @@
+<?php
+// src/Knowledge/Ingest/DocumentLoader.php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Ingest;
+
+final class DocumentLoader
+{
+    public function load(string $path): string
+    {
+        if (!is_file($path)) {
+            throw new \RuntimeException("File not found: {$path}");
+        }
+
+        $ext = mb_strtolower(pathinfo($path, PATHINFO_EXTENSION));
+
+        return match ($ext) {
+            'txt', 'md' => $this->loadText($path),
+
+            // später:
+            // 'pdf' => $this->loadPdf($path),
+            // 'docx' => $this->loadDocx($path),
+
+            default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
+        };
+    }
+
+    private function loadText(string $path): string
+    {
+        $content = file_get_contents($path);
+        if ($content === false) {
+            throw new \RuntimeException("Could not read file: {$path}");
+        }
+        return $content;
+    }
+}
--- a/src/Knowledge/Ingest/KnowledgeIngestService.php
+++ b/src/Knowledge/Ingest/KnowledgeIngestService.php
@@ -0,0 +1,39 @@
+<?php
+// src/Knowledge/Ingest/KnowledgeIngestService.php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Ingest;
+
+final class KnowledgeIngestService
+{
+    public function __construct(
+        private DocumentLoader   $loader,
+        private SimpleChunker    $chunker,
+        private ChunkWriter      $writer,
+        private ChunkIndexWriter $indexWriter,
+    )
+    {
+    }
+
+    /** @return string[] written chunk filenames */
+    public function ingestFile(string $path, bool $optimize = false): array
+    {
+        $text = $this->loader->load($path);
+
+        if ($optimize) {
+            $text = preg_replace("/\n{3,}/", "\n\n", $text);
+            $text = preg_replace("/[ \t]+$/m", "", $text);
+        }
+
+        $sourceHash = sha1($text);
+        $sourceName = basename($path);
+
+        if ($this->indexWriter->hasSourceHash($sourceName, $sourceHash)) {
+            return [];
+        }
+
+        $chunks = $this->chunker->chunk($text);
+        return $this->writer->write($sourceName, $chunks, $sourceHash);
+    }
+}
--- a/src/Knowledge/Ingest/SimpleChunker.php
+++ b/src/Knowledge/Ingest/SimpleChunker.php
@@ -0,0 +1,146 @@
+<?php
+// src/Knowledge/Ingest/SimpleChunker.php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Ingest;
+
+final class SimpleChunker
+{
+    public function __construct(
+        private int $maxWords = 180,
+        private int $overlapWords = 30
+    ) {}
+
+    /** @return string[] */
+    public function chunk(string $text): array
+    {
+        $text = $this->normalize($text);
+        if ($text === '') {
+            return [];
+        }
+
+        // Split into tokens: words + whitespace preserved
+        $tokens = preg_split(
+            '/(\s+)/u',
+            $text,
+            -1,
+            PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
+        );
+
+        if (!$tokens) {
+            return [];
+        }
+
+        // Build word index → token index mapping
+        $wordTokenIndexes = [];
+        foreach ($tokens as $i => $token) {
+            if (!preg_match('/^\s+$/u', $token)) {
+                $wordTokenIndexes[] = $i;
+            }
+        }
+
+        $totalWords = count($wordTokenIndexes);
+        if ($totalWords === 0) {
+            return [];
+        }
+
+        $chunks = [];
+        $wordPos = 0;
+
+        while ($wordPos < $totalWords) {
+            $wordEnd = min($wordPos + $this->maxWords, $totalWords);
+
+            $tokenStart = $wordTokenIndexes[$wordPos];
+            $tokenEnd   = $wordTokenIndexes[$wordEnd - 1] + 1;
+
+            // Intelligent cut (sentence / paragraph aware)
+            $tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
+
+            $chunk = trim(implode('', array_slice(
+                $tokens,
+                $tokenStart,
+                $tokenEnd - $tokenStart
+            )));
+
+            if ($chunk !== '') {
+                $chunks[] = $chunk;
+            }
+
+            if ($wordEnd >= $totalWords) {
+                break;
+            }
+
+            $wordPos = max(0, $wordEnd - $this->overlapWords);
+        }
+
+        return $this->dedupe($chunks);
+    }
+
+    private function normalize(string $text): string
+    {
+        $text = str_replace(["\r\n", "\r"], "\n", $text);
+        $text = preg_replace("/[ \t]+/u", " ", $text);
+        $text = preg_replace("/\n{3,}/u", "\n\n", $text);
+
+        return trim((string) $text);
+    }
+
+    /**
+     * Move cut backwards to a natural boundary if possible.
+     * Rules:
+     * - Never cut inside markdown list items
+     * - Sentence end only if followed by a line break
+     * - Paragraph breaks always allowed
+     */
+    private function adjustCutToBoundary(array $tokens, int $start, int $end): int
+    {
+        // Detect markdown list context (e.g. "- Foo: Bar")
+        $startToken = $tokens[$start] ?? '';
+        if (preg_match('/^- /u', ltrim($startToken))) {
+            // Keep list blocks intact
+            return $end;
+        }
+
+        for ($i = $end - 1; $i > $start; $i--) {
+
+            // Paragraph boundary
+            if ($tokens[$i] === "\n\n") {
+                return $i + 1;
+            }
+
+            // Sentence boundary only if followed by newline
+            if (
+                preg_match('/[.!?]\s*$/u', $tokens[$i]) &&
+                isset($tokens[$i + 1]) &&
+                str_contains($tokens[$i + 1], "\n")
+            ) {
+                return $i + 1;
+            }
+        }
+
+        return $end;
+    }
+
+    /** @param string[] $chunks @return string[] */
+    private function dedupe(array $chunks): array
+    {
+        $seen = [];
+        $out  = [];
+
+        foreach ($chunks as $chunk) {
+            $key = mb_strtolower(
+                preg_replace('/\s+/u', ' ', trim($chunk))
+            );
+
+            if (isset($seen[$key])) {
+                continue;
+            }
+
+            $seen[$key] = true;
+            $out[] = $chunk;
+        }
+
+        return $out;
+    }
+}