new ingest und profile settings

2026-02-16 14:38:02 +01:00
parent ece93e4cb4
commit 8666b05570
15 changed files with 655 additions and 199 deletions
--- a/src/Knowledge/Ingest/SimpleChunker.php
+++ b/src/Knowledge/Ingest/SimpleChunker.php
@@ -5,22 +5,31 @@ declare(strict_types=1);

 namespace App\Knowledge\Ingest;

+use App\Index\IndexConfigurationProvider;
+
 final class SimpleChunker
 {
+    private IndexConfigurationProvider $configurationProvider;
+
    public function __construct(
-        private int $maxWords = 180,
-        private int $overlapWords = 30
-    ) {}
+        IndexConfigurationProvider $configurationProvider
+    ) {
+        $this->configurationProvider = $configurationProvider;
+    }

    /** @return string[] */
    public function chunk(string $text): array
    {
+        $config = $this->configurationProvider->getConfiguration();
+
+        $maxWords = $config->getChunkSize();
+        $overlapWords = $config->getChunkOverlap();
+
        $text = $this->normalize($text);
        if ($text === '') {
            return [];
        }

-        // Split into tokens: words + whitespace preserved
        $tokens = preg_split(
            '/(\s+)/u',
            $text,
@@ -32,7 +41,6 @@ final class SimpleChunker
            return [];
        }

-        // Build word index → token index mapping
        $wordTokenIndexes = [];
        foreach ($tokens as $i => $token) {
            if (!preg_match('/^\s+$/u', $token)) {
@@ -49,12 +57,11 @@ final class SimpleChunker
        $wordPos = 0;

        while ($wordPos < $totalWords) {
-            $wordEnd = min($wordPos + $this->maxWords, $totalWords);
+            $wordEnd = min($wordPos + $maxWords, $totalWords);

            $tokenStart = $wordTokenIndexes[$wordPos];
            $tokenEnd   = $wordTokenIndexes[$wordEnd - 1] + 1;

-            // Intelligent cut (sentence / paragraph aware)
            $tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);

            $chunk = trim(implode('', array_slice(
@@ -71,7 +78,7 @@ final class SimpleChunker
                break;
            }

-            $wordPos = max(0, $wordEnd - $this->overlapWords);
+            $wordPos = max(0, $wordEnd - $overlapWords);
        }

        return $this->dedupe($chunks);
@@ -86,30 +93,19 @@ final class SimpleChunker
        return trim((string) $text);
    }

-    /**
-     * Move cut backwards to a natural boundary if possible.
-     * Rules:
-     * - Never cut inside markdown list items
-     * - Sentence end only if followed by a line break
-     * - Paragraph breaks always allowed
-     */
    private function adjustCutToBoundary(array $tokens, int $start, int $end): int
    {
-        // Detect markdown list context (e.g. "- Foo: Bar")
        $startToken = $tokens[$start] ?? '';
        if (preg_match('/^- /u', ltrim($startToken))) {
-            // Keep list blocks intact
            return $end;
        }

        for ($i = $end - 1; $i > $start; $i--) {

-            // Paragraph boundary
            if ($tokens[$i] === "\n\n") {
                return $i + 1;
            }

-            // Sentence boundary only if followed by newline
            if (
                preg_match('/[.!?]\s*$/u', $tokens[$i]) &&
                isset($tokens[$i + 1]) &&