first commit

2026-02-11 14:15:08 +01:00
parent a4742c2c38
commit aa7d362bc3
58 changed files with 9999 additions and 0 deletions
--- a/src/Knowledge/Ingest/SimpleChunker.php
+++ b/src/Knowledge/Ingest/SimpleChunker.php
@@ -0,0 +1,146 @@
+<?php
+// src/Knowledge/Ingest/SimpleChunker.php
+
+declare(strict_types=1);
+
+namespace App\Knowledge\Ingest;
+
+final class SimpleChunker
+{
+    public function __construct(
+        private int $maxWords = 180,
+        private int $overlapWords = 30
+    ) {}
+
+    /** @return string[] */
+    public function chunk(string $text): array
+    {
+        $text = $this->normalize($text);
+        if ($text === '') {
+            return [];
+        }
+
+        // Split into tokens: words + whitespace preserved
+        $tokens = preg_split(
+            '/(\s+)/u',
+            $text,
+            -1,
+            PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
+        );
+
+        if (!$tokens) {
+            return [];
+        }
+
+        // Build word index → token index mapping
+        $wordTokenIndexes = [];
+        foreach ($tokens as $i => $token) {
+            if (!preg_match('/^\s+$/u', $token)) {
+                $wordTokenIndexes[] = $i;
+            }
+        }
+
+        $totalWords = count($wordTokenIndexes);
+        if ($totalWords === 0) {
+            return [];
+        }
+
+        $chunks = [];
+        $wordPos = 0;
+
+        while ($wordPos < $totalWords) {
+            $wordEnd = min($wordPos + $this->maxWords, $totalWords);
+
+            $tokenStart = $wordTokenIndexes[$wordPos];
+            $tokenEnd   = $wordTokenIndexes[$wordEnd - 1] + 1;
+
+            // Intelligent cut (sentence / paragraph aware)
+            $tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
+
+            $chunk = trim(implode('', array_slice(
+                $tokens,
+                $tokenStart,
+                $tokenEnd - $tokenStart
+            )));
+
+            if ($chunk !== '') {
+                $chunks[] = $chunk;
+            }
+
+            if ($wordEnd >= $totalWords) {
+                break;
+            }
+
+            $wordPos = max(0, $wordEnd - $this->overlapWords);
+        }
+
+        return $this->dedupe($chunks);
+    }
+
+    private function normalize(string $text): string
+    {
+        $text = str_replace(["\r\n", "\r"], "\n", $text);
+        $text = preg_replace("/[ \t]+/u", " ", $text);
+        $text = preg_replace("/\n{3,}/u", "\n\n", $text);
+
+        return trim((string) $text);
+    }
+
+    /**
+     * Move cut backwards to a natural boundary if possible.
+     * Rules:
+     * - Never cut inside markdown list items
+     * - Sentence end only if followed by a line break
+     * - Paragraph breaks always allowed
+     */
+    private function adjustCutToBoundary(array $tokens, int $start, int $end): int
+    {
+        // Detect markdown list context (e.g. "- Foo: Bar")
+        $startToken = $tokens[$start] ?? '';
+        if (preg_match('/^- /u', ltrim($startToken))) {
+            // Keep list blocks intact
+            return $end;
+        }
+
+        for ($i = $end - 1; $i > $start; $i--) {
+
+            // Paragraph boundary
+            if ($tokens[$i] === "\n\n") {
+                return $i + 1;
+            }
+
+            // Sentence boundary only if followed by newline
+            if (
+                preg_match('/[.!?]\s*$/u', $tokens[$i]) &&
+                isset($tokens[$i + 1]) &&
+                str_contains($tokens[$i + 1], "\n")
+            ) {
+                return $i + 1;
+            }
+        }
+
+        return $end;
+    }
+
+    /** @param string[] $chunks @return string[] */
+    private function dedupe(array $chunks): array
+    {
+        $seen = [];
+        $out  = [];
+
+        foreach ($chunks as $chunk) {
+            $key = mb_strtolower(
+                preg_replace('/\s+/u', ' ', trim($chunk))
+            );
+
+            if (isset($seen[$key])) {
+                continue;
+            }
+
+            $seen[$key] = true;
+            $out[] = $chunk;
+        }
+
+        return $out;
+    }
+}