MtoRagSystem/src/Knowledge/Ingest/SimpleChunker.php

<?php
// src/Knowledge/Ingest/SimpleChunker.php

declare(strict_types=1);

namespace App\Knowledge\Ingest;

use App\Index\IndexConfigurationProvider;
use App\Knowledge\Text\TextNormalizer;

final readonly class SimpleChunker
{
    public function __construct(
        private IndexConfigurationProvider $configurationProvider,
        private TextNormalizer $textNormalizer
    ) {}

    /** @return string[] */
    public function chunk(string $text): array
    {
        $config = $this->configurationProvider->getConfiguration();

        $maxWords     = max(1, $config->getChunkSize());
        $overlapWords = max(0, $config->getChunkOverlap());

        $text = $this->textNormalizer->normalize($text);
        if ($text === '') {
            return [];
        }

        // Absatzbasierte Vorstruktur
        $paragraphs = preg_split('/\n{2,}/u', $text);
        if (!$paragraphs) {
            return [];
        }

        $chunks = [];
        $currentChunk = '';
        $currentWordCount = 0;

        foreach ($paragraphs as $paragraph) {

            $paragraph = trim($paragraph);
            if ($paragraph === '') {
                continue;
            }

            $paragraphWordCount = $this->countWords($paragraph);

            // Absatz größer als maxWords → Wort-Fallback
            if ($paragraphWordCount > $maxWords) {

                if ($currentChunk !== '') {
                    $chunks[] = trim($currentChunk);
                    $currentChunk = '';
                    $currentWordCount = 0;
                }

                foreach ($this->chunkByWords($paragraph, $maxWords, $overlapWords) as $subChunk) {
                    $chunks[] = $subChunk;
                }

                continue;
            }

            // Absatz passt in aktuellen Chunk
            if ($currentWordCount + $paragraphWordCount <= $maxWords) {
                $currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph;
                $currentWordCount += $paragraphWordCount;
                continue;
            }

            // Flush
            if ($currentChunk !== '') {
                $chunks[] = trim($currentChunk);
            }

            $currentChunk = $paragraph;
            $currentWordCount = $paragraphWordCount;
        }

        if ($currentChunk !== '') {
            $chunks[] = trim($currentChunk);
        }

        return $this->dedupe($chunks);
    }

    // ======================================================
    // Wortbasierter Fallback
    // ======================================================

    /** @return string[] */
    private function chunkByWords(string $text, int $maxWords, int $overlapWords): array
    {
        $tokens = preg_split(
            '/(\s+)/u',
            $text,
            -1,
            PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
        );

        if (!$tokens) {
            return [];
        }

        $wordTokenIndexes = [];
        foreach ($tokens as $i => $token) {
            if (!preg_match('/^\s+$/u', $token)) {
                $wordTokenIndexes[] = $i;
            }
        }

        $totalWords = count($wordTokenIndexes);
        if ($totalWords === 0) {
            return [];
        }

        $chunks = [];
        $wordPos = 0;

        while ($wordPos < $totalWords) {

            $wordEnd = min($wordPos + $maxWords, $totalWords);

            $tokenStart = $wordTokenIndexes[$wordPos];
            $tokenEnd   = $wordTokenIndexes[$wordEnd - 1] + 1;

            $tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);

            $chunk = trim(implode('', array_slice(
                $tokens,
                $tokenStart,
                $tokenEnd - $tokenStart
            )));

            if ($chunk !== '') {
                $chunks[] = $chunk;
            }

            if ($wordEnd >= $totalWords) {
                break;
            }

            $wordPos = max(0, $wordEnd - $overlapWords);
        }

        return $chunks;
    }

    private function adjustCutToBoundary(array $tokens, int $start, int $end): int
    {
        // Schutz für Listenanfänge
        $startToken = $tokens[$start] ?? '';
        if (preg_match('/^\s*-\s+/u', $startToken)) {
            return $end;
        }

        // Rückwärts prüfen auf Absatz- oder Satzende
        for ($i = $end - 1; $i > $start; $i--) {

            if ($tokens[$i] === "\n\n") {
                return $i + 1;
            }

            if (
                preg_match('/[.!?]\s*$/u', $tokens[$i]) &&
                isset($tokens[$i + 1]) &&
                str_contains($tokens[$i + 1], "\n")
            ) {
                return $i + 1;
            }
        }

        return $end;
    }

    private function countWords(string $text): int
    {
        $parts = preg_split('/\s+/u', trim($text));
        return $parts ? count($parts) : 0;
    }

    /** @param string[] $chunks @return string[] */
    private function dedupe(array $chunks): array
    {
        $seen = [];
        $out  = [];

        foreach ($chunks as $chunk) {

            $normalized = preg_replace('/\s+/u', ' ', trim($chunk));
            if ($normalized === null) {
                continue;
            }

            $key = mb_strtolower($normalized);

            if (isset($seen[$key])) {
                continue;
            }

            $seen[$key] = true;
            $out[] = $chunk;
        }

        return $out;
    }
}