MtoRagSystem/src/Knowledge/Ingest/SimpleChunker.php

<?php
// src/Knowledge/Ingest/SimpleChunker.php

declare(strict_types=1);

namespace App\Knowledge\Ingest;

use App\Index\IndexConfigurationProvider;
use App\Knowledge\Text\TextNormalizer;

final readonly class SimpleChunker
{

    public function __construct(
        private IndexConfigurationProvider $configurationProvider,
        private TextNormalizer             $textNormalizer
    )
    {
    }

    /** @return string[] */
    public function chunk(string $text): array
    {
        $config = $this->configurationProvider->getConfiguration();

        $maxWords = $config->getChunkSize();
        $overlapWords = $config->getChunkOverlap();

        $text = $this->textNormalizer->normalize($text);
        if ($text === '') {
            return [];
        }

        $tokens = preg_split(
            '/(\s+)/u',
            $text,
            -1,
            PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
        );

        if (!$tokens) {
            return [];
        }

        $wordTokenIndexes = [];
        foreach ($tokens as $i => $token) {
            if (!preg_match('/^\s+$/u', $token)) {
                $wordTokenIndexes[] = $i;
            }
        }

        $totalWords = count($wordTokenIndexes);
        if ($totalWords === 0) {
            return [];
        }

        $chunks = [];
        $wordPos = 0;

        while ($wordPos < $totalWords) {
            $wordEnd = min($wordPos + $maxWords, $totalWords);

            $tokenStart = $wordTokenIndexes[$wordPos];
            $tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;

            $tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);

            $chunk = trim(implode('', array_slice(
                $tokens,
                $tokenStart,
                $tokenEnd - $tokenStart
            )));

            if ($chunk !== '') {
                $chunks[] = $chunk;
            }

            if ($wordEnd >= $totalWords) {
                break;
            }

            $wordPos = max(0, $wordEnd - $overlapWords);
        }

        return $this->dedupe($chunks);
    }

    private function adjustCutToBoundary(array $tokens, int $start, int $end): int
    {
        $startToken = $tokens[$start] ?? '';
        if (preg_match('/^- /u', ltrim($startToken))) {
            return $end;
        }

        for ($i = $end - 1; $i > $start; $i--) {

            if ($tokens[$i] === "\n\n") {
                return $i + 1;
            }

            if (
                preg_match('/[.!?]\s*$/u', $tokens[$i]) &&
                isset($tokens[$i + 1]) &&
                str_contains($tokens[$i + 1], "\n")
            ) {
                return $i + 1;
            }
        }

        return $end;
    }

    /** @param string[] $chunks @return string[] */
    private function dedupe(array $chunks): array
    {
        $seen = [];
        $out = [];

        foreach ($chunks as $chunk) {
            $key = mb_strtolower(
                preg_replace('/\s+/u', ' ', trim($chunk))
            );

            if (isset($seen[$key])) {
                continue;
            }

            $seen[$key] = true;
            $out[] = $chunk;
        }

        return $out;
    }
}