configurationProvider->getConfiguration(); $maxWords = $config->getChunkSize(); $overlapWords = $config->getChunkOverlap(); $text = $this->textNormalizer->normalize($text); if ($text === '') { return []; } $tokens = preg_split( '/(\s+)/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); if (!$tokens) { return []; } $wordTokenIndexes = []; foreach ($tokens as $i => $token) { if (!preg_match('/^\s+$/u', $token)) { $wordTokenIndexes[] = $i; } } $totalWords = count($wordTokenIndexes); if ($totalWords === 0) { return []; } $chunks = []; $wordPos = 0; while ($wordPos < $totalWords) { $wordEnd = min($wordPos + $maxWords, $totalWords); $tokenStart = $wordTokenIndexes[$wordPos]; $tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1; $tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd); $chunk = trim(implode('', array_slice( $tokens, $tokenStart, $tokenEnd - $tokenStart ))); if ($chunk !== '') { $chunks[] = $chunk; } if ($wordEnd >= $totalWords) { break; } $wordPos = max(0, $wordEnd - $overlapWords); } return $this->dedupe($chunks); } private function adjustCutToBoundary(array $tokens, int $start, int $end): int { $startToken = $tokens[$start] ?? ''; if (preg_match('/^- /u', ltrim($startToken))) { return $end; } for ($i = $end - 1; $i > $start; $i--) { if ($tokens[$i] === "\n\n") { return $i + 1; } if ( preg_match('/[.!?]\s*$/u', $tokens[$i]) && isset($tokens[$i + 1]) && str_contains($tokens[$i + 1], "\n") ) { return $i + 1; } } return $end; } /** @param string[] $chunks @return string[] */ private function dedupe(array $chunks): array { $seen = []; $out = []; foreach ($chunks as $chunk) { $key = mb_strtolower( preg_replace('/\s+/u', ' ', trim($chunk)) ); if (isset($seen[$key])) { continue; } $seen[$key] = true; $out[] = $chunk; } return $out; } }