209 lines
5.3 KiB
PHP
209 lines
5.3 KiB
PHP
<?php
|
|
// src/Knowledge/Ingest/SimpleChunker.php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Knowledge\Ingest;
|
|
|
|
use App\Index\IndexConfigurationProvider;
|
|
use App\Knowledge\Text\TextNormalizer;
|
|
|
|
final readonly class SimpleChunker
|
|
{
|
|
public function __construct(
|
|
private IndexConfigurationProvider $configurationProvider,
|
|
private TextNormalizer $textNormalizer
|
|
) {}
|
|
|
|
/** @return string[] */
|
|
public function chunk(string $text): array
|
|
{
|
|
$config = $this->configurationProvider->getConfiguration();
|
|
|
|
$maxWords = max(1, $config->getChunkSize());
|
|
$overlapWords = max(0, $config->getChunkOverlap());
|
|
|
|
$text = $this->textNormalizer->normalize($text);
|
|
if ($text === '') {
|
|
return [];
|
|
}
|
|
|
|
// Absatzbasierte Vorstruktur
|
|
$paragraphs = preg_split('/\n{2,}/u', $text);
|
|
if (!$paragraphs) {
|
|
return [];
|
|
}
|
|
|
|
$chunks = [];
|
|
$currentChunk = '';
|
|
$currentWordCount = 0;
|
|
|
|
foreach ($paragraphs as $paragraph) {
|
|
|
|
$paragraph = trim($paragraph);
|
|
if ($paragraph === '') {
|
|
continue;
|
|
}
|
|
|
|
$paragraphWordCount = $this->countWords($paragraph);
|
|
|
|
// Absatz größer als maxWords → Wort-Fallback
|
|
if ($paragraphWordCount > $maxWords) {
|
|
|
|
if ($currentChunk !== '') {
|
|
$chunks[] = trim($currentChunk);
|
|
$currentChunk = '';
|
|
$currentWordCount = 0;
|
|
}
|
|
|
|
foreach ($this->chunkByWords($paragraph, $maxWords, $overlapWords) as $subChunk) {
|
|
$chunks[] = $subChunk;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
// Absatz passt in aktuellen Chunk
|
|
if ($currentWordCount + $paragraphWordCount <= $maxWords) {
|
|
$currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph;
|
|
$currentWordCount += $paragraphWordCount;
|
|
continue;
|
|
}
|
|
|
|
// Flush
|
|
if ($currentChunk !== '') {
|
|
$chunks[] = trim($currentChunk);
|
|
}
|
|
|
|
$currentChunk = $paragraph;
|
|
$currentWordCount = $paragraphWordCount;
|
|
}
|
|
|
|
if ($currentChunk !== '') {
|
|
$chunks[] = trim($currentChunk);
|
|
}
|
|
|
|
return $this->dedupe($chunks);
|
|
}
|
|
|
|
// ======================================================
|
|
// Wortbasierter Fallback
|
|
// ======================================================
|
|
|
|
/** @return string[] */
|
|
private function chunkByWords(string $text, int $maxWords, int $overlapWords): array
|
|
{
|
|
$tokens = preg_split(
|
|
'/(\s+)/u',
|
|
$text,
|
|
-1,
|
|
PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
|
|
);
|
|
|
|
if (!$tokens) {
|
|
return [];
|
|
}
|
|
|
|
$wordTokenIndexes = [];
|
|
foreach ($tokens as $i => $token) {
|
|
if (!preg_match('/^\s+$/u', $token)) {
|
|
$wordTokenIndexes[] = $i;
|
|
}
|
|
}
|
|
|
|
$totalWords = count($wordTokenIndexes);
|
|
if ($totalWords === 0) {
|
|
return [];
|
|
}
|
|
|
|
$chunks = [];
|
|
$wordPos = 0;
|
|
|
|
while ($wordPos < $totalWords) {
|
|
|
|
$wordEnd = min($wordPos + $maxWords, $totalWords);
|
|
|
|
$tokenStart = $wordTokenIndexes[$wordPos];
|
|
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
|
|
|
|
$tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
|
|
|
|
$chunk = trim(implode('', array_slice(
|
|
$tokens,
|
|
$tokenStart,
|
|
$tokenEnd - $tokenStart
|
|
)));
|
|
|
|
if ($chunk !== '') {
|
|
$chunks[] = $chunk;
|
|
}
|
|
|
|
if ($wordEnd >= $totalWords) {
|
|
break;
|
|
}
|
|
|
|
$wordPos = max(0, $wordEnd - $overlapWords);
|
|
}
|
|
|
|
return $chunks;
|
|
}
|
|
|
|
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
|
|
{
|
|
// Schutz für Listenanfänge
|
|
$startToken = $tokens[$start] ?? '';
|
|
if (preg_match('/^\s*-\s+/u', $startToken)) {
|
|
return $end;
|
|
}
|
|
|
|
// Rückwärts prüfen auf Absatz- oder Satzende
|
|
for ($i = $end - 1; $i > $start; $i--) {
|
|
|
|
if ($tokens[$i] === "\n\n") {
|
|
return $i + 1;
|
|
}
|
|
|
|
if (
|
|
preg_match('/[.!?]\s*$/u', $tokens[$i]) &&
|
|
isset($tokens[$i + 1]) &&
|
|
str_contains($tokens[$i + 1], "\n")
|
|
) {
|
|
return $i + 1;
|
|
}
|
|
}
|
|
|
|
return $end;
|
|
}
|
|
|
|
private function countWords(string $text): int
|
|
{
|
|
$parts = preg_split('/\s+/u', trim($text));
|
|
return $parts ? count($parts) : 0;
|
|
}
|
|
|
|
/** @param string[] $chunks @return string[] */
|
|
private function dedupe(array $chunks): array
|
|
{
|
|
$seen = [];
|
|
$out = [];
|
|
|
|
foreach ($chunks as $chunk) {
|
|
|
|
$normalized = preg_replace('/\s+/u', ' ', trim($chunk));
|
|
if ($normalized === null) {
|
|
continue;
|
|
}
|
|
|
|
$key = mb_strtolower($normalized);
|
|
|
|
if (isset($seen[$key])) {
|
|
continue;
|
|
}
|
|
|
|
$seen[$key] = true;
|
|
$out[] = $chunk;
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
} |