optimize ingesting documents
This commit is contained in:
@@ -10,10 +10,9 @@ use App\Knowledge\Text\TextNormalizer;
|
||||
|
||||
final readonly class SimpleChunker
|
||||
{
|
||||
|
||||
public function __construct(
|
||||
private IndexConfigurationProvider $configurationProvider,
|
||||
private TextNormalizer $textNormalizer
|
||||
private TextNormalizer $textNormalizer
|
||||
)
|
||||
{
|
||||
}
|
||||
@@ -23,7 +22,7 @@ final readonly class SimpleChunker
|
||||
{
|
||||
$config = $this->configurationProvider->getConfiguration();
|
||||
|
||||
$maxWords = $config->getChunkSize();
|
||||
$maxWords = $config->getChunkSize();
|
||||
$overlapWords = $config->getChunkOverlap();
|
||||
|
||||
$text = $this->textNormalizer->normalize($text);
|
||||
@@ -31,6 +30,74 @@ final readonly class SimpleChunker
|
||||
return [];
|
||||
}
|
||||
|
||||
// ======================================================
|
||||
// HYBRID: Erst Absatzbasiert sammeln
|
||||
// ======================================================
|
||||
|
||||
$paragraphs = preg_split('/\n{2,}/u', $text);
|
||||
if (!$paragraphs) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$chunks = [];
|
||||
$currentChunk = '';
|
||||
$currentWordCount = 0;
|
||||
|
||||
foreach ($paragraphs as $paragraph) {
|
||||
|
||||
$paragraph = trim($paragraph);
|
||||
if ($paragraph === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$paragraphWordCount = $this->countWords($paragraph);
|
||||
|
||||
// Falls einzelner Absatz größer als maxWords → Fallback
|
||||
if ($paragraphWordCount > $maxWords) {
|
||||
|
||||
if ($currentChunk !== '') {
|
||||
$chunks[] = trim($currentChunk);
|
||||
$currentChunk = '';
|
||||
$currentWordCount = 0;
|
||||
}
|
||||
|
||||
foreach ($this->chunkByWords($paragraph, $maxWords, $overlapWords) as $subChunk) {
|
||||
$chunks[] = $subChunk;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Absatz passt noch in aktuellen Chunk
|
||||
if ($currentWordCount + $paragraphWordCount <= $maxWords) {
|
||||
$currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph;
|
||||
$currentWordCount += $paragraphWordCount;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Flush aktueller Chunk
|
||||
if ($currentChunk !== '') {
|
||||
$chunks[] = trim($currentChunk);
|
||||
}
|
||||
|
||||
$currentChunk = $paragraph;
|
||||
$currentWordCount = $paragraphWordCount;
|
||||
}
|
||||
|
||||
if ($currentChunk !== '') {
|
||||
$chunks[] = trim($currentChunk);
|
||||
}
|
||||
|
||||
return $this->dedupe($chunks);
|
||||
}
|
||||
|
||||
// ======================================================
|
||||
// Wortbasierter Fallback (Original-Logik beibehalten)
|
||||
// ======================================================
|
||||
|
||||
/** @return string[] */
|
||||
private function chunkByWords(string $text, int $maxWords, int $overlapWords): array
|
||||
{
|
||||
$tokens = preg_split(
|
||||
'/(\s+)/u',
|
||||
$text,
|
||||
@@ -61,7 +128,7 @@ final readonly class SimpleChunker
|
||||
$wordEnd = min($wordPos + $maxWords, $totalWords);
|
||||
|
||||
$tokenStart = $wordTokenIndexes[$wordPos];
|
||||
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
|
||||
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
|
||||
|
||||
$tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
|
||||
|
||||
@@ -82,7 +149,7 @@ final readonly class SimpleChunker
|
||||
$wordPos = max(0, $wordEnd - $overlapWords);
|
||||
}
|
||||
|
||||
return $this->dedupe($chunks);
|
||||
return $chunks;
|
||||
}
|
||||
|
||||
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
|
||||
@@ -110,11 +177,17 @@ final readonly class SimpleChunker
|
||||
return $end;
|
||||
}
|
||||
|
||||
private function countWords(string $text): int
|
||||
{
|
||||
$parts = preg_split('/\s+/u', trim($text));
|
||||
return $parts ? count($parts) : 0;
|
||||
}
|
||||
|
||||
/** @param string[] $chunks @return string[] */
|
||||
private function dedupe(array $chunks): array
|
||||
{
|
||||
$seen = [];
|
||||
$out = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$key = mb_strtolower(
|
||||
@@ -131,4 +204,4 @@ final readonly class SimpleChunker
|
||||
|
||||
return $out;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user