optimize system and cleanup
This commit is contained in:
@@ -13,27 +13,22 @@ final readonly class SimpleChunker
|
||||
public function __construct(
|
||||
private IndexConfigurationProvider $configurationProvider,
|
||||
private TextNormalizer $textNormalizer
|
||||
)
|
||||
{
|
||||
}
|
||||
) {}
|
||||
|
||||
/** @return string[] */
|
||||
public function chunk(string $text): array
|
||||
{
|
||||
$config = $this->configurationProvider->getConfiguration();
|
||||
|
||||
$maxWords = $config->getChunkSize();
|
||||
$overlapWords = $config->getChunkOverlap();
|
||||
$maxWords = max(1, $config->getChunkSize());
|
||||
$overlapWords = max(0, $config->getChunkOverlap());
|
||||
|
||||
$text = $this->textNormalizer->normalize($text);
|
||||
if ($text === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
// ======================================================
|
||||
// HYBRID: Erst Absatzbasiert sammeln
|
||||
// ======================================================
|
||||
|
||||
// Absatzbasierte Vorstruktur
|
||||
$paragraphs = preg_split('/\n{2,}/u', $text);
|
||||
if (!$paragraphs) {
|
||||
return [];
|
||||
@@ -52,7 +47,7 @@ final readonly class SimpleChunker
|
||||
|
||||
$paragraphWordCount = $this->countWords($paragraph);
|
||||
|
||||
// Falls einzelner Absatz größer als maxWords → Fallback
|
||||
// Absatz größer als maxWords → Wort-Fallback
|
||||
if ($paragraphWordCount > $maxWords) {
|
||||
|
||||
if ($currentChunk !== '') {
|
||||
@@ -68,14 +63,14 @@ final readonly class SimpleChunker
|
||||
continue;
|
||||
}
|
||||
|
||||
// Absatz passt noch in aktuellen Chunk
|
||||
// Absatz passt in aktuellen Chunk
|
||||
if ($currentWordCount + $paragraphWordCount <= $maxWords) {
|
||||
$currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph;
|
||||
$currentWordCount += $paragraphWordCount;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Flush aktueller Chunk
|
||||
// Flush
|
||||
if ($currentChunk !== '') {
|
||||
$chunks[] = trim($currentChunk);
|
||||
}
|
||||
@@ -92,7 +87,7 @@ final readonly class SimpleChunker
|
||||
}
|
||||
|
||||
// ======================================================
|
||||
// Wortbasierter Fallback (Original-Logik beibehalten)
|
||||
// Wortbasierter Fallback
|
||||
// ======================================================
|
||||
|
||||
/** @return string[] */
|
||||
@@ -125,6 +120,7 @@ final readonly class SimpleChunker
|
||||
$wordPos = 0;
|
||||
|
||||
while ($wordPos < $totalWords) {
|
||||
|
||||
$wordEnd = min($wordPos + $maxWords, $totalWords);
|
||||
|
||||
$tokenStart = $wordTokenIndexes[$wordPos];
|
||||
@@ -154,11 +150,13 @@ final readonly class SimpleChunker
|
||||
|
||||
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
|
||||
{
|
||||
// Schutz für Listenanfänge
|
||||
$startToken = $tokens[$start] ?? '';
|
||||
if (preg_match('/^- /u', ltrim($startToken))) {
|
||||
if (preg_match('/^\s*-\s+/u', $startToken)) {
|
||||
return $end;
|
||||
}
|
||||
|
||||
// Rückwärts prüfen auf Absatz- oder Satzende
|
||||
for ($i = $end - 1; $i > $start; $i--) {
|
||||
|
||||
if ($tokens[$i] === "\n\n") {
|
||||
@@ -190,9 +188,13 @@ final readonly class SimpleChunker
|
||||
$out = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$key = mb_strtolower(
|
||||
preg_replace('/\s+/u', ' ', trim($chunk))
|
||||
);
|
||||
|
||||
$normalized = preg_replace('/\s+/u', ' ', trim($chunk));
|
||||
if ($normalized === null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$key = mb_strtolower($normalized);
|
||||
|
||||
if (isset($seen[$key])) {
|
||||
continue;
|
||||
|
||||
Reference in New Issue
Block a user