optimize ingesting documents

This commit is contained in:
team2
2026-02-28 22:48:01 +01:00
parent 54ce057ef0
commit 509ba83ac0
6 changed files with 335 additions and 21 deletions

View File

@@ -7,14 +7,16 @@ namespace App\Knowledge\Ingest;
use App\Entity\DocumentVersion;
use App\Knowledge\Text\TextNormalizer;
use App\Repository\DocumentVersionRepository;
use App\Ingest\DocumentSanitizer;
final readonly class KnowledgeIngestService
final readonly class KnowledgeIngestService
{
public function __construct(
private DocumentLoader $loader,
private SimpleChunker $chunker,
private DocumentVersionRepository $versionRepo,
private TextNormalizer $textNormalizer
private TextNormalizer $textNormalizer,
private DocumentSanitizer $documentSanitizer, // ✅ NEU
)
{
}
@@ -26,8 +28,13 @@ final readonly class KnowledgeIngestService
*/
public function buildChunkRecords(DocumentVersion $version): iterable
{
// 1⃣ Rohtext laden
$text = $this->loader->load($version->getFilePath());
// 2⃣ 🔥 Deterministische Vorverarbeitung (NEU)
$text = $this->documentSanitizer->sanitize($text);
// 3⃣ Chunking
$chunks = $this->chunker->chunk($text);
$doc = $version->getDocument();
@@ -41,7 +48,6 @@ final readonly class KnowledgeIngestService
foreach ($chunks as $chunkText) {
if ($title !== '' && !str_starts_with($chunkText, $title)) {
//title with backticks
$chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
}
@@ -57,13 +63,13 @@ final readonly class KnowledgeIngestService
);
yield [
'chunk_id' => $chunkId,
'chunk_id' => $chunkId,
'document_id' => $documentId,
'version_id' => $versionId,
'version_id' => $versionId,
'chunk_index' => $index++,
'text' => $chunkText,
'checksum' => sha1($chunkText),
'metadata' => $this->buildMetadata($version),
'text' => $chunkText,
'checksum' => sha1($chunkText),
'metadata' => $this->buildMetadata($version),
];
}
}

View File

@@ -10,10 +10,9 @@ use App\Knowledge\Text\TextNormalizer;
final readonly class SimpleChunker
{
public function __construct(
private IndexConfigurationProvider $configurationProvider,
private TextNormalizer $textNormalizer
private TextNormalizer $textNormalizer
)
{
}
@@ -23,7 +22,7 @@ final readonly class SimpleChunker
{
$config = $this->configurationProvider->getConfiguration();
$maxWords = $config->getChunkSize();
$maxWords = $config->getChunkSize();
$overlapWords = $config->getChunkOverlap();
$text = $this->textNormalizer->normalize($text);
@@ -31,6 +30,74 @@ final readonly class SimpleChunker
return [];
}
// ======================================================
// HYBRID: Erst Absatzbasiert sammeln
// ======================================================
$paragraphs = preg_split('/\n{2,}/u', $text);
if (!$paragraphs) {
return [];
}
$chunks = [];
$currentChunk = '';
$currentWordCount = 0;
foreach ($paragraphs as $paragraph) {
$paragraph = trim($paragraph);
if ($paragraph === '') {
continue;
}
$paragraphWordCount = $this->countWords($paragraph);
// Falls einzelner Absatz größer als maxWords → Fallback
if ($paragraphWordCount > $maxWords) {
if ($currentChunk !== '') {
$chunks[] = trim($currentChunk);
$currentChunk = '';
$currentWordCount = 0;
}
foreach ($this->chunkByWords($paragraph, $maxWords, $overlapWords) as $subChunk) {
$chunks[] = $subChunk;
}
continue;
}
// Absatz passt noch in aktuellen Chunk
if ($currentWordCount + $paragraphWordCount <= $maxWords) {
$currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph;
$currentWordCount += $paragraphWordCount;
continue;
}
// Flush aktueller Chunk
if ($currentChunk !== '') {
$chunks[] = trim($currentChunk);
}
$currentChunk = $paragraph;
$currentWordCount = $paragraphWordCount;
}
if ($currentChunk !== '') {
$chunks[] = trim($currentChunk);
}
return $this->dedupe($chunks);
}
// ======================================================
// Wortbasierter Fallback (Original-Logik beibehalten)
// ======================================================
/** @return string[] */
private function chunkByWords(string $text, int $maxWords, int $overlapWords): array
{
$tokens = preg_split(
'/(\s+)/u',
$text,
@@ -61,7 +128,7 @@ final readonly class SimpleChunker
$wordEnd = min($wordPos + $maxWords, $totalWords);
$tokenStart = $wordTokenIndexes[$wordPos];
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
$tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
@@ -82,7 +149,7 @@ final readonly class SimpleChunker
$wordPos = max(0, $wordEnd - $overlapWords);
}
return $this->dedupe($chunks);
return $chunks;
}
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
@@ -110,11 +177,17 @@ final readonly class SimpleChunker
return $end;
}
private function countWords(string $text): int
{
$parts = preg_split('/\s+/u', trim($text));
return $parts ? count($parts) : 0;
}
/** @param string[] $chunks @return string[] */
private function dedupe(array $chunks): array
{
$seen = [];
$out = [];
$out = [];
foreach ($chunks as $chunk) {
$key = mb_strtolower(
@@ -131,4 +204,4 @@ final readonly class SimpleChunker
return $out;
}
}
}