optimize ingesting documents

This commit is contained in:
team2
2026-02-28 22:48:01 +01:00
parent 54ce057ef0
commit 509ba83ac0
6 changed files with 335 additions and 21 deletions

View File

@@ -96,8 +96,8 @@ print("Encoding embeddings...")
embeddings = model.encode( embeddings = model.encode(
texts, texts,
normalize_embeddings=True, normalize_embeddings=True,
show_progress_bar=True, show_progress_bar=False,
batch_size=64 batch_size=128
) )
embeddings = np.array(embeddings).astype("float32") embeddings = np.array(embeddings).astype("float32")

View File

@@ -109,8 +109,8 @@ if not texts:
embeddings = model.encode( embeddings = model.encode(
texts, texts,
normalize_embeddings=True, normalize_embeddings=True,
show_progress_bar=False, show_progress_bar=True,
batch_size=64 batch_size=128
) )
embeddings = np.array(embeddings).astype("float32") embeddings = np.array(embeddings).astype("float32")

View File

@@ -11,8 +11,10 @@ use Symfony\Component\Uid\Uuid;
final readonly class ChunkWriteService final readonly class ChunkWriteService
{ {
public function __construct( public function __construct(
private ChunkManager $chunkManager, private ChunkManager $chunkManager
) {} )
{
}
public function countAllChunks(): int public function countAllChunks(): int
{ {
@@ -41,4 +43,5 @@ final readonly class ChunkWriteService
{ {
$this->chunkManager->rewriteAll($allChunks); $this->chunkManager->rewriteAll($allChunks);
} }
} }

View File

@@ -0,0 +1,232 @@
<?php
declare(strict_types=1);
namespace App\Ingest;
/**
* DocumentSanitizer
*
* Ziel (deterministisch, minimal-invasiv):
* - Entfernt typische PDF-/DOC-Artefakte VOR dem Chunking:
* - Inhaltsverzeichnis-Blöcke (TOC)
* - Seitenzahlen / "Seite X von Y"
* - wiederkehrende Header/Footer-Zeilen
* - Dot-Leader-Zeilen (".... 12")
*
* Guardrails:
* - Keine semantische Umschreibung
* - Keine Zufälligkeit
* - Kein Entfernen echter Fließtext-Absätze
*/
final class DocumentSanitizer
{
private const MAX_HEADER_LEN = 120;
private const REPEAT_HEADER_MIN_COUNT = 3;
public function sanitize(string $text): string
{
if ($text === '') {
return '';
}
$text = $this->normalizeLineEndings($text);
// Wichtig: Reihenfolge so, dass wir erst "grobe Blöcke" (TOC) entfernen,
// danach zeilenbasierte Artefakte (PageNumbers/Headers/DotLeaders).
$text = $this->removeToc($text);
$text = $this->removePageNumbers($text);
$text = $this->removeDotLeaderLines($text);
$text = $this->removeRepeatedHeaders($text);
$text = $this->cleanupWhitespace($text);
return trim($text);
}
private function normalizeLineEndings(string $text): string
{
// Vereinheitlichen auf \n (deterministisch, kein Encoding-Change)
return str_replace(["\r\n", "\r"], "\n", $text);
}
/**
* Entfernt TOC-Block nach "Inhaltsverzeichnis" bis zum ersten "echten" Absatz.
*
* Heuristik:
* - Start: Zeile enthält "Inhaltsverzeichnis" (case-insensitive)
* - Innerhalb TOC werden Zeilen entfernt, die wie TOC-Einträge aussehen:
* - Dot-Leader + Seitenzahl
* - Kapitelnummern + Text + Seitenzahl
* - Ende: sobald eine Zeile "absatzartig" wirkt:
* - ausreichend lang UND enthält Satzpunkt (.)
*
* Guardrail:
* - Leere Zeilen innerhalb TOC werden verworfen (damit TOC-Block wirklich weg ist)
*/
private function removeToc(string $text): string
{
$lines = explode("\n", $text);
$filtered = [];
$inToc = false;
foreach ($lines as $line) {
$trim = trim($line);
// TOC Start
if (!$inToc && $trim !== '' && stripos($trim, 'inhaltsverzeichnis') !== false) {
$inToc = true;
continue;
}
if ($inToc) {
// Innerhalb TOC: leere Zeilen weg (Block entfernen)
if ($trim === '') {
continue;
}
// typische TOC-Zeilen (Leader / Kapitelnummern)
if ($this->looksLikeDotLeaderLine($trim) || $this->looksLikeNumberedTocLine($trim)) {
continue;
}
// Ende TOC, wenn "echter Absatz" beginnt (lang + Punkt)
if (strlen($trim) >= 120 && str_contains($trim, '.')) {
$inToc = false;
$filtered[] = $line;
continue;
}
// sonst: solange wir im TOC sind, ignorieren
continue;
}
$filtered[] = $line;
}
return implode("\n", $filtered);
}
/**
* Entfernt typische Seitenzahl-Zeilen.
*
* Guardrails:
* - Nur kurze, "isolierte" Zeilen (trim != '')
* - Lässt Fließtext unangetastet
*/
private function removePageNumbers(string $text): string
{
$lines = explode("\n", $text);
$filtered = [];
foreach ($lines as $line) {
$trim = trim($line);
if ($trim === '') {
$filtered[] = $line;
continue;
}
// "Seite 3" / "Seite 3 von 20"
if (preg_match('/^seite\s+\d+(\s+von\s+\d+)?$/iu', $trim)) {
continue;
}
// "Page 12" / "Page 12 of 34"
if (preg_match('/^page\s+\d+(\s+of\s+\d+)?$/iu', $trim)) {
continue;
}
// "- 4 -" / "4" / " 4 "
if (preg_match('/^[-]?\s?\d{1,3}\s?[-]?$/u', $trim)) {
continue;
}
$filtered[] = $line;
}
return implode("\n", $filtered);
}
/**
* Entfernt Dot-Leader-Zeilen überall (nicht nur im TOC),
* z.B.: "Kapitel ......... 12"
*/
private function removeDotLeaderLines(string $text): string
{
$lines = explode("\n", $text);
$filtered = [];
foreach ($lines as $line) {
$trim = trim($line);
if ($trim !== '' && $this->looksLikeDotLeaderLine($trim)) {
continue;
}
$filtered[] = $line;
}
return implode("\n", $filtered);
}
/**
* Entfernt wiederkehrende Header/Footer-Zeilen.
*
* Guardrails:
* - Nur relativ kurze Zeilen (unter MAX_HEADER_LEN)
* - Nur wenn identisch (trim) >= REPEAT_HEADER_MIN_COUNT
* - Leere Zeilen bleiben erhalten
*/
private function removeRepeatedHeaders(string $text): string
{
$lines = explode("\n", $text);
// counts basiert auf trim (damit z.B. unterschiedliche Einrückung nicht zählt)
$trimmed = array_map('trim', $lines);
$counts = array_count_values($trimmed);
$filtered = [];
foreach ($lines as $line) {
$trim = trim($line);
if (
$trim !== '' &&
strlen($trim) < self::MAX_HEADER_LEN &&
($counts[$trim] ?? 0) >= self::REPEAT_HEADER_MIN_COUNT
) {
continue;
}
$filtered[] = $line;
}
return implode("\n", $filtered);
}
private function cleanupWhitespace(string $text): string
{
// nicht zu aggressiv: nur 3+ Leerzeilen auf 2 reduzieren
$text = preg_replace("/\n{3,}/", "\n\n", $text);
return $text ?? '';
}
// =========================================================
// Heuristics (isoliert, testbar)
// =========================================================
private function looksLikeDotLeaderLine(string $trimmedLine): bool
{
// "Text ..... 12" (mind. 5 Punkte, Seitenzahl am Ende)
return (bool)preg_match('/^.+\.{5,}\s*\d+$/u', $trimmedLine);
}
private function looksLikeNumberedTocLine(string $trimmedLine): bool
{
// "2.1 Kapitelname 12" / "3 Kapitelname 7"
// Kapitelnummern + Text + Seitenzahl am Ende
return (bool)preg_match('/^\d+(\.\d+)*\s+.+\s+\d+$/u', $trimmedLine);
}
}

View File

@@ -7,14 +7,16 @@ namespace App\Knowledge\Ingest;
use App\Entity\DocumentVersion; use App\Entity\DocumentVersion;
use App\Knowledge\Text\TextNormalizer; use App\Knowledge\Text\TextNormalizer;
use App\Repository\DocumentVersionRepository; use App\Repository\DocumentVersionRepository;
use App\Ingest\DocumentSanitizer;
final readonly class KnowledgeIngestService final readonly class KnowledgeIngestService
{ {
public function __construct( public function __construct(
private DocumentLoader $loader, private DocumentLoader $loader,
private SimpleChunker $chunker, private SimpleChunker $chunker,
private DocumentVersionRepository $versionRepo, private DocumentVersionRepository $versionRepo,
private TextNormalizer $textNormalizer private TextNormalizer $textNormalizer,
private DocumentSanitizer $documentSanitizer, // ✅ NEU
) )
{ {
} }
@@ -26,8 +28,13 @@ final readonly class KnowledgeIngestService
*/ */
public function buildChunkRecords(DocumentVersion $version): iterable public function buildChunkRecords(DocumentVersion $version): iterable
{ {
// 1⃣ Rohtext laden
$text = $this->loader->load($version->getFilePath()); $text = $this->loader->load($version->getFilePath());
// 2⃣ 🔥 Deterministische Vorverarbeitung (NEU)
$text = $this->documentSanitizer->sanitize($text);
// 3⃣ Chunking
$chunks = $this->chunker->chunk($text); $chunks = $this->chunker->chunk($text);
$doc = $version->getDocument(); $doc = $version->getDocument();
@@ -41,7 +48,6 @@ final readonly class KnowledgeIngestService
foreach ($chunks as $chunkText) { foreach ($chunks as $chunkText) {
if ($title !== '' && !str_starts_with($chunkText, $title)) { if ($title !== '' && !str_starts_with($chunkText, $title)) {
//title with backticks
$chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText; $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
} }
@@ -57,13 +63,13 @@ final readonly class KnowledgeIngestService
); );
yield [ yield [
'chunk_id' => $chunkId, 'chunk_id' => $chunkId,
'document_id' => $documentId, 'document_id' => $documentId,
'version_id' => $versionId, 'version_id' => $versionId,
'chunk_index' => $index++, 'chunk_index' => $index++,
'text' => $chunkText, 'text' => $chunkText,
'checksum' => sha1($chunkText), 'checksum' => sha1($chunkText),
'metadata' => $this->buildMetadata($version), 'metadata' => $this->buildMetadata($version),
]; ];
} }
} }

View File

@@ -10,10 +10,9 @@ use App\Knowledge\Text\TextNormalizer;
final readonly class SimpleChunker final readonly class SimpleChunker
{ {
public function __construct( public function __construct(
private IndexConfigurationProvider $configurationProvider, private IndexConfigurationProvider $configurationProvider,
private TextNormalizer $textNormalizer private TextNormalizer $textNormalizer
) )
{ {
} }
@@ -23,7 +22,7 @@ final readonly class SimpleChunker
{ {
$config = $this->configurationProvider->getConfiguration(); $config = $this->configurationProvider->getConfiguration();
$maxWords = $config->getChunkSize(); $maxWords = $config->getChunkSize();
$overlapWords = $config->getChunkOverlap(); $overlapWords = $config->getChunkOverlap();
$text = $this->textNormalizer->normalize($text); $text = $this->textNormalizer->normalize($text);
@@ -31,6 +30,74 @@ final readonly class SimpleChunker
return []; return [];
} }
// ======================================================
// HYBRID: Erst Absatzbasiert sammeln
// ======================================================
$paragraphs = preg_split('/\n{2,}/u', $text);
if (!$paragraphs) {
return [];
}
$chunks = [];
$currentChunk = '';
$currentWordCount = 0;
foreach ($paragraphs as $paragraph) {
$paragraph = trim($paragraph);
if ($paragraph === '') {
continue;
}
$paragraphWordCount = $this->countWords($paragraph);
// Falls einzelner Absatz größer als maxWords → Fallback
if ($paragraphWordCount > $maxWords) {
if ($currentChunk !== '') {
$chunks[] = trim($currentChunk);
$currentChunk = '';
$currentWordCount = 0;
}
foreach ($this->chunkByWords($paragraph, $maxWords, $overlapWords) as $subChunk) {
$chunks[] = $subChunk;
}
continue;
}
// Absatz passt noch in aktuellen Chunk
if ($currentWordCount + $paragraphWordCount <= $maxWords) {
$currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph;
$currentWordCount += $paragraphWordCount;
continue;
}
// Flush aktueller Chunk
if ($currentChunk !== '') {
$chunks[] = trim($currentChunk);
}
$currentChunk = $paragraph;
$currentWordCount = $paragraphWordCount;
}
if ($currentChunk !== '') {
$chunks[] = trim($currentChunk);
}
return $this->dedupe($chunks);
}
// ======================================================
// Wortbasierter Fallback (Original-Logik beibehalten)
// ======================================================
/** @return string[] */
private function chunkByWords(string $text, int $maxWords, int $overlapWords): array
{
$tokens = preg_split( $tokens = preg_split(
'/(\s+)/u', '/(\s+)/u',
$text, $text,
@@ -61,7 +128,7 @@ final readonly class SimpleChunker
$wordEnd = min($wordPos + $maxWords, $totalWords); $wordEnd = min($wordPos + $maxWords, $totalWords);
$tokenStart = $wordTokenIndexes[$wordPos]; $tokenStart = $wordTokenIndexes[$wordPos];
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1; $tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
$tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd); $tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
@@ -82,7 +149,7 @@ final readonly class SimpleChunker
$wordPos = max(0, $wordEnd - $overlapWords); $wordPos = max(0, $wordEnd - $overlapWords);
} }
return $this->dedupe($chunks); return $chunks;
} }
private function adjustCutToBoundary(array $tokens, int $start, int $end): int private function adjustCutToBoundary(array $tokens, int $start, int $end): int
@@ -110,11 +177,17 @@ final readonly class SimpleChunker
return $end; return $end;
} }
private function countWords(string $text): int
{
$parts = preg_split('/\s+/u', trim($text));
return $parts ? count($parts) : 0;
}
/** @param string[] $chunks @return string[] */ /** @param string[] $chunks @return string[] */
private function dedupe(array $chunks): array private function dedupe(array $chunks): array
{ {
$seen = []; $seen = [];
$out = []; $out = [];
foreach ($chunks as $chunk) { foreach ($chunks as $chunk) {
$key = mb_strtolower( $key = mb_strtolower(