optimize ingesting documents

This commit is contained in:
team2
2026-02-28 22:48:01 +01:00
parent 54ce057ef0
commit 509ba83ac0
6 changed files with 335 additions and 21 deletions

View File

@@ -7,14 +7,16 @@ namespace App\Knowledge\Ingest;
use App\Entity\DocumentVersion;
use App\Knowledge\Text\TextNormalizer;
use App\Repository\DocumentVersionRepository;
use App\Ingest\DocumentSanitizer;
final readonly class KnowledgeIngestService
final readonly class KnowledgeIngestService
{
public function __construct(
private DocumentLoader $loader,
private SimpleChunker $chunker,
private DocumentVersionRepository $versionRepo,
private TextNormalizer $textNormalizer
private TextNormalizer $textNormalizer,
private DocumentSanitizer $documentSanitizer, // ✅ NEU
)
{
}
@@ -26,8 +28,13 @@ final readonly class KnowledgeIngestService
*/
public function buildChunkRecords(DocumentVersion $version): iterable
{
// 1⃣ Rohtext laden
$text = $this->loader->load($version->getFilePath());
// 2⃣ 🔥 Deterministische Vorverarbeitung (NEU)
$text = $this->documentSanitizer->sanitize($text);
// 3⃣ Chunking
$chunks = $this->chunker->chunk($text);
$doc = $version->getDocument();
@@ -41,7 +48,6 @@ final readonly class KnowledgeIngestService
foreach ($chunks as $chunkText) {
if ($title !== '' && !str_starts_with($chunkText, $title)) {
//title with backticks
$chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
}
@@ -57,13 +63,13 @@ final readonly class KnowledgeIngestService
);
yield [
'chunk_id' => $chunkId,
'chunk_id' => $chunkId,
'document_id' => $documentId,
'version_id' => $versionId,
'version_id' => $versionId,
'chunk_index' => $index++,
'text' => $chunkText,
'checksum' => sha1($chunkText),
'metadata' => $this->buildMetadata($version),
'text' => $chunkText,
'checksum' => sha1($chunkText),
'metadata' => $this->buildMetadata($version),
];
}
}