optimize ingesting documents
This commit is contained in:
@@ -7,14 +7,16 @@ namespace App\Knowledge\Ingest;
|
||||
use App\Entity\DocumentVersion;
|
||||
use App\Knowledge\Text\TextNormalizer;
|
||||
use App\Repository\DocumentVersionRepository;
|
||||
use App\Ingest\DocumentSanitizer;
|
||||
|
||||
final readonly class KnowledgeIngestService
|
||||
final readonly class KnowledgeIngestService
|
||||
{
|
||||
public function __construct(
|
||||
private DocumentLoader $loader,
|
||||
private SimpleChunker $chunker,
|
||||
private DocumentVersionRepository $versionRepo,
|
||||
private TextNormalizer $textNormalizer
|
||||
private TextNormalizer $textNormalizer,
|
||||
private DocumentSanitizer $documentSanitizer, // ✅ NEU
|
||||
)
|
||||
{
|
||||
}
|
||||
@@ -26,8 +28,13 @@ final readonly class KnowledgeIngestService
|
||||
*/
|
||||
public function buildChunkRecords(DocumentVersion $version): iterable
|
||||
{
|
||||
// 1️⃣ Rohtext laden
|
||||
$text = $this->loader->load($version->getFilePath());
|
||||
|
||||
// 2️⃣ 🔥 Deterministische Vorverarbeitung (NEU)
|
||||
$text = $this->documentSanitizer->sanitize($text);
|
||||
|
||||
// 3️⃣ Chunking
|
||||
$chunks = $this->chunker->chunk($text);
|
||||
|
||||
$doc = $version->getDocument();
|
||||
@@ -41,7 +48,6 @@ final readonly class KnowledgeIngestService
|
||||
foreach ($chunks as $chunkText) {
|
||||
|
||||
if ($title !== '' && !str_starts_with($chunkText, $title)) {
|
||||
//title with backticks
|
||||
$chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
|
||||
}
|
||||
|
||||
@@ -57,13 +63,13 @@ final readonly class KnowledgeIngestService
|
||||
);
|
||||
|
||||
yield [
|
||||
'chunk_id' => $chunkId,
|
||||
'chunk_id' => $chunkId,
|
||||
'document_id' => $documentId,
|
||||
'version_id' => $versionId,
|
||||
'version_id' => $versionId,
|
||||
'chunk_index' => $index++,
|
||||
'text' => $chunkText,
|
||||
'checksum' => sha1($chunkText),
|
||||
'metadata' => $this->buildMetadata($version),
|
||||
'text' => $chunkText,
|
||||
'checksum' => sha1($chunkText),
|
||||
'metadata' => $this->buildMetadata($version),
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user