Files
MtoRagSystem/src/Knowledge/Ingest/KnowledgeIngestService.php
2026-02-28 22:48:01 +01:00

109 lines
3.1 KiB
PHP
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
use App\Entity\DocumentVersion;
use App\Knowledge\Text\TextNormalizer;
use App\Repository\DocumentVersionRepository;
use App\Ingest\DocumentSanitizer;
final readonly class KnowledgeIngestService
{
public function __construct(
private DocumentLoader $loader,
private SimpleChunker $chunker,
private DocumentVersionRepository $versionRepo,
private TextNormalizer $textNormalizer,
private DocumentSanitizer $documentSanitizer, // ✅ NEU
)
{
}
/**
* Lokaler Ingest: erzeugt deterministische NDJSON-Records.
*
* @return iterable<array<string,mixed>>
*/
public function buildChunkRecords(DocumentVersion $version): iterable
{
// 1⃣ Rohtext laden
$text = $this->loader->load($version->getFilePath());
// 2⃣ 🔥 Deterministische Vorverarbeitung (NEU)
$text = $this->documentSanitizer->sanitize($text);
// 3⃣ Chunking
$chunks = $this->chunker->chunk($text);
$doc = $version->getDocument();
$documentId = $doc->getId()->toRfc4122();
$versionId = $version->getId()->toRfc4122();
$title = trim((string)$doc->getTitle());
$index = 0;
foreach ($chunks as $chunkText) {
if ($title !== '' && !str_starts_with($chunkText, $title)) {
$chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
}
$chunkText = trim($chunkText);
// 🔥 deterministische Chunk-ID
$normalizedForId = $this->textNormalizer->normalize($chunkText);
$chunkId = sha1(
$documentId . '|' .
$versionId . '|' .
$normalizedForId
);
yield [
'chunk_id' => $chunkId,
'document_id' => $documentId,
'version_id' => $versionId,
'chunk_index' => $index++,
'text' => $chunkText,
'checksum' => sha1($chunkText),
'metadata' => $this->buildMetadata($version),
];
}
}
/**
* Global Reindex
*/
public function buildAllActiveChunkRecords(): iterable
{
foreach ($this->versionRepo->iterateActiveVersions() as $version) {
yield from $this->buildChunkRecords($version);
}
}
/**
* @return array<string,mixed>
*/
private function buildMetadata(DocumentVersion $version): array
{
$doc = $version->getDocument();
$title = null;
if (method_exists($doc, 'getTitle')) {
$title = $doc->getTitle();
} elseif (method_exists($doc, 'getName')) {
$title = $doc->getName();
}
return array_filter([
'document_title' => $title,
'version_number' => method_exists($version, 'getVersionNumber')
? $version->getVersionNumber()
: null,
'file_path' => $version->getFilePath(),
], static fn($v) => $v !== null && $v !== '');
}
}