109 lines
3.1 KiB
PHP
109 lines
3.1 KiB
PHP
<?php
|
||
|
||
declare(strict_types=1);
|
||
|
||
namespace App\Knowledge\Ingest;
|
||
|
||
use App\Entity\DocumentVersion;
|
||
use App\Knowledge\Text\TextNormalizer;
|
||
use App\Repository\DocumentVersionRepository;
|
||
use App\Ingest\DocumentSanitizer;
|
||
|
||
final readonly class KnowledgeIngestService
|
||
{
|
||
public function __construct(
|
||
private DocumentLoader $loader,
|
||
private SimpleChunker $chunker,
|
||
private DocumentVersionRepository $versionRepo,
|
||
private TextNormalizer $textNormalizer,
|
||
private DocumentSanitizer $documentSanitizer, // ✅ NEU
|
||
)
|
||
{
|
||
}
|
||
|
||
/**
|
||
* Lokaler Ingest: erzeugt deterministische NDJSON-Records.
|
||
*
|
||
* @return iterable<array<string,mixed>>
|
||
*/
|
||
public function buildChunkRecords(DocumentVersion $version): iterable
|
||
{
|
||
// 1️⃣ Rohtext laden
|
||
$text = $this->loader->load($version->getFilePath());
|
||
|
||
// 2️⃣ 🔥 Deterministische Vorverarbeitung (NEU)
|
||
$text = $this->documentSanitizer->sanitize($text);
|
||
|
||
// 3️⃣ Chunking
|
||
$chunks = $this->chunker->chunk($text);
|
||
|
||
$doc = $version->getDocument();
|
||
$documentId = $doc->getId()->toRfc4122();
|
||
$versionId = $version->getId()->toRfc4122();
|
||
|
||
$title = trim((string)$doc->getTitle());
|
||
|
||
$index = 0;
|
||
|
||
foreach ($chunks as $chunkText) {
|
||
|
||
if ($title !== '' && !str_starts_with($chunkText, $title)) {
|
||
$chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
|
||
}
|
||
|
||
$chunkText = trim($chunkText);
|
||
|
||
// 🔥 deterministische Chunk-ID
|
||
$normalizedForId = $this->textNormalizer->normalize($chunkText);
|
||
|
||
$chunkId = sha1(
|
||
$documentId . '|' .
|
||
$versionId . '|' .
|
||
$normalizedForId
|
||
);
|
||
|
||
yield [
|
||
'chunk_id' => $chunkId,
|
||
'document_id' => $documentId,
|
||
'version_id' => $versionId,
|
||
'chunk_index' => $index++,
|
||
'text' => $chunkText,
|
||
'checksum' => sha1($chunkText),
|
||
'metadata' => $this->buildMetadata($version),
|
||
];
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Global Reindex
|
||
*/
|
||
public function buildAllActiveChunkRecords(): iterable
|
||
{
|
||
foreach ($this->versionRepo->iterateActiveVersions() as $version) {
|
||
yield from $this->buildChunkRecords($version);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @return array<string,mixed>
|
||
*/
|
||
private function buildMetadata(DocumentVersion $version): array
|
||
{
|
||
$doc = $version->getDocument();
|
||
|
||
$title = null;
|
||
if (method_exists($doc, 'getTitle')) {
|
||
$title = $doc->getTitle();
|
||
} elseif (method_exists($doc, 'getName')) {
|
||
$title = $doc->getName();
|
||
}
|
||
|
||
return array_filter([
|
||
'document_title' => $title,
|
||
'version_number' => method_exists($version, 'getVersionNumber')
|
||
? $version->getVersionNumber()
|
||
: null,
|
||
'file_path' => $version->getFilePath(),
|
||
], static fn($v) => $v !== null && $v !== '');
|
||
}
|
||
} |