> */ public function buildChunkRecords(DocumentVersion $version): iterable { // 1️⃣ Rohtext laden $text = $this->loader->load($version->getFilePath()); // 2️⃣ 🔥 Deterministische Vorverarbeitung (NEU) $text = $this->documentSanitizer->sanitize($text); // 3️⃣ Chunking $chunks = $this->chunker->chunk($text); $doc = $version->getDocument(); $documentId = $doc->getId()->toRfc4122(); $versionId = $version->getId()->toRfc4122(); $title = trim((string)$doc->getTitle()); $index = 0; foreach ($chunks as $chunkText) { if ($title !== '' && !str_starts_with($chunkText, $title)) { $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText; } $chunkText = trim($chunkText); // 🔥 deterministische Chunk-ID $normalizedForId = $this->textNormalizer->normalize($chunkText); $chunkId = sha1( $documentId . '|' . $versionId . '|' . $normalizedForId ); yield [ 'chunk_id' => $chunkId, 'document_id' => $documentId, 'version_id' => $versionId, 'chunk_index' => $index++, 'text' => $chunkText, 'checksum' => sha1($chunkText), 'metadata' => $this->buildMetadata($version), ]; } } /** * Global Reindex */ public function buildAllActiveChunkRecords(): iterable { foreach ($this->versionRepo->iterateActiveVersions() as $version) { yield from $this->buildChunkRecords($version); } } /** * @return array */ private function buildMetadata(DocumentVersion $version): array { $doc = $version->getDocument(); $title = null; if (method_exists($doc, 'getTitle')) { $title = $doc->getTitle(); } elseif (method_exists($doc, 'getName')) { $title = $doc->getName(); } return array_filter([ 'document_title' => $title, 'version_number' => method_exists($version, 'getVersionNumber') ? $version->getVersionNumber() : null, 'file_path' => $version->getFilePath(), ], static fn($v) => $v !== null && $v !== ''); } }