stash light
This commit is contained in:
@@ -1,39 +1,93 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/KnowledgeIngestService.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
use App\Entity\DocumentVersion;
|
||||
use App\Repository\DocumentVersionRepository;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
final class KnowledgeIngestService
|
||||
{
|
||||
public function __construct(
|
||||
private DocumentLoader $loader,
|
||||
private SimpleChunker $chunker,
|
||||
private ChunkWriter $writer,
|
||||
private ChunkIndexWriter $indexWriter,
|
||||
)
|
||||
{
|
||||
private DocumentLoader $loader,
|
||||
private SimpleChunker $chunker,
|
||||
private DocumentVersionRepository $versionRepo,
|
||||
) {
|
||||
}
|
||||
|
||||
/** @return string[] written chunk filenames */
|
||||
public function ingestFile(string $path, bool $optimize = false): array
|
||||
/**
|
||||
* Lokaler Ingest: erzeugt NDJSON-Records für genau diese Version.
|
||||
*
|
||||
* @return iterable<array<string,mixed>>
|
||||
*/
|
||||
public function buildChunkRecords(DocumentVersion $version): iterable
|
||||
{
|
||||
$text = $this->loader->load($path);
|
||||
|
||||
if ($optimize) {
|
||||
$text = preg_replace("/\n{3,}/", "\n\n", $text);
|
||||
$text = preg_replace("/[ \t]+$/m", "", $text);
|
||||
}
|
||||
|
||||
$sourceHash = sha1($text);
|
||||
$sourceName = basename($path);
|
||||
|
||||
if ($this->indexWriter->hasSourceHash($sourceName, $sourceHash)) {
|
||||
return [];
|
||||
}
|
||||
$text = $this->loader->load($version->getFilePath());
|
||||
$text = $this->optimizeText($text);
|
||||
|
||||
$chunks = $this->chunker->chunk($text);
|
||||
return $this->writer->write($sourceName, $chunks, $sourceHash);
|
||||
|
||||
$documentId = $version->getDocument()->getId()->toRfc4122();
|
||||
$versionId = $version->getId()->toRfc4122();
|
||||
|
||||
$index = 0;
|
||||
|
||||
foreach ($chunks as $chunkText) {
|
||||
yield [
|
||||
'chunk_id' => Uuid::v4()->toRfc4122(),
|
||||
'document_id' => $documentId,
|
||||
'version_id' => $versionId,
|
||||
'chunk_index' => $index++,
|
||||
'text' => $chunkText,
|
||||
'checksum' => sha1($chunkText),
|
||||
'metadata' => $this->buildMetadata($version),
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Global Reindex: iteriert streamingfähig über alle aktiven Versionen.
|
||||
* Keine RAM-Explosion, da alles generatorbasiert bleibt.
|
||||
*
|
||||
* @return iterable<array<string,mixed>>
|
||||
*/
|
||||
public function buildAllActiveChunkRecords(): iterable
|
||||
{
|
||||
foreach ($this->versionRepo->iterateActiveVersions() as $version) {
|
||||
// yield from hält das Ganze streamingfähig (Generator-Kaskade)
|
||||
yield from $this->buildChunkRecords($version);
|
||||
}
|
||||
}
|
||||
|
||||
private function optimizeText(string $text): string
|
||||
{
|
||||
$text = preg_replace("/\n{3,}/", "\n\n", $text);
|
||||
$text = preg_replace("/[ \t]+$/m", "", $text);
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string,mixed>
|
||||
*/
|
||||
private function buildMetadata(DocumentVersion $version): array
|
||||
{
|
||||
$doc = $version->getDocument();
|
||||
|
||||
// Optional: Titel/Name, falls vorhanden
|
||||
$title = null;
|
||||
if (method_exists($doc, 'getTitle')) {
|
||||
$title = $doc->getTitle();
|
||||
} elseif (method_exists($doc, 'getName')) {
|
||||
$title = $doc->getName();
|
||||
}
|
||||
|
||||
return array_filter([
|
||||
'document_title' => $title,
|
||||
'version_number' => method_exists($version, 'getVersionNumber') ? $version->getVersionNumber() : null,
|
||||
'file_path' => $version->getFilePath(),
|
||||
], static fn($v) => $v !== null && $v !== '');
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user