stash light

This commit is contained in:
team 1
2026-02-12 10:03:52 +01:00
parent 5b650a8f28
commit 0bb0c0b42f
51 changed files with 6864 additions and 72 deletions

View File

@@ -1,39 +1,93 @@
<?php
// src/Knowledge/Ingest/KnowledgeIngestService.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
use App\Entity\DocumentVersion;
use App\Repository\DocumentVersionRepository;
use Symfony\Component\Uid\Uuid;
final class KnowledgeIngestService
{
public function __construct(
private DocumentLoader $loader,
private SimpleChunker $chunker,
private ChunkWriter $writer,
private ChunkIndexWriter $indexWriter,
)
{
private DocumentLoader $loader,
private SimpleChunker $chunker,
private DocumentVersionRepository $versionRepo,
) {
}
/** @return string[] written chunk filenames */
public function ingestFile(string $path, bool $optimize = false): array
/**
* Lokaler Ingest: erzeugt NDJSON-Records für genau diese Version.
*
* @return iterable<array<string,mixed>>
*/
public function buildChunkRecords(DocumentVersion $version): iterable
{
$text = $this->loader->load($path);
if ($optimize) {
$text = preg_replace("/\n{3,}/", "\n\n", $text);
$text = preg_replace("/[ \t]+$/m", "", $text);
}
$sourceHash = sha1($text);
$sourceName = basename($path);
if ($this->indexWriter->hasSourceHash($sourceName, $sourceHash)) {
return [];
}
$text = $this->loader->load($version->getFilePath());
$text = $this->optimizeText($text);
$chunks = $this->chunker->chunk($text);
return $this->writer->write($sourceName, $chunks, $sourceHash);
$documentId = $version->getDocument()->getId()->toRfc4122();
$versionId = $version->getId()->toRfc4122();
$index = 0;
foreach ($chunks as $chunkText) {
yield [
'chunk_id' => Uuid::v4()->toRfc4122(),
'document_id' => $documentId,
'version_id' => $versionId,
'chunk_index' => $index++,
'text' => $chunkText,
'checksum' => sha1($chunkText),
'metadata' => $this->buildMetadata($version),
];
}
}
/**
* Global Reindex: iteriert streamingfähig über alle aktiven Versionen.
* Keine RAM-Explosion, da alles generatorbasiert bleibt.
*
* @return iterable<array<string,mixed>>
*/
public function buildAllActiveChunkRecords(): iterable
{
foreach ($this->versionRepo->iterateActiveVersions() as $version) {
// yield from hält das Ganze streamingfähig (Generator-Kaskade)
yield from $this->buildChunkRecords($version);
}
}
private function optimizeText(string $text): string
{
$text = preg_replace("/\n{3,}/", "\n\n", $text);
$text = preg_replace("/[ \t]+$/m", "", $text);
return $text;
}
/**
* @return array<string,mixed>
*/
private function buildMetadata(DocumentVersion $version): array
{
$doc = $version->getDocument();
// Optional: Titel/Name, falls vorhanden
$title = null;
if (method_exists($doc, 'getTitle')) {
$title = $doc->getTitle();
} elseif (method_exists($doc, 'getName')) {
$title = $doc->getName();
}
return array_filter([
'document_title' => $title,
'version_number' => method_exists($version, 'getVersionNumber') ? $version->getVersionNumber() : null,
'file_path' => $version->getFilePath(),
], static fn($v) => $v !== null && $v !== '');
}
}