cleanup and harden document and ingest service

This commit is contained in:
team 1
2026-02-17 15:42:09 +01:00
parent 2981716c3d
commit 0b96ce6188
5 changed files with 236 additions and 240 deletions

View File

@@ -9,6 +9,7 @@ use App\Index\IndexMetaManager;
use App\Knowledge\ChunkManager;
use App\Knowledge\Ingest\KnowledgeIngestService;
use App\Vector\VectorIndexBuilder;
use Doctrine\ORM\EntityManagerInterface;
use Psr\Log\LoggerInterface;
use Symfony\Component\Uid\Uuid;
@@ -23,91 +24,118 @@ final readonly class IngestFlow
private VectorIndexBuilder $vectorBuilder,
private IndexMetaManager $metaManager,
private LoggerInterface $logger,
) {
}
private EntityManagerInterface $em,
) {}
// =========================================================
// DOCUMENT INGEST
// =========================================================
public function ingestDocumentVersion(DocumentVersion $version): void
{
$this->metaManager->validateAgainstCurrent();
$this->chunkManager->compactByDocument($version->getDocument()->getId());
$version->setIngestStatus(DocumentVersion::INGEST_RUNNING);
$this->em->flush();
$existing = $this->chunkManager->countAllChunks();
try {
$recordsIterable = $this->knowledgeIngestService->buildChunkRecords($version);
$records = is_array($recordsIterable)
? $recordsIterable
: iterator_to_array($recordsIterable, false);
// Entfernt alte Chunks dieses Dokuments
$this->chunkManager->compactByDocument($version->getDocument()->getId());
$incoming = count($records);
$total = $existing + $incoming;
$existing = $this->chunkManager->countAllChunks();
if ($total >= self::CHUNK_LIMIT_WARN) {
$this->logger->warning('RAG chunk count approaching limit.', [
'existing' => $existing,
'incoming' => $incoming,
'total' => $total,
'warn_at' => self::CHUNK_LIMIT_WARN,
'hard_cap' => self::CHUNK_LIMIT_HARD,
'document_id' => $version->getDocument()->getId()->toRfc4122(),
'version_id' => $version->getId()->toRfc4122(),
]);
$records = iterator_to_array(
$this->knowledgeIngestService->buildChunkRecords($version),
false
);
$incoming = count($records);
$total = $existing + $incoming;
if ($total >= self::CHUNK_LIMIT_WARN) {
$this->logger->warning('Chunk count approaching limit.', [
'existing' => $existing,
'incoming' => $incoming,
'total' => $total,
]);
}
if ($total > self::CHUNK_LIMIT_HARD) {
throw new \RuntimeException('Chunk limit exceeded.');
}
$this->chunkManager->appendChunks($records);
$this->rebuildIndex(false);
$version->setIngestStatus(DocumentVersion::INGEST_INDEXED);
$this->em->flush();
} catch (\Throwable $e) {
$version->setIngestStatus(DocumentVersion::INGEST_FAILED);
$this->em->flush();
throw $e;
}
if ($total > self::CHUNK_LIMIT_HARD) {
throw new \RuntimeException(sprintf(
'Chunk limit reached: %d existing + %d incoming = %d (hard cap: %d). Reduce knowledge base or move to a scaled vector setup (IVF/HNSW/GPU/sharding).',
$existing,
$incoming,
$total,
self::CHUNK_LIMIT_HARD
));
}
$this->chunkManager->appendChunks($records);
$this->vectorBuilder->rebuildFromNdjson();
$this->updateChuckCount();
}
/**
* HARD DELETE FLOW
*
* Removes all chunks belonging to a document from index.ndjson
* and rebuilds the vector index deterministically.
*/
// =========================================================
// GLOBAL REINDEX
// =========================================================
public function globalReindex(): void
{
$records = $this->knowledgeIngestService->buildAllActiveChunkRecords();
$this->chunkManager->rewriteAll($records);
$this->rebuildIndex(true);
}
// =========================================================
// DELETE FLOW
// =========================================================
public function deleteDocument(Uuid $documentId): void
{
$this->metaManager->validateAgainstCurrent();
$this->logger->info('Deleting document from RAG index.', [
'document_id' => $documentId->toRfc4122(),
]);
$document = $this->em
->getRepository(\App\Entity\Document::class)
->find($documentId);
// Remove chunks for this document
if (!$document) {
throw new \RuntimeException('Document not found.');
}
// 1) NDJSON bereinigen
$this->chunkManager->compactByDocument($documentId);
// Rebuild vector index from updated NDJSON
$this->vectorBuilder->rebuildFromNdjson();
// 2) Vector neu bauen
$this->rebuildIndex(false);
// Update runtime stats
$this->updateChuckCount();
// 3) DB Delete (nach rebuild)
$this->em->remove($document);
$this->em->flush();
}
public function globalReindex(): void
// =========================================================
// CENTRAL REBUILD
// =========================================================
private function rebuildIndex(bool $isGlobal): void
{
$allRecords = $this->knowledgeIngestService->buildAllActiveChunkRecords();
$this->chunkManager->rewriteAll($allRecords);
$this->vectorBuilder->rebuildFromNdjson();
$this->metaManager->writeMetaForGlobalReindex();
if ($isGlobal) {
$this->metaManager->writeMetaForGlobalReindex();
}
$this->updateChuckCount();
$this->updateChunkCount();
}
private function updateChuckCount(): void
private function updateChunkCount(): void
{
$chunkCount = $this->chunkManager->countAllChunks();
$this->metaManager->updateRuntimeStats($chunkCount);