cleanup and harden document and ingest service
This commit is contained in:
@@ -9,6 +9,7 @@ use App\Index\IndexMetaManager;
|
||||
use App\Knowledge\ChunkManager;
|
||||
use App\Knowledge\Ingest\KnowledgeIngestService;
|
||||
use App\Vector\VectorIndexBuilder;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
@@ -23,91 +24,118 @@ final readonly class IngestFlow
|
||||
private VectorIndexBuilder $vectorBuilder,
|
||||
private IndexMetaManager $metaManager,
|
||||
private LoggerInterface $logger,
|
||||
) {
|
||||
}
|
||||
private EntityManagerInterface $em,
|
||||
) {}
|
||||
|
||||
// =========================================================
|
||||
// DOCUMENT INGEST
|
||||
// =========================================================
|
||||
|
||||
public function ingestDocumentVersion(DocumentVersion $version): void
|
||||
{
|
||||
$this->metaManager->validateAgainstCurrent();
|
||||
|
||||
$this->chunkManager->compactByDocument($version->getDocument()->getId());
|
||||
$version->setIngestStatus(DocumentVersion::INGEST_RUNNING);
|
||||
$this->em->flush();
|
||||
|
||||
$existing = $this->chunkManager->countAllChunks();
|
||||
try {
|
||||
|
||||
$recordsIterable = $this->knowledgeIngestService->buildChunkRecords($version);
|
||||
$records = is_array($recordsIterable)
|
||||
? $recordsIterable
|
||||
: iterator_to_array($recordsIterable, false);
|
||||
// Entfernt alte Chunks dieses Dokuments
|
||||
$this->chunkManager->compactByDocument($version->getDocument()->getId());
|
||||
|
||||
$incoming = count($records);
|
||||
$total = $existing + $incoming;
|
||||
$existing = $this->chunkManager->countAllChunks();
|
||||
|
||||
if ($total >= self::CHUNK_LIMIT_WARN) {
|
||||
$this->logger->warning('RAG chunk count approaching limit.', [
|
||||
'existing' => $existing,
|
||||
'incoming' => $incoming,
|
||||
'total' => $total,
|
||||
'warn_at' => self::CHUNK_LIMIT_WARN,
|
||||
'hard_cap' => self::CHUNK_LIMIT_HARD,
|
||||
'document_id' => $version->getDocument()->getId()->toRfc4122(),
|
||||
'version_id' => $version->getId()->toRfc4122(),
|
||||
]);
|
||||
$records = iterator_to_array(
|
||||
$this->knowledgeIngestService->buildChunkRecords($version),
|
||||
false
|
||||
);
|
||||
|
||||
$incoming = count($records);
|
||||
$total = $existing + $incoming;
|
||||
|
||||
if ($total >= self::CHUNK_LIMIT_WARN) {
|
||||
$this->logger->warning('Chunk count approaching limit.', [
|
||||
'existing' => $existing,
|
||||
'incoming' => $incoming,
|
||||
'total' => $total,
|
||||
]);
|
||||
}
|
||||
|
||||
if ($total > self::CHUNK_LIMIT_HARD) {
|
||||
throw new \RuntimeException('Chunk limit exceeded.');
|
||||
}
|
||||
|
||||
$this->chunkManager->appendChunks($records);
|
||||
|
||||
$this->rebuildIndex(false);
|
||||
|
||||
$version->setIngestStatus(DocumentVersion::INGEST_INDEXED);
|
||||
$this->em->flush();
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
|
||||
$version->setIngestStatus(DocumentVersion::INGEST_FAILED);
|
||||
$this->em->flush();
|
||||
throw $e;
|
||||
}
|
||||
|
||||
if ($total > self::CHUNK_LIMIT_HARD) {
|
||||
throw new \RuntimeException(sprintf(
|
||||
'Chunk limit reached: %d existing + %d incoming = %d (hard cap: %d). Reduce knowledge base or move to a scaled vector setup (IVF/HNSW/GPU/sharding).',
|
||||
$existing,
|
||||
$incoming,
|
||||
$total,
|
||||
self::CHUNK_LIMIT_HARD
|
||||
));
|
||||
}
|
||||
|
||||
$this->chunkManager->appendChunks($records);
|
||||
$this->vectorBuilder->rebuildFromNdjson();
|
||||
|
||||
$this->updateChuckCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* HARD DELETE FLOW
|
||||
*
|
||||
* Removes all chunks belonging to a document from index.ndjson
|
||||
* and rebuilds the vector index deterministically.
|
||||
*/
|
||||
// =========================================================
|
||||
// GLOBAL REINDEX
|
||||
// =========================================================
|
||||
|
||||
public function globalReindex(): void
|
||||
{
|
||||
$records = $this->knowledgeIngestService->buildAllActiveChunkRecords();
|
||||
|
||||
$this->chunkManager->rewriteAll($records);
|
||||
|
||||
$this->rebuildIndex(true);
|
||||
}
|
||||
|
||||
// =========================================================
|
||||
// DELETE FLOW
|
||||
// =========================================================
|
||||
|
||||
public function deleteDocument(Uuid $documentId): void
|
||||
{
|
||||
$this->metaManager->validateAgainstCurrent();
|
||||
|
||||
$this->logger->info('Deleting document from RAG index.', [
|
||||
'document_id' => $documentId->toRfc4122(),
|
||||
]);
|
||||
$document = $this->em
|
||||
->getRepository(\App\Entity\Document::class)
|
||||
->find($documentId);
|
||||
|
||||
// Remove chunks for this document
|
||||
if (!$document) {
|
||||
throw new \RuntimeException('Document not found.');
|
||||
}
|
||||
|
||||
// 1) NDJSON bereinigen
|
||||
$this->chunkManager->compactByDocument($documentId);
|
||||
|
||||
// Rebuild vector index from updated NDJSON
|
||||
$this->vectorBuilder->rebuildFromNdjson();
|
||||
// 2) Vector neu bauen
|
||||
$this->rebuildIndex(false);
|
||||
|
||||
// Update runtime stats
|
||||
$this->updateChuckCount();
|
||||
// 3) DB Delete (nach rebuild)
|
||||
$this->em->remove($document);
|
||||
$this->em->flush();
|
||||
}
|
||||
|
||||
public function globalReindex(): void
|
||||
// =========================================================
|
||||
// CENTRAL REBUILD
|
||||
// =========================================================
|
||||
|
||||
private function rebuildIndex(bool $isGlobal): void
|
||||
{
|
||||
$allRecords = $this->knowledgeIngestService->buildAllActiveChunkRecords();
|
||||
|
||||
$this->chunkManager->rewriteAll($allRecords);
|
||||
|
||||
$this->vectorBuilder->rebuildFromNdjson();
|
||||
|
||||
$this->metaManager->writeMetaForGlobalReindex();
|
||||
if ($isGlobal) {
|
||||
$this->metaManager->writeMetaForGlobalReindex();
|
||||
}
|
||||
|
||||
$this->updateChuckCount();
|
||||
$this->updateChunkCount();
|
||||
}
|
||||
|
||||
private function updateChuckCount(): void
|
||||
private function updateChunkCount(): void
|
||||
{
|
||||
$chunkCount = $this->chunkManager->countAllChunks();
|
||||
$this->metaManager->updateRuntimeStats($chunkCount);
|
||||
|
||||
Reference in New Issue
Block a user