cleanup code

This commit is contained in:
team2
2026-02-27 15:49:01 +01:00
parent a5a6f466f3
commit 44be40a24d
4 changed files with 14 additions and 55 deletions

View File

@@ -14,11 +14,6 @@ final readonly class ChunkWriteService
private ChunkManager $chunkManager, private ChunkManager $chunkManager,
) {} ) {}
public function getIndexPath(): string
{
return $this->chunkManager->getIndexPath();
}
public function countAllChunks(): int public function countAllChunks(): int
{ {
return $this->chunkManager->countAllChunks(); return $this->chunkManager->countAllChunks();
@@ -37,29 +32,6 @@ final readonly class ChunkWriteService
$this->chunkManager->appendChunks($chunks); $this->chunkManager->appendChunks($chunks);
} }
/**
* Lokaler Ingest für eine einzelne DocumentVersion.
*
* Ablauf:
* 1. Entfernt bestehende Chunks dieses Dokuments
* 2. Appendet neue Chunks
*
* @param iterable<array<string,mixed>> $chunks
*/
public function writeForDocumentVersion(
DocumentVersion $version,
iterable $chunks
): void {
$documentId = $version->getDocument()->getId();
if (!$documentId instanceof Uuid) {
throw new \RuntimeException('Document ID must be a Uuid instance');
}
$this->chunkManager->compactByDocument($documentId);
$this->chunkManager->appendChunks($chunks);
}
/** /**
* Vollständiger Rewrite des NDJSON-Index (Global Reindex). * Vollständiger Rewrite des NDJSON-Index (Global Reindex).
* *

View File

@@ -17,11 +17,6 @@ final class ChunkManager
$this->indexPath = rtrim($projectDir, '/') . $relativeIndexPath; $this->indexPath = rtrim($projectDir, '/') . $relativeIndexPath;
} }
public function getIndexPath(): string
{
return $this->indexPath;
}
// ============================================================ // ============================================================
// COUNT (Streaming, robust) // COUNT (Streaming, robust)
// ============================================================ // ============================================================

View File

@@ -6,15 +6,16 @@ declare(strict_types=1);
namespace App\Knowledge\Ingest; namespace App\Knowledge\Ingest;
use App\Index\IndexConfigurationProvider; use App\Index\IndexConfigurationProvider;
use App\Knowledge\Text\TextNormalizer;
final class SimpleChunker final readonly class SimpleChunker
{ {
private IndexConfigurationProvider $configurationProvider;
public function __construct( public function __construct(
IndexConfigurationProvider $configurationProvider private IndexConfigurationProvider $configurationProvider,
) { private TextNormalizer $textNormalizer
$this->configurationProvider = $configurationProvider; )
{
} }
/** @return string[] */ /** @return string[] */
@@ -25,7 +26,7 @@ final class SimpleChunker
$maxWords = $config->getChunkSize(); $maxWords = $config->getChunkSize();
$overlapWords = $config->getChunkOverlap(); $overlapWords = $config->getChunkOverlap();
$text = $this->normalize($text); $text = $this->textNormalizer->normalize($text);
if ($text === '') { if ($text === '') {
return []; return [];
} }
@@ -60,7 +61,7 @@ final class SimpleChunker
$wordEnd = min($wordPos + $maxWords, $totalWords); $wordEnd = min($wordPos + $maxWords, $totalWords);
$tokenStart = $wordTokenIndexes[$wordPos]; $tokenStart = $wordTokenIndexes[$wordPos];
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1; $tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
$tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd); $tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
@@ -84,15 +85,6 @@ final class SimpleChunker
return $this->dedupe($chunks); return $this->dedupe($chunks);
} }
private function normalize(string $text): string
{
$text = str_replace(["\r\n", "\r"], "\n", $text);
$text = preg_replace("/[ \t]+/u", " ", $text);
$text = preg_replace("/\n{3,}/u", "\n\n", $text);
return trim((string) $text);
}
private function adjustCutToBoundary(array $tokens, int $start, int $end): int private function adjustCutToBoundary(array $tokens, int $start, int $end): int
{ {
$startToken = $tokens[$start] ?? ''; $startToken = $tokens[$start] ?? '';
@@ -122,7 +114,7 @@ final class SimpleChunker
private function dedupe(array $chunks): array private function dedupe(array $chunks): array
{ {
$seen = []; $seen = [];
$out = []; $out = [];
foreach ($chunks as $chunk) { foreach ($chunks as $chunk) {
$key = mb_strtolower( $key = mb_strtolower(

View File

@@ -9,13 +9,13 @@ use App\Ingest\IngestFlow;
use Doctrine\ORM\EntityManagerInterface; use Doctrine\ORM\EntityManagerInterface;
use Symfony\Component\Uid\Uuid; use Symfony\Component\Uid\Uuid;
final class IngestOrchestrator final readonly class IngestOrchestrator
{ {
public function __construct( public function __construct(
private readonly LockService $lockService, private LockService $lockService,
private readonly IngestJobService $jobService, private IngestJobService $jobService,
private readonly EntityManagerInterface $em, private EntityManagerInterface $em,
private readonly IngestFlow $ingestFlow, private IngestFlow $ingestFlow,
) )
{ {
} }