cleanup code
This commit is contained in:
@@ -14,11 +14,6 @@ final readonly class ChunkWriteService
|
|||||||
private ChunkManager $chunkManager,
|
private ChunkManager $chunkManager,
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
public function getIndexPath(): string
|
|
||||||
{
|
|
||||||
return $this->chunkManager->getIndexPath();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function countAllChunks(): int
|
public function countAllChunks(): int
|
||||||
{
|
{
|
||||||
return $this->chunkManager->countAllChunks();
|
return $this->chunkManager->countAllChunks();
|
||||||
@@ -37,29 +32,6 @@ final readonly class ChunkWriteService
|
|||||||
$this->chunkManager->appendChunks($chunks);
|
$this->chunkManager->appendChunks($chunks);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Lokaler Ingest für eine einzelne DocumentVersion.
|
|
||||||
*
|
|
||||||
* Ablauf:
|
|
||||||
* 1. Entfernt bestehende Chunks dieses Dokuments
|
|
||||||
* 2. Appendet neue Chunks
|
|
||||||
*
|
|
||||||
* @param iterable<array<string,mixed>> $chunks
|
|
||||||
*/
|
|
||||||
public function writeForDocumentVersion(
|
|
||||||
DocumentVersion $version,
|
|
||||||
iterable $chunks
|
|
||||||
): void {
|
|
||||||
$documentId = $version->getDocument()->getId();
|
|
||||||
|
|
||||||
if (!$documentId instanceof Uuid) {
|
|
||||||
throw new \RuntimeException('Document ID must be a Uuid instance');
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->chunkManager->compactByDocument($documentId);
|
|
||||||
$this->chunkManager->appendChunks($chunks);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Vollständiger Rewrite des NDJSON-Index (Global Reindex).
|
* Vollständiger Rewrite des NDJSON-Index (Global Reindex).
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -17,11 +17,6 @@ final class ChunkManager
|
|||||||
$this->indexPath = rtrim($projectDir, '/') . $relativeIndexPath;
|
$this->indexPath = rtrim($projectDir, '/') . $relativeIndexPath;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getIndexPath(): string
|
|
||||||
{
|
|
||||||
return $this->indexPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// COUNT (Streaming, robust)
|
// COUNT (Streaming, robust)
|
||||||
// ============================================================
|
// ============================================================
|
||||||
|
|||||||
@@ -6,15 +6,16 @@ declare(strict_types=1);
|
|||||||
namespace App\Knowledge\Ingest;
|
namespace App\Knowledge\Ingest;
|
||||||
|
|
||||||
use App\Index\IndexConfigurationProvider;
|
use App\Index\IndexConfigurationProvider;
|
||||||
|
use App\Knowledge\Text\TextNormalizer;
|
||||||
|
|
||||||
final class SimpleChunker
|
final readonly class SimpleChunker
|
||||||
{
|
{
|
||||||
private IndexConfigurationProvider $configurationProvider;
|
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
IndexConfigurationProvider $configurationProvider
|
private IndexConfigurationProvider $configurationProvider,
|
||||||
) {
|
private TextNormalizer $textNormalizer
|
||||||
$this->configurationProvider = $configurationProvider;
|
)
|
||||||
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @return string[] */
|
/** @return string[] */
|
||||||
@@ -25,7 +26,7 @@ final class SimpleChunker
|
|||||||
$maxWords = $config->getChunkSize();
|
$maxWords = $config->getChunkSize();
|
||||||
$overlapWords = $config->getChunkOverlap();
|
$overlapWords = $config->getChunkOverlap();
|
||||||
|
|
||||||
$text = $this->normalize($text);
|
$text = $this->textNormalizer->normalize($text);
|
||||||
if ($text === '') {
|
if ($text === '') {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
@@ -84,15 +85,6 @@ final class SimpleChunker
|
|||||||
return $this->dedupe($chunks);
|
return $this->dedupe($chunks);
|
||||||
}
|
}
|
||||||
|
|
||||||
private function normalize(string $text): string
|
|
||||||
{
|
|
||||||
$text = str_replace(["\r\n", "\r"], "\n", $text);
|
|
||||||
$text = preg_replace("/[ \t]+/u", " ", $text);
|
|
||||||
$text = preg_replace("/\n{3,}/u", "\n\n", $text);
|
|
||||||
|
|
||||||
return trim((string) $text);
|
|
||||||
}
|
|
||||||
|
|
||||||
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
|
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
|
||||||
{
|
{
|
||||||
$startToken = $tokens[$start] ?? '';
|
$startToken = $tokens[$start] ?? '';
|
||||||
|
|||||||
@@ -9,13 +9,13 @@ use App\Ingest\IngestFlow;
|
|||||||
use Doctrine\ORM\EntityManagerInterface;
|
use Doctrine\ORM\EntityManagerInterface;
|
||||||
use Symfony\Component\Uid\Uuid;
|
use Symfony\Component\Uid\Uuid;
|
||||||
|
|
||||||
final class IngestOrchestrator
|
final readonly class IngestOrchestrator
|
||||||
{
|
{
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly LockService $lockService,
|
private LockService $lockService,
|
||||||
private readonly IngestJobService $jobService,
|
private IngestJobService $jobService,
|
||||||
private readonly EntityManagerInterface $em,
|
private EntityManagerInterface $em,
|
||||||
private readonly IngestFlow $ingestFlow,
|
private IngestFlow $ingestFlow,
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user