cleanup code
This commit is contained in:
@@ -17,11 +17,6 @@ final class ChunkManager
|
||||
$this->indexPath = rtrim($projectDir, '/') . $relativeIndexPath;
|
||||
}
|
||||
|
||||
public function getIndexPath(): string
|
||||
{
|
||||
return $this->indexPath;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// COUNT (Streaming, robust)
|
||||
// ============================================================
|
||||
|
||||
@@ -6,15 +6,16 @@ declare(strict_types=1);
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
use App\Index\IndexConfigurationProvider;
|
||||
use App\Knowledge\Text\TextNormalizer;
|
||||
|
||||
final class SimpleChunker
|
||||
final readonly class SimpleChunker
|
||||
{
|
||||
private IndexConfigurationProvider $configurationProvider;
|
||||
|
||||
public function __construct(
|
||||
IndexConfigurationProvider $configurationProvider
|
||||
) {
|
||||
$this->configurationProvider = $configurationProvider;
|
||||
private IndexConfigurationProvider $configurationProvider,
|
||||
private TextNormalizer $textNormalizer
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
@@ -25,7 +26,7 @@ final class SimpleChunker
|
||||
$maxWords = $config->getChunkSize();
|
||||
$overlapWords = $config->getChunkOverlap();
|
||||
|
||||
$text = $this->normalize($text);
|
||||
$text = $this->textNormalizer->normalize($text);
|
||||
if ($text === '') {
|
||||
return [];
|
||||
}
|
||||
@@ -60,7 +61,7 @@ final class SimpleChunker
|
||||
$wordEnd = min($wordPos + $maxWords, $totalWords);
|
||||
|
||||
$tokenStart = $wordTokenIndexes[$wordPos];
|
||||
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
|
||||
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
|
||||
|
||||
$tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
|
||||
|
||||
@@ -84,15 +85,6 @@ final class SimpleChunker
|
||||
return $this->dedupe($chunks);
|
||||
}
|
||||
|
||||
private function normalize(string $text): string
|
||||
{
|
||||
$text = str_replace(["\r\n", "\r"], "\n", $text);
|
||||
$text = preg_replace("/[ \t]+/u", " ", $text);
|
||||
$text = preg_replace("/\n{3,}/u", "\n\n", $text);
|
||||
|
||||
return trim((string) $text);
|
||||
}
|
||||
|
||||
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
|
||||
{
|
||||
$startToken = $tokens[$start] ?? '';
|
||||
@@ -122,7 +114,7 @@ final class SimpleChunker
|
||||
private function dedupe(array $chunks): array
|
||||
{
|
||||
$seen = [];
|
||||
$out = [];
|
||||
$out = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$key = mb_strtolower(
|
||||
|
||||
Reference in New Issue
Block a user