new ingest und profile settings
This commit is contained in:
@@ -5,22 +5,31 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
use App\Index\IndexConfigurationProvider;
|
||||
|
||||
final class SimpleChunker
|
||||
{
|
||||
private IndexConfigurationProvider $configurationProvider;
|
||||
|
||||
public function __construct(
|
||||
private int $maxWords = 180,
|
||||
private int $overlapWords = 30
|
||||
) {}
|
||||
IndexConfigurationProvider $configurationProvider
|
||||
) {
|
||||
$this->configurationProvider = $configurationProvider;
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
public function chunk(string $text): array
|
||||
{
|
||||
$config = $this->configurationProvider->getConfiguration();
|
||||
|
||||
$maxWords = $config->getChunkSize();
|
||||
$overlapWords = $config->getChunkOverlap();
|
||||
|
||||
$text = $this->normalize($text);
|
||||
if ($text === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Split into tokens: words + whitespace preserved
|
||||
$tokens = preg_split(
|
||||
'/(\s+)/u',
|
||||
$text,
|
||||
@@ -32,7 +41,6 @@ final class SimpleChunker
|
||||
return [];
|
||||
}
|
||||
|
||||
// Build word index → token index mapping
|
||||
$wordTokenIndexes = [];
|
||||
foreach ($tokens as $i => $token) {
|
||||
if (!preg_match('/^\s+$/u', $token)) {
|
||||
@@ -49,12 +57,11 @@ final class SimpleChunker
|
||||
$wordPos = 0;
|
||||
|
||||
while ($wordPos < $totalWords) {
|
||||
$wordEnd = min($wordPos + $this->maxWords, $totalWords);
|
||||
$wordEnd = min($wordPos + $maxWords, $totalWords);
|
||||
|
||||
$tokenStart = $wordTokenIndexes[$wordPos];
|
||||
$tokenEnd = $wordTokenIndexes[$wordEnd - 1] + 1;
|
||||
|
||||
// Intelligent cut (sentence / paragraph aware)
|
||||
$tokenEnd = $this->adjustCutToBoundary($tokens, $tokenStart, $tokenEnd);
|
||||
|
||||
$chunk = trim(implode('', array_slice(
|
||||
@@ -71,7 +78,7 @@ final class SimpleChunker
|
||||
break;
|
||||
}
|
||||
|
||||
$wordPos = max(0, $wordEnd - $this->overlapWords);
|
||||
$wordPos = max(0, $wordEnd - $overlapWords);
|
||||
}
|
||||
|
||||
return $this->dedupe($chunks);
|
||||
@@ -86,30 +93,19 @@ final class SimpleChunker
|
||||
return trim((string) $text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Move cut backwards to a natural boundary if possible.
|
||||
* Rules:
|
||||
* - Never cut inside markdown list items
|
||||
* - Sentence end only if followed by a line break
|
||||
* - Paragraph breaks always allowed
|
||||
*/
|
||||
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
|
||||
{
|
||||
// Detect markdown list context (e.g. "- Foo: Bar")
|
||||
$startToken = $tokens[$start] ?? '';
|
||||
if (preg_match('/^- /u', ltrim($startToken))) {
|
||||
// Keep list blocks intact
|
||||
return $end;
|
||||
}
|
||||
|
||||
for ($i = $end - 1; $i > $start; $i--) {
|
||||
|
||||
// Paragraph boundary
|
||||
if ($tokens[$i] === "\n\n") {
|
||||
return $i + 1;
|
||||
}
|
||||
|
||||
// Sentence boundary only if followed by newline
|
||||
if (
|
||||
preg_match('/[.!?]\s*$/u', $tokens[$i]) &&
|
||||
isset($tokens[$i + 1]) &&
|
||||
|
||||
Reference in New Issue
Block a user