optimize chunk text normalizer

This commit is contained in:
team2
2026-02-27 15:37:05 +01:00
parent 4761648836
commit a5a6f466f3
3 changed files with 52 additions and 30 deletions

View File

@@ -5,14 +5,16 @@ declare(strict_types=1);
namespace App\Knowledge\Ingest;
use App\Entity\DocumentVersion;
use App\Knowledge\Text\TextNormalizer;
use App\Repository\DocumentVersionRepository;
final class KnowledgeIngestService
final readonly class KnowledgeIngestService
{
public function __construct(
private DocumentLoader $loader,
private SimpleChunker $chunker,
private DocumentVersionRepository $versionRepo,
private TextNormalizer $textNormalizer
)
{
}
@@ -25,12 +27,10 @@ final class KnowledgeIngestService
public function buildChunkRecords(DocumentVersion $version): iterable
{
$text = $this->loader->load($version->getFilePath());
$text = $this->optimizeText($text);
$chunks = $this->chunker->chunk($text);
$doc = $version->getDocument();
$documentId = $doc->getId()->toRfc4122();
$versionId = $version->getId()->toRfc4122();
@@ -41,13 +41,13 @@ final class KnowledgeIngestService
foreach ($chunks as $chunkText) {
if ($title !== '' && !str_starts_with($chunkText, $title)) {
$chunkText = "# Produkt Titel: " . $title . "\n\n --- " . $chunkText;
$chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
}
$chunkText = trim($chunkText);
// 🔥 deterministische Chunk-ID
$normalizedForId = $this->normalizeForId($chunkText);
$normalizedForId = $this->textNormalizer->normalize($chunkText);
$chunkId = sha1(
$documentId . '|' .
@@ -77,24 +77,6 @@ final class KnowledgeIngestService
}
}
private function optimizeText(string $text): string
{
$text = preg_replace("/\n{3,}/", "\n\n", $text);
$text = preg_replace("/[ \t]+$/m", "", $text);
return trim($text);
}
/**
* Normalisierung für stabile ID-Berechnung.
* Wichtig: ID darf nicht durch Whitespace minimal variieren.
*/
private function normalizeForId(string $text): string
{
$text = mb_strtolower($text);
$text = preg_replace('/\s+/u', ' ', $text);
return trim($text);
}
/**
* @return array<string,mixed>
*/