optimize chunk text normalizer
This commit is contained in:
@@ -5,14 +5,16 @@ declare(strict_types=1);
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
use App\Entity\DocumentVersion;
|
||||
use App\Knowledge\Text\TextNormalizer;
|
||||
use App\Repository\DocumentVersionRepository;
|
||||
|
||||
final class KnowledgeIngestService
|
||||
final readonly class KnowledgeIngestService
|
||||
{
|
||||
public function __construct(
|
||||
private DocumentLoader $loader,
|
||||
private SimpleChunker $chunker,
|
||||
private DocumentVersionRepository $versionRepo,
|
||||
private TextNormalizer $textNormalizer
|
||||
)
|
||||
{
|
||||
}
|
||||
@@ -25,12 +27,10 @@ final class KnowledgeIngestService
|
||||
public function buildChunkRecords(DocumentVersion $version): iterable
|
||||
{
|
||||
$text = $this->loader->load($version->getFilePath());
|
||||
$text = $this->optimizeText($text);
|
||||
|
||||
$chunks = $this->chunker->chunk($text);
|
||||
|
||||
$doc = $version->getDocument();
|
||||
|
||||
$documentId = $doc->getId()->toRfc4122();
|
||||
$versionId = $version->getId()->toRfc4122();
|
||||
|
||||
@@ -41,13 +41,13 @@ final class KnowledgeIngestService
|
||||
foreach ($chunks as $chunkText) {
|
||||
|
||||
if ($title !== '' && !str_starts_with($chunkText, $title)) {
|
||||
$chunkText = "# Produkt Titel: " . $title . "\n\n --- " . $chunkText;
|
||||
$chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
|
||||
}
|
||||
|
||||
$chunkText = trim($chunkText);
|
||||
|
||||
// 🔥 deterministische Chunk-ID
|
||||
$normalizedForId = $this->normalizeForId($chunkText);
|
||||
$normalizedForId = $this->textNormalizer->normalize($chunkText);
|
||||
|
||||
$chunkId = sha1(
|
||||
$documentId . '|' .
|
||||
@@ -77,24 +77,6 @@ final class KnowledgeIngestService
|
||||
}
|
||||
}
|
||||
|
||||
private function optimizeText(string $text): string
|
||||
{
|
||||
$text = preg_replace("/\n{3,}/", "\n\n", $text);
|
||||
$text = preg_replace("/[ \t]+$/m", "", $text);
|
||||
return trim($text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalisierung für stabile ID-Berechnung.
|
||||
* Wichtig: ID darf nicht durch Whitespace minimal variieren.
|
||||
*/
|
||||
private function normalizeForId(string $text): string
|
||||
{
|
||||
$text = mb_strtolower($text);
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
return trim($text);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string,mixed>
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user