optimize chunk text normalizer

This commit is contained in:
team2
2026-02-27 15:37:05 +01:00
parent 4761648836
commit a5a6f466f3
3 changed files with 52 additions and 30 deletions

View File

@@ -5,14 +5,16 @@ declare(strict_types=1);
namespace App\Knowledge\Ingest; namespace App\Knowledge\Ingest;
use App\Entity\DocumentVersion; use App\Entity\DocumentVersion;
use App\Knowledge\Text\TextNormalizer;
use App\Repository\DocumentVersionRepository; use App\Repository\DocumentVersionRepository;
final class KnowledgeIngestService final readonly class KnowledgeIngestService
{ {
public function __construct( public function __construct(
private DocumentLoader $loader, private DocumentLoader $loader,
private SimpleChunker $chunker, private SimpleChunker $chunker,
private DocumentVersionRepository $versionRepo, private DocumentVersionRepository $versionRepo,
private TextNormalizer $textNormalizer
) )
{ {
} }
@@ -25,12 +27,10 @@ final class KnowledgeIngestService
public function buildChunkRecords(DocumentVersion $version): iterable public function buildChunkRecords(DocumentVersion $version): iterable
{ {
$text = $this->loader->load($version->getFilePath()); $text = $this->loader->load($version->getFilePath());
$text = $this->optimizeText($text);
$chunks = $this->chunker->chunk($text); $chunks = $this->chunker->chunk($text);
$doc = $version->getDocument(); $doc = $version->getDocument();
$documentId = $doc->getId()->toRfc4122(); $documentId = $doc->getId()->toRfc4122();
$versionId = $version->getId()->toRfc4122(); $versionId = $version->getId()->toRfc4122();
@@ -41,13 +41,13 @@ final class KnowledgeIngestService
foreach ($chunks as $chunkText) { foreach ($chunks as $chunkText) {
if ($title !== '' && !str_starts_with($chunkText, $title)) { if ($title !== '' && !str_starts_with($chunkText, $title)) {
$chunkText = "# Produkt Titel: " . $title . "\n\n --- " . $chunkText; $chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
} }
$chunkText = trim($chunkText); $chunkText = trim($chunkText);
// 🔥 deterministische Chunk-ID // 🔥 deterministische Chunk-ID
$normalizedForId = $this->normalizeForId($chunkText); $normalizedForId = $this->textNormalizer->normalize($chunkText);
$chunkId = sha1( $chunkId = sha1(
$documentId . '|' . $documentId . '|' .
@@ -77,24 +77,6 @@ final class KnowledgeIngestService
} }
} }
private function optimizeText(string $text): string
{
$text = preg_replace("/\n{3,}/", "\n\n", $text);
$text = preg_replace("/[ \t]+$/m", "", $text);
return trim($text);
}
/**
* Normalisierung für stabile ID-Berechnung.
* Wichtig: ID darf nicht durch Whitespace minimal variieren.
*/
private function normalizeForId(string $text): string
{
$text = mb_strtolower($text);
$text = preg_replace('/\s+/u', ' ', $text);
return trim($text);
}
/** /**
* @return array<string,mixed> * @return array<string,mixed>
*/ */

View File

@@ -23,7 +23,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
* Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter). * Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter).
* Enterprise Default: klein halten, sonst dominieren Tags wieder. * Enterprise Default: klein halten, sonst dominieren Tags wieder.
*/ */
private const TAG_SCORE_BONUS = 0.25; private const TAG_SCORE_BONUS = 0.5;
public function __construct( public function __construct(
private readonly NdjsonChunkLookup $lookup, private readonly NdjsonChunkLookup $lookup,

View File

@@ -8,17 +8,57 @@ final class TextNormalizer
{ {
public function normalize(string $text): string public function normalize(string $text): string
{ {
// Silbentrennungen entfernen if ($text === '') {
$text = preg_replace('/-\n/', '', $text); return '';
}
// -------------------------------------------------
// 1. Encoding-Artefakte & Sonderzeichen
// -------------------------------------------------
// Word/PDF Bullet-Artefakte (häufiges Problemzeichen)
$text = str_replace('', '-', $text);
// Unicode Bullets vereinheitlichen → "-"
$text = preg_replace(
'/[\x{2022}\x{25CF}\x{2219}\x{2023}\x{2043}]/u',
'-',
$text
);
$text = preg_replace('/[\x{E000}-\x{F8FF}]/u', '', $text);
// Non-breaking space → normales Leerzeichen
$text = str_replace("\xC2\xA0", ' ', $text);
// Zero-width characters entfernen
$text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text);
// -------------------------------------------------
// 2. Zeilenumbrüche vereinheitlichen
// -------------------------------------------------
// Windows-Zeilenumbrüche vereinheitlichen
$text = str_replace("\r\n", "\n", $text); $text = str_replace("\r\n", "\n", $text);
$text = str_replace("\r", "\n", $text);
// -------------------------------------------------
// 3. Silbentrennung über Zeilen entfernen
// Beispiel:
// Testo-
// mat → Testomat
// -------------------------------------------------
$text = preg_replace('/-\n(\p{L})/u', '$1', $text);
// -------------------------------------------------
// 4. Whitespace normalisieren
// -------------------------------------------------
// Mehrfache Leerzeichen reduzieren // Mehrfache Leerzeichen reduzieren
$text = preg_replace('/[ \t]+/', ' ', $text); $text = preg_replace('/[ \t]+/u', ' ', $text);
// Mehrfache Leerzeilen reduzieren // Mehrfache Leerzeilen reduzieren
$text = preg_replace('/\n{3,}/', "\n\n", $text); $text = preg_replace('/\n{3,}/u', "\n\n", $text);
return trim($text); return trim($text);
} }