optimize chunk text normalizer
This commit is contained in:
@@ -5,14 +5,16 @@ declare(strict_types=1);
|
|||||||
namespace App\Knowledge\Ingest;
|
namespace App\Knowledge\Ingest;
|
||||||
|
|
||||||
use App\Entity\DocumentVersion;
|
use App\Entity\DocumentVersion;
|
||||||
|
use App\Knowledge\Text\TextNormalizer;
|
||||||
use App\Repository\DocumentVersionRepository;
|
use App\Repository\DocumentVersionRepository;
|
||||||
|
|
||||||
final class KnowledgeIngestService
|
final readonly class KnowledgeIngestService
|
||||||
{
|
{
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private DocumentLoader $loader,
|
private DocumentLoader $loader,
|
||||||
private SimpleChunker $chunker,
|
private SimpleChunker $chunker,
|
||||||
private DocumentVersionRepository $versionRepo,
|
private DocumentVersionRepository $versionRepo,
|
||||||
|
private TextNormalizer $textNormalizer
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@@ -25,12 +27,10 @@ final class KnowledgeIngestService
|
|||||||
public function buildChunkRecords(DocumentVersion $version): iterable
|
public function buildChunkRecords(DocumentVersion $version): iterable
|
||||||
{
|
{
|
||||||
$text = $this->loader->load($version->getFilePath());
|
$text = $this->loader->load($version->getFilePath());
|
||||||
$text = $this->optimizeText($text);
|
|
||||||
|
|
||||||
$chunks = $this->chunker->chunk($text);
|
$chunks = $this->chunker->chunk($text);
|
||||||
|
|
||||||
$doc = $version->getDocument();
|
$doc = $version->getDocument();
|
||||||
|
|
||||||
$documentId = $doc->getId()->toRfc4122();
|
$documentId = $doc->getId()->toRfc4122();
|
||||||
$versionId = $version->getId()->toRfc4122();
|
$versionId = $version->getId()->toRfc4122();
|
||||||
|
|
||||||
@@ -41,13 +41,13 @@ final class KnowledgeIngestService
|
|||||||
foreach ($chunks as $chunkText) {
|
foreach ($chunks as $chunkText) {
|
||||||
|
|
||||||
if ($title !== '' && !str_starts_with($chunkText, $title)) {
|
if ($title !== '' && !str_starts_with($chunkText, $title)) {
|
||||||
$chunkText = "# Produkt Titel: " . $title . "\n\n --- " . $chunkText;
|
$chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
|
||||||
}
|
}
|
||||||
|
|
||||||
$chunkText = trim($chunkText);
|
$chunkText = trim($chunkText);
|
||||||
|
|
||||||
// 🔥 deterministische Chunk-ID
|
// 🔥 deterministische Chunk-ID
|
||||||
$normalizedForId = $this->normalizeForId($chunkText);
|
$normalizedForId = $this->textNormalizer->normalize($chunkText);
|
||||||
|
|
||||||
$chunkId = sha1(
|
$chunkId = sha1(
|
||||||
$documentId . '|' .
|
$documentId . '|' .
|
||||||
@@ -77,24 +77,6 @@ final class KnowledgeIngestService
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private function optimizeText(string $text): string
|
|
||||||
{
|
|
||||||
$text = preg_replace("/\n{3,}/", "\n\n", $text);
|
|
||||||
$text = preg_replace("/[ \t]+$/m", "", $text);
|
|
||||||
return trim($text);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Normalisierung für stabile ID-Berechnung.
|
|
||||||
* Wichtig: ID darf nicht durch Whitespace minimal variieren.
|
|
||||||
*/
|
|
||||||
private function normalizeForId(string $text): string
|
|
||||||
{
|
|
||||||
$text = mb_strtolower($text);
|
|
||||||
$text = preg_replace('/\s+/u', ' ', $text);
|
|
||||||
return trim($text);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return array<string,mixed>
|
* @return array<string,mixed>
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
* Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter).
|
* Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter).
|
||||||
* Enterprise Default: klein halten, sonst dominieren Tags wieder.
|
* Enterprise Default: klein halten, sonst dominieren Tags wieder.
|
||||||
*/
|
*/
|
||||||
private const TAG_SCORE_BONUS = 0.25;
|
private const TAG_SCORE_BONUS = 0.5;
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly NdjsonChunkLookup $lookup,
|
private readonly NdjsonChunkLookup $lookup,
|
||||||
|
|||||||
@@ -8,17 +8,57 @@ final class TextNormalizer
|
|||||||
{
|
{
|
||||||
public function normalize(string $text): string
|
public function normalize(string $text): string
|
||||||
{
|
{
|
||||||
// Silbentrennungen entfernen
|
if ($text === '') {
|
||||||
$text = preg_replace('/-\n/', '', $text);
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------
|
||||||
|
// 1. Encoding-Artefakte & Sonderzeichen
|
||||||
|
// -------------------------------------------------
|
||||||
|
|
||||||
|
// Word/PDF Bullet-Artefakte (häufiges Problemzeichen)
|
||||||
|
$text = str_replace('', '-', $text);
|
||||||
|
|
||||||
|
// Unicode Bullets vereinheitlichen → "-"
|
||||||
|
$text = preg_replace(
|
||||||
|
'/[\x{2022}\x{25CF}\x{2219}\x{2023}\x{2043}]/u',
|
||||||
|
'-',
|
||||||
|
$text
|
||||||
|
);
|
||||||
|
|
||||||
|
$text = preg_replace('/[\x{E000}-\x{F8FF}]/u', '', $text);
|
||||||
|
|
||||||
|
// Non-breaking space → normales Leerzeichen
|
||||||
|
$text = str_replace("\xC2\xA0", ' ', $text);
|
||||||
|
|
||||||
|
// Zero-width characters entfernen
|
||||||
|
$text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text);
|
||||||
|
|
||||||
|
// -------------------------------------------------
|
||||||
|
// 2. Zeilenumbrüche vereinheitlichen
|
||||||
|
// -------------------------------------------------
|
||||||
|
|
||||||
// Windows-Zeilenumbrüche vereinheitlichen
|
|
||||||
$text = str_replace("\r\n", "\n", $text);
|
$text = str_replace("\r\n", "\n", $text);
|
||||||
|
$text = str_replace("\r", "\n", $text);
|
||||||
|
|
||||||
|
// -------------------------------------------------
|
||||||
|
// 3. Silbentrennung über Zeilen entfernen
|
||||||
|
// Beispiel:
|
||||||
|
// Testo-
|
||||||
|
// mat → Testomat
|
||||||
|
// -------------------------------------------------
|
||||||
|
|
||||||
|
$text = preg_replace('/-\n(\p{L})/u', '$1', $text);
|
||||||
|
|
||||||
|
// -------------------------------------------------
|
||||||
|
// 4. Whitespace normalisieren
|
||||||
|
// -------------------------------------------------
|
||||||
|
|
||||||
// Mehrfache Leerzeichen reduzieren
|
// Mehrfache Leerzeichen reduzieren
|
||||||
$text = preg_replace('/[ \t]+/', ' ', $text);
|
$text = preg_replace('/[ \t]+/u', ' ', $text);
|
||||||
|
|
||||||
// Mehrfache Leerzeilen reduzieren
|
// Mehrfache Leerzeilen reduzieren
|
||||||
$text = preg_replace('/\n{3,}/', "\n\n", $text);
|
$text = preg_replace('/\n{3,}/u', "\n\n", $text);
|
||||||
|
|
||||||
return trim($text);
|
return trim($text);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user