optimize system and cleanup
This commit is contained in:
@@ -27,11 +27,12 @@ final class DocumentLoader
|
||||
private function loadText(string $path): string
|
||||
{
|
||||
$content = file_get_contents($path);
|
||||
|
||||
if ($content === false) {
|
||||
throw new \RuntimeException("Could not read file: {$path}");
|
||||
}
|
||||
|
||||
return $this->normalize($content);
|
||||
return $this->normalizeLineEndings($content);
|
||||
}
|
||||
|
||||
private function loadPdf(string $path): string
|
||||
@@ -49,120 +50,31 @@ final class DocumentLoader
|
||||
);
|
||||
}
|
||||
|
||||
return $this->normalize($text);
|
||||
return $this->normalizeLineEndings($text);
|
||||
}
|
||||
|
||||
private function normalize(string $text): string
|
||||
/**
|
||||
* Loader ist bewusst minimal.
|
||||
*
|
||||
* KEINE:
|
||||
* - Silbentrennung
|
||||
* - Listen-Reparatur
|
||||
* - Struktur-Merges
|
||||
* - Regex-Orgie
|
||||
*
|
||||
* Nur:
|
||||
* - Zeilenumbrüche vereinheitlichen
|
||||
* - trim
|
||||
*/
|
||||
private function normalizeLineEndings(string $text): string
|
||||
{
|
||||
if ($text === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
// 1. Silbentrennung entfernen
|
||||
$text = preg_replace('/-\n/', '', $text);
|
||||
|
||||
// 2. Einheitliche Zeilenumbrüche
|
||||
// Einheitliche Zeilenumbrüche
|
||||
$text = str_replace(["\r\n", "\r"], "\n", $text);
|
||||
|
||||
// 3. Symbolmüll entfernen
|
||||
$text = $this->removeUnwantedSymbols($text);
|
||||
|
||||
// 4. Struktur-Reparatur
|
||||
$text = $this->repairStructure($text);
|
||||
|
||||
// 5. Inline-Listen stabilisieren
|
||||
$text = preg_replace('/\s-\s/', "\n- ", $text);
|
||||
|
||||
// 6. Whitespace normalisieren
|
||||
$text = preg_replace('/[ \t]+/', ' ', $text);
|
||||
$text = preg_replace('/\n{3,}/', "\n\n", $text);
|
||||
|
||||
return trim($text);
|
||||
}
|
||||
|
||||
private function removeUnwantedSymbols(string $text): string
|
||||
{
|
||||
$text = str_replace(['©', '®', '™', '℠'], '', $text);
|
||||
$text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text);
|
||||
$text = preg_replace('/[^\P{C}\n]+/u', '', $text);
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Konsolidierte Struktur-Reparatur
|
||||
*/
|
||||
private function repairStructure(string $text): string
|
||||
{
|
||||
$lines = explode("\n", $text);
|
||||
$out = [];
|
||||
$count = count($lines);
|
||||
|
||||
for ($i = 0; $i < $count; $i++) {
|
||||
$current = trim($lines[$i]);
|
||||
|
||||
if ($current === '') {
|
||||
$out[] = '';
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($i < $count - 1) {
|
||||
$next = trim($lines[$i + 1]);
|
||||
|
||||
// --- 1. Modellnummern / Zahlfortsetzung ---
|
||||
if (
|
||||
!preg_match('/^- /', $current) &&
|
||||
!preg_match('/^- /', $next) &&
|
||||
!preg_match('/[\.:\?!]$/', $current) &&
|
||||
preg_match('/^\d+/', $next) // beginnt mit Zahl
|
||||
) {
|
||||
$out[] = $current . ' ' . $next;
|
||||
$i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- 2. Satzfortsetzung (Zeile beginnt klein) ---
|
||||
if (
|
||||
!preg_match('/^- /', $current) &&
|
||||
!preg_match('/^- /', $next) &&
|
||||
!preg_match('/[\.:\?!]$/', $current) &&
|
||||
preg_match('/^[a-zäöü]/u', $next)
|
||||
) {
|
||||
$out[] = $current . ' ' . $next;
|
||||
$i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- 3. Falsche Listenfortsetzung ---
|
||||
if (
|
||||
preg_match('/^- /', $current) &&
|
||||
preg_match('/^- [a-zäöü]/u', $next) &&
|
||||
!preg_match('/[\.:\?!]$/', $current)
|
||||
) {
|
||||
$merged = rtrim($current) . ' ' . ltrim(substr($next, 2));
|
||||
$out[] = $merged;
|
||||
$i++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// --- 4. Pseudo-Liste wie "- 808 festlegen" ---
|
||||
if (preg_match('/^- \d+[A-Za-z ]{0,25}$/', $current)) {
|
||||
$out[] = substr($current, 2);
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- 5. Pseudo-Liste wie "- im eingeschalteten Zustand ..." ---
|
||||
if (
|
||||
preg_match('/^- [a-zäöü]/u', $current) &&
|
||||
($i === 0 || !preg_match('/^- /', trim($lines[$i - 1])))
|
||||
) {
|
||||
$out[] = substr($current, 2);
|
||||
continue;
|
||||
}
|
||||
|
||||
$out[] = $current;
|
||||
}
|
||||
|
||||
return implode("\n", $out);
|
||||
}
|
||||
}
|
||||
@@ -18,10 +18,8 @@ final readonly class KnowledgeIngestService
|
||||
private DocumentVersionRepository $versionRepo,
|
||||
private TextNormalizer $textNormalizer,
|
||||
private DocumentSanitizer $documentSanitizer,
|
||||
private StructureEnhancer $structureEnhancer, // ✅ NEU
|
||||
)
|
||||
{
|
||||
}
|
||||
private StructureEnhancer $structureEnhancer,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* Lokaler Ingest: erzeugt deterministische NDJSON-Records.
|
||||
@@ -34,16 +32,13 @@ final readonly class KnowledgeIngestService
|
||||
$text = $this->loader->load($version->getFilePath());
|
||||
$extension = $version->getFileExtension() ?? 'txt';
|
||||
|
||||
// 2️⃣ Deterministische Textbereinigung
|
||||
$text = $this->documentSanitizer->sanitize(
|
||||
$text,
|
||||
$extension
|
||||
);
|
||||
// 2️⃣ Artefakt-Sanitizing
|
||||
$text = $this->documentSanitizer->sanitize($text, $extension);
|
||||
|
||||
// 3️⃣ 🔥 Deterministische Struktur-Anreicherung (NEU)
|
||||
// 3️⃣ Struktur-Hints (deterministisch, minimal)
|
||||
$text = $this->structureEnhancer->enhance($text);
|
||||
|
||||
// 4️⃣ Chunking
|
||||
// 4️⃣ Chunking (inkl. TextNormalizer)
|
||||
$chunks = $this->chunker->chunk($text);
|
||||
|
||||
$doc = $version->getDocument();
|
||||
@@ -56,13 +51,15 @@ final readonly class KnowledgeIngestService
|
||||
|
||||
foreach ($chunks as $chunkText) {
|
||||
|
||||
if ($title !== '' && !str_starts_with($chunkText, $title)) {
|
||||
// 🔥 Titel nur im ersten Chunk einfügen
|
||||
if ($index === 0 && $title !== '') {
|
||||
$chunkText = "# Produkt Titel: `" . $title . "`\n\n" . $chunkText;
|
||||
}
|
||||
|
||||
$chunkText = trim($chunkText);
|
||||
|
||||
// 🔥 deterministische Chunk-ID
|
||||
// 🔥 Deterministische Chunk-ID
|
||||
// Wichtig: Normalisierung NUR für ID-Bildung
|
||||
$normalizedForId = $this->textNormalizer->normalize($chunkText);
|
||||
|
||||
$chunkId = sha1(
|
||||
@@ -75,11 +72,13 @@ final readonly class KnowledgeIngestService
|
||||
'chunk_id' => $chunkId,
|
||||
'document_id' => $documentId,
|
||||
'version_id' => $versionId,
|
||||
'chunk_index' => $index++,
|
||||
'chunk_index' => $index,
|
||||
'text' => $chunkText,
|
||||
'checksum' => sha1($chunkText),
|
||||
'metadata' => $this->buildMetadata($version),
|
||||
];
|
||||
|
||||
$index++;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,6 +100,7 @@ final readonly class KnowledgeIngestService
|
||||
$doc = $version->getDocument();
|
||||
|
||||
$title = null;
|
||||
|
||||
if (method_exists($doc, 'getTitle')) {
|
||||
$title = $doc->getTitle();
|
||||
} elseif (method_exists($doc, 'getName')) {
|
||||
|
||||
@@ -13,27 +13,22 @@ final readonly class SimpleChunker
|
||||
public function __construct(
|
||||
private IndexConfigurationProvider $configurationProvider,
|
||||
private TextNormalizer $textNormalizer
|
||||
)
|
||||
{
|
||||
}
|
||||
) {}
|
||||
|
||||
/** @return string[] */
|
||||
public function chunk(string $text): array
|
||||
{
|
||||
$config = $this->configurationProvider->getConfiguration();
|
||||
|
||||
$maxWords = $config->getChunkSize();
|
||||
$overlapWords = $config->getChunkOverlap();
|
||||
$maxWords = max(1, $config->getChunkSize());
|
||||
$overlapWords = max(0, $config->getChunkOverlap());
|
||||
|
||||
$text = $this->textNormalizer->normalize($text);
|
||||
if ($text === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
// ======================================================
|
||||
// HYBRID: Erst Absatzbasiert sammeln
|
||||
// ======================================================
|
||||
|
||||
// Absatzbasierte Vorstruktur
|
||||
$paragraphs = preg_split('/\n{2,}/u', $text);
|
||||
if (!$paragraphs) {
|
||||
return [];
|
||||
@@ -52,7 +47,7 @@ final readonly class SimpleChunker
|
||||
|
||||
$paragraphWordCount = $this->countWords($paragraph);
|
||||
|
||||
// Falls einzelner Absatz größer als maxWords → Fallback
|
||||
// Absatz größer als maxWords → Wort-Fallback
|
||||
if ($paragraphWordCount > $maxWords) {
|
||||
|
||||
if ($currentChunk !== '') {
|
||||
@@ -68,14 +63,14 @@ final readonly class SimpleChunker
|
||||
continue;
|
||||
}
|
||||
|
||||
// Absatz passt noch in aktuellen Chunk
|
||||
// Absatz passt in aktuellen Chunk
|
||||
if ($currentWordCount + $paragraphWordCount <= $maxWords) {
|
||||
$currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $paragraph;
|
||||
$currentWordCount += $paragraphWordCount;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Flush aktueller Chunk
|
||||
// Flush
|
||||
if ($currentChunk !== '') {
|
||||
$chunks[] = trim($currentChunk);
|
||||
}
|
||||
@@ -92,7 +87,7 @@ final readonly class SimpleChunker
|
||||
}
|
||||
|
||||
// ======================================================
|
||||
// Wortbasierter Fallback (Original-Logik beibehalten)
|
||||
// Wortbasierter Fallback
|
||||
// ======================================================
|
||||
|
||||
/** @return string[] */
|
||||
@@ -125,6 +120,7 @@ final readonly class SimpleChunker
|
||||
$wordPos = 0;
|
||||
|
||||
while ($wordPos < $totalWords) {
|
||||
|
||||
$wordEnd = min($wordPos + $maxWords, $totalWords);
|
||||
|
||||
$tokenStart = $wordTokenIndexes[$wordPos];
|
||||
@@ -154,11 +150,13 @@ final readonly class SimpleChunker
|
||||
|
||||
private function adjustCutToBoundary(array $tokens, int $start, int $end): int
|
||||
{
|
||||
// Schutz für Listenanfänge
|
||||
$startToken = $tokens[$start] ?? '';
|
||||
if (preg_match('/^- /u', ltrim($startToken))) {
|
||||
if (preg_match('/^\s*-\s+/u', $startToken)) {
|
||||
return $end;
|
||||
}
|
||||
|
||||
// Rückwärts prüfen auf Absatz- oder Satzende
|
||||
for ($i = $end - 1; $i > $start; $i--) {
|
||||
|
||||
if ($tokens[$i] === "\n\n") {
|
||||
@@ -190,9 +188,13 @@ final readonly class SimpleChunker
|
||||
$out = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$key = mb_strtolower(
|
||||
preg_replace('/\s+/u', ' ', trim($chunk))
|
||||
);
|
||||
|
||||
$normalized = preg_replace('/\s+/u', ' ', trim($chunk));
|
||||
if ($normalized === null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$key = mb_strtolower($normalized);
|
||||
|
||||
if (isset($seen[$key])) {
|
||||
continue;
|
||||
|
||||
@@ -1,48 +0,0 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Retrieval;
|
||||
|
||||
use Psr\Cache\CacheItemPoolInterface;
|
||||
use Psr\Cache\InvalidArgumentException;
|
||||
|
||||
final readonly class CachedRetriever implements RetrieverInterface
|
||||
{
|
||||
public function __construct(
|
||||
private RetrieverInterface $inner,
|
||||
private CacheItemPoolInterface $cache,
|
||||
private int $ttlSeconds
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function retrieve(string $prompt, int $limit = 10): array
|
||||
{
|
||||
$key = $this->buildCacheKey($prompt, $limit);
|
||||
|
||||
$item = $this->cache->getItem($key);
|
||||
if ($item->isHit()) {
|
||||
return $item->get();
|
||||
}
|
||||
|
||||
$result = $this->inner->retrieve($prompt, $limit);
|
||||
|
||||
$item->set($result);
|
||||
$item->expiresAfter($this->ttlSeconds);
|
||||
$this->cache->save($item);
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
private function buildCacheKey(string $prompt, int $limit): string
|
||||
{
|
||||
$normalized = mb_strtolower(trim($prompt));
|
||||
$normalized = preg_replace('/\s+/u', ' ', $normalized);
|
||||
|
||||
return 'rag_retrieval_' . sha1($normalized . '|' . $limit);
|
||||
}
|
||||
}
|
||||
@@ -57,6 +57,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return [$result['catalogBlock']];
|
||||
}
|
||||
|
||||
if ($result['selectedChunkIds'] === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return $this->collectTextsFromIds(
|
||||
$result['selectedChunkIds'],
|
||||
$result['rows']
|
||||
@@ -84,10 +88,15 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
]];
|
||||
}
|
||||
|
||||
if ($result['selectedChunkIds'] === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$out = [];
|
||||
$rank = 0;
|
||||
|
||||
foreach ($result['selectedChunkIds'] as $chunkId) {
|
||||
|
||||
if (!isset($result['rows'][$chunkId])) {
|
||||
continue;
|
||||
}
|
||||
@@ -127,6 +136,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$route = $this->routeResolver->resolve($salesIntent, $entityLabel);
|
||||
|
||||
if ($route === IntentRouteResolver::ROUTE_CATALOG_LIST && $entityLabel !== null) {
|
||||
|
||||
$catalogBlock = $this->entityCatalogService->listByTerm($entityLabel);
|
||||
|
||||
if ($catalogBlock !== null) {
|
||||
@@ -147,6 +157,21 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
$core = $this->runCore($prompt, $config, $withScores, $salesIntent);
|
||||
|
||||
if ($core['ranked_chunk_ids'] === [] || $core['rows'] === []) {
|
||||
return [
|
||||
'route' => $route,
|
||||
'entityLabel' => $entityLabel,
|
||||
'intent' => $salesIntent,
|
||||
'isListQuery' => $core['is_list_query'],
|
||||
'selectedChunkIds' => [],
|
||||
'rows' => [],
|
||||
'rrfScores' => [],
|
||||
'rawScores' => [],
|
||||
'threshold' => $core['threshold'],
|
||||
'catalogBlock' => null,
|
||||
];
|
||||
}
|
||||
|
||||
$selectedChunkIds = $core['is_list_query']
|
||||
? $this->selectListChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit'])
|
||||
: $this->selectSalesChunkIds($core['ranked_chunk_ids'], $core['rows'], $core['limit']);
|
||||
@@ -182,8 +207,17 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$isListQuery = $this->intentLite->isListQuery($prompt);
|
||||
|
||||
$cleanQuery = $this->queryCleaner->clean($prompt);
|
||||
|
||||
if ($cleanQuery === '') {
|
||||
$cleanQuery = $prompt;
|
||||
return [
|
||||
'limit' => $limit,
|
||||
'is_list_query' => $isListQuery,
|
||||
'threshold' => self::VECTOR_SCORE_THRESHOLD,
|
||||
'ranked_chunk_ids' => [],
|
||||
'rows' => [],
|
||||
'rrf_scores' => [],
|
||||
'raw_scores' => [],
|
||||
];
|
||||
}
|
||||
|
||||
[$threshold, $topK] = $this->computeThresholdAndTopK(
|
||||
@@ -200,10 +234,22 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$globalHits = $this->vectorClient->search($cleanQuery, $topK);
|
||||
|
||||
$scopedHits = [];
|
||||
if (!empty($candidateDocIds)) {
|
||||
if ($candidateDocIds !== []) {
|
||||
$scopedHits = $this->vectorClient->searchScoped($cleanQuery, $topK, $candidateDocIds);
|
||||
}
|
||||
|
||||
if ($globalHits === [] && $scopedHits === []) {
|
||||
return [
|
||||
'limit' => $limit,
|
||||
'is_list_query' => $isListQuery,
|
||||
'threshold' => $threshold,
|
||||
'ranked_chunk_ids' => [],
|
||||
'rows' => [],
|
||||
'rrf_scores' => [],
|
||||
'raw_scores' => [],
|
||||
];
|
||||
}
|
||||
|
||||
$fused = $this->fuseHits(
|
||||
$globalHits,
|
||||
$scopedHits,
|
||||
@@ -216,11 +262,25 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$rawScores = $fused['raw_scores'];
|
||||
|
||||
if ($rrfScores === [] && $globalHits !== []) {
|
||||
$rrfScores = $this->fallbackRrfFromHits($globalHits, self::EMPTY_RRF_FALLBACK_TOPN);
|
||||
$rrfScores = $this->fallbackRrfFromHits(
|
||||
$globalHits,
|
||||
self::EMPTY_RRF_FALLBACK_TOPN
|
||||
);
|
||||
}
|
||||
|
||||
if ($rrfScores === []) {
|
||||
return [
|
||||
'limit' => $limit,
|
||||
'is_list_query' => $isListQuery,
|
||||
'threshold' => $threshold,
|
||||
'ranked_chunk_ids' => [],
|
||||
'rows' => [],
|
||||
'rrf_scores' => [],
|
||||
'raw_scores' => $rawScores,
|
||||
];
|
||||
}
|
||||
|
||||
arsort($rrfScores);
|
||||
|
||||
$rankedChunkIds = array_keys($rrfScores);
|
||||
$rows = $this->lookup->findByChunkIds($rankedChunkIds);
|
||||
|
||||
@@ -254,13 +314,19 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return (string)($data['intent'] ?? SalesIntentLite::DISCOVERY);
|
||||
}
|
||||
|
||||
private function computeThresholdAndTopK(string $salesIntent, bool $isListQuery, int $vectorTopKBase): array
|
||||
{
|
||||
private function computeThresholdAndTopK(
|
||||
string $salesIntent,
|
||||
bool $isListQuery,
|
||||
int $vectorTopKBase
|
||||
): array {
|
||||
|
||||
$threshold = self::VECTOR_SCORE_THRESHOLD;
|
||||
$topK = $vectorTopKBase;
|
||||
|
||||
if ($salesIntent === SalesIntentLite::OBJECTION ||
|
||||
$salesIntent === SalesIntentLite::PRICING) {
|
||||
if (
|
||||
$salesIntent === SalesIntentLite::OBJECTION ||
|
||||
$salesIntent === SalesIntentLite::PRICING
|
||||
) {
|
||||
$threshold += 0.02;
|
||||
}
|
||||
|
||||
@@ -333,6 +399,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$rank = 0;
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
|
||||
if (!isset($hit['chunk_id'])) {
|
||||
continue;
|
||||
}
|
||||
@@ -354,6 +421,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$out = [];
|
||||
|
||||
foreach ($chunkIds as $id) {
|
||||
|
||||
if (!isset($rows[$id]['text'])) {
|
||||
continue;
|
||||
}
|
||||
@@ -433,11 +501,13 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$out = [];
|
||||
|
||||
foreach ($chunkIds as $id) {
|
||||
|
||||
if (!isset($rows[$id]['text'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$text = trim((string)$rows[$id]['text']);
|
||||
|
||||
if ($text !== '') {
|
||||
$out[] = $text;
|
||||
}
|
||||
|
||||
@@ -13,7 +13,15 @@ final class TextNormalizer
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// 1. Encoding-Artefakte & Sonderzeichen
|
||||
// 1. Unicode-Normalisierung (wichtig für Stabilität)
|
||||
// -------------------------------------------------
|
||||
|
||||
if (class_exists(\Normalizer::class)) {
|
||||
$text = \Normalizer::normalize($text, \Normalizer::FORM_C) ?? $text;
|
||||
}
|
||||
|
||||
// -------------------------------------------------
|
||||
// 2. Encoding-Artefakte & Sonderzeichen
|
||||
// -------------------------------------------------
|
||||
|
||||
// Word/PDF Bullet-Artefakte (häufiges Problemzeichen)
|
||||
@@ -26,38 +34,49 @@ final class TextNormalizer
|
||||
$text
|
||||
);
|
||||
|
||||
// Private-Use-Area entfernen
|
||||
$text = preg_replace('/[\x{E000}-\x{F8FF}]/u', '', $text);
|
||||
|
||||
// Non-breaking space → normales Leerzeichen
|
||||
$text = str_replace("\xC2\xA0", ' ', $text);
|
||||
|
||||
// Zero-width characters entfernen
|
||||
$text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}]/u', '', $text);
|
||||
|
||||
// -------------------------------------------------
|
||||
// 2. Zeilenumbrüche vereinheitlichen
|
||||
// -------------------------------------------------
|
||||
|
||||
$text = str_replace("\r\n", "\n", $text);
|
||||
$text = str_replace("\r", "\n", $text);
|
||||
// Geschützte Leerzeichen & ähnliche Varianten vereinheitlichen
|
||||
$text = str_replace(
|
||||
[
|
||||
"\xC2\xA0", // NBSP
|
||||
"\xE2\x80\xAF", // Narrow NBSP
|
||||
"\xE2\x80\x89", // Thin space
|
||||
],
|
||||
' ',
|
||||
$text
|
||||
);
|
||||
|
||||
// -------------------------------------------------
|
||||
// 3. Silbentrennung über Zeilen entfernen
|
||||
// 3. Zeilenumbrüche vereinheitlichen
|
||||
// -------------------------------------------------
|
||||
|
||||
$text = str_replace(["\r\n", "\r"], "\n", $text);
|
||||
|
||||
// -------------------------------------------------
|
||||
// 4. Silbentrennung über Zeilen entfernen
|
||||
//
|
||||
// Beispiel:
|
||||
// Testo-
|
||||
// mat → Testomat
|
||||
//
|
||||
// Nur wenn direkt Buchstabe folgt
|
||||
// -------------------------------------------------
|
||||
|
||||
$text = preg_replace('/-\n(\p{L})/u', '$1', $text);
|
||||
|
||||
// -------------------------------------------------
|
||||
// 4. Whitespace normalisieren
|
||||
// 5. Whitespace normalisieren
|
||||
// -------------------------------------------------
|
||||
|
||||
// Mehrfache Leerzeichen reduzieren
|
||||
$text = preg_replace('/[ \t]+/u', ' ', $text);
|
||||
|
||||
// Mehrfache Leerzeilen reduzieren
|
||||
// Mehr als 2 Leerzeilen reduzieren
|
||||
$text = preg_replace('/\n{3,}/u', "\n\n", $text);
|
||||
|
||||
return trim($text);
|
||||
|
||||
Reference in New Issue
Block a user