optimize system and cleanup

This commit is contained in:
team2
2026-03-02 21:27:20 +01:00
parent 6b8d1b1936
commit e7047cd885
10 changed files with 459 additions and 346 deletions

View File

@@ -4,8 +4,27 @@ declare(strict_types=1);
namespace App\Ingest;
/**
* StructureEnhancer
*
* Minimal, deterministic structure hints BEFORE chunking.
*
* Adds:
* - Heading markers ("## ") for isolated short title lines
* - Bullet markers ("- ") for obvious list runs
*
* Non-goals:
* - No semantic rewriting
* - No sentence merging
* - No aggressive list guessing
*/
final class StructureEnhancer
{
private const MAX_HEADING_LEN = 80;
private const MAX_LIST_ITEM_LEN = 140;
private const MIN_LIST_RUN = 2;
public function enhance(string $text): string
{
if ($text === '') {
@@ -13,6 +32,8 @@ final class StructureEnhancer
}
$text = $this->normalizeLineEndings($text);
// Reihenfolge: erst Headings, dann Listen (stabiler fürs Chunking)
$text = $this->detectHeadings($text);
$text = $this->detectSimpleLists($text);
@@ -24,6 +45,10 @@ final class StructureEnhancer
return str_replace(["\r\n", "\r"], "\n", $text);
}
// =========================================================
// HEADINGS
// =========================================================
private function detectHeadings(string $text): string
{
$lines = explode("\n", $text);
@@ -52,22 +77,31 @@ final class StructureEnhancer
return false;
}
if (strlen($line) > 80) {
// Schon Markdown-Heading? Dann nicht anfassen.
if (preg_match('/^#{1,6}\s+/u', $line)) {
return false;
}
if (str_ends_with($line, '.')) {
if (mb_strlen($line) > self::MAX_HEADING_LEN) {
return false;
}
// Heading soll kein "Satz" sein
if (preg_match('/[.!?]\s*$/u', $line)) {
return false;
}
// Keine typischen Satz-Kommas (zu risky)
if (str_contains($line, ',')) {
return false;
}
if (preg_match('/\d+\.\d+/', $line)) {
// Nummerierte Kapitel "1.2" / "2.3.4" nicht zwangs-heading-en
if (preg_match('/\b\d+\.\d+(\.\d+)*\b/u', $line)) {
return false;
}
// Muss "isoliert" stehen (leerzeile davor und danach)
$prev = $lines[$index - 1] ?? '';
$next = $lines[$index + 1] ?? '';
@@ -75,48 +109,81 @@ final class StructureEnhancer
return false;
}
// Guardrail: mindestens ein Buchstabe
if (!preg_match('/\p{L}/u', $line)) {
return false;
}
// Klassiker: UPPERCASE oder Title Case
$uppercaseRatio = $this->uppercaseRatio($line);
if ($uppercaseRatio > 0.6) {
if ($uppercaseRatio >= 0.65) {
return true;
}
if ($this->isTitleCase($line)) {
return true;
}
return false;
return $this->isTitleCase($line);
}
private function uppercaseRatio(string $line): float
{
$letters = preg_replace('/[^a-zA-ZÄÖÜäöü]/u', '', $line);
if ($letters === '') {
return 0;
$letters = preg_replace('/[^\p{L}]/u', '', $line);
if ($letters === '' || $letters === null) {
return 0.0;
}
$upper = preg_replace('/[^A-ZÄÖÜ]/u', '', $letters);
$upper = preg_replace('/[^\p{Lu}]/u', '', $letters);
if ($upper === null) {
return 0.0;
}
return mb_strlen($upper) / mb_strlen($letters);
$lettersLen = mb_strlen($letters);
if ($lettersLen === 0) {
return 0.0;
}
return mb_strlen($upper) / $lettersLen;
}
private function isTitleCase(string $line): bool
{
$words = explode(' ', $line);
$count = 0;
$words = preg_split('/\s+/u', trim($line));
if (!$words) {
return false;
}
$wordCount = 0;
$capCount = 0;
foreach ($words as $word) {
$word = trim($word);
if ($word === '') {
continue;
}
if (mb_strtoupper(mb_substr($word, 0, 1)) === mb_substr($word, 0, 1)) {
$count++;
// Wörter ohne Buchstaben ignorieren
if (!preg_match('/\p{L}/u', $word)) {
continue;
}
$wordCount++;
$first = mb_substr($word, 0, 1);
if ($first !== '' && mb_strtoupper($first) === $first) {
$capCount++;
}
}
return $count >= max(1, intdiv(count($words), 2));
if ($wordCount === 0) {
return false;
}
// mindestens die Hälfte der Wörter beginnt groß
return $capCount >= max(1, intdiv($wordCount + 1, 2));
}
// =========================================================
// LISTS
// =========================================================
private function detectSimpleLists(string $text): string
{
$lines = explode("\n", $text);
@@ -127,36 +194,45 @@ final class StructureEnhancer
foreach ($lines as $line) {
$trim = trim($line);
// Bereits echte Liste? → nicht anfassen
if (preg_match('/^-\s+/u', $trim) || preg_match('/^\d+\.\s+/u', $trim)) {
$this->flushListBuffer($buffer, $out);
$out[] = $line;
continue;
}
if ($this->isListCandidate($trim)) {
$buffer[] = $trim;
continue;
}
if (count($buffer) >= 2) {
foreach ($buffer as $item) {
$out[] = '- ' . $item;
}
} else {
foreach ($buffer as $item) {
$out[] = $item;
}
}
$buffer = [];
$this->flushListBuffer($buffer, $out);
$out[] = $line;
}
if (count($buffer) >= 2) {
$this->flushListBuffer($buffer, $out);
return implode("\n", $out);
}
private function flushListBuffer(array &$buffer, array &$out): void
{
if ($buffer === []) {
return;
}
if (count($buffer) >= self::MIN_LIST_RUN) {
foreach ($buffer as $item) {
$out[] = '- ' . $item;
}
} else {
// single line: unverändert lassen (kein "erraten"!)
foreach ($buffer as $item) {
$out[] = $item;
}
}
return implode("\n", $out);
$buffer = [];
}
private function isListCandidate(string $line): bool
@@ -165,18 +241,32 @@ final class StructureEnhancer
return false;
}
if (strlen($line) > 120) {
// zu lang = ziemlich sicher Absatz/Satz
if (mb_strlen($line) > self::MAX_LIST_ITEM_LEN) {
return false;
}
if (str_ends_with($line, '.')) {
// wenn es wie ein Satz endet, nicht als Liste
if (preg_match('/[.!?]\s*$/u', $line)) {
return false;
}
// "Key: Value" ist typischerweise keine Liste
if (str_contains($line, ':')) {
return false;
}
// Wenn es ein kompletter Satz sein könnte (Verb/Artikel), nicht raten:
// -> minimaler Guardrail: beginnt mit Großbuchstabe UND enthält mindestens 5 Wörter => eher Satz/Absatz
$words = preg_split('/\s+/u', trim($line));
if ($words && count($words) >= 5) {
$first = mb_substr($line, 0, 1);
if ($first !== '' && mb_strtoupper($first) === $first) {
return false;
}
}
// nur "kurze, stichpunktartige" Zeilen als Kandidat akzeptieren
return true;
}
}