272 lines
6.9 KiB
PHP
272 lines
6.9 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Ingest;
|
|
|
|
/**
|
|
* StructureEnhancer
|
|
*
|
|
* Minimal, deterministic structure hints BEFORE chunking.
|
|
*
|
|
* Adds:
|
|
* - Heading markers ("## ") for isolated short title lines
|
|
* - Bullet markers ("- ") for obvious list runs
|
|
*
|
|
* Non-goals:
|
|
* - No semantic rewriting
|
|
* - No sentence merging
|
|
* - No aggressive list guessing
|
|
*/
|
|
final class StructureEnhancer
|
|
{
|
|
private const MAX_HEADING_LEN = 80;
|
|
|
|
private const MAX_LIST_ITEM_LEN = 140;
|
|
private const MIN_LIST_RUN = 2;
|
|
|
|
public function enhance(string $text): string
|
|
{
|
|
if ($text === '') {
|
|
return '';
|
|
}
|
|
|
|
$text = $this->normalizeLineEndings($text);
|
|
|
|
// Reihenfolge: erst Headings, dann Listen (stabiler fürs Chunking)
|
|
$text = $this->detectHeadings($text);
|
|
$text = $this->detectSimpleLists($text);
|
|
|
|
return $text;
|
|
}
|
|
|
|
private function normalizeLineEndings(string $text): string
|
|
{
|
|
return str_replace(["\r\n", "\r"], "\n", $text);
|
|
}
|
|
|
|
// =========================================================
|
|
// HEADINGS
|
|
// =========================================================
|
|
|
|
private function detectHeadings(string $text): string
|
|
{
|
|
$lines = explode("\n", $text);
|
|
$out = [];
|
|
|
|
$total = count($lines);
|
|
|
|
for ($i = 0; $i < $total; $i++) {
|
|
$line = $lines[$i];
|
|
$trim = trim($line);
|
|
|
|
if ($this->isHeadingCandidate($trim, $lines, $i)) {
|
|
$out[] = '## ' . $trim;
|
|
continue;
|
|
}
|
|
|
|
$out[] = $line;
|
|
}
|
|
|
|
return implode("\n", $out);
|
|
}
|
|
|
|
private function isHeadingCandidate(string $line, array $lines, int $index): bool
|
|
{
|
|
if ($line === '') {
|
|
return false;
|
|
}
|
|
|
|
// Schon Markdown-Heading? Dann nicht anfassen.
|
|
if (preg_match('/^#{1,6}\s+/u', $line)) {
|
|
return false;
|
|
}
|
|
|
|
if (mb_strlen($line) > self::MAX_HEADING_LEN) {
|
|
return false;
|
|
}
|
|
|
|
// Heading soll kein "Satz" sein
|
|
if (preg_match('/[.!?]\s*$/u', $line)) {
|
|
return false;
|
|
}
|
|
|
|
// Keine typischen Satz-Kommas (zu risky)
|
|
if (str_contains($line, ',')) {
|
|
return false;
|
|
}
|
|
|
|
// Nummerierte Kapitel "1.2" / "2.3.4" nicht zwangs-heading-en
|
|
if (preg_match('/\b\d+\.\d+(\.\d+)*\b/u', $line)) {
|
|
return false;
|
|
}
|
|
|
|
// Muss "isoliert" stehen (leerzeile davor und danach)
|
|
$prev = $lines[$index - 1] ?? '';
|
|
$next = $lines[$index + 1] ?? '';
|
|
|
|
if (trim($prev) !== '' || trim($next) !== '') {
|
|
return false;
|
|
}
|
|
|
|
// Guardrail: mindestens ein Buchstabe
|
|
if (!preg_match('/\p{L}/u', $line)) {
|
|
return false;
|
|
}
|
|
|
|
// Klassiker: UPPERCASE oder Title Case
|
|
$uppercaseRatio = $this->uppercaseRatio($line);
|
|
if ($uppercaseRatio >= 0.65) {
|
|
return true;
|
|
}
|
|
|
|
return $this->isTitleCase($line);
|
|
}
|
|
|
|
private function uppercaseRatio(string $line): float
|
|
{
|
|
$letters = preg_replace('/[^\p{L}]/u', '', $line);
|
|
if ($letters === '' || $letters === null) {
|
|
return 0.0;
|
|
}
|
|
|
|
$upper = preg_replace('/[^\p{Lu}]/u', '', $letters);
|
|
if ($upper === null) {
|
|
return 0.0;
|
|
}
|
|
|
|
$lettersLen = mb_strlen($letters);
|
|
if ($lettersLen === 0) {
|
|
return 0.0;
|
|
}
|
|
|
|
return mb_strlen($upper) / $lettersLen;
|
|
}
|
|
|
|
private function isTitleCase(string $line): bool
|
|
{
|
|
$words = preg_split('/\s+/u', trim($line));
|
|
if (!$words) {
|
|
return false;
|
|
}
|
|
|
|
$wordCount = 0;
|
|
$capCount = 0;
|
|
|
|
foreach ($words as $word) {
|
|
$word = trim($word);
|
|
if ($word === '') {
|
|
continue;
|
|
}
|
|
|
|
// Wörter ohne Buchstaben ignorieren
|
|
if (!preg_match('/\p{L}/u', $word)) {
|
|
continue;
|
|
}
|
|
|
|
$wordCount++;
|
|
|
|
$first = mb_substr($word, 0, 1);
|
|
if ($first !== '' && mb_strtoupper($first) === $first) {
|
|
$capCount++;
|
|
}
|
|
}
|
|
|
|
if ($wordCount === 0) {
|
|
return false;
|
|
}
|
|
|
|
// mindestens die Hälfte der Wörter beginnt groß
|
|
return $capCount >= max(1, intdiv($wordCount + 1, 2));
|
|
}
|
|
|
|
// =========================================================
|
|
// LISTS
|
|
// =========================================================
|
|
|
|
private function detectSimpleLists(string $text): string
|
|
{
|
|
$lines = explode("\n", $text);
|
|
$out = [];
|
|
|
|
$buffer = [];
|
|
|
|
foreach ($lines as $line) {
|
|
$trim = trim($line);
|
|
|
|
// Bereits echte Liste? → nicht anfassen
|
|
if (preg_match('/^-\s+/u', $trim) || preg_match('/^\d+\.\s+/u', $trim)) {
|
|
$this->flushListBuffer($buffer, $out);
|
|
$out[] = $line;
|
|
continue;
|
|
}
|
|
|
|
if ($this->isListCandidate($trim)) {
|
|
$buffer[] = $trim;
|
|
continue;
|
|
}
|
|
|
|
$this->flushListBuffer($buffer, $out);
|
|
$out[] = $line;
|
|
}
|
|
|
|
$this->flushListBuffer($buffer, $out);
|
|
|
|
return implode("\n", $out);
|
|
}
|
|
|
|
private function flushListBuffer(array &$buffer, array &$out): void
|
|
{
|
|
if ($buffer === []) {
|
|
return;
|
|
}
|
|
|
|
if (count($buffer) >= self::MIN_LIST_RUN) {
|
|
foreach ($buffer as $item) {
|
|
$out[] = '- ' . $item;
|
|
}
|
|
} else {
|
|
// single line: unverändert lassen (kein "erraten"!)
|
|
foreach ($buffer as $item) {
|
|
$out[] = $item;
|
|
}
|
|
}
|
|
|
|
$buffer = [];
|
|
}
|
|
|
|
private function isListCandidate(string $line): bool
|
|
{
|
|
if ($line === '') {
|
|
return false;
|
|
}
|
|
|
|
// zu lang = ziemlich sicher Absatz/Satz
|
|
if (mb_strlen($line) > self::MAX_LIST_ITEM_LEN) {
|
|
return false;
|
|
}
|
|
|
|
// wenn es wie ein Satz endet, nicht als Liste
|
|
if (preg_match('/[.!?]\s*$/u', $line)) {
|
|
return false;
|
|
}
|
|
|
|
// "Key: Value" ist typischerweise keine Liste
|
|
if (str_contains($line, ':')) {
|
|
return false;
|
|
}
|
|
|
|
// Wenn es ein kompletter Satz sein könnte (Verb/Artikel), nicht raten:
|
|
// -> minimaler Guardrail: beginnt mit Großbuchstabe UND enthält mindestens 5 Wörter => eher Satz/Absatz
|
|
$words = preg_split('/\s+/u', trim($line));
|
|
if ($words && count($words) >= 5) {
|
|
$first = mb_substr($line, 0, 1);
|
|
if ($first !== '' && mb_strtoupper($first) === $first) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// nur "kurze, stichpunktartige" Zeilen als Kandidat akzeptieren
|
|
return true;
|
|
}
|
|
} |