optimize ingesting documents

This commit is contained in:
team2
2026-02-28 23:19:47 +01:00
parent 509ba83ac0
commit e7a315c147
3 changed files with 223 additions and 19 deletions

View File

@@ -0,0 +1,182 @@
<?php
declare(strict_types=1);
namespace App\Ingest;
final class StructureEnhancer
{
public function enhance(string $text): string
{
if ($text === '') {
return '';
}
$text = $this->normalizeLineEndings($text);
$text = $this->detectHeadings($text);
$text = $this->detectSimpleLists($text);
return $text;
}
private function normalizeLineEndings(string $text): string
{
return str_replace(["\r\n", "\r"], "\n", $text);
}
private function detectHeadings(string $text): string
{
$lines = explode("\n", $text);
$out = [];
$total = count($lines);
for ($i = 0; $i < $total; $i++) {
$line = $lines[$i];
$trim = trim($line);
if ($this->isHeadingCandidate($trim, $lines, $i)) {
$out[] = '## ' . $trim;
continue;
}
$out[] = $line;
}
return implode("\n", $out);
}
private function isHeadingCandidate(string $line, array $lines, int $index): bool
{
if ($line === '') {
return false;
}
if (strlen($line) > 80) {
return false;
}
if (str_ends_with($line, '.')) {
return false;
}
if (str_contains($line, ',')) {
return false;
}
if (preg_match('/\d+\.\d+/', $line)) {
return false;
}
$prev = $lines[$index - 1] ?? '';
$next = $lines[$index + 1] ?? '';
if (trim($prev) !== '' || trim($next) !== '') {
return false;
}
$uppercaseRatio = $this->uppercaseRatio($line);
if ($uppercaseRatio > 0.6) {
return true;
}
if ($this->isTitleCase($line)) {
return true;
}
return false;
}
private function uppercaseRatio(string $line): float
{
$letters = preg_replace('/[^a-zA-ZÄÖÜäöü]/u', '', $line);
if ($letters === '') {
return 0;
}
$upper = preg_replace('/[^A-ZÄÖÜ]/u', '', $letters);
return mb_strlen($upper) / mb_strlen($letters);
}
private function isTitleCase(string $line): bool
{
$words = explode(' ', $line);
$count = 0;
foreach ($words as $word) {
if ($word === '') {
continue;
}
if (mb_strtoupper(mb_substr($word, 0, 1)) === mb_substr($word, 0, 1)) {
$count++;
}
}
return $count >= max(1, intdiv(count($words), 2));
}
private function detectSimpleLists(string $text): string
{
$lines = explode("\n", $text);
$out = [];
$buffer = [];
foreach ($lines as $line) {
$trim = trim($line);
if ($this->isListCandidate($trim)) {
$buffer[] = $trim;
continue;
}
if (count($buffer) >= 2) {
foreach ($buffer as $item) {
$out[] = '- ' . $item;
}
} else {
foreach ($buffer as $item) {
$out[] = $item;
}
}
$buffer = [];
$out[] = $line;
}
if (count($buffer) >= 2) {
foreach ($buffer as $item) {
$out[] = '- ' . $item;
}
} else {
foreach ($buffer as $item) {
$out[] = $item;
}
}
return implode("\n", $out);
}
private function isListCandidate(string $line): bool
{
if ($line === '') {
return false;
}
if (strlen($line) > 120) {
return false;
}
if (str_ends_with($line, '.')) {
return false;
}
if (str_contains($line, ':')) {
return false;
}
return true;
}
}