add pdf reader modul

This commit is contained in:
team2
2026-02-12 20:57:54 +01:00
parent 14d7f3b2b9
commit a625468a9a
9 changed files with 229 additions and 6 deletions

View File

@@ -0,0 +1,12 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Extractor;
interface DocumentExtractorInterface
{
public function supports(string $extension): bool;
public function extract(string $path): string;
}

View File

@@ -0,0 +1,32 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Extractor;
final class ExtractorResolver
{
/**
* @param iterable<DocumentExtractorInterface> $extractors
*/
public function __construct(
private iterable $extractors
) {
}
public function extract(string $path): string
{
$extension = pathinfo($path, PATHINFO_EXTENSION);
foreach ($this->extractors as $extractor) {
if ($extractor->supports($extension)) {
return $extractor->extract($path);
}
}
throw new \RuntimeException(sprintf(
'No extractor available for extension "%s".',
$extension
));
}
}

View File

@@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Extractor;
use Smalot\PdfParser\Parser;
final class PdfExtractor implements DocumentExtractorInterface
{
public function supports(string $extension): bool
{
return strtolower($extension) === 'pdf';
}
public function extract(string $path): string
{
$parser = new Parser();
$pdf = $parser->parseFile($path);
return $pdf->getText();
}
}

View File

@@ -5,6 +5,8 @@ declare(strict_types=1);
namespace App\Knowledge\Ingest;
use Smalot\PdfParser\Parser;
final class DocumentLoader
{
public function load(string $path): string
@@ -17,8 +19,10 @@ final class DocumentLoader
return match ($ext) {
'txt', 'md' => $this->loadText($path),
// 'pdf' => $this->loadPdf($path),
// 'docx' => $this->loadDocx($path),
'pdf' => $this->loadPdf($path),
// vorbereitet für später:
// 'docx' => $this->loadDocx($path),
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
};
@@ -30,7 +34,41 @@ final class DocumentLoader
if ($content === false) {
throw new \RuntimeException("Could not read file: {$path}");
}
return $content;
return $this->normalize($content);
}
private function loadPdf(string $path): string
{
$parser = new Parser();
try {
$pdf = $parser->parseFile($path);
$text = $pdf->getText();
} catch (\Throwable $e) {
throw new \RuntimeException("Failed to parse PDF: {$path}", 0, $e);
}
return $this->normalize($text);
}
/**
* Zentraler Normalizer für alle Dokumenttypen
*/
private function normalize(string $text): string
{
// Silbentrennung entfernen
$text = preg_replace('/-\n/', '', $text);
// Windows-Zeilenumbrüche
$text = str_replace("\r\n", "\n", $text);
// Mehrfache Leerzeichen
$text = preg_replace('/[ \t]+/', ' ', $text);
// Mehrfache Leerzeilen
$text = preg_replace('/\n{3,}/', "\n\n", $text);
return trim($text);
}
}

View File

@@ -0,0 +1,25 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Text;
final class TextNormalizer
{
public function normalize(string $text): string
{
// Silbentrennungen entfernen
$text = preg_replace('/-\n/', '', $text);
// Windows-Zeilenumbrüche vereinheitlichen
$text = str_replace("\r\n", "\n", $text);
// Mehrfache Leerzeichen reduzieren
$text = preg_replace('/[ \t]+/', ' ', $text);
// Mehrfache Leerzeilen reduzieren
$text = preg_replace('/\n{3,}/', "\n\n", $text);
return trim($text);
}
}