add pdf reader modul
This commit is contained in:
12
src/Knowledge/Extractor/DocumentExtractorInterface.php
Normal file
12
src/Knowledge/Extractor/DocumentExtractorInterface.php
Normal file
@@ -0,0 +1,12 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Extractor;
|
||||
|
||||
interface DocumentExtractorInterface
|
||||
{
|
||||
public function supports(string $extension): bool;
|
||||
|
||||
public function extract(string $path): string;
|
||||
}
|
||||
32
src/Knowledge/Extractor/ExtractorResolver.php
Normal file
32
src/Knowledge/Extractor/ExtractorResolver.php
Normal file
@@ -0,0 +1,32 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Extractor;
|
||||
|
||||
final class ExtractorResolver
|
||||
{
|
||||
/**
|
||||
* @param iterable<DocumentExtractorInterface> $extractors
|
||||
*/
|
||||
public function __construct(
|
||||
private iterable $extractors
|
||||
) {
|
||||
}
|
||||
|
||||
public function extract(string $path): string
|
||||
{
|
||||
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
||||
|
||||
foreach ($this->extractors as $extractor) {
|
||||
if ($extractor->supports($extension)) {
|
||||
return $extractor->extract($path);
|
||||
}
|
||||
}
|
||||
|
||||
throw new \RuntimeException(sprintf(
|
||||
'No extractor available for extension "%s".',
|
||||
$extension
|
||||
));
|
||||
}
|
||||
}
|
||||
23
src/Knowledge/Extractor/PdfExtractor.php
Normal file
23
src/Knowledge/Extractor/PdfExtractor.php
Normal file
@@ -0,0 +1,23 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Extractor;
|
||||
|
||||
use Smalot\PdfParser\Parser;
|
||||
|
||||
final class PdfExtractor implements DocumentExtractorInterface
|
||||
{
|
||||
public function supports(string $extension): bool
|
||||
{
|
||||
return strtolower($extension) === 'pdf';
|
||||
}
|
||||
|
||||
public function extract(string $path): string
|
||||
{
|
||||
$parser = new Parser();
|
||||
$pdf = $parser->parseFile($path);
|
||||
|
||||
return $pdf->getText();
|
||||
}
|
||||
}
|
||||
@@ -5,6 +5,8 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
use Smalot\PdfParser\Parser;
|
||||
|
||||
final class DocumentLoader
|
||||
{
|
||||
public function load(string $path): string
|
||||
@@ -17,8 +19,10 @@ final class DocumentLoader
|
||||
|
||||
return match ($ext) {
|
||||
'txt', 'md' => $this->loadText($path),
|
||||
// 'pdf' => $this->loadPdf($path),
|
||||
// 'docx' => $this->loadDocx($path),
|
||||
'pdf' => $this->loadPdf($path),
|
||||
|
||||
// vorbereitet für später:
|
||||
// 'docx' => $this->loadDocx($path),
|
||||
|
||||
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
|
||||
};
|
||||
@@ -30,7 +34,41 @@ final class DocumentLoader
|
||||
if ($content === false) {
|
||||
throw new \RuntimeException("Could not read file: {$path}");
|
||||
}
|
||||
return $content;
|
||||
|
||||
return $this->normalize($content);
|
||||
}
|
||||
|
||||
private function loadPdf(string $path): string
|
||||
{
|
||||
$parser = new Parser();
|
||||
|
||||
try {
|
||||
$pdf = $parser->parseFile($path);
|
||||
$text = $pdf->getText();
|
||||
} catch (\Throwable $e) {
|
||||
throw new \RuntimeException("Failed to parse PDF: {$path}", 0, $e);
|
||||
}
|
||||
|
||||
return $this->normalize($text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Zentraler Normalizer für alle Dokumenttypen
|
||||
*/
|
||||
private function normalize(string $text): string
|
||||
{
|
||||
// Silbentrennung entfernen
|
||||
$text = preg_replace('/-\n/', '', $text);
|
||||
|
||||
// Windows-Zeilenumbrüche
|
||||
$text = str_replace("\r\n", "\n", $text);
|
||||
|
||||
// Mehrfache Leerzeichen
|
||||
$text = preg_replace('/[ \t]+/', ' ', $text);
|
||||
|
||||
// Mehrfache Leerzeilen
|
||||
$text = preg_replace('/\n{3,}/', "\n\n", $text);
|
||||
|
||||
return trim($text);
|
||||
}
|
||||
}
|
||||
|
||||
25
src/Knowledge/Text/TextNormalizer.php
Normal file
25
src/Knowledge/Text/TextNormalizer.php
Normal file
@@ -0,0 +1,25 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Text;
|
||||
|
||||
final class TextNormalizer
|
||||
{
|
||||
public function normalize(string $text): string
|
||||
{
|
||||
// Silbentrennungen entfernen
|
||||
$text = preg_replace('/-\n/', '', $text);
|
||||
|
||||
// Windows-Zeilenumbrüche vereinheitlichen
|
||||
$text = str_replace("\r\n", "\n", $text);
|
||||
|
||||
// Mehrfache Leerzeichen reduzieren
|
||||
$text = preg_replace('/[ \t]+/', ' ', $text);
|
||||
|
||||
// Mehrfache Leerzeilen reduzieren
|
||||
$text = preg_replace('/\n{3,}/', "\n\n", $text);
|
||||
|
||||
return trim($text);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user