diff --git a/composer.json b/composer.json index 6776b6c..34f5c3d 100644 --- a/composer.json +++ b/composer.json @@ -11,6 +11,7 @@ "doctrine/doctrine-bundle": "^2.18", "doctrine/doctrine-migrations-bundle": "^3.7", "doctrine/orm": "^3.6", + "smalot/pdfparser": "^2.12", "symfony/console": "^7.4", "symfony/dotenv": "^7.4", "symfony/flex": "^2", diff --git a/composer.lock b/composer.lock index 1db89f5..9f3384a 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "6eb5735f5c0cf7cb184127323d55e384", + "content-hash": "668be7cf3e1cc193e0f6a09448e72cf5", "packages": [ { "name": "doctrine/collections", @@ -1472,6 +1472,57 @@ }, "time": "2024-09-11T13:17:53+00:00" }, + { + "name": "smalot/pdfparser", + "version": "v2.12.3", + "source": { + "type": "git", + "url": "https://github.com/smalot/pdfparser.git", + "reference": "61c9bcafcb92899b76d8ebda6508267bae77e264" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/smalot/pdfparser/zipball/61c9bcafcb92899b76d8ebda6508267bae77e264", + "reference": "61c9bcafcb92899b76d8ebda6508267bae77e264", + "shasum": "" + }, + "require": { + "ext-iconv": "*", + "ext-zlib": "*", + "php": ">=7.1", + "symfony/polyfill-mbstring": "^1.18" + }, + "type": "library", + "autoload": { + "psr-0": { + "Smalot\\PdfParser\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "LGPL-3.0" + ], + "authors": [ + { + "name": "Sebastien MALOT", + "email": "sebastien@malot.fr" + } + ], + "description": "Pdf parser library. Can read and extract information from pdf file.", + "homepage": "https://www.pdfparser.org", + "keywords": [ + "extract", + "parse", + "parser", + "pdf", + "text" + ], + "support": { + "issues": "https://github.com/smalot/pdfparser/issues", + "source": "https://github.com/smalot/pdfparser/tree/v2.12.3" + }, + "time": "2026-01-08T08:04:04+00:00" + }, { "name": "symfony/cache", "version": "v7.4.3", @@ -5917,5 +5968,5 @@ "ext-iconv": "*" }, "platform-dev": {}, - "plugin-api-version": "2.9.0" + "plugin-api-version": "2.6.0" } diff --git a/src/Entity/DocumentVersion.php b/src/Entity/DocumentVersion.php index fa3a132..880e039 100644 --- a/src/Entity/DocumentVersion.php +++ b/src/Entity/DocumentVersion.php @@ -181,4 +181,26 @@ class DocumentVersion { return $this->isActive; } + + //######################################################### + // Helper + //######################################################### + public function getFileExtension(): string + { + if (!$this->filePath) { + return ''; + } + + return mb_strtolower(pathinfo($this->filePath, PATHINFO_EXTENSION)); + } + + public function getFileTypeLabel(): string + { + return match ($this->getFileExtension()) { + 'pdf' => 'PDF', + 'txt' => 'Text', + 'md' => 'Markdown', + default => strtoupper($this->getFileExtension()), + }; + } } diff --git a/src/Knowledge/Extractor/DocumentExtractorInterface.php b/src/Knowledge/Extractor/DocumentExtractorInterface.php new file mode 100644 index 0000000..2b4450f --- /dev/null +++ b/src/Knowledge/Extractor/DocumentExtractorInterface.php @@ -0,0 +1,12 @@ + $extractors + */ + public function __construct( + private iterable $extractors + ) { + } + + public function extract(string $path): string + { + $extension = pathinfo($path, PATHINFO_EXTENSION); + + foreach ($this->extractors as $extractor) { + if ($extractor->supports($extension)) { + return $extractor->extract($path); + } + } + + throw new \RuntimeException(sprintf( + 'No extractor available for extension "%s".', + $extension + )); + } +} diff --git a/src/Knowledge/Extractor/PdfExtractor.php b/src/Knowledge/Extractor/PdfExtractor.php new file mode 100644 index 0000000..d3f435e --- /dev/null +++ b/src/Knowledge/Extractor/PdfExtractor.php @@ -0,0 +1,23 @@ +parseFile($path); + + return $pdf->getText(); + } +} diff --git a/src/Knowledge/Ingest/DocumentLoader.php b/src/Knowledge/Ingest/DocumentLoader.php index de8391b..0f61e42 100644 --- a/src/Knowledge/Ingest/DocumentLoader.php +++ b/src/Knowledge/Ingest/DocumentLoader.php @@ -5,6 +5,8 @@ declare(strict_types=1); namespace App\Knowledge\Ingest; +use Smalot\PdfParser\Parser; + final class DocumentLoader { public function load(string $path): string @@ -17,8 +19,10 @@ final class DocumentLoader return match ($ext) { 'txt', 'md' => $this->loadText($path), - // 'pdf' => $this->loadPdf($path), - // 'docx' => $this->loadDocx($path), + 'pdf' => $this->loadPdf($path), + + // vorbereitet für später: + // 'docx' => $this->loadDocx($path), default => throw new \RuntimeException("Unsupported file type: .{$ext}"), }; @@ -30,7 +34,41 @@ final class DocumentLoader if ($content === false) { throw new \RuntimeException("Could not read file: {$path}"); } - return $content; + + return $this->normalize($content); } + private function loadPdf(string $path): string + { + $parser = new Parser(); + + try { + $pdf = $parser->parseFile($path); + $text = $pdf->getText(); + } catch (\Throwable $e) { + throw new \RuntimeException("Failed to parse PDF: {$path}", 0, $e); + } + + return $this->normalize($text); + } + + /** + * Zentraler Normalizer für alle Dokumenttypen + */ + private function normalize(string $text): string + { + // Silbentrennung entfernen + $text = preg_replace('/-\n/', '', $text); + + // Windows-Zeilenumbrüche + $text = str_replace("\r\n", "\n", $text); + + // Mehrfache Leerzeichen + $text = preg_replace('/[ \t]+/', ' ', $text); + + // Mehrfache Leerzeilen + $text = preg_replace('/\n{3,}/', "\n\n", $text); + + return trim($text); + } } diff --git a/src/Knowledge/Text/TextNormalizer.php b/src/Knowledge/Text/TextNormalizer.php new file mode 100644 index 0000000..66289dd --- /dev/null +++ b/src/Knowledge/Text/TextNormalizer.php @@ -0,0 +1,25 @@ +