add pdf reader modul

This commit is contained in:
team2
2026-02-12 20:57:54 +01:00
parent 14d7f3b2b9
commit a625468a9a
9 changed files with 229 additions and 6 deletions

View File

@@ -11,6 +11,7 @@
"doctrine/doctrine-bundle": "^2.18", "doctrine/doctrine-bundle": "^2.18",
"doctrine/doctrine-migrations-bundle": "^3.7", "doctrine/doctrine-migrations-bundle": "^3.7",
"doctrine/orm": "^3.6", "doctrine/orm": "^3.6",
"smalot/pdfparser": "^2.12",
"symfony/console": "^7.4", "symfony/console": "^7.4",
"symfony/dotenv": "^7.4", "symfony/dotenv": "^7.4",
"symfony/flex": "^2", "symfony/flex": "^2",

55
composer.lock generated
View File

@@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "6eb5735f5c0cf7cb184127323d55e384", "content-hash": "668be7cf3e1cc193e0f6a09448e72cf5",
"packages": [ "packages": [
{ {
"name": "doctrine/collections", "name": "doctrine/collections",
@@ -1472,6 +1472,57 @@
}, },
"time": "2024-09-11T13:17:53+00:00" "time": "2024-09-11T13:17:53+00:00"
}, },
{
"name": "smalot/pdfparser",
"version": "v2.12.3",
"source": {
"type": "git",
"url": "https://github.com/smalot/pdfparser.git",
"reference": "61c9bcafcb92899b76d8ebda6508267bae77e264"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/smalot/pdfparser/zipball/61c9bcafcb92899b76d8ebda6508267bae77e264",
"reference": "61c9bcafcb92899b76d8ebda6508267bae77e264",
"shasum": ""
},
"require": {
"ext-iconv": "*",
"ext-zlib": "*",
"php": ">=7.1",
"symfony/polyfill-mbstring": "^1.18"
},
"type": "library",
"autoload": {
"psr-0": {
"Smalot\\PdfParser\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"LGPL-3.0"
],
"authors": [
{
"name": "Sebastien MALOT",
"email": "sebastien@malot.fr"
}
],
"description": "Pdf parser library. Can read and extract information from pdf file.",
"homepage": "https://www.pdfparser.org",
"keywords": [
"extract",
"parse",
"parser",
"pdf",
"text"
],
"support": {
"issues": "https://github.com/smalot/pdfparser/issues",
"source": "https://github.com/smalot/pdfparser/tree/v2.12.3"
},
"time": "2026-01-08T08:04:04+00:00"
},
{ {
"name": "symfony/cache", "name": "symfony/cache",
"version": "v7.4.3", "version": "v7.4.3",
@@ -5917,5 +5968,5 @@
"ext-iconv": "*" "ext-iconv": "*"
}, },
"platform-dev": {}, "platform-dev": {},
"plugin-api-version": "2.9.0" "plugin-api-version": "2.6.0"
} }

View File

@@ -181,4 +181,26 @@ class DocumentVersion
{ {
return $this->isActive; return $this->isActive;
} }
//#########################################################
// Helper
//#########################################################
public function getFileExtension(): string
{
if (!$this->filePath) {
return '';
}
return mb_strtolower(pathinfo($this->filePath, PATHINFO_EXTENSION));
}
public function getFileTypeLabel(): string
{
return match ($this->getFileExtension()) {
'pdf' => 'PDF',
'txt' => 'Text',
'md' => 'Markdown',
default => strtoupper($this->getFileExtension()),
};
}
} }

View File

@@ -0,0 +1,12 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Extractor;
interface DocumentExtractorInterface
{
public function supports(string $extension): bool;
public function extract(string $path): string;
}

View File

@@ -0,0 +1,32 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Extractor;
final class ExtractorResolver
{
/**
* @param iterable<DocumentExtractorInterface> $extractors
*/
public function __construct(
private iterable $extractors
) {
}
public function extract(string $path): string
{
$extension = pathinfo($path, PATHINFO_EXTENSION);
foreach ($this->extractors as $extractor) {
if ($extractor->supports($extension)) {
return $extractor->extract($path);
}
}
throw new \RuntimeException(sprintf(
'No extractor available for extension "%s".',
$extension
));
}
}

View File

@@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Extractor;
use Smalot\PdfParser\Parser;
final class PdfExtractor implements DocumentExtractorInterface
{
public function supports(string $extension): bool
{
return strtolower($extension) === 'pdf';
}
public function extract(string $path): string
{
$parser = new Parser();
$pdf = $parser->parseFile($path);
return $pdf->getText();
}
}

View File

@@ -5,6 +5,8 @@ declare(strict_types=1);
namespace App\Knowledge\Ingest; namespace App\Knowledge\Ingest;
use Smalot\PdfParser\Parser;
final class DocumentLoader final class DocumentLoader
{ {
public function load(string $path): string public function load(string $path): string
@@ -17,7 +19,9 @@ final class DocumentLoader
return match ($ext) { return match ($ext) {
'txt', 'md' => $this->loadText($path), 'txt', 'md' => $this->loadText($path),
// 'pdf' => $this->loadPdf($path), 'pdf' => $this->loadPdf($path),
// vorbereitet für später:
// 'docx' => $this->loadDocx($path), // 'docx' => $this->loadDocx($path),
default => throw new \RuntimeException("Unsupported file type: .{$ext}"), default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
@@ -30,7 +34,41 @@ final class DocumentLoader
if ($content === false) { if ($content === false) {
throw new \RuntimeException("Could not read file: {$path}"); throw new \RuntimeException("Could not read file: {$path}");
} }
return $content;
return $this->normalize($content);
} }
private function loadPdf(string $path): string
{
$parser = new Parser();
try {
$pdf = $parser->parseFile($path);
$text = $pdf->getText();
} catch (\Throwable $e) {
throw new \RuntimeException("Failed to parse PDF: {$path}", 0, $e);
}
return $this->normalize($text);
}
/**
* Zentraler Normalizer für alle Dokumenttypen
*/
private function normalize(string $text): string
{
// Silbentrennung entfernen
$text = preg_replace('/-\n/', '', $text);
// Windows-Zeilenumbrüche
$text = str_replace("\r\n", "\n", $text);
// Mehrfache Leerzeichen
$text = preg_replace('/[ \t]+/', ' ', $text);
// Mehrfache Leerzeilen
$text = preg_replace('/\n{3,}/', "\n\n", $text);
return trim($text);
}
} }

View File

@@ -0,0 +1,25 @@
<?php
declare(strict_types=1);
namespace App\Knowledge\Text;
final class TextNormalizer
{
public function normalize(string $text): string
{
// Silbentrennungen entfernen
$text = preg_replace('/-\n/', '', $text);
// Windows-Zeilenumbrüche vereinheitlichen
$text = str_replace("\r\n", "\n", $text);
// Mehrfache Leerzeichen reduzieren
$text = preg_replace('/[ \t]+/', ' ', $text);
// Mehrfache Leerzeilen reduzieren
$text = preg_replace('/\n{3,}/', "\n\n", $text);
return trim($text);
}
}

View File

@@ -21,20 +21,33 @@
<thead> <thead>
<tr> <tr>
<th>Titel</th> <th>Titel</th>
<th>Typ</th>
<th>Status</th> <th>Status</th>
<th>Versionen</th> <th>Versionen</th>
<th>Aktive Version</th> <th>Aktive Version</th>
<th>Erstellt am</th> <th>Erstellt am</th>
<th>Aktionen</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{% for document in documents %} {% for document in documents %}
<tr> <tr>
<td> <td>
<a href="{{ path('admin_document_show', {id: document.id}) }}" class="text-decoration-none text-light"> <a href="{{ path('admin_document_show', {id: document.id}) }}"
class="text-decoration-none text-light">
{{ document.title }} {{ document.title }}
</a> </a>
</td> </td>
<td>
{% if document.currentVersion %}
<span class="badge bg-secondary">
{{ document.currentVersion.fileTypeLabel }}
</span>
{% else %}
<span class="badge bg-dark">-</span>
{% endif %}
</td>
<td> <td>
{% if document.status == 'ACTIVE' %} {% if document.status == 'ACTIVE' %}
<span class="badge bg-success">Aktiv</span> <span class="badge bg-success">Aktiv</span>
@@ -51,6 +64,12 @@
{% endif %} {% endif %}
</td> </td>
<td>{{ document.createdAt|date('d.m.Y H:i') }}</td> <td>{{ document.createdAt|date('d.m.Y H:i') }}</td>
<td>
<a class="btn btn-sm btn-outline-light" href="{{ path('admin_document_show', {id: document.id}) }}"
class="text-decoration-none text-light">
Ansehen
</a>
</td>
</tr> </tr>
{% endfor %} {% endfor %}
</tbody> </tbody>