add pdf reader modul
This commit is contained in:
@@ -11,6 +11,7 @@
|
|||||||
"doctrine/doctrine-bundle": "^2.18",
|
"doctrine/doctrine-bundle": "^2.18",
|
||||||
"doctrine/doctrine-migrations-bundle": "^3.7",
|
"doctrine/doctrine-migrations-bundle": "^3.7",
|
||||||
"doctrine/orm": "^3.6",
|
"doctrine/orm": "^3.6",
|
||||||
|
"smalot/pdfparser": "^2.12",
|
||||||
"symfony/console": "^7.4",
|
"symfony/console": "^7.4",
|
||||||
"symfony/dotenv": "^7.4",
|
"symfony/dotenv": "^7.4",
|
||||||
"symfony/flex": "^2",
|
"symfony/flex": "^2",
|
||||||
|
|||||||
55
composer.lock
generated
55
composer.lock
generated
@@ -4,7 +4,7 @@
|
|||||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||||
"This file is @generated automatically"
|
"This file is @generated automatically"
|
||||||
],
|
],
|
||||||
"content-hash": "6eb5735f5c0cf7cb184127323d55e384",
|
"content-hash": "668be7cf3e1cc193e0f6a09448e72cf5",
|
||||||
"packages": [
|
"packages": [
|
||||||
{
|
{
|
||||||
"name": "doctrine/collections",
|
"name": "doctrine/collections",
|
||||||
@@ -1472,6 +1472,57 @@
|
|||||||
},
|
},
|
||||||
"time": "2024-09-11T13:17:53+00:00"
|
"time": "2024-09-11T13:17:53+00:00"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "smalot/pdfparser",
|
||||||
|
"version": "v2.12.3",
|
||||||
|
"source": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/smalot/pdfparser.git",
|
||||||
|
"reference": "61c9bcafcb92899b76d8ebda6508267bae77e264"
|
||||||
|
},
|
||||||
|
"dist": {
|
||||||
|
"type": "zip",
|
||||||
|
"url": "https://api.github.com/repos/smalot/pdfparser/zipball/61c9bcafcb92899b76d8ebda6508267bae77e264",
|
||||||
|
"reference": "61c9bcafcb92899b76d8ebda6508267bae77e264",
|
||||||
|
"shasum": ""
|
||||||
|
},
|
||||||
|
"require": {
|
||||||
|
"ext-iconv": "*",
|
||||||
|
"ext-zlib": "*",
|
||||||
|
"php": ">=7.1",
|
||||||
|
"symfony/polyfill-mbstring": "^1.18"
|
||||||
|
},
|
||||||
|
"type": "library",
|
||||||
|
"autoload": {
|
||||||
|
"psr-0": {
|
||||||
|
"Smalot\\PdfParser\\": "src/"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
|
"license": [
|
||||||
|
"LGPL-3.0"
|
||||||
|
],
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "Sebastien MALOT",
|
||||||
|
"email": "sebastien@malot.fr"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "Pdf parser library. Can read and extract information from pdf file.",
|
||||||
|
"homepage": "https://www.pdfparser.org",
|
||||||
|
"keywords": [
|
||||||
|
"extract",
|
||||||
|
"parse",
|
||||||
|
"parser",
|
||||||
|
"pdf",
|
||||||
|
"text"
|
||||||
|
],
|
||||||
|
"support": {
|
||||||
|
"issues": "https://github.com/smalot/pdfparser/issues",
|
||||||
|
"source": "https://github.com/smalot/pdfparser/tree/v2.12.3"
|
||||||
|
},
|
||||||
|
"time": "2026-01-08T08:04:04+00:00"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/cache",
|
"name": "symfony/cache",
|
||||||
"version": "v7.4.3",
|
"version": "v7.4.3",
|
||||||
@@ -5917,5 +5968,5 @@
|
|||||||
"ext-iconv": "*"
|
"ext-iconv": "*"
|
||||||
},
|
},
|
||||||
"platform-dev": {},
|
"platform-dev": {},
|
||||||
"plugin-api-version": "2.9.0"
|
"plugin-api-version": "2.6.0"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -181,4 +181,26 @@ class DocumentVersion
|
|||||||
{
|
{
|
||||||
return $this->isActive;
|
return $this->isActive;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//#########################################################
|
||||||
|
// Helper
|
||||||
|
//#########################################################
|
||||||
|
public function getFileExtension(): string
|
||||||
|
{
|
||||||
|
if (!$this->filePath) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
return mb_strtolower(pathinfo($this->filePath, PATHINFO_EXTENSION));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getFileTypeLabel(): string
|
||||||
|
{
|
||||||
|
return match ($this->getFileExtension()) {
|
||||||
|
'pdf' => 'PDF',
|
||||||
|
'txt' => 'Text',
|
||||||
|
'md' => 'Markdown',
|
||||||
|
default => strtoupper($this->getFileExtension()),
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
12
src/Knowledge/Extractor/DocumentExtractorInterface.php
Normal file
12
src/Knowledge/Extractor/DocumentExtractorInterface.php
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Knowledge\Extractor;
|
||||||
|
|
||||||
|
interface DocumentExtractorInterface
|
||||||
|
{
|
||||||
|
public function supports(string $extension): bool;
|
||||||
|
|
||||||
|
public function extract(string $path): string;
|
||||||
|
}
|
||||||
32
src/Knowledge/Extractor/ExtractorResolver.php
Normal file
32
src/Knowledge/Extractor/ExtractorResolver.php
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Knowledge\Extractor;
|
||||||
|
|
||||||
|
final class ExtractorResolver
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* @param iterable<DocumentExtractorInterface> $extractors
|
||||||
|
*/
|
||||||
|
public function __construct(
|
||||||
|
private iterable $extractors
|
||||||
|
) {
|
||||||
|
}
|
||||||
|
|
||||||
|
public function extract(string $path): string
|
||||||
|
{
|
||||||
|
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
||||||
|
|
||||||
|
foreach ($this->extractors as $extractor) {
|
||||||
|
if ($extractor->supports($extension)) {
|
||||||
|
return $extractor->extract($path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new \RuntimeException(sprintf(
|
||||||
|
'No extractor available for extension "%s".',
|
||||||
|
$extension
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
23
src/Knowledge/Extractor/PdfExtractor.php
Normal file
23
src/Knowledge/Extractor/PdfExtractor.php
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Knowledge\Extractor;
|
||||||
|
|
||||||
|
use Smalot\PdfParser\Parser;
|
||||||
|
|
||||||
|
final class PdfExtractor implements DocumentExtractorInterface
|
||||||
|
{
|
||||||
|
public function supports(string $extension): bool
|
||||||
|
{
|
||||||
|
return strtolower($extension) === 'pdf';
|
||||||
|
}
|
||||||
|
|
||||||
|
public function extract(string $path): string
|
||||||
|
{
|
||||||
|
$parser = new Parser();
|
||||||
|
$pdf = $parser->parseFile($path);
|
||||||
|
|
||||||
|
return $pdf->getText();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -5,6 +5,8 @@ declare(strict_types=1);
|
|||||||
|
|
||||||
namespace App\Knowledge\Ingest;
|
namespace App\Knowledge\Ingest;
|
||||||
|
|
||||||
|
use Smalot\PdfParser\Parser;
|
||||||
|
|
||||||
final class DocumentLoader
|
final class DocumentLoader
|
||||||
{
|
{
|
||||||
public function load(string $path): string
|
public function load(string $path): string
|
||||||
@@ -17,7 +19,9 @@ final class DocumentLoader
|
|||||||
|
|
||||||
return match ($ext) {
|
return match ($ext) {
|
||||||
'txt', 'md' => $this->loadText($path),
|
'txt', 'md' => $this->loadText($path),
|
||||||
// 'pdf' => $this->loadPdf($path),
|
'pdf' => $this->loadPdf($path),
|
||||||
|
|
||||||
|
// vorbereitet für später:
|
||||||
// 'docx' => $this->loadDocx($path),
|
// 'docx' => $this->loadDocx($path),
|
||||||
|
|
||||||
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
|
default => throw new \RuntimeException("Unsupported file type: .{$ext}"),
|
||||||
@@ -30,7 +34,41 @@ final class DocumentLoader
|
|||||||
if ($content === false) {
|
if ($content === false) {
|
||||||
throw new \RuntimeException("Could not read file: {$path}");
|
throw new \RuntimeException("Could not read file: {$path}");
|
||||||
}
|
}
|
||||||
return $content;
|
|
||||||
|
return $this->normalize($content);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private function loadPdf(string $path): string
|
||||||
|
{
|
||||||
|
$parser = new Parser();
|
||||||
|
|
||||||
|
try {
|
||||||
|
$pdf = $parser->parseFile($path);
|
||||||
|
$text = $pdf->getText();
|
||||||
|
} catch (\Throwable $e) {
|
||||||
|
throw new \RuntimeException("Failed to parse PDF: {$path}", 0, $e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->normalize($text);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Zentraler Normalizer für alle Dokumenttypen
|
||||||
|
*/
|
||||||
|
private function normalize(string $text): string
|
||||||
|
{
|
||||||
|
// Silbentrennung entfernen
|
||||||
|
$text = preg_replace('/-\n/', '', $text);
|
||||||
|
|
||||||
|
// Windows-Zeilenumbrüche
|
||||||
|
$text = str_replace("\r\n", "\n", $text);
|
||||||
|
|
||||||
|
// Mehrfache Leerzeichen
|
||||||
|
$text = preg_replace('/[ \t]+/', ' ', $text);
|
||||||
|
|
||||||
|
// Mehrfache Leerzeilen
|
||||||
|
$text = preg_replace('/\n{3,}/', "\n\n", $text);
|
||||||
|
|
||||||
|
return trim($text);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
25
src/Knowledge/Text/TextNormalizer.php
Normal file
25
src/Knowledge/Text/TextNormalizer.php
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Knowledge\Text;
|
||||||
|
|
||||||
|
final class TextNormalizer
|
||||||
|
{
|
||||||
|
public function normalize(string $text): string
|
||||||
|
{
|
||||||
|
// Silbentrennungen entfernen
|
||||||
|
$text = preg_replace('/-\n/', '', $text);
|
||||||
|
|
||||||
|
// Windows-Zeilenumbrüche vereinheitlichen
|
||||||
|
$text = str_replace("\r\n", "\n", $text);
|
||||||
|
|
||||||
|
// Mehrfache Leerzeichen reduzieren
|
||||||
|
$text = preg_replace('/[ \t]+/', ' ', $text);
|
||||||
|
|
||||||
|
// Mehrfache Leerzeilen reduzieren
|
||||||
|
$text = preg_replace('/\n{3,}/', "\n\n", $text);
|
||||||
|
|
||||||
|
return trim($text);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -21,20 +21,33 @@
|
|||||||
<thead>
|
<thead>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Titel</th>
|
<th>Titel</th>
|
||||||
|
<th>Typ</th>
|
||||||
<th>Status</th>
|
<th>Status</th>
|
||||||
<th>Versionen</th>
|
<th>Versionen</th>
|
||||||
<th>Aktive Version</th>
|
<th>Aktive Version</th>
|
||||||
<th>Erstellt am</th>
|
<th>Erstellt am</th>
|
||||||
|
<th>Aktionen</th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
{% for document in documents %}
|
{% for document in documents %}
|
||||||
<tr>
|
<tr>
|
||||||
|
|
||||||
<td>
|
<td>
|
||||||
<a href="{{ path('admin_document_show', {id: document.id}) }}" class="text-decoration-none text-light">
|
<a href="{{ path('admin_document_show', {id: document.id}) }}"
|
||||||
|
class="text-decoration-none text-light">
|
||||||
{{ document.title }}
|
{{ document.title }}
|
||||||
</a>
|
</a>
|
||||||
</td>
|
</td>
|
||||||
|
<td>
|
||||||
|
{% if document.currentVersion %}
|
||||||
|
<span class="badge bg-secondary">
|
||||||
|
{{ document.currentVersion.fileTypeLabel }}
|
||||||
|
</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="badge bg-dark">-</span>
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
<td>
|
<td>
|
||||||
{% if document.status == 'ACTIVE' %}
|
{% if document.status == 'ACTIVE' %}
|
||||||
<span class="badge bg-success">Aktiv</span>
|
<span class="badge bg-success">Aktiv</span>
|
||||||
@@ -51,6 +64,12 @@
|
|||||||
{% endif %}
|
{% endif %}
|
||||||
</td>
|
</td>
|
||||||
<td>{{ document.createdAt|date('d.m.Y H:i') }}</td>
|
<td>{{ document.createdAt|date('d.m.Y H:i') }}</td>
|
||||||
|
<td>
|
||||||
|
<a class="btn btn-sm btn-outline-light" href="{{ path('admin_document_show', {id: document.id}) }}"
|
||||||
|
class="text-decoration-none text-light">
|
||||||
|
Ansehen
|
||||||
|
</a>
|
||||||
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</tbody>
|
</tbody>
|
||||||
|
|||||||
Reference in New Issue
Block a user