optimize code and ingest docs
This commit is contained in:
@@ -13,6 +13,7 @@ use Doctrine\ORM\EntityManagerInterface;
|
||||
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
|
||||
use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;
|
||||
use Symfony\Component\HttpFoundation\File\Exception\FileException;
|
||||
use Symfony\Component\HttpFoundation\File\UploadedFile;
|
||||
use Symfony\Component\HttpFoundation\RedirectResponse;
|
||||
use Symfony\Component\HttpFoundation\Request;
|
||||
use Symfony\Component\HttpFoundation\Response;
|
||||
@@ -29,7 +30,14 @@ class DocumentController extends AbstractController
|
||||
public function index(EntityManagerInterface $em): Response
|
||||
{
|
||||
$documents = $em->getRepository(Document::class)
|
||||
->findBy([], ['createdAt' => 'DESC']);
|
||||
->createQueryBuilder('d')
|
||||
->leftJoin('d.versions', 'v')
|
||||
->addSelect('v')
|
||||
->leftJoin('d.currentVersion', 'cv')
|
||||
->addSelect('cv')
|
||||
->orderBy('d.createdAt', 'DESC')
|
||||
->getQuery()
|
||||
->getResult();
|
||||
|
||||
return $this->render('admin/document/index.html.twig', [
|
||||
'documents' => $documents
|
||||
@@ -71,12 +79,22 @@ class DocumentController extends AbstractController
|
||||
{
|
||||
if ($request->isMethod('POST')) {
|
||||
|
||||
/** @var UploadedFile|null $file */
|
||||
$file = $request->files->get('file');
|
||||
$title = $request->request->get('title') ?: $file->getClientOriginalName();
|
||||
$title = $formatText->slugify($title);
|
||||
|
||||
if (!$file || !$title) {
|
||||
$this->addFlash('error', 'Titel und Datei sind erforderlich.');
|
||||
if (!$file instanceof UploadedFile) {
|
||||
throw new \InvalidArgumentException('No valid file uploaded.');
|
||||
}
|
||||
|
||||
$rawTitle = $request->request->get('title');
|
||||
|
||||
$title = is_string($rawTitle) && $rawTitle !== ''
|
||||
? $rawTitle
|
||||
: $formatText->slugify($file->getClientOriginalName());
|
||||
|
||||
|
||||
if (!$title) {
|
||||
$this->addFlash('error', 'Titel ist erforderlich.');
|
||||
return $this->redirectToRoute('admin_document_new');
|
||||
}
|
||||
|
||||
|
||||
@@ -99,4 +99,13 @@ class IngestProfileController extends AbstractController
|
||||
|
||||
return $this->redirectToRoute('admin_ingest_profile_list');
|
||||
}
|
||||
|
||||
#[Route('/remove/{id}', name: 'admin_ingest_profile_remove')]
|
||||
public function remove(
|
||||
IngestProfileRepository $repo,
|
||||
string $id
|
||||
): Response {
|
||||
$repo->remove($id);
|
||||
return $this->redirectToRoute('admin_ingest_profile_list');
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Entity;
|
||||
|
||||
use Doctrine\ORM\Mapping as ORM;
|
||||
@@ -25,11 +27,11 @@ class DocumentVersion
|
||||
#[ORM\Column(type: 'uuid', unique: true)]
|
||||
private Uuid $id;
|
||||
|
||||
#[ORM\ManyToOne(inversedBy: 'versions')]
|
||||
#[ORM\JoinColumn(nullable: false)]
|
||||
#[ORM\ManyToOne(targetEntity: Document::class, inversedBy: 'versions')]
|
||||
#[ORM\JoinColumn(nullable: false, onDelete: 'CASCADE')]
|
||||
private Document $document;
|
||||
|
||||
#[ORM\Column]
|
||||
#[ORM\Column(type: 'integer')]
|
||||
private int $versionNumber;
|
||||
|
||||
#[ORM\Column(length: 255)]
|
||||
@@ -41,14 +43,14 @@ class DocumentVersion
|
||||
#[ORM\Column(length: 20)]
|
||||
private string $ingestStatus = self::INGEST_PENDING;
|
||||
|
||||
#[ORM\ManyToOne]
|
||||
#[ORM\ManyToOne(targetEntity: User::class)]
|
||||
#[ORM\JoinColumn(nullable: false)]
|
||||
private User $createdBy;
|
||||
|
||||
#[ORM\Column]
|
||||
#[ORM\Column(type: 'datetime_immutable')]
|
||||
private \DateTimeImmutable $createdAt;
|
||||
|
||||
#[ORM\Column]
|
||||
#[ORM\Column(type: 'boolean')]
|
||||
private bool $isActive = false;
|
||||
|
||||
public function __construct()
|
||||
@@ -57,22 +59,18 @@ class DocumentVersion
|
||||
$this->createdAt = new \DateTimeImmutable();
|
||||
}
|
||||
|
||||
// =========================
|
||||
// ID
|
||||
// =========================
|
||||
|
||||
public function getId(): Uuid
|
||||
{
|
||||
return $this->id;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Document Relation
|
||||
// =========================
|
||||
|
||||
public function setDocument(Document $document): void
|
||||
{
|
||||
$this->document = $document;
|
||||
|
||||
if (!$document->getVersions()->contains($this)) {
|
||||
$document->addVersion($this);
|
||||
}
|
||||
}
|
||||
|
||||
public function getDocument(): Document
|
||||
@@ -80,10 +78,6 @@ class DocumentVersion
|
||||
return $this->document;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Version Number
|
||||
// =========================
|
||||
|
||||
public function getVersionNumber(): int
|
||||
{
|
||||
return $this->versionNumber;
|
||||
@@ -94,10 +88,6 @@ class DocumentVersion
|
||||
$this->versionNumber = $number;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// File Path
|
||||
// =========================
|
||||
|
||||
public function setFilePath(string $path): void
|
||||
{
|
||||
$this->filePath = $path;
|
||||
@@ -108,10 +98,6 @@ class DocumentVersion
|
||||
return $this->filePath;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Checksum
|
||||
// =========================
|
||||
|
||||
public function setChecksum(string $checksum): void
|
||||
{
|
||||
$this->checksum = $checksum;
|
||||
@@ -122,10 +108,6 @@ class DocumentVersion
|
||||
return $this->checksum;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Ingest Status
|
||||
// =========================
|
||||
|
||||
public function setIngestStatus(string $status): void
|
||||
{
|
||||
if (!in_array($status, self::INGEST_STATUSES, true)) {
|
||||
@@ -145,10 +127,6 @@ class DocumentVersion
|
||||
return $this->ingestStatus === self::INGEST_INDEXED;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Created By
|
||||
// =========================
|
||||
|
||||
public function setCreatedBy(User $user): void
|
||||
{
|
||||
$this->createdBy = $user;
|
||||
@@ -159,19 +137,11 @@ class DocumentVersion
|
||||
return $this->createdBy;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Created At
|
||||
// =========================
|
||||
|
||||
public function getCreatedAt(): \DateTimeImmutable
|
||||
{
|
||||
return $this->createdAt;
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Active Flag
|
||||
// =========================
|
||||
|
||||
public function setActive(bool $active): void
|
||||
{
|
||||
$this->isActive = $active;
|
||||
@@ -182,15 +152,8 @@ class DocumentVersion
|
||||
return $this->isActive;
|
||||
}
|
||||
|
||||
//#########################################################
|
||||
// Helper
|
||||
//#########################################################
|
||||
public function getFileExtension(): string
|
||||
{
|
||||
if (!$this->filePath) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return mb_strtolower(pathinfo($this->filePath, PATHINFO_EXTENSION));
|
||||
}
|
||||
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/ChunkIndexWriter.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
final class ChunkIndexWriter
|
||||
{
|
||||
public function __construct(
|
||||
private string $indexPath
|
||||
) {}
|
||||
|
||||
public function add(array $entry): void
|
||||
{
|
||||
$index = $this->load();
|
||||
$index[] = $entry;
|
||||
$this->save($index);
|
||||
}
|
||||
|
||||
private function load(): array
|
||||
{
|
||||
if (!is_file($this->indexPath)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$json = file_get_contents($this->indexPath);
|
||||
$data = $json ? json_decode($json, true) : null;
|
||||
|
||||
return is_array($data) ? $data : [];
|
||||
}
|
||||
|
||||
private function save(array $index): void
|
||||
{
|
||||
$dir = dirname($this->indexPath);
|
||||
if (!is_dir($dir)) {
|
||||
mkdir($dir, 0775, true);
|
||||
}
|
||||
|
||||
file_put_contents(
|
||||
$this->indexPath,
|
||||
json_encode($index, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)
|
||||
);
|
||||
}
|
||||
|
||||
public function hasSourceHash(string $source, string $hash): bool
|
||||
{
|
||||
foreach ($this->load() as $entry) {
|
||||
if (
|
||||
($entry['source'] ?? null) === $source &&
|
||||
($entry['sourceHash'] ?? null) === $hash
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -1,149 +0,0 @@
|
||||
<?php
|
||||
// src/Knowledge/Ingest/ChunkWriter.php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Knowledge\Ingest;
|
||||
|
||||
|
||||
use App\Knowledge\StopWords;
|
||||
|
||||
final class ChunkWriter
|
||||
{
|
||||
|
||||
public function __construct(
|
||||
private string $chunksDir,
|
||||
private string $manifestPath,
|
||||
private ChunkIndexWriter $indexWriter,
|
||||
private StopWords $stopWords,
|
||||
)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[] $chunks
|
||||
* @return string[] written filenames
|
||||
*/
|
||||
public function write(string $sourceName, array $chunks, string $sourceHash): array
|
||||
{
|
||||
if (!is_dir($this->chunksDir)) {
|
||||
mkdir($this->chunksDir, 0775, true);
|
||||
}
|
||||
|
||||
$manifest = $this->loadManifest();
|
||||
$written = [];
|
||||
|
||||
$base = $this->safeBase($sourceName);
|
||||
$ts = date('Ymd_His');
|
||||
|
||||
foreach ($chunks as $i => $chunk) {
|
||||
$filename = "{$base}__{$ts}__" . str_pad((string)$i, 4, '0', STR_PAD_LEFT) . ".txt";
|
||||
$path = rtrim($this->chunksDir, '/') . '/' . $filename;
|
||||
|
||||
$header = $this->buildHeader(
|
||||
source: $sourceName,
|
||||
index: $i
|
||||
);
|
||||
|
||||
file_put_contents($path, $header . "\n\n" . $chunk);
|
||||
|
||||
$written[] = $filename;
|
||||
|
||||
$manifest[] = [
|
||||
'file' => $filename,
|
||||
'source' => $sourceName,
|
||||
'index' => $i,
|
||||
'chars' => mb_strlen($chunk),
|
||||
'createdAt' => date('c'),
|
||||
];
|
||||
|
||||
$this->indexWriter->add([
|
||||
'file' => $filename,
|
||||
'source' => $sourceName,
|
||||
'sourceHash' => $sourceHash,
|
||||
'keywords' => $this->extractKeywords($chunk),
|
||||
'chars' => mb_strlen($chunk),
|
||||
]);
|
||||
}
|
||||
|
||||
|
||||
$this->saveManifest($manifest);
|
||||
return $written;
|
||||
}
|
||||
|
||||
private function safeBase(string $name): string
|
||||
{
|
||||
$name = pathinfo($name, PATHINFO_FILENAME);
|
||||
$name = mb_strtolower($name);
|
||||
$name = preg_replace('/[^a-z0-9\-_]+/u', '-', $name);
|
||||
return trim((string)$name, '-');
|
||||
}
|
||||
|
||||
private function loadManifest(): array
|
||||
{
|
||||
if (!is_file($this->manifestPath)) {
|
||||
return [];
|
||||
}
|
||||
$json = file_get_contents($this->manifestPath);
|
||||
$data = $json ? json_decode($json, true) : null;
|
||||
return is_array($data) ? $data : [];
|
||||
}
|
||||
|
||||
private function saveManifest(array $manifest): void
|
||||
{
|
||||
$dir = dirname($this->manifestPath);
|
||||
if (!is_dir($dir)) {
|
||||
mkdir($dir, 0775, true);
|
||||
}
|
||||
file_put_contents($this->manifestPath, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
|
||||
}
|
||||
|
||||
private function buildHeader(string $source, int $index): string
|
||||
{
|
||||
return sprintf(
|
||||
'[Quelle: %s | Abschnitt: Chunk %d]',
|
||||
$source,
|
||||
$index + 1
|
||||
);
|
||||
}
|
||||
|
||||
private function extractKeywords(string $text): array
|
||||
{
|
||||
// 1) Lowercase
|
||||
$text = mb_strtolower($text);
|
||||
|
||||
// 2) URLs entfernen (sehr wichtig)
|
||||
$text = preg_replace('#https?://\S+#u', ' ', $text);
|
||||
|
||||
// 3) Newlines & Tabs → Space
|
||||
$text = str_replace(["\r", "\n", "\t"], ' ', $text);
|
||||
|
||||
// 4) Trennzeichen → Space (NICHT löschen!)
|
||||
$text = preg_replace('/[\/\.\,\:\;\-\_\(\)\[\]\{\}]/u', ' ', $text);
|
||||
|
||||
// 5) Alles andere raus
|
||||
$text = preg_replace('/[^\p{L}\p{N}\s]/u', '', $text);
|
||||
|
||||
// 6) Whitespace normalisieren
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
$text = trim($text);
|
||||
|
||||
// 7) Wörter extrahieren
|
||||
$words = explode(' ', $text);
|
||||
|
||||
// 8) Filtern + deduplizieren
|
||||
$keywords = [];
|
||||
|
||||
foreach ($words as $word) {
|
||||
if (mb_strlen($word) < 4) {
|
||||
continue;
|
||||
}
|
||||
if (in_array($word, $this->stopWords->getStopWords() ?? [], true)) {
|
||||
continue;
|
||||
}
|
||||
$keywords[] = $word;
|
||||
}
|
||||
|
||||
return array_values(array_unique(array_slice($keywords, 0, 25)));
|
||||
}
|
||||
}
|
||||
@@ -29,18 +29,30 @@ final class KnowledgeIngestService
|
||||
|
||||
$chunks = $this->chunker->chunk($text);
|
||||
|
||||
$documentId = $version->getDocument()->getId()->toRfc4122();
|
||||
$doc = $version->getDocument();
|
||||
|
||||
$documentId = $doc->getId()->toRfc4122();
|
||||
$versionId = $version->getId()->toRfc4122();
|
||||
|
||||
// ✅ Regel: Wenn title gefüllt ist, kommt er in jeden Chunk
|
||||
$title = trim((string) $doc->getTitle());
|
||||
|
||||
$index = 0;
|
||||
|
||||
foreach ($chunks as $chunkText) {
|
||||
|
||||
// ✅ Prefix nur wenn title vorhanden; keine Flags, keine Meta-Schalter
|
||||
if ($title !== '' && !str_starts_with($chunkText, $title)) {
|
||||
$chunkText = $title . "\n\n" . $chunkText;
|
||||
}
|
||||
|
||||
yield [
|
||||
'chunk_id' => Uuid::v4()->toRfc4122(),
|
||||
'document_id' => $documentId,
|
||||
'version_id' => $versionId,
|
||||
'chunk_index' => $index++,
|
||||
'text' => $chunkText,
|
||||
// ✅ checksum muss den finalen Text abbilden (inkl. Titel)
|
||||
'checksum' => sha1($chunkText),
|
||||
'metadata' => $this->buildMetadata($version),
|
||||
];
|
||||
@@ -56,7 +68,6 @@ final class KnowledgeIngestService
|
||||
public function buildAllActiveChunkRecords(): iterable
|
||||
{
|
||||
foreach ($this->versionRepo->iterateActiveVersions() as $version) {
|
||||
// yield from hält das Ganze streamingfähig (Generator-Kaskade)
|
||||
yield from $this->buildChunkRecords($version);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ namespace App\Repository;
|
||||
use App\Entity\IngestProfile;
|
||||
use Doctrine\Bundle\DoctrineBundle\Repository\ServiceEntityRepository;
|
||||
use Doctrine\Persistence\ManagerRegistry;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
class IngestProfileRepository extends ServiceEntityRepository
|
||||
{
|
||||
@@ -28,4 +29,17 @@ class IngestProfileRepository extends ServiceEntityRepository
|
||||
{
|
||||
return $this->findOneBy(['active' => true]);
|
||||
}
|
||||
|
||||
public function remove(string $id): void
|
||||
{
|
||||
$entity = $this->find($id);
|
||||
|
||||
if (!$entity instanceof IngestProfile) {
|
||||
return;
|
||||
}
|
||||
|
||||
$em = $this->getEntityManager();
|
||||
$em->remove($entity);
|
||||
$em->flush();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user