optimize code and ingest docs

This commit is contained in:
team 1
2026-02-17 14:12:24 +01:00
parent f528a1c395
commit 812f2bf265
11 changed files with 156 additions and 282 deletions

View File

@@ -13,6 +13,7 @@ use Doctrine\ORM\EntityManagerInterface;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface; use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;
use Symfony\Component\HttpFoundation\File\Exception\FileException; use Symfony\Component\HttpFoundation\File\Exception\FileException;
use Symfony\Component\HttpFoundation\File\UploadedFile;
use Symfony\Component\HttpFoundation\RedirectResponse; use Symfony\Component\HttpFoundation\RedirectResponse;
use Symfony\Component\HttpFoundation\Request; use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response; use Symfony\Component\HttpFoundation\Response;
@@ -29,7 +30,14 @@ class DocumentController extends AbstractController
public function index(EntityManagerInterface $em): Response public function index(EntityManagerInterface $em): Response
{ {
$documents = $em->getRepository(Document::class) $documents = $em->getRepository(Document::class)
->findBy([], ['createdAt' => 'DESC']); ->createQueryBuilder('d')
->leftJoin('d.versions', 'v')
->addSelect('v')
->leftJoin('d.currentVersion', 'cv')
->addSelect('cv')
->orderBy('d.createdAt', 'DESC')
->getQuery()
->getResult();
return $this->render('admin/document/index.html.twig', [ return $this->render('admin/document/index.html.twig', [
'documents' => $documents 'documents' => $documents
@@ -71,12 +79,22 @@ class DocumentController extends AbstractController
{ {
if ($request->isMethod('POST')) { if ($request->isMethod('POST')) {
/** @var UploadedFile|null $file */
$file = $request->files->get('file'); $file = $request->files->get('file');
$title = $request->request->get('title') ?: $file->getClientOriginalName();
$title = $formatText->slugify($title);
if (!$file || !$title) { if (!$file instanceof UploadedFile) {
$this->addFlash('error', 'Titel und Datei sind erforderlich.'); throw new \InvalidArgumentException('No valid file uploaded.');
}
$rawTitle = $request->request->get('title');
$title = is_string($rawTitle) && $rawTitle !== ''
? $rawTitle
: $formatText->slugify($file->getClientOriginalName());
if (!$title) {
$this->addFlash('error', 'Titel ist erforderlich.');
return $this->redirectToRoute('admin_document_new'); return $this->redirectToRoute('admin_document_new');
} }

View File

@@ -99,4 +99,13 @@ class IngestProfileController extends AbstractController
return $this->redirectToRoute('admin_ingest_profile_list'); return $this->redirectToRoute('admin_ingest_profile_list');
} }
#[Route('/remove/{id}', name: 'admin_ingest_profile_remove')]
public function remove(
IngestProfileRepository $repo,
string $id
): Response {
$repo->remove($id);
return $this->redirectToRoute('admin_ingest_profile_list');
}
} }

View File

@@ -1,5 +1,7 @@
<?php <?php
declare(strict_types=1);
namespace App\Entity; namespace App\Entity;
use Doctrine\ORM\Mapping as ORM; use Doctrine\ORM\Mapping as ORM;
@@ -25,11 +27,11 @@ class DocumentVersion
#[ORM\Column(type: 'uuid', unique: true)] #[ORM\Column(type: 'uuid', unique: true)]
private Uuid $id; private Uuid $id;
#[ORM\ManyToOne(inversedBy: 'versions')] #[ORM\ManyToOne(targetEntity: Document::class, inversedBy: 'versions')]
#[ORM\JoinColumn(nullable: false)] #[ORM\JoinColumn(nullable: false, onDelete: 'CASCADE')]
private Document $document; private Document $document;
#[ORM\Column] #[ORM\Column(type: 'integer')]
private int $versionNumber; private int $versionNumber;
#[ORM\Column(length: 255)] #[ORM\Column(length: 255)]
@@ -41,14 +43,14 @@ class DocumentVersion
#[ORM\Column(length: 20)] #[ORM\Column(length: 20)]
private string $ingestStatus = self::INGEST_PENDING; private string $ingestStatus = self::INGEST_PENDING;
#[ORM\ManyToOne] #[ORM\ManyToOne(targetEntity: User::class)]
#[ORM\JoinColumn(nullable: false)] #[ORM\JoinColumn(nullable: false)]
private User $createdBy; private User $createdBy;
#[ORM\Column] #[ORM\Column(type: 'datetime_immutable')]
private \DateTimeImmutable $createdAt; private \DateTimeImmutable $createdAt;
#[ORM\Column] #[ORM\Column(type: 'boolean')]
private bool $isActive = false; private bool $isActive = false;
public function __construct() public function __construct()
@@ -57,22 +59,18 @@ class DocumentVersion
$this->createdAt = new \DateTimeImmutable(); $this->createdAt = new \DateTimeImmutable();
} }
// =========================
// ID
// =========================
public function getId(): Uuid public function getId(): Uuid
{ {
return $this->id; return $this->id;
} }
// =========================
// Document Relation
// =========================
public function setDocument(Document $document): void public function setDocument(Document $document): void
{ {
$this->document = $document; $this->document = $document;
if (!$document->getVersions()->contains($this)) {
$document->addVersion($this);
}
} }
public function getDocument(): Document public function getDocument(): Document
@@ -80,10 +78,6 @@ class DocumentVersion
return $this->document; return $this->document;
} }
// =========================
// Version Number
// =========================
public function getVersionNumber(): int public function getVersionNumber(): int
{ {
return $this->versionNumber; return $this->versionNumber;
@@ -94,10 +88,6 @@ class DocumentVersion
$this->versionNumber = $number; $this->versionNumber = $number;
} }
// =========================
// File Path
// =========================
public function setFilePath(string $path): void public function setFilePath(string $path): void
{ {
$this->filePath = $path; $this->filePath = $path;
@@ -108,10 +98,6 @@ class DocumentVersion
return $this->filePath; return $this->filePath;
} }
// =========================
// Checksum
// =========================
public function setChecksum(string $checksum): void public function setChecksum(string $checksum): void
{ {
$this->checksum = $checksum; $this->checksum = $checksum;
@@ -122,10 +108,6 @@ class DocumentVersion
return $this->checksum; return $this->checksum;
} }
// =========================
// Ingest Status
// =========================
public function setIngestStatus(string $status): void public function setIngestStatus(string $status): void
{ {
if (!in_array($status, self::INGEST_STATUSES, true)) { if (!in_array($status, self::INGEST_STATUSES, true)) {
@@ -145,10 +127,6 @@ class DocumentVersion
return $this->ingestStatus === self::INGEST_INDEXED; return $this->ingestStatus === self::INGEST_INDEXED;
} }
// =========================
// Created By
// =========================
public function setCreatedBy(User $user): void public function setCreatedBy(User $user): void
{ {
$this->createdBy = $user; $this->createdBy = $user;
@@ -159,19 +137,11 @@ class DocumentVersion
return $this->createdBy; return $this->createdBy;
} }
// =========================
// Created At
// =========================
public function getCreatedAt(): \DateTimeImmutable public function getCreatedAt(): \DateTimeImmutable
{ {
return $this->createdAt; return $this->createdAt;
} }
// =========================
// Active Flag
// =========================
public function setActive(bool $active): void public function setActive(bool $active): void
{ {
$this->isActive = $active; $this->isActive = $active;
@@ -182,15 +152,8 @@ class DocumentVersion
return $this->isActive; return $this->isActive;
} }
//#########################################################
// Helper
//#########################################################
public function getFileExtension(): string public function getFileExtension(): string
{ {
if (!$this->filePath) {
return '';
}
return mb_strtolower(pathinfo($this->filePath, PATHINFO_EXTENSION)); return mb_strtolower(pathinfo($this->filePath, PATHINFO_EXTENSION));
} }

View File

@@ -1,58 +0,0 @@
<?php
// src/Knowledge/Ingest/ChunkIndexWriter.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
final class ChunkIndexWriter
{
public function __construct(
private string $indexPath
) {}
public function add(array $entry): void
{
$index = $this->load();
$index[] = $entry;
$this->save($index);
}
private function load(): array
{
if (!is_file($this->indexPath)) {
return [];
}
$json = file_get_contents($this->indexPath);
$data = $json ? json_decode($json, true) : null;
return is_array($data) ? $data : [];
}
private function save(array $index): void
{
$dir = dirname($this->indexPath);
if (!is_dir($dir)) {
mkdir($dir, 0775, true);
}
file_put_contents(
$this->indexPath,
json_encode($index, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)
);
}
public function hasSourceHash(string $source, string $hash): bool
{
foreach ($this->load() as $entry) {
if (
($entry['source'] ?? null) === $source &&
($entry['sourceHash'] ?? null) === $hash
) {
return true;
}
}
return false;
}
}

View File

@@ -1,149 +0,0 @@
<?php
// src/Knowledge/Ingest/ChunkWriter.php
declare(strict_types=1);
namespace App\Knowledge\Ingest;
use App\Knowledge\StopWords;
final class ChunkWriter
{
public function __construct(
private string $chunksDir,
private string $manifestPath,
private ChunkIndexWriter $indexWriter,
private StopWords $stopWords,
)
{
}
/**
* @param string[] $chunks
* @return string[] written filenames
*/
public function write(string $sourceName, array $chunks, string $sourceHash): array
{
if (!is_dir($this->chunksDir)) {
mkdir($this->chunksDir, 0775, true);
}
$manifest = $this->loadManifest();
$written = [];
$base = $this->safeBase($sourceName);
$ts = date('Ymd_His');
foreach ($chunks as $i => $chunk) {
$filename = "{$base}__{$ts}__" . str_pad((string)$i, 4, '0', STR_PAD_LEFT) . ".txt";
$path = rtrim($this->chunksDir, '/') . '/' . $filename;
$header = $this->buildHeader(
source: $sourceName,
index: $i
);
file_put_contents($path, $header . "\n\n" . $chunk);
$written[] = $filename;
$manifest[] = [
'file' => $filename,
'source' => $sourceName,
'index' => $i,
'chars' => mb_strlen($chunk),
'createdAt' => date('c'),
];
$this->indexWriter->add([
'file' => $filename,
'source' => $sourceName,
'sourceHash' => $sourceHash,
'keywords' => $this->extractKeywords($chunk),
'chars' => mb_strlen($chunk),
]);
}
$this->saveManifest($manifest);
return $written;
}
private function safeBase(string $name): string
{
$name = pathinfo($name, PATHINFO_FILENAME);
$name = mb_strtolower($name);
$name = preg_replace('/[^a-z0-9\-_]+/u', '-', $name);
return trim((string)$name, '-');
}
private function loadManifest(): array
{
if (!is_file($this->manifestPath)) {
return [];
}
$json = file_get_contents($this->manifestPath);
$data = $json ? json_decode($json, true) : null;
return is_array($data) ? $data : [];
}
private function saveManifest(array $manifest): void
{
$dir = dirname($this->manifestPath);
if (!is_dir($dir)) {
mkdir($dir, 0775, true);
}
file_put_contents($this->manifestPath, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
}
private function buildHeader(string $source, int $index): string
{
return sprintf(
'[Quelle: %s | Abschnitt: Chunk %d]',
$source,
$index + 1
);
}
private function extractKeywords(string $text): array
{
// 1) Lowercase
$text = mb_strtolower($text);
// 2) URLs entfernen (sehr wichtig)
$text = preg_replace('#https?://\S+#u', ' ', $text);
// 3) Newlines & Tabs → Space
$text = str_replace(["\r", "\n", "\t"], ' ', $text);
// 4) Trennzeichen → Space (NICHT löschen!)
$text = preg_replace('/[\/\.\,\:\;\-\_\(\)\[\]\{\}]/u', ' ', $text);
// 5) Alles andere raus
$text = preg_replace('/[^\p{L}\p{N}\s]/u', '', $text);
// 6) Whitespace normalisieren
$text = preg_replace('/\s+/u', ' ', $text);
$text = trim($text);
// 7) Wörter extrahieren
$words = explode(' ', $text);
// 8) Filtern + deduplizieren
$keywords = [];
foreach ($words as $word) {
if (mb_strlen($word) < 4) {
continue;
}
if (in_array($word, $this->stopWords->getStopWords() ?? [], true)) {
continue;
}
$keywords[] = $word;
}
return array_values(array_unique(array_slice($keywords, 0, 25)));
}
}

View File

@@ -29,18 +29,30 @@ final class KnowledgeIngestService
$chunks = $this->chunker->chunk($text); $chunks = $this->chunker->chunk($text);
$documentId = $version->getDocument()->getId()->toRfc4122(); $doc = $version->getDocument();
$documentId = $doc->getId()->toRfc4122();
$versionId = $version->getId()->toRfc4122(); $versionId = $version->getId()->toRfc4122();
// ✅ Regel: Wenn title gefüllt ist, kommt er in jeden Chunk
$title = trim((string) $doc->getTitle());
$index = 0; $index = 0;
foreach ($chunks as $chunkText) { foreach ($chunks as $chunkText) {
// ✅ Prefix nur wenn title vorhanden; keine Flags, keine Meta-Schalter
if ($title !== '' && !str_starts_with($chunkText, $title)) {
$chunkText = $title . "\n\n" . $chunkText;
}
yield [ yield [
'chunk_id' => Uuid::v4()->toRfc4122(), 'chunk_id' => Uuid::v4()->toRfc4122(),
'document_id' => $documentId, 'document_id' => $documentId,
'version_id' => $versionId, 'version_id' => $versionId,
'chunk_index' => $index++, 'chunk_index' => $index++,
'text' => $chunkText, 'text' => $chunkText,
// ✅ checksum muss den finalen Text abbilden (inkl. Titel)
'checksum' => sha1($chunkText), 'checksum' => sha1($chunkText),
'metadata' => $this->buildMetadata($version), 'metadata' => $this->buildMetadata($version),
]; ];
@@ -56,7 +68,6 @@ final class KnowledgeIngestService
public function buildAllActiveChunkRecords(): iterable public function buildAllActiveChunkRecords(): iterable
{ {
foreach ($this->versionRepo->iterateActiveVersions() as $version) { foreach ($this->versionRepo->iterateActiveVersions() as $version) {
// yield from hält das Ganze streamingfähig (Generator-Kaskade)
yield from $this->buildChunkRecords($version); yield from $this->buildChunkRecords($version);
} }
} }

View File

@@ -7,6 +7,7 @@ namespace App\Repository;
use App\Entity\IngestProfile; use App\Entity\IngestProfile;
use Doctrine\Bundle\DoctrineBundle\Repository\ServiceEntityRepository; use Doctrine\Bundle\DoctrineBundle\Repository\ServiceEntityRepository;
use Doctrine\Persistence\ManagerRegistry; use Doctrine\Persistence\ManagerRegistry;
use Symfony\Component\Uid\Uuid;
class IngestProfileRepository extends ServiceEntityRepository class IngestProfileRepository extends ServiceEntityRepository
{ {
@@ -28,4 +29,17 @@ class IngestProfileRepository extends ServiceEntityRepository
{ {
return $this->findOneBy(['active' => true]); return $this->findOneBy(['active' => true]);
} }
public function remove(string $id): void
{
$entity = $this->find($id);
if (!$entity instanceof IngestProfile) {
return;
}
$em = $this->getEntityManager();
$em->remove($entity);
$em->flush();
}
} }

View File

@@ -24,6 +24,7 @@
<th>ID</th> <th>ID</th>
<th>Typ</th> <th>Typ</th>
<th>Status</th> <th>Status</th>
<th>Indexiert</th>
<th>Versionen</th> <th>Versionen</th>
<th>Aktive Version</th> <th>Aktive Version</th>
<th>Erstellt am</th> <th>Erstellt am</th>
@@ -58,6 +59,13 @@
<span class="badge bg-secondary">Archiviert</span> <span class="badge bg-secondary">Archiviert</span>
{% endif %} {% endif %}
</td> </td>
<td>
{% if document.currentVersion.ingestStatus == 'INDEXED' %}
<span class="badge bg-success">{{ document.currentVersion.ingestStatus }}</span>
{% else %}
<span class="badge bg-danger">{{ document.currentVersion.ingestStatus }}</span>
{% endif %}
</td>
<td>{{ document.versions|length }}</td> <td>{{ document.versions|length }}</td>
<td> <td>
{% if document.currentVersion %} {% if document.currentVersion %}

View File

@@ -8,12 +8,15 @@
<form method="post" enctype="multipart/form-data"> <form method="post" enctype="multipart/form-data">
<div class="mb-3"> <div class="mb-3">
<label class="form-label">Titel</label> <label class="form-label">Titel:</label>
<div class="mb-2"><b>Bitte geben Sie einen aussagekräftigen Titel ein.</b><br>
Der Titel ist entscheidend, damit in jedem Chunk ein sinnvoller thematischer Bezug hergestellt und eine saubere semantische Zuordnung ermöglicht werden kann.<br>
Wenn kein Titel angegeben wird, wird automatisch der Dateiname als Titel verwendet (nicht empfohlen).</div>
<input class="form-control" name="title"> <input class="form-control" name="title">
</div> </div>
<div class="mb-3"> <div class="mb-3">
<label class="form-label">Datei</label> <label class="form-label">Datei:</label>
<input type="file" class="form-control" name="file" required> <input type="file" class="form-control" name="file" required>
</div> </div>

View File

@@ -6,21 +6,71 @@
<h1>Create Ingest Profile</h1> <h1>Create Ingest Profile</h1>
<form method="post"> <form method="post">
<label>Chunk Size:</label> <table class="table table-sm table-dark align-middle">
<input type="number" name="chunk_size" required><br> <tbody>
<tr>
<label>Chunk Overlap:</label> <th scope="row" class="w-25">Chunk Size (500-2500)</th>
<input type="number" name="chunk_overlap" required><br> <td>
<label>
<label>Embedding Model:</label> <select name="chunk_size" class="form-select">
<input type="text" name="embedding_model" required><br> {% for i in range(250, 2500, 50) %}
<option value="{{ i }}" {{ selectedValue is defined and selectedValue == i ? 'selected' : '' }}>
<label>Embedding Dimension:</label> {{ i }}
<input type="number" name="embedding_dimension" required><br> </option>
{% endfor %}
<label>Scoring Version:</label> </select>
<input type="number" name="scoring_version" required><br> </label>
</td>
<button type="submit">Create</button> </tr>
<tr>
<th scope="row">Chunk Overlap (50-150)</th>
<td>
<label>
<select name="chunk_overlap" class="form-select">
{% for i in range(50, 150, 25) %}
<option value="{{ i }}" {{ selectedValue is defined and selectedValue == i ? 'selected' : '' }}>
{{ i }}
</option>
{% endfor %}
</select>
</label>
</td>
</tr>
<tr>
<th scope="row">Embedding Model (default)</th>
<td>
<label>
<select name="embedding_model" class="form-control" required>
<option value="all-MiniLM-L6-v2">all-MiniLM-L6-v2</option>
</select>
</label>
</td>
</tr>
<tr>
<th scope="row">Embedding Dimension (default)</th>
<td>
<label>
<select name="embedding_dimension" class="form-control" required>
<option value="768">768</option>
</select>
</label>
</td>
</tr>
<tr>
<th scope="row">Scoring Version (default)</th>
<td>
<label>
<input type="number" name="scoring_version" class="form-control" value="1" placeholder="1" readonly required>
</label>
</td>
</tr>
<tr>
<td colspan="2" class="text-start">
<button type="submit" class="btn btn-primary">Create</button>
</td>
</tr>
</tbody>
</table>
</form> </form>
{% endblock %} {% endblock %}

View File

@@ -44,10 +44,15 @@
<td>{{ p.reindexRequired ? 'Yes' : 'No' }}</td> <td>{{ p.reindexRequired ? 'Yes' : 'No' }}</td>
<td> <td>
{% if not p.active %} {% if not p.active %}
<a class="btn btn-outline-info btn-sm" href="{{ path('admin_ingest_profile_activate', {id: p.id}) }}"> <a class="btn btn-outline-success btn-sm" href="{{ path('admin_ingest_profile_activate', {id: p.id}) }}">
Aktivieren Aktivieren
</a> </a>
{% endif %} {% endif %}
{% if not p.active %}
<a class="btn btn-outline-danger btn-sm" href="{{ path('admin_ingest_profile_remove', {id: p.id}) }}">
Löschen
</a>
{% endif %}
</td> </td>
</tr> </tr>
{% endfor %} {% endfor %}