new system rebuild command
harden IngestService
This commit is contained in:
@@ -6,7 +6,6 @@ namespace App\Knowledge\Ingest;
|
||||
|
||||
use App\Entity\DocumentVersion;
|
||||
use App\Repository\DocumentVersionRepository;
|
||||
use Symfony\Component\Uid\Uuid;
|
||||
|
||||
final class KnowledgeIngestService
|
||||
{
|
||||
@@ -18,7 +17,7 @@ final class KnowledgeIngestService
|
||||
}
|
||||
|
||||
/**
|
||||
* Lokaler Ingest: erzeugt NDJSON-Records für genau diese Version.
|
||||
* Lokaler Ingest: erzeugt deterministische NDJSON-Records.
|
||||
*
|
||||
* @return iterable<array<string,mixed>>
|
||||
*/
|
||||
@@ -34,25 +33,34 @@ final class KnowledgeIngestService
|
||||
$documentId = $doc->getId()->toRfc4122();
|
||||
$versionId = $version->getId()->toRfc4122();
|
||||
|
||||
// ✅ Regel: Wenn title gefüllt ist, kommt er in jeden Chunk
|
||||
$title = trim((string) $doc->getTitle());
|
||||
|
||||
$index = 0;
|
||||
|
||||
foreach ($chunks as $chunkText) {
|
||||
|
||||
// ✅ Prefix nur wenn title vorhanden; keine Flags, keine Meta-Schalter
|
||||
// Titel optional weiterhin prefixen (wenn du das behalten willst)
|
||||
if ($title !== '' && !str_starts_with($chunkText, $title)) {
|
||||
$chunkText = $title . "\n\n" . $chunkText;
|
||||
}
|
||||
|
||||
$chunkText = trim($chunkText);
|
||||
|
||||
// 🔥 deterministische Chunk-ID
|
||||
$normalizedForId = $this->normalizeForId($chunkText);
|
||||
|
||||
$chunkId = sha1(
|
||||
$documentId . '|' .
|
||||
$versionId . '|' .
|
||||
$normalizedForId
|
||||
);
|
||||
|
||||
yield [
|
||||
'chunk_id' => Uuid::v4()->toRfc4122(),
|
||||
'chunk_id' => $chunkId,
|
||||
'document_id' => $documentId,
|
||||
'version_id' => $versionId,
|
||||
'chunk_index' => $index++,
|
||||
'text' => $chunkText,
|
||||
// ✅ checksum muss den finalen Text abbilden (inkl. Titel)
|
||||
'checksum' => sha1($chunkText),
|
||||
'metadata' => $this->buildMetadata($version),
|
||||
];
|
||||
@@ -60,10 +68,7 @@ final class KnowledgeIngestService
|
||||
}
|
||||
|
||||
/**
|
||||
* Global Reindex: iteriert streamingfähig über alle aktiven Versionen.
|
||||
* Keine RAM-Explosion, da alles generatorbasiert bleibt.
|
||||
*
|
||||
* @return iterable<array<string,mixed>>
|
||||
* Global Reindex
|
||||
*/
|
||||
public function buildAllActiveChunkRecords(): iterable
|
||||
{
|
||||
@@ -76,8 +81,18 @@ final class KnowledgeIngestService
|
||||
{
|
||||
$text = preg_replace("/\n{3,}/", "\n\n", $text);
|
||||
$text = preg_replace("/[ \t]+$/m", "", $text);
|
||||
return trim($text);
|
||||
}
|
||||
|
||||
return $text;
|
||||
/**
|
||||
* Normalisierung für stabile ID-Berechnung.
|
||||
* Wichtig: ID darf nicht durch Whitespace minimal variieren.
|
||||
*/
|
||||
private function normalizeForId(string $text): string
|
||||
{
|
||||
$text = mb_strtolower($text);
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
return trim($text);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -87,7 +102,6 @@ final class KnowledgeIngestService
|
||||
{
|
||||
$doc = $version->getDocument();
|
||||
|
||||
// Optional: Titel/Name, falls vorhanden
|
||||
$title = null;
|
||||
if (method_exists($doc, 'getTitle')) {
|
||||
$title = $doc->getTitle();
|
||||
@@ -97,8 +111,10 @@ final class KnowledgeIngestService
|
||||
|
||||
return array_filter([
|
||||
'document_title' => $title,
|
||||
'version_number' => method_exists($version, 'getVersionNumber') ? $version->getVersionNumber() : null,
|
||||
'version_number' => method_exists($version, 'getVersionNumber')
|
||||
? $version->getVersionNumber()
|
||||
: null,
|
||||
'file_path' => $version->getFilePath(),
|
||||
], static fn($v) => $v !== null && $v !== '');
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user