new system rebuild command
harden IngestService
This commit is contained in:
@@ -10,10 +10,11 @@ use App\Repository\DocumentVersionRepository;
|
|||||||
final class KnowledgeIngestService
|
final class KnowledgeIngestService
|
||||||
{
|
{
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private DocumentLoader $loader,
|
private DocumentLoader $loader,
|
||||||
private SimpleChunker $chunker,
|
private SimpleChunker $chunker,
|
||||||
private DocumentVersionRepository $versionRepo,
|
private DocumentVersionRepository $versionRepo,
|
||||||
) {
|
)
|
||||||
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -31,17 +32,16 @@ final class KnowledgeIngestService
|
|||||||
$doc = $version->getDocument();
|
$doc = $version->getDocument();
|
||||||
|
|
||||||
$documentId = $doc->getId()->toRfc4122();
|
$documentId = $doc->getId()->toRfc4122();
|
||||||
$versionId = $version->getId()->toRfc4122();
|
$versionId = $version->getId()->toRfc4122();
|
||||||
|
|
||||||
$title = trim((string) $doc->getTitle());
|
$title = trim((string)$doc->getTitle());
|
||||||
|
|
||||||
$index = 0;
|
$index = 0;
|
||||||
|
|
||||||
foreach ($chunks as $chunkText) {
|
foreach ($chunks as $chunkText) {
|
||||||
|
|
||||||
// Titel optional weiterhin prefixen (wenn du das behalten willst)
|
|
||||||
if ($title !== '' && !str_starts_with($chunkText, $title)) {
|
if ($title !== '' && !str_starts_with($chunkText, $title)) {
|
||||||
$chunkText = $title . "\n\n" . $chunkText;
|
$chunkText = "# Produkt Titel: " . $title . "\n\n --- " . $chunkText;
|
||||||
}
|
}
|
||||||
|
|
||||||
$chunkText = trim($chunkText);
|
$chunkText = trim($chunkText);
|
||||||
@@ -56,13 +56,13 @@ final class KnowledgeIngestService
|
|||||||
);
|
);
|
||||||
|
|
||||||
yield [
|
yield [
|
||||||
'chunk_id' => $chunkId,
|
'chunk_id' => $chunkId,
|
||||||
'document_id' => $documentId,
|
'document_id' => $documentId,
|
||||||
'version_id' => $versionId,
|
'version_id' => $versionId,
|
||||||
'chunk_index' => $index++,
|
'chunk_index' => $index++,
|
||||||
'text' => $chunkText,
|
'text' => $chunkText,
|
||||||
'checksum' => sha1($chunkText),
|
'checksum' => sha1($chunkText),
|
||||||
'metadata' => $this->buildMetadata($version),
|
'metadata' => $this->buildMetadata($version),
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -114,7 +114,7 @@ final class KnowledgeIngestService
|
|||||||
'version_number' => method_exists($version, 'getVersionNumber')
|
'version_number' => method_exists($version, 'getVersionNumber')
|
||||||
? $version->getVersionNumber()
|
? $version->getVersionNumber()
|
||||||
: null,
|
: null,
|
||||||
'file_path' => $version->getFilePath(),
|
'file_path' => $version->getFilePath(),
|
||||||
], static fn($v) => $v !== null && $v !== '');
|
], static fn($v) => $v !== null && $v !== '');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -14,7 +14,7 @@ use App\Vector\VectorSearchClient;
|
|||||||
|
|
||||||
final class NdjsonHybridRetriever implements RetrieverInterface
|
final class NdjsonHybridRetriever implements RetrieverInterface
|
||||||
{
|
{
|
||||||
private const VECTOR_SCORE_THRESHOLD = 0.4;
|
private const VECTOR_SCORE_THRESHOLD = 0.75;
|
||||||
|
|
||||||
private const HARD_MAX_CHUNKS = 200;
|
private const HARD_MAX_CHUNKS = 200;
|
||||||
private const HARD_MAX_VECTORK = 200;
|
private const HARD_MAX_VECTORK = 200;
|
||||||
@@ -23,7 +23,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
* Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter).
|
* Tags dürfen nur ein kleiner Bonus sein (kein Gate/Filter).
|
||||||
* Enterprise Default: klein halten, sonst dominieren Tags wieder.
|
* Enterprise Default: klein halten, sonst dominieren Tags wieder.
|
||||||
*/
|
*/
|
||||||
private const TAG_SCORE_BONUS = 0.08;
|
private const TAG_SCORE_BONUS = 0.25;
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly NdjsonChunkLookup $lookup,
|
private readonly NdjsonChunkLookup $lookup,
|
||||||
|
|||||||
Reference in New Issue
Block a user