Files
MtoRagSystem/src/Tag/TagRoutingService.php
2026-04-24 13:13:56 +02:00

225 lines
6.0 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Tag;
use App\Entity\Document;
use Doctrine\DBAL\ArrayParameterType;
use Doctrine\DBAL\Exception;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Component\Uid\Uuid;
final class TagRoutingService
{
/**
* Number of raw tag hits requested from the vector service.
*/
public function __construct(
private readonly TagVectorSearchClient $tagSearch,
private readonly EntityManagerInterface $em,
private readonly int $defaultTopK = 8,
private readonly float $minBestScore = 0.72,
private readonly float $maxScoreDropFromBest = 0.08,
private readonly int $maxRoutingTags = 5,
private readonly int $maxCandidateDocs = 80,
private readonly float $multiTagBonusPerExtraTag = 0.05,
private readonly float $maxMultiTagBonus = 0.15,
) {
}
/**
* Returns ordered active document ids for tag-scoped retrieval.
*
* The method intentionally returns only document ids so the current
* retriever pipeline can stay unchanged.
*
* @return list<string>|null
* @throws Exception
*/
public function route(string $query): ?array
{
$query = trim($query);
if ($query === '') {
return null;
}
$hits = $this->filterRoutingHits(
$this->tagSearch->search($query, $this->defaultTopK)
);
if ($hits === []) {
return null;
}
$tagBinaryIds = [];
$tagMetaById = [];
foreach ($hits as $hit) {
$tagId = (string) ($hit['tag_id'] ?? '');
if ($tagId === '') {
continue;
}
try {
$tagBinaryIds[] = Uuid::fromString($tagId)->toBinary();
} catch (\Throwable) {
continue;
}
$tagMetaById[$tagId] = [
'score' => (float) $hit['score'],
'weight' => $this->resolveTypeWeight((string) $hit['tag_type']),
];
}
if ($tagBinaryIds === []) {
return null;
}
$rows = $this->em->getConnection()->executeQuery(
'SELECT dt.document_id, dt.tag_id
FROM document_tag dt
INNER JOIN document d ON d.id = dt.document_id
WHERE dt.tag_id IN (:tagIds)
AND d.status = :status',
[
'tagIds' => $tagBinaryIds,
'status' => Document::STATUS_ACTIVE,
],
[
'tagIds' => ArrayParameterType::BINARY,
]
)->fetchAllAssociative();
if ($rows === []) {
return null;
}
$documentScores = [];
$documentMatchedTags = [];
foreach ($rows as $row) {
if (!isset($row['document_id'], $row['tag_id'])) {
continue;
}
try {
$documentId = (string) Uuid::fromBinary($row['document_id']);
$tagId = (string) Uuid::fromBinary($row['tag_id']);
} catch (\Throwable) {
continue;
}
if (!isset($tagMetaById[$tagId])) {
continue;
}
$documentScores[$documentId] = ($documentScores[$documentId] ?? 0.0)
+ ($tagMetaById[$tagId]['score'] * $tagMetaById[$tagId]['weight']);
$documentMatchedTags[$documentId][$tagId] = true;
}
if ($documentScores === []) {
return null;
}
foreach ($documentScores as $documentId => $score) {
$matchedTagCount = isset($documentMatchedTags[$documentId])
? count($documentMatchedTags[$documentId])
: 0;
if ($matchedTagCount > 1) {
$documentScores[$documentId] += min(
$this->maxMultiTagBonus,
($matchedTagCount - 1) * $this->multiTagBonusPerExtraTag
);
}
}
arsort($documentScores, SORT_NUMERIC);
return array_slice(
array_keys($documentScores),
0,
$this->maxCandidateDocs
);
}
/**
* @param array<int, array{
* tag_id:string,
* score:float,
* label?:string,
* tag_type?:string
* }> $hits
*
* @return list<array{
* tag_id:string,
* score:float,
* tag_type:string
* }>
*/
private function filterRoutingHits(array $hits): array
{
if ($hits === []) {
return [];
}
$bestScore = (float) ($hits[0]['score'] ?? 0.0);
if ($bestScore < $this->minBestScore) {
return [];
}
$minimumAcceptedScore = max(
$this->minBestScore,
$bestScore - $this->maxScoreDropFromBest
);
$filtered = [];
foreach ($hits as $hit) {
$tagId = (string) ($hit['tag_id'] ?? '');
$score = (float) ($hit['score'] ?? 0.0);
$tagType = TagTypes::normalize(
(string) ($hit['tag_type'] ?? TagTypes::GENERIC)
);
if ($tagId === '' || $score < $minimumAcceptedScore) {
continue;
}
// Sales signals may still be useful elsewhere, but they should not
// expand the document scope for semantic retrieval.
if ($tagType === TagTypes::SALES_SIGNAL) {
continue;
}
$filtered[] = [
'tag_id' => $tagId,
'score' => $score,
'tag_type' => $tagType,
];
if (count($filtered) >= $this->maxRoutingTags) {
break;
}
}
return $filtered;
}
private function resolveTypeWeight(string $tagType): float
{
return match (TagTypes::normalize($tagType)) {
TagTypes::CATALOG_ENTITY => 1.20,
TagTypes::GENERIC => 1.00,
TagTypes::SALES_SIGNAL => 0.00,
default => 1.00,
};
}
}