225 lines
6.0 KiB
PHP
225 lines
6.0 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Tag;
|
|
|
|
use App\Entity\Document;
|
|
use Doctrine\DBAL\ArrayParameterType;
|
|
use Doctrine\DBAL\Exception;
|
|
use Doctrine\ORM\EntityManagerInterface;
|
|
use Symfony\Component\Uid\Uuid;
|
|
|
|
final class TagRoutingService
|
|
{
|
|
/**
|
|
* Number of raw tag hits requested from the vector service.
|
|
*/
|
|
public function __construct(
|
|
private readonly TagVectorSearchClient $tagSearch,
|
|
private readonly EntityManagerInterface $em,
|
|
private readonly int $defaultTopK = 8,
|
|
private readonly float $minBestScore = 0.72,
|
|
private readonly float $maxScoreDropFromBest = 0.08,
|
|
private readonly int $maxRoutingTags = 5,
|
|
private readonly int $maxCandidateDocs = 80,
|
|
private readonly float $multiTagBonusPerExtraTag = 0.05,
|
|
private readonly float $maxMultiTagBonus = 0.15,
|
|
) {
|
|
}
|
|
|
|
/**
|
|
* Returns ordered active document ids for tag-scoped retrieval.
|
|
*
|
|
* The method intentionally returns only document ids so the current
|
|
* retriever pipeline can stay unchanged.
|
|
*
|
|
* @return list<string>|null
|
|
* @throws Exception
|
|
*/
|
|
public function route(string $query): ?array
|
|
{
|
|
$query = trim($query);
|
|
|
|
if ($query === '') {
|
|
return null;
|
|
}
|
|
|
|
$hits = $this->filterRoutingHits(
|
|
$this->tagSearch->search($query, $this->defaultTopK)
|
|
);
|
|
|
|
if ($hits === []) {
|
|
return null;
|
|
}
|
|
|
|
$tagBinaryIds = [];
|
|
$tagMetaById = [];
|
|
|
|
foreach ($hits as $hit) {
|
|
$tagId = (string) ($hit['tag_id'] ?? '');
|
|
|
|
if ($tagId === '') {
|
|
continue;
|
|
}
|
|
|
|
try {
|
|
$tagBinaryIds[] = Uuid::fromString($tagId)->toBinary();
|
|
} catch (\Throwable) {
|
|
continue;
|
|
}
|
|
|
|
$tagMetaById[$tagId] = [
|
|
'score' => (float) $hit['score'],
|
|
'weight' => $this->resolveTypeWeight((string) $hit['tag_type']),
|
|
];
|
|
}
|
|
|
|
if ($tagBinaryIds === []) {
|
|
return null;
|
|
}
|
|
|
|
$rows = $this->em->getConnection()->executeQuery(
|
|
'SELECT dt.document_id, dt.tag_id
|
|
FROM document_tag dt
|
|
INNER JOIN document d ON d.id = dt.document_id
|
|
WHERE dt.tag_id IN (:tagIds)
|
|
AND d.status = :status',
|
|
[
|
|
'tagIds' => $tagBinaryIds,
|
|
'status' => Document::STATUS_ACTIVE,
|
|
],
|
|
[
|
|
'tagIds' => ArrayParameterType::BINARY,
|
|
]
|
|
)->fetchAllAssociative();
|
|
|
|
if ($rows === []) {
|
|
return null;
|
|
}
|
|
|
|
$documentScores = [];
|
|
$documentMatchedTags = [];
|
|
|
|
foreach ($rows as $row) {
|
|
if (!isset($row['document_id'], $row['tag_id'])) {
|
|
continue;
|
|
}
|
|
|
|
try {
|
|
$documentId = (string) Uuid::fromBinary($row['document_id']);
|
|
$tagId = (string) Uuid::fromBinary($row['tag_id']);
|
|
} catch (\Throwable) {
|
|
continue;
|
|
}
|
|
|
|
if (!isset($tagMetaById[$tagId])) {
|
|
continue;
|
|
}
|
|
|
|
$documentScores[$documentId] = ($documentScores[$documentId] ?? 0.0)
|
|
+ ($tagMetaById[$tagId]['score'] * $tagMetaById[$tagId]['weight']);
|
|
|
|
$documentMatchedTags[$documentId][$tagId] = true;
|
|
}
|
|
|
|
if ($documentScores === []) {
|
|
return null;
|
|
}
|
|
|
|
foreach ($documentScores as $documentId => $score) {
|
|
$matchedTagCount = isset($documentMatchedTags[$documentId])
|
|
? count($documentMatchedTags[$documentId])
|
|
: 0;
|
|
|
|
if ($matchedTagCount > 1) {
|
|
$documentScores[$documentId] += min(
|
|
$this->maxMultiTagBonus,
|
|
($matchedTagCount - 1) * $this->multiTagBonusPerExtraTag
|
|
);
|
|
}
|
|
}
|
|
|
|
arsort($documentScores, SORT_NUMERIC);
|
|
|
|
return array_slice(
|
|
array_keys($documentScores),
|
|
0,
|
|
$this->maxCandidateDocs
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @param array<int, array{
|
|
* tag_id:string,
|
|
* score:float,
|
|
* label?:string,
|
|
* tag_type?:string
|
|
* }> $hits
|
|
*
|
|
* @return list<array{
|
|
* tag_id:string,
|
|
* score:float,
|
|
* tag_type:string
|
|
* }>
|
|
*/
|
|
private function filterRoutingHits(array $hits): array
|
|
{
|
|
if ($hits === []) {
|
|
return [];
|
|
}
|
|
|
|
$bestScore = (float) ($hits[0]['score'] ?? 0.0);
|
|
|
|
if ($bestScore < $this->minBestScore) {
|
|
return [];
|
|
}
|
|
|
|
$minimumAcceptedScore = max(
|
|
$this->minBestScore,
|
|
$bestScore - $this->maxScoreDropFromBest
|
|
);
|
|
|
|
$filtered = [];
|
|
|
|
foreach ($hits as $hit) {
|
|
$tagId = (string) ($hit['tag_id'] ?? '');
|
|
$score = (float) ($hit['score'] ?? 0.0);
|
|
$tagType = TagTypes::normalize(
|
|
(string) ($hit['tag_type'] ?? TagTypes::GENERIC)
|
|
);
|
|
|
|
if ($tagId === '' || $score < $minimumAcceptedScore) {
|
|
continue;
|
|
}
|
|
|
|
// Sales signals may still be useful elsewhere, but they should not
|
|
// expand the document scope for semantic retrieval.
|
|
if ($tagType === TagTypes::SALES_SIGNAL) {
|
|
continue;
|
|
}
|
|
|
|
$filtered[] = [
|
|
'tag_id' => $tagId,
|
|
'score' => $score,
|
|
'tag_type' => $tagType,
|
|
];
|
|
|
|
if (count($filtered) >= $this->maxRoutingTags) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $filtered;
|
|
}
|
|
|
|
private function resolveTypeWeight(string $tagType): float
|
|
{
|
|
return match (TagTypes::normalize($tagType)) {
|
|
TagTypes::CATALOG_ENTITY => 1.20,
|
|
TagTypes::GENERIC => 1.00,
|
|
TagTypes::SALES_SIGNAL => 0.00,
|
|
default => 1.00,
|
|
};
|
|
}
|
|
} |