first commit
This commit is contained in:
@@ -4,6 +4,7 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Tag;
|
||||
|
||||
use App\Entity\Document;
|
||||
use Doctrine\DBAL\ArrayParameterType;
|
||||
use Doctrine\DBAL\Exception;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
@@ -11,91 +12,239 @@ use Symfony\Component\Uid\Uuid;
|
||||
|
||||
final class TagRoutingService
|
||||
{
|
||||
/**
|
||||
* Number of raw tag hits requested from the vector service.
|
||||
*/
|
||||
private const DEFAULT_TOPK = 8;
|
||||
private const MIN_BEST_SCORE = 0.25;
|
||||
private const MAX_CANDIDATE_DOCS = 200;
|
||||
|
||||
/**
|
||||
* Hard minimum confidence required to activate tag-based document routing.
|
||||
*
|
||||
* This intentionally aligns with the tag vector client gate to avoid
|
||||
* misleading secondary thresholds in this class.
|
||||
*/
|
||||
private const MIN_BEST_SCORE = 0.72;
|
||||
|
||||
/**
|
||||
* Only keep tag hits that stay reasonably close to the best hit.
|
||||
* This reduces semantic spillover into weakly related document spaces.
|
||||
*/
|
||||
private const MAX_SCORE_DROP_FROM_BEST = 0.08;
|
||||
|
||||
/**
|
||||
* Maximum number of tag hits that may influence routing.
|
||||
*/
|
||||
private const MAX_ROUTING_TAGS = 5;
|
||||
|
||||
/**
|
||||
* Maximum number of candidate documents passed into scoped chunk search.
|
||||
*/
|
||||
private const MAX_CANDIDATE_DOCS = 80;
|
||||
|
||||
/**
|
||||
* Small bonus for documents matched by multiple routed tags.
|
||||
*/
|
||||
private const MULTI_TAG_BONUS_PER_EXTRA_TAG = 0.05;
|
||||
private const MAX_MULTI_TAG_BONUS = 0.15;
|
||||
|
||||
public function __construct(
|
||||
private readonly TagVectorSearchClient $tagSearch,
|
||||
private readonly EntityManagerInterface $em,
|
||||
) {}
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]|null
|
||||
* Returns ordered active document ids for tag-scoped retrieval.
|
||||
*
|
||||
* The method intentionally returns only document ids so the current
|
||||
* retriever pipeline can stay unchanged.
|
||||
*
|
||||
* @return list<string>|null
|
||||
* @throws Exception
|
||||
*/
|
||||
public function route(string $query): ?array
|
||||
{
|
||||
$query = trim($query);
|
||||
|
||||
if ($query === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
$hits = $this->tagSearch->search($query, self::DEFAULT_TOPK);
|
||||
$hits = $this->filterRoutingHits(
|
||||
$this->tagSearch->search($query, self::DEFAULT_TOPK)
|
||||
);
|
||||
|
||||
if (!is_array($hits) || $hits === []) {
|
||||
if ($hits === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$bestScore = (float)($hits[0]['score'] ?? 0.0);
|
||||
if ($bestScore < self::MIN_BEST_SCORE) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Convert tag UUID strings to binary(16)
|
||||
$tagBinaryIds = [];
|
||||
$tagMetaById = [];
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
$id = (string)($hit['tag_id'] ?? '');
|
||||
if ($id === '') {
|
||||
$tagId = (string) ($hit['tag_id'] ?? '');
|
||||
|
||||
if ($tagId === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
$tagBinaryIds[] = Uuid::fromString($id)->toBinary();
|
||||
$tagBinaryIds[] = Uuid::fromString($tagId)->toBinary();
|
||||
} catch (\Throwable) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$tagMetaById[$tagId] = [
|
||||
'score' => (float) $hit['score'],
|
||||
'weight' => $this->resolveTypeWeight((string) $hit['tag_type']),
|
||||
];
|
||||
}
|
||||
|
||||
if ($tagBinaryIds === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Direct DBAL query (binary-safe)
|
||||
$conn = $this->em->getConnection();
|
||||
|
||||
$rows = $conn->executeQuery(
|
||||
'SELECT document_id
|
||||
FROM document_tag
|
||||
WHERE tag_id IN (:tagIds)',
|
||||
['tagIds' => $tagBinaryIds],
|
||||
['tagIds' => ArrayParameterType::BINARY]
|
||||
$rows = $this->em->getConnection()->executeQuery(
|
||||
'SELECT dt.document_id, dt.tag_id
|
||||
FROM document_tag dt
|
||||
INNER JOIN document d ON d.id = dt.document_id
|
||||
WHERE dt.tag_id IN (:tagIds)
|
||||
AND d.status = :status',
|
||||
[
|
||||
'tagIds' => $tagBinaryIds,
|
||||
'status' => Document::STATUS_ACTIVE,
|
||||
],
|
||||
[
|
||||
'tagIds' => ArrayParameterType::BINARY,
|
||||
]
|
||||
)->fetchAllAssociative();
|
||||
|
||||
if ($rows === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$docIds = [];
|
||||
$documentScores = [];
|
||||
$documentMatchedTags = [];
|
||||
|
||||
foreach ($rows as $row) {
|
||||
if (!isset($row['document_id'])) {
|
||||
if (!isset($row['document_id'], $row['tag_id'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
$uuid = Uuid::fromBinary($row['document_id']);
|
||||
$docIds[(string)$uuid] = true;
|
||||
$documentId = (string) Uuid::fromBinary($row['document_id']);
|
||||
$tagId = (string) Uuid::fromBinary($row['tag_id']);
|
||||
} catch (\Throwable) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (count($docIds) >= self::MAX_CANDIDATE_DOCS) {
|
||||
if (!isset($tagMetaById[$tagId])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$documentScores[$documentId] = ($documentScores[$documentId] ?? 0.0)
|
||||
+ ($tagMetaById[$tagId]['score'] * $tagMetaById[$tagId]['weight']);
|
||||
|
||||
$documentMatchedTags[$documentId][$tagId] = true;
|
||||
}
|
||||
|
||||
if ($documentScores === []) {
|
||||
return null;
|
||||
}
|
||||
|
||||
foreach ($documentScores as $documentId => $score) {
|
||||
$matchedTagCount = isset($documentMatchedTags[$documentId])
|
||||
? count($documentMatchedTags[$documentId])
|
||||
: 0;
|
||||
|
||||
if ($matchedTagCount > 1) {
|
||||
$documentScores[$documentId] += min(
|
||||
self::MAX_MULTI_TAG_BONUS,
|
||||
($matchedTagCount - 1) * self::MULTI_TAG_BONUS_PER_EXTRA_TAG
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
arsort($documentScores, SORT_NUMERIC);
|
||||
|
||||
return array_slice(
|
||||
array_keys($documentScores),
|
||||
0,
|
||||
self::MAX_CANDIDATE_DOCS
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, array{
|
||||
* tag_id:string,
|
||||
* score:float,
|
||||
* label?:string,
|
||||
* tag_type?:string
|
||||
* }> $hits
|
||||
*
|
||||
* @return list<array{
|
||||
* tag_id:string,
|
||||
* score:float,
|
||||
* tag_type:string
|
||||
* }>
|
||||
*/
|
||||
private function filterRoutingHits(array $hits): array
|
||||
{
|
||||
if ($hits === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$bestScore = (float) ($hits[0]['score'] ?? 0.0);
|
||||
|
||||
if ($bestScore < self::MIN_BEST_SCORE) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$minimumAcceptedScore = max(
|
||||
self::MIN_BEST_SCORE,
|
||||
$bestScore - self::MAX_SCORE_DROP_FROM_BEST
|
||||
);
|
||||
|
||||
$filtered = [];
|
||||
|
||||
foreach ($hits as $hit) {
|
||||
$tagId = (string) ($hit['tag_id'] ?? '');
|
||||
$score = (float) ($hit['score'] ?? 0.0);
|
||||
$tagType = TagTypes::normalize(
|
||||
(string) ($hit['tag_type'] ?? TagTypes::GENERIC)
|
||||
);
|
||||
|
||||
if ($tagId === '' || $score < $minimumAcceptedScore) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Sales signals may still be useful elsewhere, but they should not
|
||||
// expand the document scope for semantic retrieval.
|
||||
if ($tagType === TagTypes::SALES_SIGNAL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$filtered[] = [
|
||||
'tag_id' => $tagId,
|
||||
'score' => $score,
|
||||
'tag_type' => $tagType,
|
||||
];
|
||||
|
||||
if (count($filtered) >= self::MAX_ROUTING_TAGS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return array_keys($docIds);
|
||||
return $filtered;
|
||||
}
|
||||
|
||||
private function resolveTypeWeight(string $tagType): float
|
||||
{
|
||||
return match (TagTypes::normalize($tagType)) {
|
||||
TagTypes::CATALOG_ENTITY => 1.20,
|
||||
TagTypes::GENERIC => 1.00,
|
||||
TagTypes::SALES_SIGNAL => 0.00,
|
||||
default => 1.00,
|
||||
};
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user