lexical logic
This commit is contained in:
298
src/Command/TestHybridRetrievalCommand.php
Normal file
298
src/Command/TestHybridRetrievalCommand.php
Normal file
@@ -0,0 +1,298 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use App\Knowledge\Retrieval\NdjsonHybridRetriever;
|
||||
use Symfony\Component\Console\Attribute\AsCommand;
|
||||
use Symfony\Component\Console\Command\Command;
|
||||
use Symfony\Component\Console\Input\InputArgument;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use Symfony\Component\Console\Style\SymfonyStyle;
|
||||
|
||||
#[AsCommand(
|
||||
name: 'mto:agent:retrieval:test',
|
||||
description: 'Test the real hybrid retrieval path with debug output'
|
||||
)]
|
||||
final class TestHybridRetrievalCommand extends Command
|
||||
{
|
||||
public function __construct(
|
||||
private readonly NdjsonHybridRetriever $retriever,
|
||||
) {
|
||||
parent::__construct();
|
||||
}
|
||||
|
||||
protected function configure(): void
|
||||
{
|
||||
$this
|
||||
->addArgument(
|
||||
'prompt',
|
||||
InputArgument::REQUIRED,
|
||||
'Prompt to test against the real hybrid retrieval pipeline'
|
||||
)
|
||||
->addOption(
|
||||
'json',
|
||||
null,
|
||||
InputOption::VALUE_NONE,
|
||||
'Return the raw retrieval debug result as JSON'
|
||||
)
|
||||
->addOption(
|
||||
'show-text',
|
||||
null,
|
||||
InputOption::VALUE_NONE,
|
||||
'Show full chunk text instead of a shortened preview'
|
||||
);
|
||||
}
|
||||
|
||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$io = new SymfonyStyle($input, $output);
|
||||
|
||||
$prompt = trim((string) $input->getArgument('prompt'));
|
||||
$asJson = (bool) $input->getOption('json');
|
||||
$showText = (bool) $input->getOption('show-text');
|
||||
|
||||
if ($prompt === '') {
|
||||
$io->error('Prompt must not be empty.');
|
||||
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$start = microtime(true);
|
||||
|
||||
try {
|
||||
$results = $this->retriever->retrieveDebug($prompt);
|
||||
} catch (\Throwable $e) {
|
||||
$io->error($e->getMessage());
|
||||
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$durationMs = round((microtime(true) - $start) * 1000, 2);
|
||||
|
||||
if ($asJson) {
|
||||
$payload = [
|
||||
'prompt' => $prompt,
|
||||
'duration_ms' => $durationMs,
|
||||
'result_count' => count($results),
|
||||
'results' => $results,
|
||||
];
|
||||
|
||||
$json = json_encode(
|
||||
$payload,
|
||||
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
|
||||
);
|
||||
|
||||
if (!is_string($json)) {
|
||||
$io->error('json_encode failed.');
|
||||
|
||||
return Command::FAILURE;
|
||||
}
|
||||
|
||||
$output->writeln($json);
|
||||
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
|
||||
$io->title('Hybrid Retrieval Test');
|
||||
$io->definitionList(
|
||||
['prompt' => $prompt],
|
||||
['duration_ms' => (string) $durationMs],
|
||||
['result_count' => (string) count($results)]
|
||||
);
|
||||
|
||||
if ($results === []) {
|
||||
$io->warning('No retrieval results returned.');
|
||||
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
|
||||
$first = $results[0];
|
||||
|
||||
$io->section('Pipeline Summary');
|
||||
$io->definitionList(
|
||||
['scope_mode' => $this->stringValue($first, 'scope_mode')],
|
||||
['selection_mode' => $this->stringValue($first, 'selection_mode')],
|
||||
['intent' => $this->stringValue($first, 'intent')],
|
||||
['route' => $this->stringValue($first, 'route')],
|
||||
['entity_label' => $this->stringValue($first, 'entity_label')],
|
||||
['is_list_query' => $this->boolishValue($first, 'is_list_query')],
|
||||
['clean_query' => $this->stringValue($first, 'clean_query')],
|
||||
['semantic_query' => $this->stringValue($first, 'semantic_query')],
|
||||
['secondary_vector_query' => $this->stringValue($first, 'secondary_vector_query')],
|
||||
['lexical_query' => $this->stringValue($first, 'lexical_query')],
|
||||
['threshold' => $this->scalarValue($first, 'threshold')],
|
||||
['lexical_threshold' => $this->scalarValue($first, 'lexical_threshold')]
|
||||
);
|
||||
|
||||
$io->section('Scope Candidates');
|
||||
$io->definitionList(
|
||||
['tag_candidate_doc_ids' => $this->jsonValue($first, 'tag_candidate_doc_ids')],
|
||||
['soft_document_candidate_doc_ids' => $this->jsonValue($first, 'soft_document_candidate_doc_ids')],
|
||||
['pseudo_scope_doc_ids' => $this->jsonValue($first, 'pseudo_scope_doc_ids')],
|
||||
['title_metadata_doc_boosts' => $this->jsonObjectValue($first, 'title_metadata_doc_boosts')]
|
||||
);
|
||||
|
||||
$io->section('Hit Counts');
|
||||
$io->definitionList(
|
||||
['global_hit_count' => $this->scalarValue($first, 'global_hit_count')],
|
||||
['scoped_hit_count' => $this->scalarValue($first, 'scoped_hit_count')],
|
||||
['global_vector_hit_count' => $this->scalarValue($first, 'global_vector_hit_count')],
|
||||
['global_primary_vector_hit_count' => $this->scalarValue($first, 'global_primary_vector_hit_count')],
|
||||
['global_secondary_vector_hit_count' => $this->scalarValue($first, 'global_secondary_vector_hit_count')],
|
||||
['global_keyword_hit_count' => $this->scalarValue($first, 'global_keyword_hit_count')],
|
||||
['scoped_vector_hit_count' => $this->scalarValue($first, 'scoped_vector_hit_count')],
|
||||
['scoped_primary_vector_hit_count' => $this->scalarValue($first, 'scoped_primary_vector_hit_count')],
|
||||
['scoped_secondary_vector_hit_count' => $this->scalarValue($first, 'scoped_secondary_vector_hit_count')],
|
||||
['scoped_keyword_hit_count' => $this->scalarValue($first, 'scoped_keyword_hit_count')]
|
||||
);
|
||||
|
||||
$io->section('Boosts');
|
||||
$io->definitionList(
|
||||
['scoped_boost_factor' => $this->scalarValue($first, 'scoped_boost_factor')],
|
||||
['scoped_vector_boost_factor' => $this->scalarValue($first, 'scoped_vector_boost_factor')],
|
||||
['secondary_scoped_vector_boost_factor' => $this->scalarValue($first, 'secondary_scoped_vector_boost_factor')],
|
||||
['scoped_keyword_boost_factor' => $this->scalarValue($first, 'scoped_keyword_boost_factor')]
|
||||
);
|
||||
|
||||
$io->section('Selected Chunks');
|
||||
|
||||
foreach ($results as $row) {
|
||||
$rank = $this->scalarValue($row, 'rank');
|
||||
$chunkId = $this->stringValue($row, 'chunk_id');
|
||||
$documentId = $this->stringValue($row, 'document_id');
|
||||
$chunkIndex = $this->scalarValue($row, 'chunk_index');
|
||||
$rrfScore = $this->scalarValue($row, 'rrf_score');
|
||||
$rawVectorScore = $this->scalarValue($row, 'raw_vector_score');
|
||||
$rawKeywordScore = $this->scalarValue($row, 'raw_keyword_score');
|
||||
$titleMetadataBoost = $this->scalarValue($row, 'title_metadata_boost');
|
||||
$text = (string) ($row['text'] ?? '');
|
||||
|
||||
if (!$showText) {
|
||||
$text = $this->shortenText($text, 500);
|
||||
}
|
||||
|
||||
$io->writeln(sprintf(
|
||||
'<info>#%s</info> chunk=%s doc=%s idx=%s rrf=%s vector=%s keyword=%s title_meta=%s',
|
||||
$rank,
|
||||
$chunkId,
|
||||
$documentId !== '' ? $documentId : '-',
|
||||
$chunkIndex !== '' ? $chunkIndex : '-',
|
||||
$rrfScore !== '' ? $rrfScore : '-',
|
||||
$rawVectorScore !== '' ? $rawVectorScore : '-',
|
||||
$rawKeywordScore !== '' ? $rawKeywordScore : '-',
|
||||
$titleMetadataBoost !== '' ? $titleMetadataBoost : '-'
|
||||
));
|
||||
$io->writeln($text);
|
||||
$io->writeln('');
|
||||
}
|
||||
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $row
|
||||
*/
|
||||
private function stringValue(array $row, string $key): string
|
||||
{
|
||||
$value = $row[$key] ?? null;
|
||||
|
||||
if ($value === null) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return trim((string) $value);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $row
|
||||
*/
|
||||
private function scalarValue(array $row, string $key): string
|
||||
{
|
||||
$value = $row[$key] ?? null;
|
||||
|
||||
if ($value === null) {
|
||||
return '';
|
||||
}
|
||||
|
||||
if (is_bool($value)) {
|
||||
return $value ? 'true' : 'false';
|
||||
}
|
||||
|
||||
if (is_scalar($value)) {
|
||||
return (string) $value;
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $row
|
||||
*/
|
||||
private function boolishValue(array $row, string $key): string
|
||||
{
|
||||
$value = $row[$key] ?? null;
|
||||
|
||||
if (is_bool($value)) {
|
||||
return $value ? 'true' : 'false';
|
||||
}
|
||||
|
||||
if (is_scalar($value)) {
|
||||
return (string) $value;
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $row
|
||||
*/
|
||||
private function jsonValue(array $row, string $key): string
|
||||
{
|
||||
$value = $row[$key] ?? null;
|
||||
|
||||
if ($value === null || !is_array($value)) {
|
||||
return '[]';
|
||||
}
|
||||
|
||||
$json = json_encode(
|
||||
array_values($value),
|
||||
JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
|
||||
);
|
||||
|
||||
return is_string($json) ? $json : '[]';
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $row
|
||||
*/
|
||||
private function jsonObjectValue(array $row, string $key): string
|
||||
{
|
||||
$value = $row[$key] ?? null;
|
||||
|
||||
if ($value === null || !is_array($value)) {
|
||||
return '{}';
|
||||
}
|
||||
|
||||
$json = json_encode(
|
||||
$value,
|
||||
JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
|
||||
);
|
||||
|
||||
return is_string($json) ? $json : '{}';
|
||||
}
|
||||
|
||||
private function shortenText(string $text, int $maxLength): string
|
||||
{
|
||||
$text = trim((preg_replace('/\s+/u', ' ', $text) ?? $text));
|
||||
|
||||
if (mb_strlen($text, 'UTF-8') <= $maxLength) {
|
||||
return $text;
|
||||
}
|
||||
|
||||
return mb_substr($text, 0, $maxLength, 'UTF-8') . ' …';
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user