remove direct chnuk search. only vector search
This commit is contained in:
39
src/Knowledge/QueryCleaner.php
Normal file
39
src/Knowledge/QueryCleaner.php
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Knowledge;
|
||||||
|
|
||||||
|
final readonly class QueryCleaner
|
||||||
|
{
|
||||||
|
public function __construct(
|
||||||
|
private StopWords $stopWords
|
||||||
|
) {
|
||||||
|
}
|
||||||
|
|
||||||
|
public function clean(string $query): string
|
||||||
|
{
|
||||||
|
$query = mb_strtolower($query);
|
||||||
|
$query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query);
|
||||||
|
$query = preg_replace('/\s+/u', ' ', $query);
|
||||||
|
$query = trim($query);
|
||||||
|
|
||||||
|
if ($query === '') {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
$tokens = explode(' ', $query);
|
||||||
|
$stopWords = $this->stopWords->getStopWords(); // <-- wichtig: nutzt deine Klasse
|
||||||
|
|
||||||
|
$filtered = array_filter(
|
||||||
|
$tokens,
|
||||||
|
function (string $word) use ($stopWords): bool {
|
||||||
|
return $word !== ''
|
||||||
|
&& mb_strlen($word) > 2
|
||||||
|
&& !in_array($word, $stopWords, true);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
return implode(' ', $filtered);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,6 +4,7 @@ declare(strict_types=1);
|
|||||||
|
|
||||||
namespace App\Knowledge\Retrieval;
|
namespace App\Knowledge\Retrieval;
|
||||||
|
|
||||||
|
use App\Knowledge\QueryCleaner;
|
||||||
use App\Vector\VectorSearchClient;
|
use App\Vector\VectorSearchClient;
|
||||||
|
|
||||||
final class NdjsonHybridRetriever implements RetrieverInterface
|
final class NdjsonHybridRetriever implements RetrieverInterface
|
||||||
@@ -11,10 +12,11 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
private const VECTOR_SCORE_THRESHOLD = 0.25;
|
private const VECTOR_SCORE_THRESHOLD = 0.25;
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private readonly NdjsonChunkLookup $lookup,
|
private readonly NdjsonChunkLookup $lookup,
|
||||||
private readonly VectorSearchClient $vectorClient,
|
private readonly VectorSearchClient $vectorClient,
|
||||||
private readonly int $maxChunks = 10,
|
private readonly QueryCleaner $queryCleaner,
|
||||||
private readonly int $vectorTopK = 10,
|
private readonly int $maxChunks = 25,
|
||||||
|
private readonly int $vectorTopK = 10,
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@@ -23,9 +25,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
{
|
{
|
||||||
$limit = $this->maxChunks;
|
$limit = $this->maxChunks;
|
||||||
$keywordChunks = [];
|
$keywordChunks = [];
|
||||||
|
$query = $this->queryCleaner->clean($prompt);
|
||||||
|
|
||||||
// Vector / enrichment
|
// Vector / enrichment
|
||||||
$hits = $this->vectorClient->search($prompt, $this->vectorTopK);
|
$hits = $this->vectorClient->search($query, $this->vectorTopK);
|
||||||
if ($hits === []) {
|
if ($hits === []) {
|
||||||
return $this->diversifyByDevice($keywordChunks, $limit, 1);
|
return $this->diversifyByDevice($keywordChunks, $limit, 1);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user