diff --git a/src/Knowledge/QueryCleaner.php b/src/Knowledge/QueryCleaner.php new file mode 100644 index 0000000..bf38e31 --- /dev/null +++ b/src/Knowledge/QueryCleaner.php @@ -0,0 +1,39 @@ +stopWords->getStopWords(); // <-- wichtig: nutzt deine Klasse + + $filtered = array_filter( + $tokens, + function (string $word) use ($stopWords): bool { + return $word !== '' + && mb_strlen($word) > 2 + && !in_array($word, $stopWords, true); + } + ); + + return implode(' ', $filtered); + } +} diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index 6ebc492..409f419 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -4,6 +4,7 @@ declare(strict_types=1); namespace App\Knowledge\Retrieval; +use App\Knowledge\QueryCleaner; use App\Vector\VectorSearchClient; final class NdjsonHybridRetriever implements RetrieverInterface @@ -11,10 +12,11 @@ final class NdjsonHybridRetriever implements RetrieverInterface private const VECTOR_SCORE_THRESHOLD = 0.25; public function __construct( - private readonly NdjsonChunkLookup $lookup, - private readonly VectorSearchClient $vectorClient, - private readonly int $maxChunks = 10, - private readonly int $vectorTopK = 10, + private readonly NdjsonChunkLookup $lookup, + private readonly VectorSearchClient $vectorClient, + private readonly QueryCleaner $queryCleaner, + private readonly int $maxChunks = 25, + private readonly int $vectorTopK = 10, ) { } @@ -23,9 +25,10 @@ final class NdjsonHybridRetriever implements RetrieverInterface { $limit = $this->maxChunks; $keywordChunks = []; + $query = $this->queryCleaner->clean($prompt); // Vector / enrichment - $hits = $this->vectorClient->search($prompt, $this->vectorTopK); + $hits = $this->vectorClient->search($query, $this->vectorTopK); if ($hits === []) { return $this->diversifyByDevice($keywordChunks, $limit, 1); }