optimize py autoload
This commit is contained in:
@@ -4,17 +4,41 @@ declare(strict_types=1);
|
|||||||
|
|
||||||
namespace App\Knowledge;
|
namespace App\Knowledge;
|
||||||
|
|
||||||
final readonly class QueryCleaner
|
final class QueryCleaner
|
||||||
{
|
{
|
||||||
public function __construct(
|
/**
|
||||||
private StopWords $stopWords
|
* Bereinigt eine Query ausschließlich für Retrieval-Zwecke.
|
||||||
) {
|
*
|
||||||
}
|
* Wichtig:
|
||||||
|
* - Unicode-sicher
|
||||||
|
* - Zahlen bleiben erhalten
|
||||||
|
* - Negationen bleiben erhalten
|
||||||
|
* - Keine aggressive Token-Längen-Filterung
|
||||||
|
* - StopWords werden entfernt
|
||||||
|
*/
|
||||||
public function clean(string $query): string
|
public function clean(string $query): string
|
||||||
{
|
{
|
||||||
$query = mb_strtolower($query);
|
if ($query === '') {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// 1. Unicode-sicher lowercase
|
||||||
|
$query = mb_strtolower($query, 'UTF-8');
|
||||||
|
|
||||||
|
// 2. Bindestriche & Slashes als Worttrenner behandeln
|
||||||
|
$query = str_replace(['-', '/'], ' ', $query);
|
||||||
|
|
||||||
|
// 3. Sonderzeichen entfernen, aber:
|
||||||
|
// - Buchstaben behalten
|
||||||
|
// - Zahlen behalten
|
||||||
|
// - Umlaute behalten
|
||||||
$query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query);
|
$query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query);
|
||||||
|
|
||||||
|
if ($query === null) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Mehrfache Whitespaces normalisieren
|
||||||
$query = preg_replace('/\s+/u', ' ', $query);
|
$query = preg_replace('/\s+/u', ' ', $query);
|
||||||
$query = trim($query);
|
$query = trim($query);
|
||||||
|
|
||||||
@@ -22,18 +46,35 @@ final readonly class QueryCleaner
|
|||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
$tokens = explode(' ', $query);
|
// 5. Tokenisierung
|
||||||
$stopWords = $this->stopWords->getStopWords(); // <-- wichtig: nutzt deine Klasse
|
$tokens = preg_split('/\s+/u', $query);
|
||||||
|
|
||||||
$filtered = array_filter(
|
if ($tokens === false) {
|
||||||
$tokens,
|
return '';
|
||||||
function (string $word) use ($stopWords): bool {
|
|
||||||
return $word !== ''
|
|
||||||
&& mb_strlen($word) > 2
|
|
||||||
&& !in_array($word, $stopWords, true);
|
|
||||||
}
|
}
|
||||||
);
|
|
||||||
|
|
||||||
return implode(' ', $filtered);
|
$cleanTokens = [];
|
||||||
|
|
||||||
|
foreach ($tokens as $token) {
|
||||||
|
|
||||||
|
$token = trim($token);
|
||||||
|
|
||||||
|
if ($token === '') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// StopWords entfernen
|
||||||
|
if (StopWords::isStopWord($token)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$cleanTokens[] = $token;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($cleanTokens === []) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
return implode(' ', $cleanTokens);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -6,6 +6,7 @@ namespace App\Knowledge\Retrieval;
|
|||||||
|
|
||||||
use App\Entity\ModelGenerationConfig;
|
use App\Entity\ModelGenerationConfig;
|
||||||
use App\Knowledge\ChunkManager;
|
use App\Knowledge\ChunkManager;
|
||||||
|
use App\Knowledge\QueryCleaner;
|
||||||
use App\Repository\ModelGenerationConfigRepository;
|
use App\Repository\ModelGenerationConfigRepository;
|
||||||
use App\Tag\TagRoutingService;
|
use App\Tag\TagRoutingService;
|
||||||
use App\Vector\VectorSearchClient;
|
use App\Vector\VectorSearchClient;
|
||||||
@@ -24,6 +25,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
private readonly VectorSearchClient $vectorClient,
|
private readonly VectorSearchClient $vectorClient,
|
||||||
private readonly TagRoutingService $tagRouting,
|
private readonly TagRoutingService $tagRouting,
|
||||||
private readonly ModelGenerationConfigRepository $configRepository,
|
private readonly ModelGenerationConfigRepository $configRepository,
|
||||||
|
private readonly QueryCleaner $queryCleaner,
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
public function retrieve(string $prompt): array
|
public function retrieve(string $prompt): array
|
||||||
@@ -47,12 +49,21 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
|
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
|
||||||
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
|
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
|
||||||
|
|
||||||
|
// Wichtig: List-Detection bleibt auf Originalprompt (sonst entfernst du "zeige/liste" etc.)
|
||||||
$isListQuery = $this->isListQuery($prompt);
|
$isListQuery = $this->isListQuery($prompt);
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// 1) Tag Routing
|
// CLEAN QUERY (nur für Retrieval: Tags + Vector)
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
$candidateDocIds = $this->tagRouting->route($prompt);
|
$cleanQuery = $this->queryCleaner->clean($prompt);
|
||||||
|
if ($cleanQuery === '') {
|
||||||
|
$cleanQuery = $prompt;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------
|
||||||
|
// 1) Tag Routing (bereinigte Query)
|
||||||
|
// -------------------------------------------------
|
||||||
|
$candidateDocIds = $this->tagRouting->route($cleanQuery);
|
||||||
$candidateSet = null;
|
$candidateSet = null;
|
||||||
|
|
||||||
if (is_array($candidateDocIds) && $candidateDocIds !== []) {
|
if (is_array($candidateDocIds) && $candidateDocIds !== []) {
|
||||||
@@ -76,22 +87,22 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// 3) Vector Search (Scoped wenn möglich)
|
// 3) Vector Search (bereinigte Query; scoped wenn möglich)
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
if ($candidateSet !== null) {
|
if ($candidateSet !== null) {
|
||||||
$hits = $this->vectorClient->searchScoped(
|
$hits = $this->vectorClient->searchScoped(
|
||||||
$prompt,
|
$cleanQuery,
|
||||||
$topK,
|
$topK,
|
||||||
array_keys($candidateSet)
|
array_keys($candidateSet)
|
||||||
);
|
);
|
||||||
|
|
||||||
// Wenn scoped nichts liefert → global fallback
|
// Wenn scoped nichts liefert → global fallback
|
||||||
if ($hits === []) {
|
if ($hits === []) {
|
||||||
$hits = $this->vectorClient->search($prompt, $vectorTopKBase);
|
$hits = $this->vectorClient->search($cleanQuery, $vectorTopKBase);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
$hits = $this->vectorClient->search($prompt, $topK);
|
$hits = $this->vectorClient->search($cleanQuery, $topK);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($hits === []) {
|
if ($hits === []) {
|
||||||
@@ -294,7 +305,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
}
|
}
|
||||||
|
|
||||||
$chunk = trim($rows[$id]['text']);
|
$chunk = trim($rows[$id]['text']);
|
||||||
|
|
||||||
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
|
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
|
||||||
|
|
||||||
if (isset($seen[$key])) {
|
if (isset($seen[$key])) {
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user