optimize py autoload

This commit is contained in:
team2
2026-02-26 17:43:22 +01:00
parent deba7cd06f
commit 5b4e0ad4c3
3 changed files with 131 additions and 1879 deletions

View File

@@ -4,17 +4,41 @@ declare(strict_types=1);
namespace App\Knowledge; namespace App\Knowledge;
final readonly class QueryCleaner final class QueryCleaner
{ {
public function __construct( /**
private StopWords $stopWords * Bereinigt eine Query ausschließlich für Retrieval-Zwecke.
) { *
} * Wichtig:
* - Unicode-sicher
* - Zahlen bleiben erhalten
* - Negationen bleiben erhalten
* - Keine aggressive Token-Längen-Filterung
* - StopWords werden entfernt
*/
public function clean(string $query): string public function clean(string $query): string
{ {
$query = mb_strtolower($query); if ($query === '') {
return '';
}
// 1. Unicode-sicher lowercase
$query = mb_strtolower($query, 'UTF-8');
// 2. Bindestriche & Slashes als Worttrenner behandeln
$query = str_replace(['-', '/'], ' ', $query);
// 3. Sonderzeichen entfernen, aber:
// - Buchstaben behalten
// - Zahlen behalten
// - Umlaute behalten
$query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query); $query = preg_replace('/[^\p{L}\p{N}\s]/u', ' ', $query);
if ($query === null) {
return '';
}
// 4. Mehrfache Whitespaces normalisieren
$query = preg_replace('/\s+/u', ' ', $query); $query = preg_replace('/\s+/u', ' ', $query);
$query = trim($query); $query = trim($query);
@@ -22,18 +46,35 @@ final readonly class QueryCleaner
return ''; return '';
} }
$tokens = explode(' ', $query); // 5. Tokenisierung
$stopWords = $this->stopWords->getStopWords(); // <-- wichtig: nutzt deine Klasse $tokens = preg_split('/\s+/u', $query);
$filtered = array_filter( if ($tokens === false) {
$tokens, return '';
function (string $word) use ($stopWords): bool { }
return $word !== ''
&& mb_strlen($word) > 2 $cleanTokens = [];
&& !in_array($word, $stopWords, true);
foreach ($tokens as $token) {
$token = trim($token);
if ($token === '') {
continue;
} }
);
return implode(' ', $filtered); // StopWords entfernen
if (StopWords::isStopWord($token)) {
continue;
}
$cleanTokens[] = $token;
}
if ($cleanTokens === []) {
return '';
}
return implode(' ', $cleanTokens);
} }
} }

View File

@@ -6,6 +6,7 @@ namespace App\Knowledge\Retrieval;
use App\Entity\ModelGenerationConfig; use App\Entity\ModelGenerationConfig;
use App\Knowledge\ChunkManager; use App\Knowledge\ChunkManager;
use App\Knowledge\QueryCleaner;
use App\Repository\ModelGenerationConfigRepository; use App\Repository\ModelGenerationConfigRepository;
use App\Tag\TagRoutingService; use App\Tag\TagRoutingService;
use App\Vector\VectorSearchClient; use App\Vector\VectorSearchClient;
@@ -24,6 +25,7 @@ final class NdjsonHybridRetriever implements RetrieverInterface
private readonly VectorSearchClient $vectorClient, private readonly VectorSearchClient $vectorClient,
private readonly TagRoutingService $tagRouting, private readonly TagRoutingService $tagRouting,
private readonly ModelGenerationConfigRepository $configRepository, private readonly ModelGenerationConfigRepository $configRepository,
private readonly QueryCleaner $queryCleaner,
) {} ) {}
public function retrieve(string $prompt): array public function retrieve(string $prompt): array
@@ -47,12 +49,21 @@ final class NdjsonHybridRetriever implements RetrieverInterface
$limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS)); $limit = max(1, min($config->getRetrievalMaxChunks(), self::HARD_MAX_CHUNKS));
$vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK)); $vectorTopKBase = max(1, min($config->getRetrievalVectorTopK(), self::HARD_MAX_VECTORK));
// Wichtig: List-Detection bleibt auf Originalprompt (sonst entfernst du "zeige/liste" etc.)
$isListQuery = $this->isListQuery($prompt); $isListQuery = $this->isListQuery($prompt);
// ------------------------------------------------- // -------------------------------------------------
// 1) Tag Routing // CLEAN QUERY (nur für Retrieval: Tags + Vector)
// ------------------------------------------------- // -------------------------------------------------
$candidateDocIds = $this->tagRouting->route($prompt); $cleanQuery = $this->queryCleaner->clean($prompt);
if ($cleanQuery === '') {
$cleanQuery = $prompt;
}
// -------------------------------------------------
// 1) Tag Routing (bereinigte Query)
// -------------------------------------------------
$candidateDocIds = $this->tagRouting->route($cleanQuery);
$candidateSet = null; $candidateSet = null;
if (is_array($candidateDocIds) && $candidateDocIds !== []) { if (is_array($candidateDocIds) && $candidateDocIds !== []) {
@@ -76,22 +87,22 @@ final class NdjsonHybridRetriever implements RetrieverInterface
} }
// ------------------------------------------------- // -------------------------------------------------
// 3) Vector Search (Scoped wenn möglich) // 3) Vector Search (bereinigte Query; scoped wenn möglich)
// ------------------------------------------------- // -------------------------------------------------
if ($candidateSet !== null) { if ($candidateSet !== null) {
$hits = $this->vectorClient->searchScoped( $hits = $this->vectorClient->searchScoped(
$prompt, $cleanQuery,
$topK, $topK,
array_keys($candidateSet) array_keys($candidateSet)
); );
// Wenn scoped nichts liefert → global fallback // Wenn scoped nichts liefert → global fallback
if ($hits === []) { if ($hits === []) {
$hits = $this->vectorClient->search($prompt, $vectorTopKBase); $hits = $this->vectorClient->search($cleanQuery, $vectorTopKBase);
} }
} else { } else {
$hits = $this->vectorClient->search($prompt, $topK); $hits = $this->vectorClient->search($cleanQuery, $topK);
} }
if ($hits === []) { if ($hits === []) {
@@ -294,7 +305,6 @@ final class NdjsonHybridRetriever implements RetrieverInterface
} }
$chunk = trim($rows[$id]['text']); $chunk = trim($rows[$id]['text']);
$key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk)); $key = mb_strtolower((string)preg_replace('/\s+/u', ' ', $chunk));
if (isset($seen[$key])) { if (isset($seen[$key])) {

File diff suppressed because it is too large Load Diff