cleanup code

This commit is contained in:
team 1
2026-04-21 18:22:57 +02:00
parent ce859b9662
commit 86caae5552
2 changed files with 84 additions and 65 deletions

View File

@@ -99,4 +99,69 @@ final class NdjsonHybridRetrieverConfig
* - still allow relevant continuation when needed * - still allow relevant continuation when needed
*/ */
public const MIN_CHUNK_DISTANCE = 2; public const MIN_CHUNK_DISTANCE = 2;
/**
* When one document clearly dominates the top-ranked window,
* temporarily switch from "spread" mode to "dominant document" mode.
*/
public const DOMINANT_DOC_WINDOW = 6;
public const DOMINANT_DOC_MIN_HITS = 3;
public const DOMINANT_DOC_MAX_CHUNKS = 4;
public const EXACT_DOCUMENT_MAX_CHUNKS = 6;
public const FOCUSED_PRODUCT_WINDOW = 8;
public const FOCUSED_PRODUCT_MIN_SCORE = 10.0;
public const FOCUSED_PRODUCT_MIN_GAP = 4.0;
public const FOCUSED_PRODUCT_MAX_CHUNKS = 4;
public const GENERIC_PRODUCT_TOKEN = [
'der', 'die', 'das', 'ein', 'eine', 'einen', 'einem', 'und', 'oder', 'mit',
'fuer', 'für', 'von', 'im', 'in', 'am', 'an', 'auf', 'zu', 'zum', 'zur',
'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät',
'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte',
'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung',
'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend',
'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher',
'welches', 'brauche', 'suche', 'bitte', 'fuer', 'gegen', 'und', 'oder',
];
const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
const FAMILY_DESCRIPTOR_TOKEN = [
'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab',
'inline', 'compact', 'panel', 'sc',
];
const LOOKS_LIKE_REAGENT_TOKENS = [
'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie',
'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche',
'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz',
'kerzenfilter', 'druckregler',
];
const LOOKS_LIKE_SAFETY_DOCS = [
'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung',
'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp',
'kennzeichnung', 'h290', 'pbt', 'vpvb',
];
public const LOOKS_LIKE_REAGENT_WORDS = [
'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb',
'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde',
];
public const LOOKS_LIKE_DOCUMENT_WORDS = [
'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung',
'sdb', 'sicherheitsdatenblatt', 'msds',
];
public const LOOKS_LIKE_SAFETY_WORDS = [
'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung',
'transport', 'lagerung', 'piktogramm',
];
public const LOOKS_LIKE_DEVICE_WORDS = [
'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat',
'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor',
];
} }

View File

@@ -31,18 +31,6 @@ use RuntimeException;
*/ */
final readonly class NdjsonHybridRetriever implements RetrieverInterface final readonly class NdjsonHybridRetriever implements RetrieverInterface
{ {
/**
* When one document clearly dominates the top-ranked window,
* temporarily switch from "spread" mode to "dominant document" mode.
*/
private const DOMINANT_DOC_WINDOW = 6;
private const DOMINANT_DOC_MIN_HITS = 3;
private const DOMINANT_DOC_MAX_CHUNKS = 4;
private const EXACT_DOCUMENT_MAX_CHUNKS = 6;
private const FOCUSED_PRODUCT_WINDOW = 8;
private const FOCUSED_PRODUCT_MIN_SCORE = 10.0;
private const FOCUSED_PRODUCT_MIN_GAP = 4.0;
private const FOCUSED_PRODUCT_MAX_CHUNKS = 4;
public function __construct( public function __construct(
private NdjsonChunkLookup $lookup, private NdjsonChunkLookup $lookup,
@@ -588,7 +576,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
}); });
$selected = []; $selected = [];
$max = min($limit, self::EXACT_DOCUMENT_MAX_CHUNKS); $max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
foreach ($rows as $row) { foreach ($rows as $row) {
$chunkId = $row['chunk_id'] ?? null; $chunkId = $row['chunk_id'] ?? null;
@@ -751,7 +739,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$candidates = []; $candidates = [];
$seenDocs = []; $seenDocs = [];
foreach (array_slice($chunkIds, 0, self::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) { foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) {
$row = $rows[$chunkId] ?? null; $row = $rows[$chunkId] ?? null;
if (!is_array($row)) { if (!is_array($row)) {
continue; continue;
@@ -797,7 +785,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$bestScore = (float)$best['score']; $bestScore = (float)$best['score'];
$gap = $bestScore - $runnerUpScore; $gap = $bestScore - $runnerUpScore;
if ($bestScore < self::FOCUSED_PRODUCT_MIN_SCORE || $gap < self::FOCUSED_PRODUCT_MIN_GAP) { if ($bestScore < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_SCORE || $gap < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_GAP) {
return null; return null;
} }
@@ -825,22 +813,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$normalized = $this->normalizeText($prompt); $normalized = $this->normalizeText($prompt);
$tokens = $this->tokenizeText($normalized); $tokens = $this->tokenizeText($normalized);
$reagentWords = [ $reagentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_WORDS;
'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb', $documentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DOCUMENT_WORDS;
'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde', $safetyWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_WORDS;
]; $deviceWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DEVICE_WORDS;
$documentWords = [
'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung',
'sdb', 'sicherheitsdatenblatt', 'msds',
];
$safetyWords = [
'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung',
'transport', 'lagerung', 'piktogramm',
];
$deviceWords = [
'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat',
'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor',
];
$asksReagent = $this->containsAnyToken($tokens, $reagentWords); $asksReagent = $this->containsAnyToken($tokens, $reagentWords);
$asksDocument = $this->containsAnyToken($tokens, $documentWords); $asksDocument = $this->containsAnyToken($tokens, $documentWords);
@@ -981,7 +957,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$documentId, $documentId,
$chunkIds, $chunkIds,
$rows, $rows,
min($limit, self::FOCUSED_PRODUCT_MAX_CHUNKS) min($limit, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MAX_CHUNKS)
); );
} }
@@ -996,7 +972,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
{ {
$docWindow = []; $docWindow = [];
foreach (array_slice($chunkIds, 0, self::DOMINANT_DOC_WINDOW) as $chunkId) { foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::DOMINANT_DOC_WINDOW) as $chunkId) {
if (!isset($rows[$chunkId]['text'])) { if (!isset($rows[$chunkId]['text'])) {
continue; continue;
} }
@@ -1026,7 +1002,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
$dominantCount = (int)($counts[$dominantDocId] ?? 0); $dominantCount = (int)($counts[$dominantDocId] ?? 0);
if ($dominantCount >= self::DOMINANT_DOC_MIN_HITS) { if ($dominantCount >= NdjsonHybridRetrieverConfig::DOMINANT_DOC_MIN_HITS) {
return $dominantDocId; return $dominantDocId;
} }
@@ -1088,7 +1064,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return []; return [];
} }
$maxFromDoc = min($limit, self::DOMINANT_DOC_MAX_CHUNKS); $maxFromDoc = min($limit, NdjsonHybridRetrieverConfig::DOMINANT_DOC_MAX_CHUNKS);
if ($anchorChunkIndex !== null) { if ($anchorChunkIndex !== null) {
usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int { usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int {
@@ -1353,17 +1329,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
*/ */
private function isGenericProductToken(string $token): bool private function isGenericProductToken(string $token): bool
{ {
static $generic = [ static $generic = NdjsonHybridRetrieverConfig::GENERIC_PRODUCT_TOKEN;
'der', 'die', 'das', 'ein', 'eine', 'einen', 'einem', 'und', 'oder', 'mit',
'fuer', 'für', 'von', 'im', 'in', 'am', 'an', 'auf', 'zu', 'zum', 'zur',
'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät',
'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte',
'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung',
'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend',
'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher',
'welches', 'brauche', 'suche', 'bitte', 'fuer', 'gegen', 'und', 'oder',
];
return isset(array_fill_keys($generic, true)[$token]); return isset(array_fill_keys($generic, true)[$token]);
} }
@@ -1372,7 +1338,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
*/ */
private function isImportantShortModelToken(string $token): bool private function isImportantShortModelToken(string $token): bool
{ {
static $allowed = ['th', 'tc', 'tp', 'tm', 'ph', 'rx']; static $allowed = NdjsonHybridRetrieverConfig::IMPORTANT_SHORT_MODEL_TOKEN;
return in_array($token, $allowed, true); return in_array($token, $allowed, true);
} }
@@ -1382,10 +1348,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
*/ */
private function isFamilyDescriptorToken(string $token): bool private function isFamilyDescriptorToken(string $token): bool
{ {
static $familyDescriptors = [ static $familyDescriptors = NdjsonHybridRetrieverConfig::FAMILY_DESCRIPTOR_TOKEN;
'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab',
'inline', 'compact', 'panel', 'sc',
];
return in_array($token, $familyDescriptors, true) return in_array($token, $familyDescriptors, true)
|| $this->isImportantShortModelToken($token) || $this->isImportantShortModelToken($token)
@@ -1403,12 +1366,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return false; return false;
} }
$needles = [ $needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_TOKENS;
'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie',
'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche',
'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz',
'kerzenfilter', 'druckregler',
];
foreach ($needles as $needle) { foreach ($needles as $needle) {
if (str_contains($haystack, $needle)) { if (str_contains($haystack, $needle)) {
@@ -1430,11 +1388,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
return false; return false;
} }
$needles = [ $needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_DOCS;
'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung',
'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp',
'kennzeichnung', 'h290', 'pbt', 'vpvb',
];
foreach ($needles as $needle) { foreach ($needles as $needle) {
if (str_contains($haystack, $needle)) { if (str_contains($haystack, $needle)) {