cleanup code
This commit is contained in:
@@ -99,4 +99,69 @@ final class NdjsonHybridRetrieverConfig
|
||||
* - still allow relevant continuation when needed
|
||||
*/
|
||||
public const MIN_CHUNK_DISTANCE = 2;
|
||||
|
||||
/**
|
||||
* When one document clearly dominates the top-ranked window,
|
||||
* temporarily switch from "spread" mode to "dominant document" mode.
|
||||
*/
|
||||
public const DOMINANT_DOC_WINDOW = 6;
|
||||
public const DOMINANT_DOC_MIN_HITS = 3;
|
||||
public const DOMINANT_DOC_MAX_CHUNKS = 4;
|
||||
public const EXACT_DOCUMENT_MAX_CHUNKS = 6;
|
||||
public const FOCUSED_PRODUCT_WINDOW = 8;
|
||||
public const FOCUSED_PRODUCT_MIN_SCORE = 10.0;
|
||||
public const FOCUSED_PRODUCT_MIN_GAP = 4.0;
|
||||
public const FOCUSED_PRODUCT_MAX_CHUNKS = 4;
|
||||
|
||||
public const GENERIC_PRODUCT_TOKEN = [
|
||||
'der', 'die', 'das', 'ein', 'eine', 'einen', 'einem', 'und', 'oder', 'mit',
|
||||
'fuer', 'für', 'von', 'im', 'in', 'am', 'an', 'auf', 'zu', 'zum', 'zur',
|
||||
'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät',
|
||||
'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte',
|
||||
'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung',
|
||||
'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend',
|
||||
'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher',
|
||||
'welches', 'brauche', 'suche', 'bitte', 'fuer', 'gegen', 'und', 'oder',
|
||||
];
|
||||
|
||||
const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
|
||||
|
||||
const FAMILY_DESCRIPTOR_TOKEN = [
|
||||
'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab',
|
||||
'inline', 'compact', 'panel', 'sc',
|
||||
];
|
||||
|
||||
const LOOKS_LIKE_REAGENT_TOKENS = [
|
||||
'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie',
|
||||
'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche',
|
||||
'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz',
|
||||
'kerzenfilter', 'druckregler',
|
||||
];
|
||||
|
||||
const LOOKS_LIKE_SAFETY_DOCS = [
|
||||
'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung',
|
||||
'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp',
|
||||
'kennzeichnung', 'h290', 'pbt', 'vpvb',
|
||||
];
|
||||
|
||||
public const LOOKS_LIKE_REAGENT_WORDS = [
|
||||
'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb',
|
||||
'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde',
|
||||
];
|
||||
|
||||
public const LOOKS_LIKE_DOCUMENT_WORDS = [
|
||||
'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung',
|
||||
'sdb', 'sicherheitsdatenblatt', 'msds',
|
||||
];
|
||||
|
||||
public const LOOKS_LIKE_SAFETY_WORDS = [
|
||||
'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung',
|
||||
'transport', 'lagerung', 'piktogramm',
|
||||
];
|
||||
|
||||
public const LOOKS_LIKE_DEVICE_WORDS = [
|
||||
'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat',
|
||||
'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor',
|
||||
];
|
||||
|
||||
}
|
||||
@@ -31,18 +31,6 @@ use RuntimeException;
|
||||
*/
|
||||
final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
/**
|
||||
* When one document clearly dominates the top-ranked window,
|
||||
* temporarily switch from "spread" mode to "dominant document" mode.
|
||||
*/
|
||||
private const DOMINANT_DOC_WINDOW = 6;
|
||||
private const DOMINANT_DOC_MIN_HITS = 3;
|
||||
private const DOMINANT_DOC_MAX_CHUNKS = 4;
|
||||
private const EXACT_DOCUMENT_MAX_CHUNKS = 6;
|
||||
private const FOCUSED_PRODUCT_WINDOW = 8;
|
||||
private const FOCUSED_PRODUCT_MIN_SCORE = 10.0;
|
||||
private const FOCUSED_PRODUCT_MIN_GAP = 4.0;
|
||||
private const FOCUSED_PRODUCT_MAX_CHUNKS = 4;
|
||||
|
||||
public function __construct(
|
||||
private NdjsonChunkLookup $lookup,
|
||||
@@ -588,7 +576,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
});
|
||||
|
||||
$selected = [];
|
||||
$max = min($limit, self::EXACT_DOCUMENT_MAX_CHUNKS);
|
||||
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
|
||||
|
||||
foreach ($rows as $row) {
|
||||
$chunkId = $row['chunk_id'] ?? null;
|
||||
@@ -751,7 +739,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$candidates = [];
|
||||
$seenDocs = [];
|
||||
|
||||
foreach (array_slice($chunkIds, 0, self::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) {
|
||||
foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) {
|
||||
$row = $rows[$chunkId] ?? null;
|
||||
if (!is_array($row)) {
|
||||
continue;
|
||||
@@ -797,7 +785,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$bestScore = (float)$best['score'];
|
||||
$gap = $bestScore - $runnerUpScore;
|
||||
|
||||
if ($bestScore < self::FOCUSED_PRODUCT_MIN_SCORE || $gap < self::FOCUSED_PRODUCT_MIN_GAP) {
|
||||
if ($bestScore < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_SCORE || $gap < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_GAP) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -825,22 +813,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$normalized = $this->normalizeText($prompt);
|
||||
$tokens = $this->tokenizeText($normalized);
|
||||
|
||||
$reagentWords = [
|
||||
'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb',
|
||||
'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde',
|
||||
];
|
||||
$documentWords = [
|
||||
'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung',
|
||||
'sdb', 'sicherheitsdatenblatt', 'msds',
|
||||
];
|
||||
$safetyWords = [
|
||||
'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung',
|
||||
'transport', 'lagerung', 'piktogramm',
|
||||
];
|
||||
$deviceWords = [
|
||||
'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat',
|
||||
'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor',
|
||||
];
|
||||
$reagentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_WORDS;
|
||||
$documentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DOCUMENT_WORDS;
|
||||
$safetyWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_WORDS;
|
||||
$deviceWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DEVICE_WORDS;
|
||||
|
||||
$asksReagent = $this->containsAnyToken($tokens, $reagentWords);
|
||||
$asksDocument = $this->containsAnyToken($tokens, $documentWords);
|
||||
@@ -981,7 +957,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
$documentId,
|
||||
$chunkIds,
|
||||
$rows,
|
||||
min($limit, self::FOCUSED_PRODUCT_MAX_CHUNKS)
|
||||
min($limit, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MAX_CHUNKS)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -996,7 +972,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
{
|
||||
$docWindow = [];
|
||||
|
||||
foreach (array_slice($chunkIds, 0, self::DOMINANT_DOC_WINDOW) as $chunkId) {
|
||||
foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::DOMINANT_DOC_WINDOW) as $chunkId) {
|
||||
if (!isset($rows[$chunkId]['text'])) {
|
||||
continue;
|
||||
}
|
||||
@@ -1026,7 +1002,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
|
||||
$dominantCount = (int)($counts[$dominantDocId] ?? 0);
|
||||
|
||||
if ($dominantCount >= self::DOMINANT_DOC_MIN_HITS) {
|
||||
if ($dominantCount >= NdjsonHybridRetrieverConfig::DOMINANT_DOC_MIN_HITS) {
|
||||
return $dominantDocId;
|
||||
}
|
||||
|
||||
@@ -1088,7 +1064,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return [];
|
||||
}
|
||||
|
||||
$maxFromDoc = min($limit, self::DOMINANT_DOC_MAX_CHUNKS);
|
||||
$maxFromDoc = min($limit, NdjsonHybridRetrieverConfig::DOMINANT_DOC_MAX_CHUNKS);
|
||||
|
||||
if ($anchorChunkIndex !== null) {
|
||||
usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int {
|
||||
@@ -1353,17 +1329,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
*/
|
||||
private function isGenericProductToken(string $token): bool
|
||||
{
|
||||
static $generic = [
|
||||
'der', 'die', 'das', 'ein', 'eine', 'einen', 'einem', 'und', 'oder', 'mit',
|
||||
'fuer', 'für', 'von', 'im', 'in', 'am', 'an', 'auf', 'zu', 'zum', 'zur',
|
||||
'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät',
|
||||
'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte',
|
||||
'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung',
|
||||
'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend',
|
||||
'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher',
|
||||
'welches', 'brauche', 'suche', 'bitte', 'fuer', 'gegen', 'und', 'oder',
|
||||
];
|
||||
|
||||
static $generic = NdjsonHybridRetrieverConfig::GENERIC_PRODUCT_TOKEN;
|
||||
return isset(array_fill_keys($generic, true)[$token]);
|
||||
}
|
||||
|
||||
@@ -1372,7 +1338,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
*/
|
||||
private function isImportantShortModelToken(string $token): bool
|
||||
{
|
||||
static $allowed = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
|
||||
static $allowed = NdjsonHybridRetrieverConfig::IMPORTANT_SHORT_MODEL_TOKEN;
|
||||
|
||||
return in_array($token, $allowed, true);
|
||||
}
|
||||
@@ -1382,10 +1348,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
*/
|
||||
private function isFamilyDescriptorToken(string $token): bool
|
||||
{
|
||||
static $familyDescriptors = [
|
||||
'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab',
|
||||
'inline', 'compact', 'panel', 'sc',
|
||||
];
|
||||
static $familyDescriptors = NdjsonHybridRetrieverConfig::FAMILY_DESCRIPTOR_TOKEN;
|
||||
|
||||
return in_array($token, $familyDescriptors, true)
|
||||
|| $this->isImportantShortModelToken($token)
|
||||
@@ -1403,12 +1366,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return false;
|
||||
}
|
||||
|
||||
$needles = [
|
||||
'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie',
|
||||
'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche',
|
||||
'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz',
|
||||
'kerzenfilter', 'druckregler',
|
||||
];
|
||||
$needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_TOKENS;
|
||||
|
||||
foreach ($needles as $needle) {
|
||||
if (str_contains($haystack, $needle)) {
|
||||
@@ -1430,11 +1388,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||
return false;
|
||||
}
|
||||
|
||||
$needles = [
|
||||
'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung',
|
||||
'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp',
|
||||
'kennzeichnung', 'h290', 'pbt', 'vpvb',
|
||||
];
|
||||
$needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_DOCS;
|
||||
|
||||
foreach ($needles as $needle) {
|
||||
if (str_contains($haystack, $needle)) {
|
||||
|
||||
Reference in New Issue
Block a user