cleanup code
This commit is contained in:
@@ -99,4 +99,69 @@ final class NdjsonHybridRetrieverConfig
|
|||||||
* - still allow relevant continuation when needed
|
* - still allow relevant continuation when needed
|
||||||
*/
|
*/
|
||||||
public const MIN_CHUNK_DISTANCE = 2;
|
public const MIN_CHUNK_DISTANCE = 2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When one document clearly dominates the top-ranked window,
|
||||||
|
* temporarily switch from "spread" mode to "dominant document" mode.
|
||||||
|
*/
|
||||||
|
public const DOMINANT_DOC_WINDOW = 6;
|
||||||
|
public const DOMINANT_DOC_MIN_HITS = 3;
|
||||||
|
public const DOMINANT_DOC_MAX_CHUNKS = 4;
|
||||||
|
public const EXACT_DOCUMENT_MAX_CHUNKS = 6;
|
||||||
|
public const FOCUSED_PRODUCT_WINDOW = 8;
|
||||||
|
public const FOCUSED_PRODUCT_MIN_SCORE = 10.0;
|
||||||
|
public const FOCUSED_PRODUCT_MIN_GAP = 4.0;
|
||||||
|
public const FOCUSED_PRODUCT_MAX_CHUNKS = 4;
|
||||||
|
|
||||||
|
public const GENERIC_PRODUCT_TOKEN = [
|
||||||
|
'der', 'die', 'das', 'ein', 'eine', 'einen', 'einem', 'und', 'oder', 'mit',
|
||||||
|
'fuer', 'für', 'von', 'im', 'in', 'am', 'an', 'auf', 'zu', 'zum', 'zur',
|
||||||
|
'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät',
|
||||||
|
'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte',
|
||||||
|
'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung',
|
||||||
|
'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend',
|
||||||
|
'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher',
|
||||||
|
'welches', 'brauche', 'suche', 'bitte', 'fuer', 'gegen', 'und', 'oder',
|
||||||
|
];
|
||||||
|
|
||||||
|
const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
|
||||||
|
|
||||||
|
const FAMILY_DESCRIPTOR_TOKEN = [
|
||||||
|
'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab',
|
||||||
|
'inline', 'compact', 'panel', 'sc',
|
||||||
|
];
|
||||||
|
|
||||||
|
const LOOKS_LIKE_REAGENT_TOKENS = [
|
||||||
|
'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie',
|
||||||
|
'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche',
|
||||||
|
'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz',
|
||||||
|
'kerzenfilter', 'druckregler',
|
||||||
|
];
|
||||||
|
|
||||||
|
const LOOKS_LIKE_SAFETY_DOCS = [
|
||||||
|
'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung',
|
||||||
|
'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp',
|
||||||
|
'kennzeichnung', 'h290', 'pbt', 'vpvb',
|
||||||
|
];
|
||||||
|
|
||||||
|
public const LOOKS_LIKE_REAGENT_WORDS = [
|
||||||
|
'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb',
|
||||||
|
'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde',
|
||||||
|
];
|
||||||
|
|
||||||
|
public const LOOKS_LIKE_DOCUMENT_WORDS = [
|
||||||
|
'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung',
|
||||||
|
'sdb', 'sicherheitsdatenblatt', 'msds',
|
||||||
|
];
|
||||||
|
|
||||||
|
public const LOOKS_LIKE_SAFETY_WORDS = [
|
||||||
|
'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung',
|
||||||
|
'transport', 'lagerung', 'piktogramm',
|
||||||
|
];
|
||||||
|
|
||||||
|
public const LOOKS_LIKE_DEVICE_WORDS = [
|
||||||
|
'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat',
|
||||||
|
'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor',
|
||||||
|
];
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -31,18 +31,6 @@ use RuntimeException;
|
|||||||
*/
|
*/
|
||||||
final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
||||||
{
|
{
|
||||||
/**
|
|
||||||
* When one document clearly dominates the top-ranked window,
|
|
||||||
* temporarily switch from "spread" mode to "dominant document" mode.
|
|
||||||
*/
|
|
||||||
private const DOMINANT_DOC_WINDOW = 6;
|
|
||||||
private const DOMINANT_DOC_MIN_HITS = 3;
|
|
||||||
private const DOMINANT_DOC_MAX_CHUNKS = 4;
|
|
||||||
private const EXACT_DOCUMENT_MAX_CHUNKS = 6;
|
|
||||||
private const FOCUSED_PRODUCT_WINDOW = 8;
|
|
||||||
private const FOCUSED_PRODUCT_MIN_SCORE = 10.0;
|
|
||||||
private const FOCUSED_PRODUCT_MIN_GAP = 4.0;
|
|
||||||
private const FOCUSED_PRODUCT_MAX_CHUNKS = 4;
|
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private NdjsonChunkLookup $lookup,
|
private NdjsonChunkLookup $lookup,
|
||||||
@@ -588,7 +576,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
});
|
});
|
||||||
|
|
||||||
$selected = [];
|
$selected = [];
|
||||||
$max = min($limit, self::EXACT_DOCUMENT_MAX_CHUNKS);
|
$max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS);
|
||||||
|
|
||||||
foreach ($rows as $row) {
|
foreach ($rows as $row) {
|
||||||
$chunkId = $row['chunk_id'] ?? null;
|
$chunkId = $row['chunk_id'] ?? null;
|
||||||
@@ -751,7 +739,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
$candidates = [];
|
$candidates = [];
|
||||||
$seenDocs = [];
|
$seenDocs = [];
|
||||||
|
|
||||||
foreach (array_slice($chunkIds, 0, self::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) {
|
foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) {
|
||||||
$row = $rows[$chunkId] ?? null;
|
$row = $rows[$chunkId] ?? null;
|
||||||
if (!is_array($row)) {
|
if (!is_array($row)) {
|
||||||
continue;
|
continue;
|
||||||
@@ -797,7 +785,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
$bestScore = (float)$best['score'];
|
$bestScore = (float)$best['score'];
|
||||||
$gap = $bestScore - $runnerUpScore;
|
$gap = $bestScore - $runnerUpScore;
|
||||||
|
|
||||||
if ($bestScore < self::FOCUSED_PRODUCT_MIN_SCORE || $gap < self::FOCUSED_PRODUCT_MIN_GAP) {
|
if ($bestScore < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_SCORE || $gap < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_GAP) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -825,22 +813,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
$normalized = $this->normalizeText($prompt);
|
$normalized = $this->normalizeText($prompt);
|
||||||
$tokens = $this->tokenizeText($normalized);
|
$tokens = $this->tokenizeText($normalized);
|
||||||
|
|
||||||
$reagentWords = [
|
$reagentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_WORDS;
|
||||||
'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb',
|
$documentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DOCUMENT_WORDS;
|
||||||
'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde',
|
$safetyWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_WORDS;
|
||||||
];
|
$deviceWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DEVICE_WORDS;
|
||||||
$documentWords = [
|
|
||||||
'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung',
|
|
||||||
'sdb', 'sicherheitsdatenblatt', 'msds',
|
|
||||||
];
|
|
||||||
$safetyWords = [
|
|
||||||
'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung',
|
|
||||||
'transport', 'lagerung', 'piktogramm',
|
|
||||||
];
|
|
||||||
$deviceWords = [
|
|
||||||
'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat',
|
|
||||||
'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor',
|
|
||||||
];
|
|
||||||
|
|
||||||
$asksReagent = $this->containsAnyToken($tokens, $reagentWords);
|
$asksReagent = $this->containsAnyToken($tokens, $reagentWords);
|
||||||
$asksDocument = $this->containsAnyToken($tokens, $documentWords);
|
$asksDocument = $this->containsAnyToken($tokens, $documentWords);
|
||||||
@@ -972,16 +948,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
*/
|
*/
|
||||||
private function selectFocusedProductChunkIds(
|
private function selectFocusedProductChunkIds(
|
||||||
string $documentId,
|
string $documentId,
|
||||||
array $chunkIds,
|
array $chunkIds,
|
||||||
array $rows,
|
array $rows,
|
||||||
int $limit
|
int $limit
|
||||||
): array
|
): array
|
||||||
{
|
{
|
||||||
return $this->selectDominantDocumentChunkIds(
|
return $this->selectDominantDocumentChunkIds(
|
||||||
$documentId,
|
$documentId,
|
||||||
$chunkIds,
|
$chunkIds,
|
||||||
$rows,
|
$rows,
|
||||||
min($limit, self::FOCUSED_PRODUCT_MAX_CHUNKS)
|
min($limit, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MAX_CHUNKS)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -996,7 +972,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
{
|
{
|
||||||
$docWindow = [];
|
$docWindow = [];
|
||||||
|
|
||||||
foreach (array_slice($chunkIds, 0, self::DOMINANT_DOC_WINDOW) as $chunkId) {
|
foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::DOMINANT_DOC_WINDOW) as $chunkId) {
|
||||||
if (!isset($rows[$chunkId]['text'])) {
|
if (!isset($rows[$chunkId]['text'])) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1026,7 +1002,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
|
|
||||||
$dominantCount = (int)($counts[$dominantDocId] ?? 0);
|
$dominantCount = (int)($counts[$dominantDocId] ?? 0);
|
||||||
|
|
||||||
if ($dominantCount >= self::DOMINANT_DOC_MIN_HITS) {
|
if ($dominantCount >= NdjsonHybridRetrieverConfig::DOMINANT_DOC_MIN_HITS) {
|
||||||
return $dominantDocId;
|
return $dominantDocId;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1088,7 +1064,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
$maxFromDoc = min($limit, self::DOMINANT_DOC_MAX_CHUNKS);
|
$maxFromDoc = min($limit, NdjsonHybridRetrieverConfig::DOMINANT_DOC_MAX_CHUNKS);
|
||||||
|
|
||||||
if ($anchorChunkIndex !== null) {
|
if ($anchorChunkIndex !== null) {
|
||||||
usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int {
|
usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int {
|
||||||
@@ -1353,17 +1329,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
*/
|
*/
|
||||||
private function isGenericProductToken(string $token): bool
|
private function isGenericProductToken(string $token): bool
|
||||||
{
|
{
|
||||||
static $generic = [
|
static $generic = NdjsonHybridRetrieverConfig::GENERIC_PRODUCT_TOKEN;
|
||||||
'der', 'die', 'das', 'ein', 'eine', 'einen', 'einem', 'und', 'oder', 'mit',
|
|
||||||
'fuer', 'für', 'von', 'im', 'in', 'am', 'an', 'auf', 'zu', 'zum', 'zur',
|
|
||||||
'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät',
|
|
||||||
'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte',
|
|
||||||
'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung',
|
|
||||||
'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend',
|
|
||||||
'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher',
|
|
||||||
'welches', 'brauche', 'suche', 'bitte', 'fuer', 'gegen', 'und', 'oder',
|
|
||||||
];
|
|
||||||
|
|
||||||
return isset(array_fill_keys($generic, true)[$token]);
|
return isset(array_fill_keys($generic, true)[$token]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1372,7 +1338,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
*/
|
*/
|
||||||
private function isImportantShortModelToken(string $token): bool
|
private function isImportantShortModelToken(string $token): bool
|
||||||
{
|
{
|
||||||
static $allowed = ['th', 'tc', 'tp', 'tm', 'ph', 'rx'];
|
static $allowed = NdjsonHybridRetrieverConfig::IMPORTANT_SHORT_MODEL_TOKEN;
|
||||||
|
|
||||||
return in_array($token, $allowed, true);
|
return in_array($token, $allowed, true);
|
||||||
}
|
}
|
||||||
@@ -1382,10 +1348,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
*/
|
*/
|
||||||
private function isFamilyDescriptorToken(string $token): bool
|
private function isFamilyDescriptorToken(string $token): bool
|
||||||
{
|
{
|
||||||
static $familyDescriptors = [
|
static $familyDescriptors = NdjsonHybridRetrieverConfig::FAMILY_DESCRIPTOR_TOKEN;
|
||||||
'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab',
|
|
||||||
'inline', 'compact', 'panel', 'sc',
|
|
||||||
];
|
|
||||||
|
|
||||||
return in_array($token, $familyDescriptors, true)
|
return in_array($token, $familyDescriptors, true)
|
||||||
|| $this->isImportantShortModelToken($token)
|
|| $this->isImportantShortModelToken($token)
|
||||||
@@ -1403,12 +1366,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
$needles = [
|
$needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_TOKENS;
|
||||||
'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie',
|
|
||||||
'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche',
|
|
||||||
'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz',
|
|
||||||
'kerzenfilter', 'druckregler',
|
|
||||||
];
|
|
||||||
|
|
||||||
foreach ($needles as $needle) {
|
foreach ($needles as $needle) {
|
||||||
if (str_contains($haystack, $needle)) {
|
if (str_contains($haystack, $needle)) {
|
||||||
@@ -1430,11 +1388,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
$needles = [
|
$needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_DOCS;
|
||||||
'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung',
|
|
||||||
'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp',
|
|
||||||
'kennzeichnung', 'h290', 'pbt', 'vpvb',
|
|
||||||
];
|
|
||||||
|
|
||||||
foreach ($needles as $needle) {
|
foreach ($needles as $needle) {
|
||||||
if (str_contains($haystack, $needle)) {
|
if (str_contains($haystack, $needle)) {
|
||||||
|
|||||||
Reference in New Issue
Block a user