From 86caae55520c9d107a548577494048a2c4fc4f1d Mon Sep 17 00:00:00 2001 From: team 1 Date: Tue, 21 Apr 2026 18:22:57 +0200 Subject: [PATCH] cleanup code --- src/Config/NdjsonHybridRetrieverConfig.php | 65 ++++++++++++++ .../Retrieval/NdjsonHybridRetriever.php | 84 +++++-------------- 2 files changed, 84 insertions(+), 65 deletions(-) diff --git a/src/Config/NdjsonHybridRetrieverConfig.php b/src/Config/NdjsonHybridRetrieverConfig.php index 2b8d66b..215cd99 100644 --- a/src/Config/NdjsonHybridRetrieverConfig.php +++ b/src/Config/NdjsonHybridRetrieverConfig.php @@ -99,4 +99,69 @@ final class NdjsonHybridRetrieverConfig * - still allow relevant continuation when needed */ public const MIN_CHUNK_DISTANCE = 2; + + /** + * When one document clearly dominates the top-ranked window, + * temporarily switch from "spread" mode to "dominant document" mode. + */ + public const DOMINANT_DOC_WINDOW = 6; + public const DOMINANT_DOC_MIN_HITS = 3; + public const DOMINANT_DOC_MAX_CHUNKS = 4; + public const EXACT_DOCUMENT_MAX_CHUNKS = 6; + public const FOCUSED_PRODUCT_WINDOW = 8; + public const FOCUSED_PRODUCT_MIN_SCORE = 10.0; + public const FOCUSED_PRODUCT_MIN_GAP = 4.0; + public const FOCUSED_PRODUCT_MAX_CHUNKS = 4; + + public const GENERIC_PRODUCT_TOKEN = [ + 'der', 'die', 'das', 'ein', 'eine', 'einen', 'einem', 'und', 'oder', 'mit', + 'fuer', 'für', 'von', 'im', 'in', 'am', 'an', 'auf', 'zu', 'zum', 'zur', + 'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät', + 'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte', + 'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung', + 'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend', + 'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher', + 'welches', 'brauche', 'suche', 'bitte', 'fuer', 'gegen', 'und', 'oder', + ]; + + const IMPORTANT_SHORT_MODEL_TOKEN = ['th', 'tc', 'tp', 'tm', 'ph', 'rx']; + + const FAMILY_DESCRIPTOR_TOKEN = [ + 'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab', + 'inline', 'compact', 'panel', 'sc', + ]; + + const LOOKS_LIKE_REAGENT_TOKENS = [ + 'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie', + 'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche', + 'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz', + 'kerzenfilter', 'druckregler', + ]; + + const LOOKS_LIKE_SAFETY_DOCS = [ + 'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung', + 'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp', + 'kennzeichnung', 'h290', 'pbt', 'vpvb', + ]; + + public const LOOKS_LIKE_REAGENT_WORDS = [ + 'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb', + 'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde', + ]; + + public const LOOKS_LIKE_DOCUMENT_WORDS = [ + 'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung', + 'sdb', 'sicherheitsdatenblatt', 'msds', + ]; + + public const LOOKS_LIKE_SAFETY_WORDS = [ + 'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung', + 'transport', 'lagerung', 'piktogramm', + ]; + + public const LOOKS_LIKE_DEVICE_WORDS = [ + 'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat', + 'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor', + ]; + } \ No newline at end of file diff --git a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php index fa55f3f..bc4ef2c 100644 --- a/src/Knowledge/Retrieval/NdjsonHybridRetriever.php +++ b/src/Knowledge/Retrieval/NdjsonHybridRetriever.php @@ -31,18 +31,6 @@ use RuntimeException; */ final readonly class NdjsonHybridRetriever implements RetrieverInterface { - /** - * When one document clearly dominates the top-ranked window, - * temporarily switch from "spread" mode to "dominant document" mode. - */ - private const DOMINANT_DOC_WINDOW = 6; - private const DOMINANT_DOC_MIN_HITS = 3; - private const DOMINANT_DOC_MAX_CHUNKS = 4; - private const EXACT_DOCUMENT_MAX_CHUNKS = 6; - private const FOCUSED_PRODUCT_WINDOW = 8; - private const FOCUSED_PRODUCT_MIN_SCORE = 10.0; - private const FOCUSED_PRODUCT_MIN_GAP = 4.0; - private const FOCUSED_PRODUCT_MAX_CHUNKS = 4; public function __construct( private NdjsonChunkLookup $lookup, @@ -588,7 +576,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface }); $selected = []; - $max = min($limit, self::EXACT_DOCUMENT_MAX_CHUNKS); + $max = min($limit, NdjsonHybridRetrieverConfig::EXACT_DOCUMENT_MAX_CHUNKS); foreach ($rows as $row) { $chunkId = $row['chunk_id'] ?? null; @@ -751,7 +739,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $candidates = []; $seenDocs = []; - foreach (array_slice($chunkIds, 0, self::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) { + foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_WINDOW) as $rank => $chunkId) { $row = $rows[$chunkId] ?? null; if (!is_array($row)) { continue; @@ -797,7 +785,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $bestScore = (float)$best['score']; $gap = $bestScore - $runnerUpScore; - if ($bestScore < self::FOCUSED_PRODUCT_MIN_SCORE || $gap < self::FOCUSED_PRODUCT_MIN_GAP) { + if ($bestScore < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_SCORE || $gap < NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MIN_GAP) { return null; } @@ -825,22 +813,10 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $normalized = $this->normalizeText($prompt); $tokens = $this->tokenizeText($normalized); - $reagentWords = [ - 'indikator', 'reagenz', 'reagens', 'chemie', 'chemikalie', 'sdb', - 'sicherheitsdatenblatt', 'msds', 'flasche', 'gebinde', - ]; - $documentWords = [ - 'datenblatt', 'dokument', 'pdf', 'handbuch', 'manual', 'beschreibung', - 'sdb', 'sicherheitsdatenblatt', 'msds', - ]; - $safetyWords = [ - 'gefahr', 'gefahrgut', 'clp', 'h290', 'sicherheit', 'kennzeichnung', - 'transport', 'lagerung', 'piktogramm', - ]; - $deviceWords = [ - 'geraet', 'gerät', 'messgeraet', 'messgerät', 'analysator', 'automat', - 'messung', 'messen', 'ueberwachung', 'überwachung', 'online', 'monitor', - ]; + $reagentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_WORDS; + $documentWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DOCUMENT_WORDS; + $safetyWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_WORDS; + $deviceWords = NdjsonHybridRetrieverConfig::LOOKS_LIKE_DEVICE_WORDS; $asksReagent = $this->containsAnyToken($tokens, $reagentWords); $asksDocument = $this->containsAnyToken($tokens, $documentWords); @@ -972,16 +948,16 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface */ private function selectFocusedProductChunkIds( string $documentId, - array $chunkIds, - array $rows, - int $limit + array $chunkIds, + array $rows, + int $limit ): array { return $this->selectDominantDocumentChunkIds( $documentId, $chunkIds, $rows, - min($limit, self::FOCUSED_PRODUCT_MAX_CHUNKS) + min($limit, NdjsonHybridRetrieverConfig::FOCUSED_PRODUCT_MAX_CHUNKS) ); } @@ -996,7 +972,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface { $docWindow = []; - foreach (array_slice($chunkIds, 0, self::DOMINANT_DOC_WINDOW) as $chunkId) { + foreach (array_slice($chunkIds, 0, NdjsonHybridRetrieverConfig::DOMINANT_DOC_WINDOW) as $chunkId) { if (!isset($rows[$chunkId]['text'])) { continue; } @@ -1026,7 +1002,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface $dominantCount = (int)($counts[$dominantDocId] ?? 0); - if ($dominantCount >= self::DOMINANT_DOC_MIN_HITS) { + if ($dominantCount >= NdjsonHybridRetrieverConfig::DOMINANT_DOC_MIN_HITS) { return $dominantDocId; } @@ -1088,7 +1064,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return []; } - $maxFromDoc = min($limit, self::DOMINANT_DOC_MAX_CHUNKS); + $maxFromDoc = min($limit, NdjsonHybridRetrieverConfig::DOMINANT_DOC_MAX_CHUNKS); if ($anchorChunkIndex !== null) { usort($docHits, static function (array $a, array $b) use ($anchorChunkIndex): int { @@ -1353,17 +1329,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface */ private function isGenericProductToken(string $token): bool { - static $generic = [ - 'der', 'die', 'das', 'ein', 'eine', 'einen', 'einem', 'und', 'oder', 'mit', - 'fuer', 'für', 'von', 'im', 'in', 'am', 'an', 'auf', 'zu', 'zum', 'zur', - 'produkt', 'produkte', 'produktkarte', 'titel', 'geraet', 'gerät', - 'messgeraet', 'messgerät', 'wasser', 'haerte', 'härte', 'resthaerte', - 'resthärte', 'analyse', 'analysator', 'automat', 'online', 'messung', - 'messen', 'preis', 'preise', 'kosten', 'info', 'infos', 'passend', - 'richtige', 'richtiges', 'geeignet', 'geeignete', 'welche', 'welcher', - 'welches', 'brauche', 'suche', 'bitte', 'fuer', 'gegen', 'und', 'oder', - ]; - + static $generic = NdjsonHybridRetrieverConfig::GENERIC_PRODUCT_TOKEN; return isset(array_fill_keys($generic, true)[$token]); } @@ -1372,7 +1338,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface */ private function isImportantShortModelToken(string $token): bool { - static $allowed = ['th', 'tc', 'tp', 'tm', 'ph', 'rx']; + static $allowed = NdjsonHybridRetrieverConfig::IMPORTANT_SHORT_MODEL_TOKEN; return in_array($token, $allowed, true); } @@ -1382,10 +1348,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface */ private function isFamilyDescriptorToken(string $token): bool { - static $familyDescriptors = [ - 'evo', 'eco', 'self', 'clean', 'mini', 'pro', 'plus', 'basic', 'lab', - 'inline', 'compact', 'panel', 'sc', - ]; + static $familyDescriptors = NdjsonHybridRetrieverConfig::FAMILY_DESCRIPTOR_TOKEN; return in_array($token, $familyDescriptors, true) || $this->isImportantShortModelToken($token) @@ -1403,12 +1366,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return false; } - $needles = [ - 'indikator', 'reagenz', 'reagens', 'laborchemikalie', 'chemikalie', - 'sicherheitsdatenblatt', 'sdb', 'msds', 'ufi', 'gebinde', 'flasche', - 'ersatzteil', 'zubehoer', 'zubehör', 'service set', 'filtereinsatz', - 'kerzenfilter', 'druckregler', - ]; + $needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_REAGENT_TOKENS; foreach ($needles as $needle) { if (str_contains($haystack, $needle)) { @@ -1430,11 +1388,7 @@ final readonly class NdjsonHybridRetriever implements RetrieverInterface return false; } - $needles = [ - 'sicherheitsdatenblatt', 'sdb', 'msds', 'gefahrenbewertung', - 'gefahrenpiktogramm', 'signalwort', 'lagerung', 'transport', 'clp', - 'kennzeichnung', 'h290', 'pbt', 'vpvb', - ]; + $needles = NdjsonHybridRetrieverConfig::LOOKS_LIKE_SAFETY_DOCS; foreach ($needles as $needle) { if (str_contains($haystack, $needle)) {