lexical logic

This commit is contained in:
team2
2026-04-20 21:46:42 +02:00
parent 2587ac8b4b
commit 065f59c090
9 changed files with 2576 additions and 326 deletions

View File

@@ -7,62 +7,96 @@ namespace App\Config;
final class NdjsonHybridRetrieverConfig
{
/**
* Default semantic similarity threshold for vector hits.
* Maximum number of chunks the retriever may finally hand to the model.
*
* Chosen to stay selective enough for product-family-heavy data
* while not cutting off too many useful fallback hits.
* Rationale:
* - enough room for the stronger hybrid pipeline
* - still conservative enough to avoid prompt bloat
*/
public const VECTOR_SCORE_THRESHOLD = 0.83;
public const HARD_MAX_CHUNKS = 6;
/**
* Absolute safety caps.
* Hard upper bound for vector retrieval candidate size.
*
* These limits protect the retriever from overly large candidate sets
* even if runtime config values are set too high.
* Rationale:
* - the pipeline now combines primary vector, secondary vector,
* lexical, scoped retrieval and re-ranking
* - the old limit would constrain recall too early
* - still capped to keep latency controlled
*/
public const HARD_MAX_CHUNKS = 72;
public const HARD_MAX_VECTORK = 180;
public const HARD_MAX_VECTORK = 18;
/**
* List-style queries benefit from a slightly wider candidate pool
* before de-duplication and final selection.
* Default semantic score threshold for vector hits.
*
* Rationale:
* - slightly relaxed compared to stricter pure-vector setups
* - the system now has more safeguards:
* lexical cross-signals, scoped retrieval, title/meta boost, selection rules
*/
public const LIST_BONUS = 1.25;
public const VECTOR_SCORE_THRESHOLD = 0.81;
/**
* Selection rules for cross-document semantic retrieval.
* Lower safety boundary for dynamic threshold adjustments.
*
* MAX_CHUNKS_PER_DOC:
* Keeps one document from dominating the final result in normal
* semantic retrieval mode.
*
* MIN_CHUNK_DISTANCE:
* Allows nearby chunks to be selected when they are still meaningfully
* distinct, which is important for compact product sheets.
* Rationale:
* - prevents the system from getting too noisy in fallback cases
* - still allows recovery when exact signals are sparse
*/
public const MAX_CHUNKS_PER_DOC = 3;
public const MIN_CHUNK_DISTANCE = 1.0;
public const THRESHOLD_FLOOR = 0.75;
/**
* Upper safety boundary for dynamic threshold adjustments.
*
* Rationale:
* - protects objection/pricing/list adjustments from becoming too strict
* - keeps retrieval from collapsing into empty result sets too easily
*/
public const THRESHOLD_CEIL = 0.90;
/**
* Additional candidate expansion factor for list-like prompts.
*
* Rationale:
* - list requests benefit from wider candidate recall
* - too high would create noise across multiple retrieval channels
*/
public const LIST_BONUS = 1.35;
/**
* Reciprocal Rank Fusion constant.
*
* Slightly lower than classic defaults so top-ranked hits matter more.
* Rationale:
* - keep rank importance meaningful
* - but not so aggressive that one retrieval source dominates too hard
*/
public const RRF_K = 50;
/**
* Dynamic threshold clamp boundaries.
* Fallback size when thresholded fusion yields no candidates.
*
* The floor must stay below the default threshold, otherwise the
* configured base threshold becomes ineffective.
* Rationale:
* - slightly larger safety net for the richer hybrid stack
* - helps no-tag and low-signal cases without exploding context
*/
public const THRESHOLD_FLOOR = 0.78;
public const THRESHOLD_CEIL = 0.90;
public const EMPTY_RRF_FALLBACK_TOPN = 5;
/**
* Fallback breadth when strict thresholding removes all fused hits.
* Maximum number of chunks allowed from one document in spread mode.
*
* More than one fallback result makes the retriever less brittle.
* Rationale:
* - preserve diversity across documents
* - still allow coherent multi-chunk retrieval from strong sources
*/
public const EMPTY_RRF_FALLBACK_TOPN = 3;
public const MAX_CHUNKS_PER_DOC = 2;
/**
* Minimum distance between chunk indices from the same document
* during spread-style selection.
*
* Rationale:
* - reduce near-duplicate neighboring chunks
* - still allow relevant continuation when needed
*/
public const MIN_CHUNK_DISTANCE = 2;
}