harden retrieve logic

This commit is contained in:
team 1
2026-04-17 14:52:53 +02:00
parent ae2b52ad18
commit 5c9d81adeb
4 changed files with 838 additions and 141 deletions

View File

@@ -1,21 +1,68 @@
<?php
declare(strict_types=1);
namespace App\Config;
class NdjsonHybridRetrieverConfig
final class NdjsonHybridRetrieverConfig
{
public const VECTOR_SCORE_THRESHOLD = 0.75;
/**
* Default semantic similarity threshold for vector hits.
*
* Chosen to stay selective enough for product-family-heavy data
* while not cutting off too many useful fallback hits.
*/
public const VECTOR_SCORE_THRESHOLD = 0.80;
public const HARD_MAX_CHUNKS = 90;
public const HARD_MAX_VECTORK = 250;
/**
* Absolute safety caps.
*
* These limits protect the retriever from overly large candidate sets
* even if runtime config values are set too high.
*/
public const HARD_MAX_CHUNKS = 72;
public const HARD_MAX_VECTORK = 180;
public const LIST_BONUS = 1.25;
/**
* List-style queries benefit from a slightly wider candidate pool
* before de-duplication and final selection.
*/
public const LIST_BONUS = 1.40;
public const MAX_CHUNKS_PER_DOC = 2;
public const MIN_CHUNK_DISTANCE = 2.5;
public const RRF_K = 60;
/**
* Selection rules for cross-document semantic retrieval.
*
* MAX_CHUNKS_PER_DOC:
* Keeps one document from dominating the final result in normal
* semantic retrieval mode.
*
* MIN_CHUNK_DISTANCE:
* Allows nearby chunks to be selected when they are still meaningfully
* distinct, which is important for compact product sheets.
*/
public const MAX_CHUNKS_PER_DOC = 3;
public const MIN_CHUNK_DISTANCE = 1.0;
public const THRESHOLD_FLOOR = 0.83;
public const THRESHOLD_CEIL = 0.92;
public const EMPTY_RRF_FALLBACK_TOPN = 1;
/**
* Reciprocal Rank Fusion constant.
*
* Slightly lower than classic defaults so top-ranked hits matter more.
*/
public const RRF_K = 50;
/**
* Dynamic threshold clamp boundaries.
*
* The floor must stay below the default threshold, otherwise the
* configured base threshold becomes ineffective.
*/
public const THRESHOLD_FLOOR = 0.78;
public const THRESHOLD_CEIL = 0.90;
/**
* Fallback breadth when strict thresholding removes all fused hits.
*
* More than one fallback result makes the retriever less brittle.
*/
public const EMPTY_RRF_FALLBACK_TOPN = 3;
}