lexical logic
This commit is contained in:
@@ -7,62 +7,96 @@ namespace App\Config;
|
||||
final class NdjsonHybridRetrieverConfig
|
||||
{
|
||||
/**
|
||||
* Default semantic similarity threshold for vector hits.
|
||||
* Maximum number of chunks the retriever may finally hand to the model.
|
||||
*
|
||||
* Chosen to stay selective enough for product-family-heavy data
|
||||
* while not cutting off too many useful fallback hits.
|
||||
* Rationale:
|
||||
* - enough room for the stronger hybrid pipeline
|
||||
* - still conservative enough to avoid prompt bloat
|
||||
*/
|
||||
public const VECTOR_SCORE_THRESHOLD = 0.83;
|
||||
public const HARD_MAX_CHUNKS = 6;
|
||||
|
||||
/**
|
||||
* Absolute safety caps.
|
||||
* Hard upper bound for vector retrieval candidate size.
|
||||
*
|
||||
* These limits protect the retriever from overly large candidate sets
|
||||
* even if runtime config values are set too high.
|
||||
* Rationale:
|
||||
* - the pipeline now combines primary vector, secondary vector,
|
||||
* lexical, scoped retrieval and re-ranking
|
||||
* - the old limit would constrain recall too early
|
||||
* - still capped to keep latency controlled
|
||||
*/
|
||||
public const HARD_MAX_CHUNKS = 72;
|
||||
public const HARD_MAX_VECTORK = 180;
|
||||
public const HARD_MAX_VECTORK = 18;
|
||||
|
||||
/**
|
||||
* List-style queries benefit from a slightly wider candidate pool
|
||||
* before de-duplication and final selection.
|
||||
* Default semantic score threshold for vector hits.
|
||||
*
|
||||
* Rationale:
|
||||
* - slightly relaxed compared to stricter pure-vector setups
|
||||
* - the system now has more safeguards:
|
||||
* lexical cross-signals, scoped retrieval, title/meta boost, selection rules
|
||||
*/
|
||||
public const LIST_BONUS = 1.25;
|
||||
public const VECTOR_SCORE_THRESHOLD = 0.81;
|
||||
|
||||
/**
|
||||
* Selection rules for cross-document semantic retrieval.
|
||||
* Lower safety boundary for dynamic threshold adjustments.
|
||||
*
|
||||
* MAX_CHUNKS_PER_DOC:
|
||||
* Keeps one document from dominating the final result in normal
|
||||
* semantic retrieval mode.
|
||||
*
|
||||
* MIN_CHUNK_DISTANCE:
|
||||
* Allows nearby chunks to be selected when they are still meaningfully
|
||||
* distinct, which is important for compact product sheets.
|
||||
* Rationale:
|
||||
* - prevents the system from getting too noisy in fallback cases
|
||||
* - still allows recovery when exact signals are sparse
|
||||
*/
|
||||
public const MAX_CHUNKS_PER_DOC = 3;
|
||||
public const MIN_CHUNK_DISTANCE = 1.0;
|
||||
public const THRESHOLD_FLOOR = 0.75;
|
||||
|
||||
/**
|
||||
* Upper safety boundary for dynamic threshold adjustments.
|
||||
*
|
||||
* Rationale:
|
||||
* - protects objection/pricing/list adjustments from becoming too strict
|
||||
* - keeps retrieval from collapsing into empty result sets too easily
|
||||
*/
|
||||
public const THRESHOLD_CEIL = 0.90;
|
||||
|
||||
/**
|
||||
* Additional candidate expansion factor for list-like prompts.
|
||||
*
|
||||
* Rationale:
|
||||
* - list requests benefit from wider candidate recall
|
||||
* - too high would create noise across multiple retrieval channels
|
||||
*/
|
||||
public const LIST_BONUS = 1.35;
|
||||
|
||||
/**
|
||||
* Reciprocal Rank Fusion constant.
|
||||
*
|
||||
* Slightly lower than classic defaults so top-ranked hits matter more.
|
||||
* Rationale:
|
||||
* - keep rank importance meaningful
|
||||
* - but not so aggressive that one retrieval source dominates too hard
|
||||
*/
|
||||
public const RRF_K = 50;
|
||||
|
||||
/**
|
||||
* Dynamic threshold clamp boundaries.
|
||||
* Fallback size when thresholded fusion yields no candidates.
|
||||
*
|
||||
* The floor must stay below the default threshold, otherwise the
|
||||
* configured base threshold becomes ineffective.
|
||||
* Rationale:
|
||||
* - slightly larger safety net for the richer hybrid stack
|
||||
* - helps no-tag and low-signal cases without exploding context
|
||||
*/
|
||||
public const THRESHOLD_FLOOR = 0.78;
|
||||
public const THRESHOLD_CEIL = 0.90;
|
||||
public const EMPTY_RRF_FALLBACK_TOPN = 5;
|
||||
|
||||
/**
|
||||
* Fallback breadth when strict thresholding removes all fused hits.
|
||||
* Maximum number of chunks allowed from one document in spread mode.
|
||||
*
|
||||
* More than one fallback result makes the retriever less brittle.
|
||||
* Rationale:
|
||||
* - preserve diversity across documents
|
||||
* - still allow coherent multi-chunk retrieval from strong sources
|
||||
*/
|
||||
public const EMPTY_RRF_FALLBACK_TOPN = 3;
|
||||
public const MAX_CHUNKS_PER_DOC = 2;
|
||||
|
||||
/**
|
||||
* Minimum distance between chunk indices from the same document
|
||||
* during spread-style selection.
|
||||
*
|
||||
* Rationale:
|
||||
* - reduce near-duplicate neighboring chunks
|
||||
* - still allow relevant continuation when needed
|
||||
*/
|
||||
public const MIN_CHUNK_DISTANCE = 2;
|
||||
}
|
||||
@@ -1,22 +1,180 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Config;
|
||||
|
||||
class QueryEnricherConfig
|
||||
final readonly class QueryEnricherConfig
|
||||
{
|
||||
/**
|
||||
* Keep the enrichment vocabulary in the class for now.
|
||||
*
|
||||
* Important:
|
||||
* - This is intentionally NOT externalized yet.
|
||||
* - Add or maintain the current project-specific mappings here.
|
||||
* - The later move to external config/files can happen separately.
|
||||
*
|
||||
* Supported shapes:
|
||||
*
|
||||
* 1) Simple mapping:
|
||||
* [
|
||||
* 'water hardness' => 'residual hardness',
|
||||
* 'device' => 'instrument',
|
||||
* ]
|
||||
*
|
||||
* 2) Small synonym groups:
|
||||
* [
|
||||
* ['water hardness', 'residual hardness', 'hardness'],
|
||||
* ['device', 'instrument', 'meter'],
|
||||
* ]
|
||||
*
|
||||
* The public API stays intentionally simple:
|
||||
* - getEnrichQueryList(): array<string,string>
|
||||
*
|
||||
* This keeps QueryEnricher generic while the domain vocabulary
|
||||
* deliberately remains inside this class for now.
|
||||
*
|
||||
* Replace the example entries below with your real project mappings.
|
||||
*
|
||||
* @var array<int|string, mixed>
|
||||
*/
|
||||
private const ENRICH_QUERY_LIST = [
|
||||
// -----------------------------------------------------------------
|
||||
// Example mappings.
|
||||
// Replace / extend these with your current real project mappings.
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
'water hardness' => 'residual hardness',
|
||||
'device' => 'instrument',
|
||||
'gerät'=>'produkt',
|
||||
'rebuild'=>'reindex',
|
||||
|
||||
['measuring device', 'meter', 'instrument'],
|
||||
];
|
||||
|
||||
/**
|
||||
* Returns a normalized, deduplicated mapping for the QueryEnricher.
|
||||
*
|
||||
* Output format:
|
||||
* [
|
||||
* 'term a' => 'term b',
|
||||
* 'term c' => 'term d',
|
||||
* ]
|
||||
*
|
||||
* Rules:
|
||||
* - ignore empty / invalid values
|
||||
* - trim and normalize whitespace
|
||||
* - ignore self-mappings
|
||||
* - preserve first valid rule if duplicates normalize to the same key
|
||||
*
|
||||
* @return array<string, string>
|
||||
*/
|
||||
public function getEnrichQueryList(): array
|
||||
{
|
||||
return [
|
||||
'Wasserhärte' => 'Resthärte',
|
||||
'Gerät' => 'Modell',
|
||||
'Indikator' => 'Chemie',
|
||||
'Seminar' => 'Webinar',
|
||||
'Schulung' => 'Seminar',
|
||||
'Indikatoren' => 'Indikator',
|
||||
'Wasserhärte-Grenzwert' => 'Resthärte',
|
||||
'Resthärte-Grenzwert' => 'Wasserhärte',
|
||||
'Grenzwert' => 'Überwachungsbereich',
|
||||
'store'=>'shop'
|
||||
];
|
||||
$normalized = [];
|
||||
|
||||
foreach (self::ENRICH_QUERY_LIST as $key => $value) {
|
||||
if (is_array($value)) {
|
||||
$this->ingestGroup($normalized, $value);
|
||||
continue;
|
||||
}
|
||||
|
||||
$left = $this->normalizePhrase(is_string($key) ? $key : '');
|
||||
$right = $this->normalizePhrase(is_string($value) ? $value : '');
|
||||
|
||||
if (!$this->isValidPair($left, $right)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isset($normalized[$left])) {
|
||||
$normalized[$left] = $right;
|
||||
}
|
||||
}
|
||||
|
||||
return $normalized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true when at least one valid enrichment rule exists.
|
||||
*/
|
||||
public function hasRules(): bool
|
||||
{
|
||||
return $this->getEnrichQueryList() !== [];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, string> $normalized
|
||||
* @param array<int|string, mixed> $group
|
||||
*/
|
||||
private function ingestGroup(array &$normalized, array $group): void
|
||||
{
|
||||
$items = [];
|
||||
|
||||
foreach ($group as $item) {
|
||||
if (!is_string($item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$item = $this->normalizePhrase($item);
|
||||
|
||||
if ($item === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$items[$item] = $item;
|
||||
}
|
||||
|
||||
$items = array_values($items);
|
||||
|
||||
if (count($items) < 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Turn a synonym group into a conservative chain:
|
||||
* ['a', 'b', 'c'] => a=>b, b=>c
|
||||
*
|
||||
* QueryEnricher builds a bidirectional lookup later,
|
||||
* so the config output stays intentionally small.
|
||||
*/
|
||||
for ($i = 0, $max = count($items) - 1; $i < $max; $i++) {
|
||||
$left = $items[$i];
|
||||
$right = $items[$i + 1];
|
||||
|
||||
if (!$this->isValidPair($left, $right)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isset($normalized[$left])) {
|
||||
$normalized[$left] = $right;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function isValidPair(string $left, string $right): bool
|
||||
{
|
||||
if ($left === '' || $right === '') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($left === $right) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private function normalizePhrase(string $value): string
|
||||
{
|
||||
$value = trim($value);
|
||||
|
||||
if ($value === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
$value = mb_strtolower($value, 'UTF-8');
|
||||
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
||||
|
||||
return trim($value);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user