Skip to content

Lens Parser

Status & scope

  • Stage: POC — VRS Use Case
  • Module: parallax/ops/fusion/lens_parser.py
  • Milestone: M0 (foundation for everything)

Purpose

Parse a Semantic Lens YAML file into a structured Python object that drives the rest of the fusion pipeline. The parser output feeds two consumers:

  1. Feature Extraction (component.parallax.feature-extraction) — which parallax operations to run
  2. Scoring Engine (component.parallax.scoring-engine) — which metrics, weights, and thresholds to use

Input

YAML file following the 7-component Semantic Lens format. For POC, the VRS vulnerability lens:

lens_id: vrs_vulnerability_v1
version: "1.0.0"
name: VRS Vulnerability Cross-Reference
domain: social_care

scope:
  object_types: [social_care.vulnerable_person]
  time_window: rolling_24h
  federation: all_participants

field_groupings:
  identity_hints:
    - field: full_name
      transform: jaro_winkler
      label: Full Name
    - field: date_of_birth
      transform: exact
      label: Date of Birth
    - field: postcode
      transform: geo_prefix | chars: 3
      label: Postcode Area
    - field: phone
      transform: hash | algorithm: sha256
      label: Phone (hashed)
    - field: email
      transform: hash | algorithm: sha256
      label: Email (hashed)
  context_fields:
    - field: vulnerability_category
      transform: exact
      label: Vulnerability Type
  suppressed_fields:
    - vulnerability_status
    - case_notes
    - risk_score

identity_fusion:
  match_function:
    name_match:
      field: full_name
      weight: 0.25
      metric: jaro_winkler
    dob_match:
      field: date_of_birth
      weight: 0.30
      metric: exact
    postcode_match:
      field: postcode
      weight: 0.15
      metric: geo_prefix
    phone_match:
      field: phone
      weight: 0.15
      metric: exact
    email_match:
      field: email
      weight: 0.15
      metric: exact
  initial_threshold: 0.65
  confirmation_threshold: 0.80
  blocking_strategy:
    method: hash_blocking
    blocking_keys:
      - soundex | field: full_name
      - year | field: date_of_birth

evidence_rules:
  min_independent_sources: 2
  max_age: 24h
  provenance_required: true

policy_envelope:
  access: restricted
  field_suppression: [vulnerability_status, case_notes, risk_score]
  audit_level: full

output_semantics:
  correlation_type: vrs_cross_reference
  signal_severity: high
  signal_type: fusion_vulnerability_match
  display_fields: [full_name, date_of_birth, postcode]
  evidence_class: vrs_fusion_correlation

Output

LensSpec dataclass

@dataclass(frozen=True)
class LensSpec:
    lens_id: str
    version: str
    name: str
    domain: str
    scope: ScopeConfig
    field_groupings: FieldGroupings
    identity_fusion: IdentityFusionConfig
    evidence_rules: EvidenceRulesConfig
    policy_envelope: PolicyEnvelope
    output_semantics: OutputSemantics

@dataclass(frozen=True)
class ScopeConfig:
    object_types: list[str]
    time_window: str
    federation: str

@dataclass(frozen=True)
class FieldGroupings:
    identity_hints: list[FieldHint]
    context_fields: list[FieldHint]
    suppressed_fields: list[str]

@dataclass(frozen=True)
class FieldHint:
    field: str
    transform: str           # raw transform string, e.g. "hash | algorithm: sha256"
    label: str
    transform_type: str      # parsed: "hash", "jaro_winkler", "exact", "geo_prefix"
    transform_params: dict   # parsed: {"algorithm": "sha256"} or {}

@dataclass(frozen=True)
class MatchField:
    name: str                # key name, e.g. "name_match"
    field_ref: str           # source field (component.parallax.fusion-binding: may be model-qualified,
                             # e.g. "full_name" or "customer.full_name")
    weight: float
    metric: str              # "jaro_winkler", "exact", "geo_prefix"
    metric_params: dict      # e.g. {"max_distance": 3} or {}

# YAML key compatibility: the parser reads `field_ref:` preferentially and
# falls back to `field:` for pre-binding-v2 lens YAML.

@dataclass(frozen=True)
class BlockingKey:
    method: str              # "soundex", "year", "geo_prefix"
    field: str
    params: dict

@dataclass(frozen=True)
class IdentityFusionConfig:
    match_function: list[MatchField]
    initial_threshold: float
    confirmation_threshold: float
    blocking_strategy: BlockingStrategy | None   # optional — a lens may declare no blocking

@dataclass(frozen=True)
class BlockingStrategy:
    method: str              # "hash_blocking"
    blocking_keys: tuple[BlockingKey, ...]              # first pass (back-compat)
    blocking_passes: tuple[tuple[BlockingKey, ...], ...] # N passes; candidates UNIONED

@dataclass(frozen=True)
class EvidenceRulesConfig:
    min_independent_sources: int
    max_age: str
    provenance_required: bool

@dataclass(frozen=True)
class PolicyEnvelope:
    access: str
    field_suppression: list[str]
    audit_level: str

@dataclass(frozen=True)
class OutputSemantics:
    correlation_type: str
    signal_severity: str
    signal_type: str
    display_fields: list[str]
    evidence_class: str

Public API

def parse_lens(yaml_path: str) -> LensSpec:
    """Parse a lens YAML file into a LensSpec.

    Raises:
        LensValidationError: If YAML is malformed or fails validation.
    """

def parse_lens_from_dict(data: dict, validate: bool = False) -> LensSpec:
    """Parse a lens from a Python dict (e.g., from NO_CODE_MODEL hyperparameters).

    This is the path used when the lens config comes from the UI workflow
    rather than a standalone YAML file.
    """

def validate_lens(spec: LensSpec) -> list[ValidationError]:
    """Validate a parsed LensSpec against semantic rules.

    Returns empty list if valid.
    """

Validation Rules

ID Rule Error
V-01 match_function weights sum to 1.0 (±0.001) WeightSumError
V-02 initial_threshold < confirmation_threshold ThresholdOrderError
V-03 Both thresholds in range [0.0, 1.0] ThresholdRangeError
V-04 Every match_function field_ref references a field in identity_hints FieldReferenceError
V-05 suppressed_fields never appear in identity_hints SuppressionViolation
V-06 min_independent_sources >= 2 MinSourcesError
V-07 blocking_keys reference valid fields; method (after shorthand expansion) is in SUPPORTED_BLOCKING_METHODS. Plain or explicit-method keys must reference a field in identity_hints. Shorthand keys (<field>_<suffix> where suffix ∈ BLOCKING_SHORTHAND_SUFFIXES) are exempt — they reference extractor-produced derived columns. An absent (None) or key-less blocking_strategy declares nothing to validate: V-07 yields no errors and V-08..V-10 still run. BlockingFieldError, UnknownBlockingMethodError
V-08 All metric values are in SUPPORTED_METRICS (see catalog below) UnknownMetricError
V-09 All transform values are in SUPPORTED_TRANSFORMS (see catalog below) UnknownTransformError
V-10 lens_id is non-empty, alphanumeric + underscore InvalidLensIdError

Supported transforms (SUPPORTED_TRANSFORMS)

exact, hash, sha256, hmac, jaro_winkler, geo_prefix, geo_prefix_keyed, postcode_area, soundex, metaphone, nysiis, alias_expand, bloom_filter, numeric_bucket, temporal_round, year, levenshtein, token_set_ratio, cosine, geohash, set_normalize. (levenshtein, token_set_ratio, cosine — like exact/jaro_winkler — are metric-only: the raw value passes through and the name selects the comparison metric.)

Supported metrics (SUPPORTED_METRICS)

exact, jaccard, cosine, levenshtein, jaro_winkler, token_set_ratio, soundex, metaphone, nysiis, sorensen_dice, geo_prefix, geospatial_distance, temporal_proximity, interval_overlap, time_gap, recency_decay, haversine, space_time_cone, address_similarity, category_agreement, geohash_match, source_complementarity, heading_proximity, speed_proximity, uncertainty_aware_distance, numeric_proximity.

Supported blocking methods (SUPPORTED_BLOCKING_METHODS)

exact, hash, soundex, metaphone, nysiis, year, geo_prefix, geohash_prefix, time_bucket (1h default), time_bucket_5m, time_bucket_10m, time_bucket_30m, token_prefix.

Shorthand suffixes on BlockingKey.field (e.g. surname_soundex, dob_year) resolve to the corresponding method via BLOCKING_SHORTHAND_SUFFIXES in blocker.py. The parser validates the resolved method, not the literal method= value.

Multi-pass blocking syntax

blocking_keys accepts two forms; the parser auto-detects which by inspecting the first element:

  • Single-pass (flat list of strings) — one composite blocking pass: yaml blocking_keys: [surname_soundex, dob_year]
  • Multi-pass (list of lists of strings) — N independent passes; candidates from every pass are UNIONED: ```yaml blocking_keys:
    • [surname_metaphone, dob_year]
    • [firstname_metaphone, dob_year]
    • [dob_year, postcode_area] ```

The parser populates BlockingStrategy.blocking_passes: tuple[tuple[BlockingKey, ...], ...]. BlockingStrategy.blocking_keys stays populated with the first pass for back-compat. Pipeline semantics (partial-pass tolerance, UNION) are specified in component.parallax.blocking-engine §"Multi-pass execution".

Transform Parsing

Transform strings use pipe syntax: transform_type | param1: value1 | param2: value2

def parse_transform(raw: str) -> tuple[str, dict]:
    """Parse 'hash | algorithm: sha256' -> ('hash', {'algorithm': 'sha256'})"""
    parts = [p.strip() for p in raw.split("|")]
    transform_type = parts[0]
    params = {}
    for part in parts[1:]:
        key, _, value = part.partition(":")
        params[key.strip()] = value.strip()
    return transform_type, params

Examples: - "exact"("exact", {}) - "hash | algorithm: sha256"("hash", {"algorithm": "sha256"}) - "geo_prefix | chars: 3"("geo_prefix", {"chars": "3"}) - "jaro_winkler"("jaro_winkler", {})

Test Fixtures

FIX-01: Valid VRS lens

def test_parse_valid_vrs_lens():
    spec = parse_lens("fixtures/vrs_vulnerability_v1.yaml")
    assert spec.lens_id == "vrs_vulnerability_v1"
    assert spec.version == "1.0.0"
    assert len(spec.identity_fusion.match_function) == 5
    assert sum(m.weight for m in spec.identity_fusion.match_function) == pytest.approx(1.0)
    assert spec.identity_fusion.initial_threshold == 0.65
    assert spec.identity_fusion.confirmation_threshold == 0.80
    assert len(spec.field_groupings.suppressed_fields) == 3
    assert "case_notes" in spec.field_groupings.suppressed_fields

FIX-02: Weights don't sum to 1.0

def test_bad_weights():
    data = load_yaml("fixtures/vrs_vulnerability_v1.yaml")
    data["identity_fusion"]["match_function"]["name_match"]["weight"] = 0.50
    errors = validate_lens(parse_lens_from_dict(data))
    assert any(isinstance(e, WeightSumError) for e in errors)

FIX-03: Suppressed field in identity_hints

def test_suppressed_in_hints():
    data = load_yaml("fixtures/vrs_vulnerability_v1.yaml")
    data["field_groupings"]["identity_hints"].append({
        "field": "case_notes", "transform": "exact", "label": "Notes"
    })
    errors = validate_lens(parse_lens_from_dict(data))
    assert any(isinstance(e, SuppressionViolation) for e in errors)

FIX-04: Transform parsing

@pytest.mark.parametrize("raw,expected_type,expected_params", [
    ("exact", "exact", {}),
    ("hash | algorithm: sha256", "hash", {"algorithm": "sha256"}),
    ("geo_prefix | chars: 3", "geo_prefix", {"chars": "3"}),
    ("jaro_winkler", "jaro_winkler", {}),
    ("soundex | field: full_name", "soundex", {"field": "full_name"}),
])
def test_parse_transform(raw, expected_type, expected_params):
    t, p = parse_transform(raw)
    assert t == expected_type
    assert p == expected_params

FIX-05: parse_lens_from_dict (UI workflow path)

def test_parse_from_dict():
    """This is how the lens arrives from the NO_CODE_MODEL hyperparameters."""
    config = {
        "lens_id": "vrs_vulnerability_v1",
        "version": "1.0.0",
        "name": "VRS Vulnerability Cross-Reference",
        # ... full dict as would come from UI
    }
    spec = parse_lens_from_dict(config)
    assert isinstance(spec, LensSpec)
    assert spec.lens_id == "vrs_vulnerability_v1"

File Layout

parallax/ops/fusion/
├── lens_parser.py          # parse_lens, parse_lens_from_dict, validate_lens
├── lens_types.py           # All dataclasses (LensSpec, FieldHint, etc.)
├── lens_errors.py          # LensValidationError and subclasses
└── tests/
    ├── fixtures/
    │   └── vrs_vulnerability_v1.yaml
    └── test_lens_parser.py

Integration Points


Realizes: product.lens

Required by: component.parallax.blocking-engine, component.parallax.counter-isr, component.parallax.derived-features, component.parallax.feature-extraction, component.parallax.fusion-binding, component.parallax.fusion-governance-lifecycle, component.parallax.fusionmatch-model, component.parallax.primitives-framework, component.parallax.scoring-engine