Source code for mir.basic.aliases

"""Centralized species/locus alias utilities.

This module provides canonicalization helpers shared across control setup,
parser logic, and OLGA model resolution.
"""

from __future__ import annotations

_SPECIES_ALIASES: dict[str, str] = {
    "human": "human",
    "hsa": "human",
    "homosapiens": "human",
    "homo_sapiens": "human",
    "homo sapiens": "human",
    "mouse": "mouse",
    "mmu": "mouse",
    "musmusculus": "mouse",
    "mus_musculus": "mouse",
    "mus musculus": "mouse",
}

# Canonical IMGT locus aliases used by users/notebooks/tools.
_LOCUS_ALIASES: dict[str, str] = {
    "TRA": "TRA",
    "TALPHA": "TRA",
    "T_ALPHA": "TRA",
    "T-ALPHA": "TRA",
    "ALPHA": "TRA",
    "TRB": "TRB",
    "TBETA": "TRB",
    "T_BETA": "TRB",
    "T-BETA": "TRB",
    "BETA": "TRB",
    "TRG": "TRG",
    "TGAMMA": "TRG",
    "T_GAMMA": "TRG",
    "T-GAMMA": "TRG",
    "GAMMA": "TRG",
    "TRD": "TRD",
    "TDELTA": "TRD",
    "T_DELTA": "TRD",
    "T-DELTA": "TRD",
    "DELTA": "TRD",
    "IGH": "IGH",
    "BHEAVY": "IGH",
    "B_HEAVY": "IGH",
    "B-HEAVY": "IGH",
    "HEAVY": "IGH",
    "IGK": "IGK",
    "BKAPPA": "IGK",
    "B_KAPPA": "IGK",
    "B-KAPPA": "IGK",
    "KAPPA": "IGK",
    "IGL": "IGL",
    "BLAMBDA": "IGL",
    "B_LAMBDA": "IGL",
    "B-LAMBDA": "IGL",
    "LAMBDA": "IGL",
}

AIRR_LOCUS_ALIASES: dict[str, set[str]] = {
    "alpha": {"alpha", "tra"},
    "beta": {"beta", "trb"},
    "gamma": {"gamma", "trg"},
    "delta": {"delta", "trd"},
    "heavy": {"heavy", "igh"},
    "kappa": {"kappa", "igk"},
    "lambda": {"lambda", "igl"},
}

AIRR_ALIAS_TO_IMGT: dict[str, str] = {
    "alpha": "TRA", "tra": "TRA",
    "beta": "TRB", "trb": "TRB",
    "gamma": "TRG", "trg": "TRG",
    "delta": "TRD", "trd": "TRD",
    "heavy": "IGH", "igh": "IGH",
    "kappa": "IGK", "igk": "IGK",
    "lambda": "IGL", "igl": "IGL",
}

LOCUS_TO_OLGA_SUFFIX: dict[str, str] = {
    "TRA": "T_alpha",
    "TRB": "T_beta",
    "TRG": "T_gamma",
    "TRD": "T_delta",
    "IGH": "B_heavy",
    "IGK": "B_kappa",
    "IGL": "B_lambda",
}

OLGA_SUFFIX_TO_LOCUS: dict[str, str] = {
    "T_ALPHA": "TRA",
    "T_BETA": "TRB",
    "T_GAMMA": "TRG",
    "T_DELTA": "TRD",
    "B_HEAVY": "IGH",
    "B_KAPPA": "IGK",
    "B_LAMBDA": "IGL",
}

_LOCUS_SEARCH_TOKENS: dict[str, set[str]] = {
    "TRA": {"tra", "talpha"},
    "TRB": {"trb", "tbeta"},
    "TRG": {"trg", "tgamma"},
    "TRD": {"trd", "tdelta"},
    "IGH": {"igh", "bheavy"},
    "IGK": {"igk", "bkappa"},
    "IGL": {"igl", "blambda"},
}


[docs] def normalize_species_alias(species: str) -> str: key = (species or "").strip().lower().replace("-", "_") if key not in _SPECIES_ALIASES: raise ValueError(f"Unsupported species alias: {species!r}") return _SPECIES_ALIASES[key]
[docs] def normalize_locus_alias(locus: str) -> str: key = (locus or "").strip().upper().replace("-", "_") key = key.replace(" ", "") if key not in _LOCUS_ALIASES: raise ValueError(f"Unsupported locus alias: {locus!r}") return _LOCUS_ALIASES[key]
[docs] def normalize_airr_locus_value(value: str) -> str: raw = (value or "").strip().lower() if not raw: return "" return AIRR_ALIAS_TO_IMGT.get(raw, raw.upper()[:3])
[docs] def airr_aliases_for_locus(locus: str) -> set[str]: locus_norm = (locus or "").strip().lower() if locus_norm in AIRR_LOCUS_ALIASES: return AIRR_LOCUS_ALIASES[locus_norm] for aliases in AIRR_LOCUS_ALIASES.values(): if locus_norm in aliases: return aliases return {locus_norm}
[docs] def locus_search_tokens(locus_imgt: str) -> set[str]: return set(_LOCUS_SEARCH_TOKENS.get(locus_imgt, {locus_imgt.lower()}))