Source code for mir.basic.alphabets

"""Alphabets, constants, and amino-acid → reduced-alphabet translation.

This module holds the lightweight, GC-friendly parts that are faster in
pure Python (``bytes.translate``) than in C.  Heavy-lifting functions
(codon translation, tokenisation, distances) live in the ``mirseq``
C extension.

Types
-----
* ``Seq`` — Union type ``str | bytes | bytearray``.

Helpers
-------
* ``_to_bytes`` — Normalise *Seq* to ``bytes``.

Alphabets
---------
* ``NT_ALPHABET`` / ``AA_ALPHABET`` / ``REDUCED_AA_ALPHABET`` — 256-byte LUTs.
* ``NT_MASK`` / ``AA_MASK`` / ``REDUCED_AA_MASK`` — Mask byte values.

Translation
-----------
* ``aa_to_reduced``     — AA → reduced via ``bytes.translate`` (fastest path).
* ``validate``          — Check every byte belongs to an alphabet.
* ``mask``              — Replace position(s) with a mask character.
* ``matches``           — Wildcard-aware positional comparison.
* ``matches_aa_reduced``— Cross-alphabet wildcard match (AA vs reduced).
"""

from __future__ import annotations

# ---------------------------------------------------------------------------
# Type alias
# ---------------------------------------------------------------------------

Seq = str | bytes | bytearray

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _to_bytes(seq: Seq) -> bytes:
    """Normalise *seq* to ``bytes``.  Strings are ASCII-encoded."""
    return seq.encode("ascii") if isinstance(seq, str) else bytes(seq)


# ---------------------------------------------------------------------------
# Alphabet construction
# ---------------------------------------------------------------------------


[docs]
def make_alphabet(chars: str) -> bytes:
    """Build a 256-byte lookup table where allowed positions are ``1``."""
    lut = bytearray(256)
    for ch in chars:
        lut[ord(ch)] = 1
    return bytes(lut)



# ---------------------------------------------------------------------------
# Pre-built alphabets
# ---------------------------------------------------------------------------

NT_CHARS = "ATGCN"
AA_CHARS = "ACDEFGHIKLMNPQRSTVWY*_X"
AA_STANDARD_CHARS = AA_CHARS[:20]
REDUCED_AA_CHARS = "lbmcshGFPWYX*_"

NT_ALPHABET: bytes = make_alphabet(NT_CHARS)
AA_ALPHABET: bytes = make_alphabet(AA_CHARS)
REDUCED_AA_ALPHABET: bytes = make_alphabet(REDUCED_AA_CHARS)

NT_MASK = ord("N")
AA_MASK = ord("X")
REDUCED_AA_MASK = ord("X")


# ---------------------------------------------------------------------------
# Amino-acid → reduced-alphabet mapping
# ---------------------------------------------------------------------------

AA_TO_REDUCED: dict[str, str] = {
    "A": "l", "R": "b", "N": "m", "D": "c", "C": "s", "Q": "m",
    "E": "c", "G": "G", "H": "b", "I": "l", "L": "l", "K": "b",
    "M": "s", "F": "F", "P": "P", "S": "h", "T": "h", "W": "W",
    "Y": "Y", "V": "l", "X": "X", "*": "*", "_": "_",
}

AA_TO_REDUCED_TABLE: bytes = bytes.maketrans(
    "".join(AA_TO_REDUCED.keys()).encode(),
    "".join(AA_TO_REDUCED.values()).encode(),
)

_AA_TO_REDUCED_LUT: bytes
_lut = bytearray(256)
for _aa, _red in AA_TO_REDUCED.items():
    _lut[ord(_aa)] = ord(_red)
_AA_TO_REDUCED_LUT = bytes(_lut)
del _lut, _aa, _red


# ---------------------------------------------------------------------------
# Translation (aa_to_reduced — fastest in Python via bytes.translate)
# ---------------------------------------------------------------------------


[docs]
def aa_to_reduced(seq: Seq) -> bytes:
    """Convert an amino-acid sequence to the reduced physico-chemical alphabet.

    Uses ``bytes.translate`` with a pre-built table — faster than C for
    this particular operation.
    """
    return _to_bytes(seq).translate(AA_TO_REDUCED_TABLE)



# ---------------------------------------------------------------------------
# Validation
# ---------------------------------------------------------------------------


[docs]
def validate(seq: Seq, alphabet: bytes) -> bytes:
    """Validate every byte of *seq* belongs to *alphabet* (256-byte LUT)."""
    raw = _to_bytes(seq)
    for b in raw:
        if not alphabet[b]:
            raise ValueError(
                f"Sequence contains symbol {chr(b)!r} outside of alphabet"
            )
    return raw



# ---------------------------------------------------------------------------
# Masking
# ---------------------------------------------------------------------------


[docs]
def mask(seq: Seq, position: int | slice | tuple[int, int], mask_byte: int) -> bytes:
    """Return a copy of *seq* with the given position(s) replaced by *mask_byte*."""
    buf = bytearray(_to_bytes(seq))
    if isinstance(position, int):
        n = len(buf)
        if position < 0:
            position += n
        if position < 0 or position >= n:
            raise IndexError("Mask position out of range")
        buf[position] = mask_byte
    elif isinstance(position, slice):
        for i in range(*position.indices(len(buf))):
            buf[i] = mask_byte
    elif isinstance(position, tuple) and len(position) == 2:
        for i in range(position[0], position[1]):
            buf[i] = mask_byte
    else:
        raise TypeError("position must be int, slice, or (start, stop) tuple")
    return bytes(buf)



# ---------------------------------------------------------------------------
# Wildcard matching
# ---------------------------------------------------------------------------


[docs]
def matches(a: Seq, b: Seq, mask_byte: int) -> bool:
    """Wildcard-aware positional comparison.

    Returns ``True`` when *a* and *b* have the same length and at every
    position the bytes are equal **or** at least one side carries
    *mask_byte*.
    """
    ba = _to_bytes(a)
    bb = _to_bytes(b)
    if len(ba) != len(bb):
        return False
    if ba == bb:
        return True
    for x, y in zip(ba, bb):
        if x == y or x == mask_byte or y == mask_byte:
            continue
        return False
    return True



# ---------------------------------------------------------------------------
# Back-translation (amino acid → nucleotide)
# ---------------------------------------------------------------------------

# Most likely human codon per amino acid (Kazusa Homo sapiens codon usage table).
_MOST_LIKELY_CODON: dict[str, str] = {
    "A": "GCC", "R": "AGG", "N": "AAC", "D": "GAC",
    "C": "TGC", "Q": "CAG", "E": "GAG", "G": "GGC",
    "H": "CAC", "I": "ATC", "L": "CTG", "K": "AAG",
    "M": "ATG", "F": "TTC", "P": "CCC", "S": "AGC",
    "T": "ACC", "W": "TGG", "Y": "TAC", "V": "GTG",
}



[docs]
def back_translate(aa_seq: str, unknown_codon: str = "NNN") -> str:
    """Back-translate *aa_seq* to a nucleotide sequence.

    Each residue is mapped to the most frequently used human codon
    (Kazusa Homo sapiens codon usage database).  Non-standard residues
    (``X``, ``*``, ``_``, etc.) produce *unknown_codon* (default ``"NNN"``).

    The returned sequence has length ``len(aa_seq) * 3``.

    Examples
    --------
    >>> back_translate("CA")
    'TGCGCC'
    >>> back_translate("X")
    'NNN'
    """
    return "".join(_MOST_LIKELY_CODON.get(aa, unknown_codon) for aa in aa_seq)




[docs]
def matches_aa_reduced(aa_seq: Seq, reduced_seq: Seq) -> bool:
    """Wildcard-aware match between an amino-acid and a reduced-alphabet sequence."""
    ba = _to_bytes(aa_seq)
    br = _to_bytes(reduced_seq)
    if len(ba) != len(br):
        return False
    if len(ba) == 0:
        return True
    lut = _AA_TO_REDUCED_LUT
    mask_x = AA_MASK
    for a, r in zip(ba, br):
        conv = lut[a]
        if conv == r or a == mask_x or r == mask_x:
            continue
        return False
    return True