"""Alphabets, constants, and amino-acid → reduced-alphabet translation.
This module holds the lightweight, GC-friendly parts that are faster in
pure Python (``bytes.translate``) than in C. Heavy-lifting functions
(codon translation, tokenisation, distances) live in the ``mirseq``
C extension.
Types
-----
* ``Seq`` — Union type ``str | bytes | bytearray``.
Helpers
-------
* ``_to_bytes`` — Normalise *Seq* to ``bytes``.
Alphabets
---------
* ``NT_ALPHABET`` / ``AA_ALPHABET`` / ``REDUCED_AA_ALPHABET`` — 256-byte LUTs.
* ``NT_MASK`` / ``AA_MASK`` / ``REDUCED_AA_MASK`` — Mask byte values.
Translation
-----------
* ``aa_to_reduced`` — AA → reduced via ``bytes.translate`` (fastest path).
* ``validate`` — Check every byte belongs to an alphabet.
* ``mask`` — Replace position(s) with a mask character.
* ``matches`` — Wildcard-aware positional comparison.
* ``matches_aa_reduced``— Cross-alphabet wildcard match (AA vs reduced).
"""
from __future__ import annotations
# ---------------------------------------------------------------------------
# Type alias
# ---------------------------------------------------------------------------
Seq = str | bytes | bytearray
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _to_bytes(seq: Seq) -> bytes:
"""Normalise *seq* to ``bytes``. Strings are ASCII-encoded."""
return seq.encode("ascii") if isinstance(seq, str) else bytes(seq)
# ---------------------------------------------------------------------------
# Alphabet construction
# ---------------------------------------------------------------------------
[docs]
def make_alphabet(chars: str) -> bytes:
"""Build a 256-byte lookup table where allowed positions are ``1``."""
lut = bytearray(256)
for ch in chars:
lut[ord(ch)] = 1
return bytes(lut)
# ---------------------------------------------------------------------------
# Pre-built alphabets
# ---------------------------------------------------------------------------
NT_CHARS = "ATGCN"
AA_CHARS = "ACDEFGHIKLMNPQRSTVWY*_X"
AA_STANDARD_CHARS = AA_CHARS[:20]
REDUCED_AA_CHARS = "lbmcshGFPWYX*_"
NT_ALPHABET: bytes = make_alphabet(NT_CHARS)
AA_ALPHABET: bytes = make_alphabet(AA_CHARS)
REDUCED_AA_ALPHABET: bytes = make_alphabet(REDUCED_AA_CHARS)
NT_MASK = ord("N")
AA_MASK = ord("X")
REDUCED_AA_MASK = ord("X")
# ---------------------------------------------------------------------------
# Amino-acid → reduced-alphabet mapping
# ---------------------------------------------------------------------------
AA_TO_REDUCED: dict[str, str] = {
"A": "l", "R": "b", "N": "m", "D": "c", "C": "s", "Q": "m",
"E": "c", "G": "G", "H": "b", "I": "l", "L": "l", "K": "b",
"M": "s", "F": "F", "P": "P", "S": "h", "T": "h", "W": "W",
"Y": "Y", "V": "l", "X": "X", "*": "*", "_": "_",
}
AA_TO_REDUCED_TABLE: bytes = bytes.maketrans(
"".join(AA_TO_REDUCED.keys()).encode(),
"".join(AA_TO_REDUCED.values()).encode(),
)
_AA_TO_REDUCED_LUT: bytes
_lut = bytearray(256)
for _aa, _red in AA_TO_REDUCED.items():
_lut[ord(_aa)] = ord(_red)
_AA_TO_REDUCED_LUT = bytes(_lut)
del _lut, _aa, _red
# ---------------------------------------------------------------------------
# Translation (aa_to_reduced — fastest in Python via bytes.translate)
# ---------------------------------------------------------------------------
[docs]
def aa_to_reduced(seq: Seq) -> bytes:
"""Convert an amino-acid sequence to the reduced physico-chemical alphabet.
Uses ``bytes.translate`` with a pre-built table — faster than C for
this particular operation.
"""
return _to_bytes(seq).translate(AA_TO_REDUCED_TABLE)
# ---------------------------------------------------------------------------
# Validation
# ---------------------------------------------------------------------------
[docs]
def validate(seq: Seq, alphabet: bytes) -> bytes:
"""Validate every byte of *seq* belongs to *alphabet* (256-byte LUT)."""
raw = _to_bytes(seq)
for b in raw:
if not alphabet[b]:
raise ValueError(
f"Sequence contains symbol {chr(b)!r} outside of alphabet"
)
return raw
# ---------------------------------------------------------------------------
# Masking
# ---------------------------------------------------------------------------
[docs]
def mask(seq: Seq, position: int | slice | tuple[int, int], mask_byte: int) -> bytes:
"""Return a copy of *seq* with the given position(s) replaced by *mask_byte*."""
buf = bytearray(_to_bytes(seq))
if isinstance(position, int):
n = len(buf)
if position < 0:
position += n
if position < 0 or position >= n:
raise IndexError("Mask position out of range")
buf[position] = mask_byte
elif isinstance(position, slice):
for i in range(*position.indices(len(buf))):
buf[i] = mask_byte
elif isinstance(position, tuple) and len(position) == 2:
for i in range(position[0], position[1]):
buf[i] = mask_byte
else:
raise TypeError("position must be int, slice, or (start, stop) tuple")
return bytes(buf)
# ---------------------------------------------------------------------------
# Wildcard matching
# ---------------------------------------------------------------------------
[docs]
def matches(a: Seq, b: Seq, mask_byte: int) -> bool:
"""Wildcard-aware positional comparison.
Returns ``True`` when *a* and *b* have the same length and at every
position the bytes are equal **or** at least one side carries
*mask_byte*.
"""
ba = _to_bytes(a)
bb = _to_bytes(b)
if len(ba) != len(bb):
return False
if ba == bb:
return True
for x, y in zip(ba, bb):
if x == y or x == mask_byte or y == mask_byte:
continue
return False
return True
# ---------------------------------------------------------------------------
# Back-translation (amino acid → nucleotide)
# ---------------------------------------------------------------------------
# Most likely human codon per amino acid (Kazusa Homo sapiens codon usage table).
_MOST_LIKELY_CODON: dict[str, str] = {
"A": "GCC", "R": "AGG", "N": "AAC", "D": "GAC",
"C": "TGC", "Q": "CAG", "E": "GAG", "G": "GGC",
"H": "CAC", "I": "ATC", "L": "CTG", "K": "AAG",
"M": "ATG", "F": "TTC", "P": "CCC", "S": "AGC",
"T": "ACC", "W": "TGG", "Y": "TAC", "V": "GTG",
}
[docs]
def back_translate(aa_seq: str, unknown_codon: str = "NNN") -> str:
"""Back-translate *aa_seq* to a nucleotide sequence.
Each residue is mapped to the most frequently used human codon
(Kazusa Homo sapiens codon usage database). Non-standard residues
(``X``, ``*``, ``_``, etc.) produce *unknown_codon* (default ``"NNN"``).
The returned sequence has length ``len(aa_seq) * 3``.
Examples
--------
>>> back_translate("CA")
'TGCGCC'
>>> back_translate("X")
'NNN'
"""
return "".join(_MOST_LIKELY_CODON.get(aa, unknown_codon) for aa in aa_seq)
[docs]
def matches_aa_reduced(aa_seq: Seq, reduced_seq: Seq) -> bool:
"""Wildcard-aware match between an amino-acid and a reduced-alphabet sequence."""
ba = _to_bytes(aa_seq)
br = _to_bytes(reduced_seq)
if len(ba) != len(br):
return False
if len(ba) == 0:
return True
lut = _AA_TO_REDUCED_LUT
mask_x = AA_MASK
for a, r in zip(ba, br):
conv = lut[a]
if conv == r or a == mask_x or r == mask_x:
continue
return False
return True