[1]:
import importlib.metadata as _meta
import sys as _sys
print(f"Python {_sys.version.split()[0]}")
for _pkg in ["mirpy-lib", "numpy", "pandas", "matplotlib", "scipy", "polars"]:
try:
print(f" {_pkg}: {_meta.version(_pkg)}")
except _meta.PackageNotFoundError:
pass
import io
import sys
import tarfile
from pathlib import Path
import pandas as pd
import polars as pl
# Resolve repo root regardless of whether the notebook is run from
# notebooks/, the repo root, or docs/.
repo_root = (
Path.cwd().resolve().parents[1] if Path.cwd().name == "docs"
else (Path.cwd().resolve().parent if Path.cwd().name == "notebooks"
else Path.cwd().resolve())
)
if str(repo_root) not in sys.path:
sys.path.insert(0, str(repo_root))
from mir.common.clonotype import Clonotype
from mir.common.parser import (
AdaptiveParser,
AIRRParser,
ClonotypeTableParser,
OldMiXCRParser,
OlgaParser,
VDJdbSlimParser,
VDJtoolsParser,
)
from mir.common.repertoire import LocusRepertoire, SampleRepertoire
from mir.common.repertoire_dataset import RepertoireDataset
from mir.utils.notebook_assets import (
ensure_airr_benchmark,
find_airr_benchmark_sra_meta,
find_airr_benchmark_vdjdb_slim,
)
Python 3.12.12
mirpy-lib: 1.1.0
numpy: 1.26.4
pandas: 3.0.3
matplotlib: 3.10.9
scipy: 1.17.1
polars: 1.40.1
/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
Parsing Immune Repertoire Data — mirpy Tutorial#
This notebook shows how to load immune repertoire data in every format that mirpy supports. All parsers live in mir.common.parser; they all return either a LocusRepertoire (single locus) or a SampleRepertoire (multi-locus, keyed by IMGT locus code).
Parser |
Input format |
Returns |
|---|---|---|
|
Any TSV/CSV with |
|
|
VDJtools TSV (gzipped or plain) |
|
|
AIRR TSV with |
|
|
Adaptive immunoSEQ / MLR TSV |
|
|
Legacy MiXCR clone table |
|
|
VDJdb slim export |
|
|
OLGA generative model output |
|
1 — Bootstrap benchmark assets#
ensure_airr_benchmark downloads the AIRR benchmark dataset from Hugging Face Hub into notebooks/assets/large/ on first run and is a no-op on subsequent runs (files are cached locally).
[2]:
# Download SRA AIRR TSVs, VDJtools files, and VDJdb slim — cached locally after first run.
benchmark_root = ensure_airr_benchmark(
repo_root,
allow_patterns=["sra/**", "vdjtools_lite/**", "vdjdb/**"],
)
vdjdb_path = find_airr_benchmark_vdjdb_slim(benchmark_root)
sra_tarball, sra_meta_path = find_airr_benchmark_sra_meta(benchmark_root)
vdjtools_dir = benchmark_root / "vdjtools_lite"
vdjtools_files = sorted(vdjtools_dir.glob("*.txt.gz"))
print(f"benchmark_root : {benchmark_root}")
print(f"vdjdb_path : {vdjdb_path.name}")
print(f"sra_tarball : {sra_tarball.name} ({sra_meta_path.name} alongside)")
print(f"vdjtools files : {len(vdjtools_files)} files in {vdjtools_dir.name}/")
benchmark_root : /Users/mikesh/vcs/mirpy/notebooks/assets/large/airr_benchmark
vdjdb_path : vdjdb.slim.txt.gz
sra_tarball : samples.tar.gz (meta.tsv alongside)
vdjtools files : 45 files in vdjtools_lite/
2 — VDJdb slim format#
VDJdb distributes a “slim” export mapping T-cell receptor junctions to known antigen specificities. VDJdbSlimParser.parse_file returns a SampleRepertoire split by locus (TRA / TRB). Each clonotype carries antigen metadata (epitope, MHC, pathogen) in its clone_metadata dict.
[3]:
# Parse the VDJdb slim file — keep only human (HomoSapiens) entries.
vdjdb_sample = VDJdbSlimParser().parse_file(vdjdb_path, species="HomoSapiens")
print(vdjdb_sample)
print()
# Per-locus clonotype counts.
for locus, lr in vdjdb_sample.loci.items():
print(f" {locus}: {lr.clonotype_count:,} clonotypes")
SampleRepertoire(id='vdjdb.slim', loci={TRA: 43965, TRB: 87611})
TRA: 43,965 clonotypes
TRB: 87,611 clonotypes
[4]:
# Inspect antigen metadata on a few TRB clonotypes.
trb_lr = vdjdb_sample["TRB"]
print("TRB — first 3 clonotypes with epitope metadata:")
for c in trb_lr[:3]:
epitope = c.clone_metadata.get("antigen.epitope", "")
pathogen = c.clone_metadata.get("antigen.species", "")
mhc = c.clone_metadata.get("mhc.a", "")
print(f" {c.junction_aa:22s} V={c.v_gene:15s} epi={epitope!r:14s} pathogen={pathogen!r}")
TRB — first 3 clonotypes with epitope metadata:
ATSIRFTDTQYF V=TRBV7-2*01 epi='PQPELPYPQPQ' pathogen='TriticumAestivum'
CAAADEEIGNQPQHF V=TRBV10-3*01 epi='ATDALMTGY' pathogen='HCV'
CAAAERNTGELFF V=TRBV28*01 epi='YLQPRTFLL' pathogen='SARS-CoV-2'
3 — AIRR format (SRA benchmark bundle)#
The SRA bundle stores mixed-locus AIRR TSVs (TRB + TRA in the same file) without an explicit locus column. The pipeline:
Read one TSV from the tar archive into a pandas DataFrame.
Infer
locusfrom thev_callgene prefix (TRBV...→TRB).Pass the full DataFrame to
SampleRepertoire.from_pandas, which groups rows by thelocuscolumn automatically.
For single-locus use, AIRRParser.parse_inner returns a plain list[Clonotype] that can be wrapped in a LocusRepertoire directly.
[5]:
# Read the first AIRR TSV from the SRA tarball.
_GENE_PREFIX_TO_LOCUS = {
"TRBV": "TRB", "TRAV": "TRA", "IGHV": "IGH",
"IGKV": "IGK", "IGLV": "IGL", "TRDV": "TRD", "TRGV": "TRG",
}
with tarfile.open(sra_tarball, "r:gz") as tar:
members = sorted(name for name in tar.getnames() if name.endswith(".tsv"))
target_member = members[0]
with tar.extractfile(target_member) as raw:
airr_df = pd.read_csv(io.BytesIO(raw.read()), sep="\t")
# Infer locus from v_call prefix; keep the original AIRR column names intact.
airr_df["locus"] = airr_df["v_call"].str[:4].map(_GENE_PREFIX_TO_LOCUS)
# SampleRepertoire.from_pandas groups by locus; rename v_call/j_call for it.
airr_df_for_sample = airr_df.rename(columns={"v_call": "v_gene", "j_call": "j_gene"})
airr_sample = SampleRepertoire.from_pandas(
airr_df_for_sample,
sample_id=target_member.lstrip("./").removesuffix(".tsv"),
)
print(f"Loaded: {target_member}")
print(airr_sample)
for locus, lr in airr_sample.loci.items():
print(f" {locus}: {lr.clonotype_count} clonotypes, {lr.duplicate_count} reads")
Loaded: ./SRR10611239.tsv
SampleRepertoire(id='SRR10611239', loci={IGK: 86, TRD: 1, IGL: 25, TRB: 267, TRA: 143, TRG: 4, IGH: 40})
IGK: 86 clonotypes, 115 reads
TRD: 1 clonotypes, 3 reads
IGL: 25 clonotypes, 33 reads
TRB: 267 clonotypes, 312 reads
TRA: 143 clonotypes, 167 reads
TRG: 4 clonotypes, 4 reads
IGH: 40 clonotypes, 43 reads
[6]:
# Single-locus path: AIRRParser.parse_inner → LocusRepertoire.
# AIRRParser expects v_call / j_call (original AIRR names) in the DataFrame.
trb_airr_df = airr_df[airr_df["locus"] == "TRB"].copy()
trb_clonotypes = AIRRParser(locus="TRB").parse_inner(trb_airr_df)
trb_rep = LocusRepertoire(clonotypes=trb_clonotypes, locus="TRB")
print(trb_rep)
print("Top 3 TRB junctions:")
trb_rep.sort()
for c in trb_rep.top(3):
print(f" count={c.duplicate_count:4d} {c.junction_aa} V={c.v_gene}")
LocusRepertoire(locus='TRB', clonotypes=267, duplicate_count=312)
Top 3 TRB junctions:
count= 4 CASSRSANSPLHF V=TRBV27*01
count= 4 CSVLGQPTYGYTF V=TRBV29-1*01
count= 3 CASERWSSGNTIYF V=TRBV12-3*01
4 — VDJtools format#
VDJtoolsParser reads the VDJtools TSV format (columns #count, freq, cdr3nt, cdr3aa, v, d, j). The parser maps these to AIRR names internally and returns a list[Clonotype].
Below we load several files and merge them into one LocusRepertoire using the + operator.
[7]:
# Load the first VDJtools file and wrap it in a LocusRepertoire.
vdj_parser = VDJtoolsParser()
sample_path = vdjtools_files[0]
vdj_rep = LocusRepertoire(
clonotypes=vdj_parser.parse(str(sample_path)),
locus="TRB",
repertoire_id=sample_path.stem.removesuffix(".txt"),
)
print(vdj_rep)
print(f"\nTop 5 clonotypes by read count (sample: {vdj_rep.repertoire_id}):")
vdj_rep.sort()
for c in vdj_rep.top(5):
print(f" {c.duplicate_count:5d} {c.junction_aa:25s} V={c.v_gene}")
LocusRepertoire(locus='TRB', clonotypes=6525, duplicate_count=14903)
Top 5 clonotypes by read count (sample: 10months):
812 CSVEIWDSSYNEQFF V=TRBV29-1*01
704 CASSLAPGATNEKLFF V=TRBV7-6*01
571 CASSLGENIQYF V=TRBV13*01
408 CSARTTYGTDIISQHF V=TRBV20-1*01
282 CSVGTSEAYEQYF V=TRBV29-1*01
[8]:
# Load up to 3 parseable VDJtools samples and merge them with the + operator.
# Some benchmark files use non-standard column names; skip those gracefully.
vdj_parser = VDJtoolsParser()
merged_rep = LocusRepertoire(clonotypes=[], locus="TRB")
loaded = 0
for fp in vdjtools_files:
if loaded >= 3:
break
try:
lr = LocusRepertoire(
clonotypes=vdj_parser.parse(str(fp)),
locus="TRB",
repertoire_id=fp.stem.removesuffix(".txt"),
)
merged_rep = merged_rep + lr
print(f" + {lr.repertoire_id:20s} → {lr.clonotype_count:5,} clonotypes")
loaded += 1
except (ValueError, KeyError):
pass # Skip files with non-standard column names.
print(f"\nMerged: {merged_rep}")
+ 10months → 6,525 clonotypes
+ 37months → 5,172 clonotypes
+ A2-i129 → 9,282 clonotypes
Merged: LocusRepertoire(locus='TRB', clonotypes=20979, duplicate_count=33993)
5 — ClonotypeTableParser (generic format)#
ClonotypeTableParser is the recommended general-purpose parser when the input format is unknown or mixes VDJtools and AIRR column naming conventions. It normalises column names transparently (#count → duplicate_count, cdr3aa → junction_aa, v → v_gene, etc.) and accepts both file paths and pre-loaded DataFrames.
Use it as a drop-in replacement for VDJtoolsParser whenever column names may vary between batches or pipelines.
[9]:
# ClonotypeTableParser: handles both VDJtools and AIRR column names transparently.
# Works on file paths (str / Path) or pre-loaded pandas DataFrames.
ctparser = ClonotypeTableParser()
# Load a VDJtools-format file (column names: #count, cdr3aa, v, j)
ct_clonotypes = ctparser.parse(str(vdjtools_files[0]))
ct_rep = LocusRepertoire(
clonotypes=ct_clonotypes,
locus="TRB",
repertoire_id=vdjtools_files[0].stem.removesuffix(".txt"),
)
print(f"ClonotypeTableParser → VDJtools file:")
print(f" {ct_rep}")
ct_rep.sort()
print(" Top 3 clonotypes:")
for c in ct_rep.top(3):
print(f" {c.duplicate_count:5d} {c.junction_aa:25s} V={c.v_gene}")
# Also accepts a pre-loaded pandas DataFrame — useful when data is in memory.
df_raw = pd.read_csv(str(vdjtools_files[0]), sep="\t")
ct_from_df = ctparser.parse(df_raw)
print(f"\nClonotypeTableParser ← DataFrame: {len(ct_from_df)} clonotypes")
ClonotypeTableParser → VDJtools file:
LocusRepertoire(locus='TRB', clonotypes=6525, duplicate_count=14903)
Top 3 clonotypes:
812 CSVEIWDSSYNEQFF V=TRBV29-1*01
704 CASSLAPGATNEKLFF V=TRBV7-6*01
571 CASSLGENIQYF V=TRBV13*01
ClonotypeTableParser ← DataFrame: 6525 clonotypes
5 — AIRR multi-sample batch loading#
RepertoireDataset.from_folder_polars loads an entire cohort from a folder that contains a metadata.tsv with sample_id and file_name columns plus the corresponding sample files. It uses multiple threads for parallel I/O and builds lazy LocusRepertoire objects (clonotypes are materialised on first access).
# Typical call pattern (folder with metadata.tsv + AIRR TSV files):
dataset = RepertoireDataset.from_folder_polars(
folder=cohort_dir,
parser=VDJtoolsParser(), # or AIRRParser(), ClonotypeTableParser(), ...
metadata_file="metadata.tsv",
n_workers=6,
progress=True,
)
# Access one sample:
trb = dataset.samples["SRR8363891"]["TRB"]
# Get the metadata table as a Polars DataFrame:
meta_df = dataset.metadata_df
Below we demonstrate the same pattern on the VDJtools benchmark using a synthesised metadata.tsv pointing at the benchmark files.
[10]:
import tempfile, shutil
# Pre-filter vdjtools_files to those that parse successfully.
def _is_parseable(fp, parser):
"""Return True if fp parses without errors."""
try:
parser.parse(str(fp))
return True
except (ValueError, KeyError):
return False
vdj_parser_check = VDJtoolsParser()
good_vdjtools_files = [fp for fp in vdjtools_files if _is_parseable(fp, vdj_parser_check)]
print(f"Parseable VDJtools files: {len(good_vdjtools_files)} / {len(vdjtools_files)}")
# Build a temporary cohort folder with metadata.tsv + sample files.
cohort_dir = Path(tempfile.mkdtemp(prefix="mirpy_vdj_cohort_"))
try:
meta_rows = ["sample_id\tfile_name"]
for fp in good_vdjtools_files[:5]: # Use at most 5 samples for the demo.
sample_id = fp.stem.removesuffix(".txt")
dest = cohort_dir / fp.name
shutil.copy(fp, dest)
meta_rows.append(f"{sample_id}\t{fp.name}")
(cohort_dir / "metadata.tsv").write_text("\n".join(meta_rows))
# Load all samples in parallel with 4 worker threads.
dataset = RepertoireDataset.from_folder_polars(
folder=cohort_dir,
parser=VDJtoolsParser(),
n_workers=4,
progress=False,
)
finally:
shutil.rmtree(cohort_dir, ignore_errors=True)
print(f"Loaded {len(dataset.samples)} samples")
print("Sample IDs:", list(dataset.samples.keys()))
# Access per-locus repertoire for one sample.
first_id = next(iter(dataset.samples))
sample_rep = dataset.samples[first_id]
print(f"\n{sample_rep}")
trb = sample_rep["TRB"]
print(f"TRB clonotypes: {trb.clonotype_count:,}")
Parseable VDJtools files: 44 / 45
Loaded 5 samples
Sample IDs: ['37months', '10months', 'A2-i131', 'A2-i129', 'A2-i132']
SampleRepertoire(id='37months', loci={TRB: 5172})
TRB clonotypes: 5,172
6 — Building a repertoire from custom / synthetic data#
When data arrives from a custom pipeline or in-memory arrays, construct Clonotype objects directly and wrap them in LocusRepertoire or SampleRepertoire. No file I/O required.
[11]:
# Step 1: construct individual Clonotype objects from raw fields.
raw_records = [
# (junction_aa, v_gene, j_gene, locus, duplicate_count)
("CASSLAPGATNEKLFF", "TRBV25-1", "TRBJ1-4", "TRB", 200),
("CASSPGTSGNTIYF", "TRBV20-1", "TRBJ1-3", "TRB", 150),
("CAISEVGGGAQKLVF", "TRAV12-2", "TRAJ54", "TRA", 90),
("CAVSDLNAGNMLTF", "TRAV1-2", "TRAJ39", "TRA", 70),
("CASSIRSSYEQYF", "TRBV12-3", "TRBJ2-7", "TRB", 45),
]
clonotypes = [
Clonotype(
junction_aa=jaa,
v_gene=vg,
j_gene=jg,
locus=loc,
duplicate_count=cnt,
)
for jaa, vg, jg, loc, cnt in raw_records
]
# Step 2: build a SampleRepertoire (groups clonotypes by locus automatically).
synthetic_sample = SampleRepertoire.from_clonotypes(clonotypes, sample_id="synthetic_donor_1")
print(synthetic_sample)
for locus, lr in synthetic_sample.loci.items():
print(f" {locus}: {lr.clonotype_count} clonotypes, {lr.duplicate_count} reads")
SampleRepertoire(id='synthetic_donor_1', loci={TRB: 3, TRA: 2})
TRB: 3 clonotypes, 395 reads
TRA: 2 clonotypes, 160 reads
[12]:
# Step 3: access per-locus repertoire and inspect top clonotypes.
synthetic_trb = synthetic_sample["TRB"]
synthetic_trb.sort()
print("Synthetic TRB — ranked by duplicate_count:")
for c in synthetic_trb:
print(f" {c.duplicate_count:4d} {c.junction_aa:20s} V={c.v_gene}")
# Step 4: serialise to Polars DataFrame (AIRR schema).
df = synthetic_trb.to_polars()
print(f"\nPolars DataFrame shape: {df.shape}")
print(df.select(["locus", "junction_aa", "v_gene", "j_gene", "duplicate_count"]))
Synthetic TRB — ranked by duplicate_count:
200 CASSLAPGATNEKLFF V=TRBV25-1
150 CASSPGTSGNTIYF V=TRBV20-1
45 CASSIRSSYEQYF V=TRBV12-3
Polars DataFrame shape: (3, 14)
shape: (3, 5)
┌───────┬──────────────────┬──────────┬─────────┬─────────────────┐
│ locus ┆ junction_aa ┆ v_gene ┆ j_gene ┆ duplicate_count │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ i64 │
╞═══════╪══════════════════╪══════════╪═════════╪═════════════════╡
│ TRB ┆ CASSLAPGATNEKLFF ┆ TRBV25-1 ┆ TRBJ1-4 ┆ 200 │
│ TRB ┆ CASSPGTSGNTIYF ┆ TRBV20-1 ┆ TRBJ1-3 ┆ 150 │
│ TRB ┆ CASSIRSSYEQYF ┆ TRBV12-3 ┆ TRBJ2-7 ┆ 45 │
└───────┴──────────────────┴──────────┴─────────┴─────────────────┘
7 — OlgaParser and AdaptiveParser (reference)#
Two additional parsers cover less common input formats.
``OlgaParser`` reads tab-delimited OLGA output files (no header row; columns: junction, junction_aa, v_gene, j_gene) and returns a SampleRepertoire. Locus is inferred from the J-gene prefix.
olga_sample = OlgaParser().parse_file("TRB_generated.tsv")
trb = olga_sample["TRB"] # LocusRepertoire with inferred locus
``AdaptiveParser`` reads Adaptive immunoSEQ / MLR format and returns a LocusRepertoire. It normalises Adaptive gene naming conventions (TCRBV01 → TRBV1) automatically.
adaptive_rep = AdaptiveParser(locus="TRB").parse_file("adaptive_sample.tsv")
``OldMiXCRParser`` targets legacy MiXCR v2/v3 clone tables (the pre-AIRR MiXCR export format) and returns a SampleRepertoire.
mixcr_sample = OldMiXCRParser().parse_file("clones.txt.gz")
None of these parsers require assets that differ from the pattern shown above — they all accept a path string and return a standard repertoire object.
Summary — parser quick-reference#
Parser |
Typical file |
Call |
Returns |
|---|---|---|---|
|
Any TSV (VDJtools or AIRR column names) |
|
|
|
|
|
|
|
|
|
|
|
Adaptive TSV |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Batch loading (whole cohort from a folder with metadata.tsv):
dataset = RepertoireDataset.from_folder_polars(
folder=cohort_dir, parser=ClonotypeTableParser(), n_workers=6
)
trb = dataset.samples[sample_id]["TRB"]
Manual construction (from arrays or custom pipeline output):
clonotypes = [Clonotype(junction_aa=..., v_gene=..., j_gene=..., locus=..., duplicate_count=...)]
sample = SampleRepertoire.from_clonotypes(clonotypes, sample_id="donor_1")
trb = sample["TRB"] # LocusRepertoire