T-cell repertoire annotation and motif discovery (TCRNET-style)#

This notebook demonstrates a highly customized MIR implementation of TCRNET-style analysis for antigen-associated TCR motif discovery. The workflow is inspired by ideas described in the TCRNET framework paper and is not a literal line-by-line reimplementation of the original software.

Samples:

(B35+)       HIP02877  A*26 A*33 B*14 B*35  CMV-
(CMV+)       HIP13994  A*02 A*02 B*07 B*44  CMV+

Control repertoire:

control.txt.gz

Inputs:

airr_benchmark/tcrnet/CMV+.txt.gz
airr_benchmark/tcrnet/B35+.txt.gz
airr_benchmark/tcrnet/control.txt.gz
latest airr_benchmark/vdjdb/**/vdjdb.slim.txt.gz

Reference: Pogorelyy MV, Shugay M. A Framework for Annotation of Antigen Specificities in High-Throughput T-Cell Repertoire Sequencing Studies. Front Immunol. 2019;10:2159. doi:10.3389/fimmu.2019.02159. PMID:31616409. PubMed: https://pubmed.ncbi.nlm.nih.gov/31616409/

[ ]:

import importlib.metadata as _meta
import sys as _sys
import warnings

warnings.filterwarnings('ignore', message='IProgress not found.*')

print(f"Python {_sys.version.split()[0]}")
for _pkg in ["mirpy-lib", "numpy", "pandas", "matplotlib", "scipy", "polars"]:
    try:
        print(f"  {_pkg}: {_meta.version(_pkg)}")
    except _meta.PackageNotFoundError:
        pass

import sys
from pathlib import Path

import igraph as ig
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.colors import LogNorm, to_rgba
from scipy.stats import binomtest, mannwhitneyu, spearmanr

repo_root = Path.cwd().resolve().parent if Path.cwd().name == "notebooks" else Path.cwd().resolve()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from mir.biomarkers.tcrnet import compute_tcrnet
from mir.common.clonotype import Clonotype
from mir.common.filter import filter_functional
from mir.common.parser import ClonotypeTableParser
from mir.common.repertoire import LocusRepertoire
from mir.graph.edit_distance_graph import build_edit_distance_graph
from mir.utils.notebook_assets import ensure_airr_benchmark, find_airr_benchmark_vdjdb_slim

CORES = 4
BENCHMARK_ROOT = ensure_airr_benchmark(repo_root, allow_patterns=["tcrnet/**", "vdjdb/**"])
TCRNET_DIR = BENCHMARK_ROOT / "tcrnet"
VDJDB_PATH = find_airr_benchmark_vdjdb_slim(BENCHMARK_ROOT)

SAMPLE_HLAS = {
    'B35+': ['HLA-A*26', 'HLA-A*33', 'HLA-B*14', 'HLA-B*35'],
    'CMV+': ['HLA-A*02', 'HLA-B*07', 'HLA-B*44'],
}

parser = ClonotypeTableParser()


def bh_adjust(p_values: pd.Series) -> pd.Series:
    values = pd.Series(p_values, dtype=float)
    if values.empty:
        return values
    order = np.argsort(values.to_numpy())
    ranked = values.to_numpy()[order]
    n = len(ranked)
    adjusted = np.empty(n, dtype=float)
    running = 1.0
    for i in range(n - 1, -1, -1):
        rank = i + 1
        running = min(running, ranked[i] * n / rank)
        adjusted[i] = running
    out = np.empty(n, dtype=float)
    out[order] = np.clip(adjusted, 0.0, 1.0)
    return pd.Series(out, index=values.index)


def load_trb(path: str | Path, sample_id: str) -> LocusRepertoire:
    clones = filter_functional(LocusRepertoire(parser.parse(str(path)), locus='TRB', repertoire_id=sample_id)).clonotypes
    rep = LocusRepertoire(clonotypes=clones, locus='TRB', repertoire_id=sample_id)
    return rep


def repertoire_to_sequence_table(rep: LocusRepertoire, sample_id: str) -> pd.DataFrame:
    rows = []
    total = max(1, rep.duplicate_count)
    for clonotype in rep.clonotypes:
        rows.append(
            {
                'cdr3aa': clonotype.junction_aa,
                'count': clonotype.duplicate_count,
                'v_gene': clonotype.v_gene,
                'j_gene': clonotype.j_gene,
                'sample_id': sample_id,
            }
        )
    df = pd.DataFrame(rows)
    grouped = (
        df.groupby(['cdr3aa', 'sample_id'], as_index=False)
        .agg(count=('count', 'sum'), v_gene=('v_gene', 'first'), j_gene=('j_gene', 'first'))
    )
    grouped['freq'] = grouped['count'] / total
    return grouped


def mhc_a_base(series: pd.Series) -> pd.Series:
    return series.fillna('').astype(str).str.split(':').str[0].str.split(',').str[0]


def load_vdjdb_reference(path: Path) -> tuple[pd.DataFrame, LocusRepertoire]:
    raw = pd.read_csv(path, sep='\t', compression='gzip')
    raw = raw[raw['gene'].eq('TRB')].copy()
    epi_sizes = raw.groupby('antigen.epitope')['cdr3'].nunique()
    raw = raw[raw['antigen.epitope'].map(epi_sizes).ge(30)].reset_index(drop=True)
    raw['mhc.a.base'] = mhc_a_base(raw['mhc.a'])
    raw['epi.name'] = raw['mhc.a.base'].str.replace('HLA-', '', regex=False).str[:6].str.strip() + ' ' + raw['antigen.epitope'].astype(str).str[:3]
    raw['antigen.species'] = raw['antigen.species'].replace({'DENV': 'DengueVirus'})

    clonotypes = []
    for idx, row in raw.iterrows():
        clonotypes.append(
            Clonotype(
                sequence_id=str(idx),
                locus='TRB',
                junction_aa=str(row['cdr3']),
                v_gene=str(row.get('v.segm', '') or ''),
                j_gene=str(row.get('j.segm', '') or ''),
                duplicate_count=1,
                _validate=False,
            )
        )
    return raw, LocusRepertoire(clonotypes=clonotypes, locus='TRB')


def annotate_repertoire_with_vdjdb(rep: LocusRepertoire, sample_id: str, vdjdb_df: pd.DataFrame, vdjdb_rep: LocusRepertoire) -> pd.DataFrame:
    seq_df = repertoire_to_sequence_table(rep, sample_id)
    rows = []
    for rec in seq_df.itertuples(index=False):
        hits = vdjdb_rep.trie.SearchIndices(
            query=rec.cdr3aa,
            maxSubstitution=1,
            maxInsertion=0,
            maxDeletion=0,
            maxEdits=1,
        )
        for idx, distance in hits:
            hit = vdjdb_df.iloc[int(idx)]
            rows.append(
                {
                    'cdr3aa': rec.cdr3aa,
                    'sample_id': sample_id,
                    'count': rec.count,
                    'freq': rec.freq,
                    'distance': int(distance),
                    'antigen.epitope': hit['antigen.epitope'],
                    'antigen.species': hit['antigen.species'],
                    'mhc.a': hit['mhc.a'],
                    'mhc.a.base': hit['mhc.a.base'],
                    'mhc.b': hit['mhc.b'],
                    'mhc.class': hit['mhc.class'],
                    'reference.id': hit['reference.id'],
                    'vdjdb.score': hit['vdjdb.score'],
                    'epi.name': hit['epi.name'],
                }
            )
    if not rows:
        return pd.DataFrame(columns=['cdr3aa', 'sample_id', 'count', 'freq', 'distance', 'antigen.epitope', 'antigen.species', 'mhc.a', 'mhc.a.base', 'mhc.b', 'mhc.class', 'reference.id', 'vdjdb.score', 'epi.name'])
    return pd.DataFrame(rows)


def quality_filter_annotations(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df.copy()
    df = (
        df.groupby(['cdr3aa', 'antigen.epitope', 'antigen.species', 'mhc.a', 'sample_id', 'vdjdb.score', 'reference.id'], as_index=False)
        .agg(count=('count', 'sum'), freq=('freq', 'sum'), mhc_b=('mhc.b', 'first'), mhc_class=('mhc.class', 'first'), mhc_a_base=('mhc.a.base', 'first'), epi_name=('epi.name', 'first'))
        .rename(columns={'mhc_b': 'mhc.b', 'mhc_class': 'mhc.class', 'mhc_a_base': 'mhc.a.base', 'epi_name': 'epi.name'})
    )
    good = df[['cdr3aa', 'antigen.epitope', 'mhc.a', 'vdjdb.score', 'reference.id']].drop_duplicates().copy()
    good['vdjdb.score.max'] = good.groupby('cdr3aa')['vdjdb.score'].transform('max')
    good = good[good['vdjdb.score'].eq(good['vdjdb.score.max'])].copy()
    good['num.pub'] = good['reference.id'].fillna('').str.count('PMID')
    good['num.pub.max'] = good.groupby('cdr3aa')['num.pub'].transform('max')
    good = good[good['num.pub'].eq(good['num.pub.max'])].copy()
    good['num.spec'] = good.groupby('cdr3aa')['antigen.epitope'].transform('nunique')
    good = good[good['num.spec'].eq(1)][['cdr3aa', 'antigen.epitope', 'mhc.a', 'vdjdb.score', 'reference.id']].drop_duplicates()
    return df.merge(good, on=['cdr3aa', 'antigen.epitope', 'mhc.a', 'vdjdb.score', 'reference.id'])


def wilcox_by_epitope(sample_df: pd.DataFrame, control_df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for (sample_id, epi_name), group in sample_df.groupby(['sample_id', 'epi.name']):
        ctrl = control_df[control_df['sample_id'].eq(sample_id) & control_df['epi.name'].eq(epi_name)]['count']
        if ctrl.empty:
            ctrl = control_df[control_df['epi.name'].eq(epi_name)]['count']
        if group['count'].empty or ctrl.empty:
            continue
        stat = mannwhitneyu(group['count'], ctrl, alternative='two-sided')
        rows.append({'sample_id': sample_id, 'epi.name': epi_name, 'p.value': stat.pvalue})
    out = pd.DataFrame(rows)
    if not out.empty:
        out['p.value.adj'] = bh_adjust(out['p.value'])
    return out.sort_values('p.value') if not out.empty else out


def donor_filter(df: pd.DataFrame) -> pd.DataFrame:
    parts = []
    for sample_id, hlas in SAMPLE_HLAS.items():
        subset = df[df['sample_id'].eq(sample_id)].copy()
        subset = subset[subset['mhc.a.base'].isin(hlas)]
        if sample_id == 'CMV+':
            subset = subset[subset['antigen.species'].eq('CMV')]
        else:
            subset = subset[subset['antigen.species'].ne('CMV')]
        parts.append(subset)
    return pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=df.columns)


def donor_control_filter(df_control: pd.DataFrame) -> pd.DataFrame:
    parts = []
    for sample_id, hlas in SAMPLE_HLAS.items():
        subset = df_control.copy()
        subset = subset[subset['mhc.a.base'].isin(hlas)].copy()
        if sample_id == 'CMV+':
            subset = subset[subset['antigen.species'].eq('CMV')]
        else:
            subset = subset[subset['antigen.species'].ne('CMV')]
        subset['sample_id'] = sample_id
        parts.append(subset)
    return pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=df_control.columns)


def plot_boxpanels(sample_df: pd.DataFrame, control_df: pd.DataFrame, title: str) -> None:
    sample_ids = list(sample_df['sample_id'].drop_duplicates())
    species = sorted(sample_df['antigen.species'].dropna().unique())
    cmap = plt.get_cmap('Paired', max(1, len(species)))
    species_colors = {sp: cmap(i) for i, sp in enumerate(species)}
    fig, axes = plt.subplots(1, len(sample_ids), figsize=(7 * len(sample_ids), 6), squeeze=False)
    for ax, sample_id in zip(axes[0], sample_ids):
        sample_sub = sample_df[sample_df['sample_id'].eq(sample_id)].copy()
        control_sub = control_df[control_df['sample_id'].eq(sample_id)].copy()
        if control_sub.empty:
            control_sub = control_df.copy()
        order = (
            sample_sub.groupby('epi.name', as_index=False)
            .agg(freq=('freq', 'max'), antigen_species=('antigen.species', 'first'))
            .sort_values(['freq', 'antigen_species'])['epi.name']
            .tolist()
        )
        positions = np.arange(len(order), dtype=float)
        control_values = [control_sub[control_sub['epi.name'].eq(epi)]['count'].to_numpy() for epi in order]
        sample_values = [sample_sub[sample_sub['epi.name'].eq(epi)]['count'].to_numpy() for epi in order]
        bp_control = ax.boxplot(control_values, positions=positions - 0.18, widths=0.32, patch_artist=True, showfliers=False)
        for patch in bp_control['boxes']:
            patch.set(facecolor='lightgrey', edgecolor='grey')
        bp_sample = ax.boxplot(sample_values, positions=positions + 0.18, widths=0.32, patch_artist=True, showfliers=False)
        for patch, epi in zip(bp_sample['boxes'], order):
            species_name = sample_sub[sample_sub['epi.name'].eq(epi)]['antigen.species'].iloc[0]
            patch.set(facecolor=(1, 1, 1, 0), edgecolor=species_colors.get(species_name, 'black'), linewidth=1.5)
        ax.set_xticks(positions)
        ax.set_xticklabels(order, rotation=90)
        ax.set_yscale('log')
        ax.set_ylabel('# reads')
        ax.set_title(sample_id)
        ax.grid(False)
    fig.suptitle(title)
    plt.tight_layout()
    plt.show()


def tcrnet_table_with_counts(rep: LocusRepertoire, sample_id: str, result_table: pd.DataFrame) -> pd.DataFrame:
    counts = repertoire_to_sequence_table(rep, sample_id)[['cdr3aa', 'count', 'freq']].rename(columns={'cdr3aa': 'junction_aa'})
    # result_table is a Polars DataFrame from compute_tcrnet — convert to pandas
    tbl_pd = result_table.to_pandas() if hasattr(result_table, "to_pandas") else result_table
    table = tbl_pd.merge(counts, on='junction_aa', how='left')
    table['sample_id'] = sample_id
    table = table.rename(columns={'junction_aa': 'cdr3aa', 'fold_enrichment': 'fold'})
    table = table[table['count'].fillna(0).gt(1)].copy()
    table['p.adj'] = bh_adjust(table['p_value'])
    return table.sort_values(['p.adj', 'fold'], ascending=[True, False]).reset_index(drop=True)


def sequences_to_clonotypes(df: pd.DataFrame) -> list[Clonotype]:
    clonotypes = []
    for idx, row in enumerate(df.drop_duplicates('cdr3aa').itertuples(index=False)):
        clonotypes.append(
            Clonotype(
                sequence_id=str(idx),
                locus='TRB',
                junction_aa=row.cdr3aa,
                duplicate_count=int(row.count),
                v_gene=str(getattr(row, 'v_gene', '') or ''),
                j_gene=str(getattr(row, 'j_gene', '') or ''),
                _validate=False,
            )
        )
    return clonotypes


def collect_neighbor_sequences(rep: LocusRepertoire, query_sequences: list[str]) -> set[str]:
    found = set()
    for seq in query_sequences:
        hits = rep.trie.SearchIndices(seq, maxSubstitution=1, maxInsertion=0, maxDeletion=0, maxEdits=1)
        for idx, _ in hits:
            found.add(rep.clonotypes[int(idx)].junction_aa)
    return found


def layout_graph_df(node_df: pd.DataFrame, sample_id: str) -> pd.DataFrame:
    if node_df.empty:
        return pd.DataFrame(columns=['cdr3aa', 'x', 'y', 'cid', 'cid2', 'sample_id', 'freq', 'count'])
    graph = build_edit_distance_graph(sequences_to_clonotypes(node_df), metric='hamming', threshold=1, nproc=CORES)
    coords = np.asarray(graph.layout_graphopt(niter=3000, node_charge=0.005)) if graph.vcount() else np.empty((0, 2))
    membership = graph.components().membership if graph.vcount() else []
    layout = pd.DataFrame(
        {
            'cdr3aa': graph.vs['name'],
            'x': coords[:, 0] if len(coords) else [],
            'y': coords[:, 1] if len(coords) else [],
            'cid': membership,
            'sample_id': sample_id,
        }
    )
    layout['cid2'] = layout['sample_id'] + '_C' + layout['cid'].astype(str)
    return layout.merge(node_df[['cdr3aa', 'freq', 'count']].drop_duplicates(), on='cdr3aa', how='left')


def plot_graph_panels(df_layout: pd.DataFrame, color_by: str | None, title: str, annotate_only: bool = False) -> None:
    sample_ids = list(df_layout['sample_id'].drop_duplicates())
    fig, axes = plt.subplots(1, len(sample_ids), figsize=(7 * len(sample_ids), 6), squeeze=False)
    for ax, sample_id in zip(axes[0], sample_ids):
        sub = df_layout[df_layout['sample_id'].eq(sample_id)].copy()
        if sub.empty:
            continue
        sizes = np.sqrt(sub['freq'].fillna(0).to_numpy() + 1e-12) * 200
        if annotate_only:
            ax.scatter(sub['x'], sub['y'], s=np.maximum(sizes, 10), color='grey', alpha=0.35)
            colored = sub[sub[color_by].notna()].copy()
            if not colored.empty:
                for label, grp in colored.groupby(color_by):
                    ax.scatter(grp['x'], grp['y'], s=np.maximum(np.sqrt(grp['freq'].to_numpy() + 1e-12) * 220, 14), label=label, alpha=0.9)
                ax.legend(loc='lower center', bbox_to_anchor=(0.5, -0.22), ncol=2, fontsize=8)
        elif color_by is None:
            ax.scatter(sub['x'], sub['y'], s=np.maximum(sizes, 10), color='grey40', alpha=0.9)
        else:
            ax.scatter(sub['x'], sub['y'], s=np.maximum(sizes, 10), c=sub[color_by], cmap='Set1', alpha=0.9)
        ax.set_title(sample_id)
        ax.set_xlabel('MDS1')
        ax.set_ylabel('MDS2')
        ax.set_aspect('equal', adjustable='datalim')
        ax.grid(False)
    fig.suptitle(title)
    plt.tight_layout()
    plt.show()


def cluster_annotation_stats(df_layout_annot: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for sample_id, sample_df in df_layout_annot.groupby('sample_id'):
        total_sample = len(sample_df)
        epi_totals = sample_df.dropna(subset=['epi.name']).groupby('epi.name').size().to_dict()
        for (cid2, epi_name), group in sample_df.dropna(subset=['epi.name']).groupby(['cid2', 'epi.name']):
            total_cluster = int(sample_df[sample_df['cid2'].eq(cid2)].shape[0])
            total_epi = int(epi_totals[epi_name])
            count_matched = int(group.shape[0])
            expected_fraction = total_epi / total_sample if total_sample else 0.0
            test = binomtest(count_matched, total_cluster, expected_fraction)
            rows.append(
                {
                    'sample_id': sample_id,
                    'cid2': cid2,
                    'epi.name': epi_name,
                    'total.sample': total_sample,
                    'total.epi': total_epi,
                    'total.cluster': total_cluster,
                    'count.matched': count_matched,
                    'fraction.matched': count_matched / total_cluster if total_cluster else 0.0,
                    'fraction.matched.e': expected_fraction,
                    'p.value': test.pvalue,
                }
            )
    out = pd.DataFrame(rows)
    if not out.empty:
        out['p.value.adj'] = bh_adjust(out['p.value'])
    return out.sort_values('p.value') if not out.empty else out

# Publication-quality matplotlib style (Nature/Science aesthetics)
plt.rcParams.update({
    "font.family": "sans-serif",
    "font.sans-serif": ["Arial", "Helvetica", "DejaVu Sans"],
    "font.size": 10,
    "axes.titlesize": 11,
    "axes.labelsize": 10,
    "xtick.labelsize": 9,
    "ytick.labelsize": 9,
    "legend.fontsize": 9,
    "figure.dpi": 150,
    "savefig.dpi": 300,
    "axes.linewidth": 0.8,
    "axes.spines.top": False,
    "axes.spines.right": False,
    "xtick.major.size": 3.5,
    "ytick.major.size": 3.5,
    "xtick.direction": "out",
    "ytick.direction": "out",
    "axes.grid": False,
})

Python 3.12.12
  mirpy-lib: 1.1.0
  numpy: 1.26.4
  pandas: 3.0.3
  matplotlib: 3.10.9
  scipy: 1.17.1
  polars: 1.40.1

/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

[2]:

def plot_boxpanels(sample_df: pd.DataFrame, control_df: pd.DataFrame, title: str) -> None:
    sample_ids = list(sample_df['sample_id'].drop_duplicates())
    species = sorted(sample_df['antigen.species'].dropna().unique())
    cmap = plt.get_cmap('Paired', max(1, len(species)))
    species_colors = {sp: cmap(i) for i, sp in enumerate(species)}
    fig, axes = plt.subplots(1, len(sample_ids), figsize=(8 * len(sample_ids), 10), squeeze=False)
    for ax, sample_id in zip(axes[0], sample_ids):
        sample_sub = sample_df[sample_df['sample_id'].eq(sample_id)].copy()
        control_sub = control_df[control_df['sample_id'].eq(sample_id)].copy()
        if control_sub.empty:
            control_sub = control_df.copy()
        order = (
            sample_sub.groupby('epi.name', as_index=False)
            .agg(freq=('freq', 'max'), antigen_species=('antigen.species', 'first'))
            .sort_values(['freq', 'antigen_species'], ascending=[True, True])['epi.name']
            .tolist()
        )
        positions = np.arange(len(order), dtype=float)
        control_values = [control_sub[control_sub['epi.name'].eq(epi)]['count'].to_numpy() for epi in order]
        sample_values = [sample_sub[sample_sub['epi.name'].eq(epi)]['count'].to_numpy() for epi in order]
        bp_control = ax.boxplot(control_values, positions=positions - 0.18, widths=0.32, patch_artist=True, showfliers=False, vert=False)
        for patch in bp_control['boxes']:
            patch.set(facecolor='lightgrey', edgecolor='grey')
        bp_sample = ax.boxplot(sample_values, positions=positions + 0.18, widths=0.32, patch_artist=True, showfliers=False, vert=False)
        for patch, epi in zip(bp_sample['boxes'], order):
            species_name = sample_sub[sample_sub['epi.name'].eq(epi)]['antigen.species'].iloc[0]
            patch.set(facecolor=(1, 1, 1, 0), edgecolor=species_colors.get(species_name, 'black'), linewidth=1.5)
        ax.set_yticks(positions)
        ax.set_yticklabels(order)
        ax.set_xscale('log')
        ax.set_xlabel('# reads')
        ax.set_title(sample_id)
        ax.grid(False)
    fig.suptitle(title)
    plt.tight_layout()
    plt.show()

[3]:

cmv_rep_full = load_trb(TCRNET_DIR / 'CMV+.txt.gz', 'CMV+')
b35_rep_full = load_trb(TCRNET_DIR / 'B35+.txt.gz', 'B35+')
control_rep_full = load_trb(TCRNET_DIR / 'control.txt.gz', 'control')

basic_stats = pd.DataFrame(
    [
        {'sample_id': 'control', 'n_clonotypes': len(control_rep_full.clonotypes), 'n_reads': control_rep_full.duplicate_count},
        {'sample_id': 'CMV+', 'n_clonotypes': len(cmv_rep_full.clonotypes), 'n_reads': cmv_rep_full.duplicate_count},
        {'sample_id': 'B35+', 'n_clonotypes': len(b35_rep_full.clonotypes), 'n_reads': b35_rep_full.duplicate_count},
    ]
)
basic_stats

[3]:

	sample_id	n_clonotypes	n_reads
0	control	873611	10376685
1	CMV+	147536	3094067
2	B35+	61368	865505

Explore VDJdb annotation results#

Load local VDJdb annotations, match repertoires with one substitution allowed, and apply the same ambiguity filtering logic as in the reference workflow.

[4]:

vdjdb_raw, vdjdb_rep = load_vdjdb_reference(VDJDB_PATH)

df_vdjdb = pd.concat(
    [
        annotate_repertoire_with_vdjdb(control_rep_full, 'control', vdjdb_raw, vdjdb_rep),
        annotate_repertoire_with_vdjdb(cmv_rep_full, 'CMV+', vdjdb_raw, vdjdb_rep),
        annotate_repertoire_with_vdjdb(b35_rep_full, 'B35+', vdjdb_raw, vdjdb_rep),
    ],
    ignore_index=True,
)

df_vdjdb = quality_filter_annotations(df_vdjdb)
df_vdjdb_control = df_vdjdb[df_vdjdb['sample_id'].eq('control')].copy()
df_vdjdb = df_vdjdb[df_vdjdb['sample_id'].ne('control')].copy()

df_vdjdb.head()

Skipping too long sequence CASSYSTVLHGCLLSAQKDHTGAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQRLGQGLEFLIYFQGNSAPDKSGLPSDRFSAERTGESVSTLTIQRTQQEDSAVYLCAAPSQGESDEQFF: length = 126

[4]:

	cdr3aa	antigen.epitope	antigen.species	mhc.a	sample_id	vdjdb.score	reference.id	count	freq	mhc.b	mhc.class	mhc.a.base	epi.name
0	CAAADANTGELFF	YLQPRTFLL	SARS-CoV-2	HLA-A*02:01	CMV+	0	PMID:33664060	3	9.695976e-07	B2M	MHCI	HLA-A*02	A*02 YLQ
9	CAAGGNQPQHF	NLVPMVATV	CMV	HLA-A*02:01	CMV+	0	PMID:39746936	34	1.098877e-05	B2M	MHCI	HLA-A*02	A*02 NLV
11	CAAGGSNQPQHF	FLYNLLTRV	HomoSapiens	HLA-A*02:01	B35+	0	PMID:25837513	20	2.310790e-05	B2M	MHCI	HLA-A*02	A*02 FLY
19	CAAGSSGANVLTF	GILGFVFTL	InfluenzaA	HLA-A*02	CMV+	0	PMID:28423320	13	4.201590e-06	B2M	MHCI	HLA-A*02	A*02 GIL
21	CAAGTGDSNQPQHF	EPLPQGQLTAY	EBV	HLA-B35:01,HLA-B35:42:01	B35+	3	PMID:16148129,PMID:17259989,PMID:23028307,PMID...	55	6.354672e-05	B2M	MHCI	HLA-B*35	B*35 EPL

Plot all VDJdb annotations#

Overlay sample-specific annotation count distributions against the control repertoire background.

[5]:

plot_boxpanels(df_vdjdb, df_vdjdb_control, 'Plot all VDJdb annotations')
wilcox_by_epitope(df_vdjdb, df_vdjdb_control)

../_images/notebooks_tcrnet_analysis_7_0.png

[5]:

	sample_id	epi.name	p.value	p.value.adj
182	CMV+	A*02 NLV	9.674646e-59	2.815322e-56
198	CMV+	A*03 KLG	2.029298e-56	2.952629e-54
37	B35+	A*02 NLV	1.564193e-53	1.517267e-51
25	B35+	A*02 GIL	1.283083e-31	9.334429e-30
169	CMV+	A*02 GIL	1.273074e-17	7.409290e-16
...	...	...	...	...
263	CMV+	DRA*01 QAR	9.602773e-01	9.736610e-01
43	B35+	A*02 RTA	9.812679e-01	9.914894e-01
152	CMV+	A*01 VTD	1.000000e+00	1.000000e+00
60	B35+	A*11 QVP	1.000000e+00	1.000000e+00
138	B35+	I-Ab GFS	1.000000e+00	1.000000e+00

291 rows × 4 columns

Zoom in and filter based on donor HLA haplotype knowledge#

For the CMV+ donor keep only CMV annotations compatible with donor HLA. For the B35+ donor keep only non-CMV annotations compatible with donor HLA.

[6]:

df_vdjdb_f = donor_filter(df_vdjdb)
df_vdjdb_c = donor_control_filter(df_vdjdb_control)

plot_boxpanels(df_vdjdb_f, df_vdjdb_c, 'Donor-HLA filtered VDJdb annotations')
wilcox_by_epitope(df_vdjdb_f, df_vdjdb_c)

../_images/notebooks_tcrnet_analysis_9_0.png

[6]:

	sample_id	epi.name	p.value	p.value.adj
3	CMV+	A*02 NLV	9.674646e-59	6.772252e-58
0	B35+	B*35 EPL	1.162456e-04	2.856093e-04
5	CMV+	B*07 TPR	1.224040e-04	2.856093e-04
2	CMV+	A*02 MLN	9.842690e-03	1.722471e-02
6	CMV+	B*44 NEG	2.828507e-02	3.959909e-02
1	B35+	B*35 HPV	3.344690e-01	3.902138e-01
4	CMV+	B*07 RPH	8.518887e-01	8.518887e-01

Search for expanded TCR groups with TCRNET#

Run TCRNET against the full control repertoire, adjust P-values, select enriched clonotypes, and compare fold enrichment with a synthetic control of size \(10^7\).

[7]:

cmv_tcrnet = compute_tcrnet(
    cmv_rep_full,
    control=control_rep_full,
    metric='hamming',
    threshold=1,
    match_mode='vj',
    pvalue_mode='binomial',
    n_jobs=CORES,
)
b35_tcrnet = compute_tcrnet(
    b35_rep_full,
    control=control_rep_full,
    metric='hamming',
    threshold=1,
    match_mode='vj',
    pvalue_mode='binomial',
    n_jobs=CORES,
)

df_tcrnet = pd.concat(
    [
        tcrnet_table_with_counts(cmv_rep_full, 'CMV+', cmv_tcrnet.table),
        tcrnet_table_with_counts(b35_rep_full, 'B35+', b35_tcrnet.table),
    ],
    ignore_index=True,
)
df_tcrnet_e = df_tcrnet[df_tcrnet['p.adj'].lt(0.05)].copy()
df_tcrnet_e.groupby('sample_id').agg(enriched_clonotypes=('cdr3aa', 'count'))

[7]:

	enriched_clonotypes
sample_id
B35+	371
CMV+	126

Synthetic-control comparison#

This is the only analysis block added on top of the reference workflow: compare fold enrichment from the full control repertoire against a synthetic control of size \(10^7\).

[8]:

from mir.common.control import ControlManager

[ ]:

mgr = ControlManager()

synth_specs = {
    'CMV+': {'rep': cmv_rep_full, 'seed': 42},
    'B35+': {'rep': b35_rep_full, 'seed': 43},
}

synth_tables: list[pd.DataFrame] = []
for sample_id, spec in synth_specs.items():
    synth_path = mgr.synthetic_control_path('human', 'TRB', 10_000_000)
    if not synth_path.exists():
        print(f"Synthetic control cache missing for n=10^7 at {synth_path}; skipping synthetic comparison for {sample_id}.")
        continue

    synth_result = compute_tcrnet(
        spec['rep'],
        control_type='synthetic',
        species='human',
        control_manager=mgr,
        control_kwargs={'n': 10_000_000, 'seed': spec['seed'], 'chunk_size': 100_000, 'progress': True},
        normalize_control_vj_usage=True,
        metric='hamming',
        threshold=1,
        match_mode='vj',
        pvalue_mode='binomial',
        n_jobs=CORES,
    )
    synth_tables.append(tcrnet_table_with_counts(spec['rep'], sample_id, synth_result.table))

if synth_tables:
    df_tcrnet_synth = pd.concat(synth_tables, ignore_index=True)

    comparison = (
        df_tcrnet[['sample_id', 'cdr3aa', 'fold', 'p.adj']]
        .rename(columns={'fold': 'fold_real', 'p.adj': 'p_real'})
        .merge(
            df_tcrnet_synth[['sample_id', 'cdr3aa', 'fold', 'p.adj']].rename(columns={'fold': 'fold_synth', 'p.adj': 'p_synth'}),
            on=['sample_id', 'cdr3aa'],
            how='inner',
        )
    )

    ann_cols = ['sample_id', 'cdr3aa', 'antigen.epitope', 'mhc.a', 'mhc.b']
    ann_available = [c for c in ann_cols if c in df_vdjdb_f.columns]
    if ann_available:
        comparison = comparison.merge(df_vdjdb_f[ann_available].drop_duplicates(), on=['sample_id', 'cdr3aa'], how='left')

    epi = comparison.get('antigen.epitope', pd.Series('', index=comparison.index)).fillna('')
    mhc_b = comparison.get('mhc.b', pd.Series('', index=comparison.index)).fillna('')

    comparison['highlight'] = np.select(
        [
            comparison['sample_id'].eq('B35+') & epi.str.contains('EPL', na=False),
            comparison['sample_id'].eq('CMV+') & (epi.str.contains('TPR', na=False) | mhc_b.str.contains(r'B\*07', na=False, regex=True)),
        ],
        ['B35 EBV epitope', 'B07 CMV epitope'],
        default='other',
    )

    fig, axes = plt.subplots(1, 2, figsize=(14, 5), squeeze=False)
    for ax, sid in zip(axes[0], ['CMV+', 'B35+']):
        sub = comparison[comparison['sample_id'].eq(sid)].copy()
        if sub.empty:
            ax.set_title(f"{sid} (no synthetic-control overlap)")
            ax.set_axis_off()
            continue

        base = sub[sub['highlight'].eq('other')]
        if not base.empty:
            ax.scatter(base['fold_real'], base['fold_synth'], s=12, alpha=0.45, color='lightgray', label='other')

        palette = {'B35 EBV epitope': '#d62728', 'B07 CMV epitope': '#1f77b4'}
        for label, color in palette.items():
            h = sub[sub['highlight'].eq(label)]
            if h.empty:
                continue
            ax.scatter(h['fold_real'], h['fold_synth'], s=28, alpha=0.95, color=color, label=label)

        ax.set_xscale('log')
        ax.set_yscale('log')
        ax.set_xlabel('Fold enrichment vs real control')
        ax.set_ylabel('Fold enrichment vs synthetic control (10^7)')
        ax.set_title(sid)
        ax.legend(loc='best', fontsize=8)

    plt.tight_layout()
    plt.show()
else:
    print('Synthetic-control comparison skipped: no cached 10^7 synthetic control available.')

Synthetic control cache missing for n=10^7 at /Users/mikesh/.cache/mirpy/controls/synthetic/human/TRB/olga_n10000000.pkl; skipping synthetic comparison for CMV+.
Synthetic control cache missing for n=10^7 at /Users/mikesh/.cache/mirpy/controls/synthetic/human/TRB/olga_n10000000.pkl; skipping synthetic comparison for B35+.
Synthetic-control comparison skipped: no cached 10^7 synthetic control available.

<>:54: SyntaxWarning: invalid escape sequence '\*'
<>:54: SyntaxWarning: invalid escape sequence '\*'
/var/folders/w1/pqrcnlxn3ss93t6764fdgp1c0000gn/T/ipykernel_61407/2650750415.py:54: SyntaxWarning: invalid escape sequence '\*'
  comparison['sample_id'].eq('CMV+') & (epi.str.contains('TPR', na=False) | mhc_b.str.contains('B\*07', na=False, regex=True)),

Correlation between enrichment fold and clonotype frequency#

Plot enriched clonotypes, coloring by adjusted significance.

[10]:

fig, axes = plt.subplots(1, 2, figsize=(14, 5), squeeze=False)
for ax, sample_id in zip(axes[0], ['CMV+', 'B35+']):
    sub = df_tcrnet_e[df_tcrnet_e['sample_id'].eq(sample_id)].copy()
    if sub.empty:
        continue
    scatter = ax.scatter(
        sub['freq'],
        sub['fold'],
        c=-np.log10(np.maximum(sub['p.adj'], 1e-300)),
        cmap='RdBu_r',
        s=20,
        alpha=0.8,
    )
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel('Frequency')
    ax.set_ylabel('Enrichment fold')
    ax.set_title(sample_id)
    plt.colorbar(scatter, ax=ax, label='-log10 adjusted P-value')
plt.tight_layout()
plt.show()

df_tcrnet_e.groupby('sample_id').apply(lambda x: pd.Series({'spearman_r': spearmanr(x['fold'], x['freq']).statistic, 'p_value': spearmanr(x['fold'], x['freq']).pvalue}))

../_images/notebooks_tcrnet_analysis_16_0.png

[10]:

	spearman_r	p_value
sample_id
B35+	0.013612	0.793849
CMV+	0.244769	0.005740

Extract enriched groups of homologous TCRs#

Compute Hamming-distance graphs for random top-3000 clonotypes and for the neighborhoods around enriched clonotypes.

[11]:

def layout_graph_df(node_df: pd.DataFrame, sample_id: str) -> pd.DataFrame:
    if node_df.empty:
        return pd.DataFrame(columns=['cdr3aa', 'x', 'y', 'cid', 'cid2', 'sample_id', 'freq', 'count'])
    graph = build_edit_distance_graph(sequences_to_clonotypes(node_df), metric='hamming', threshold=1, nproc=CORES)
    coords = np.asarray(graph.layout_graphopt(niter=3000, node_charge=0.005)) if graph.vcount() else np.empty((0, 2))
    membership = graph.components().membership if graph.vcount() else []
    layout = pd.DataFrame(
        {
            'cdr3aa': graph.vs['name'],
            'x': coords[:, 0] if len(coords) else [],
            'y': coords[:, 1] if len(coords) else [],
            'cid': membership,
            'sample_id': sample_id,
        }
    )
    layout['cid2'] = layout['sample_id'] + '_C' + layout['cid'].astype(str)
    return layout.merge(node_df[['cdr3aa', 'freq', 'count']].drop_duplicates(), on='cdr3aa', how='left')

[12]:

def plot_graph_panels(df_layout: pd.DataFrame, color_by: str | None, title: str, annotate_only: bool = False) -> None:
    sample_ids = list(df_layout['sample_id'].drop_duplicates())
    fig, axes = plt.subplots(1, len(sample_ids), figsize=(7 * len(sample_ids), 6), squeeze=False)
    for ax, sample_id in zip(axes[0], sample_ids):
        sub = df_layout[df_layout['sample_id'].eq(sample_id)].copy()
        if sub.empty:
            continue
        sizes = np.sqrt(sub['freq'].fillna(0).to_numpy() + 1e-12) * 200
        if annotate_only:
            ax.scatter(sub['x'], sub['y'], s=np.maximum(sizes, 10), color='lightgrey', alpha=0.35)
            colored = sub[sub[color_by].notna()].copy()
            if not colored.empty:
                for label, grp in colored.groupby(color_by):
                    ax.scatter(grp['x'], grp['y'], s=np.maximum(np.sqrt(grp['freq'].to_numpy() + 1e-12) * 220, 14), label=label, alpha=0.9)
                ax.legend(loc='lower center', bbox_to_anchor=(0.5, -0.22), ncol=2, fontsize=8)
        elif color_by is None:
            ax.scatter(sub['x'], sub['y'], s=np.maximum(sizes, 10), color='dimgray', alpha=0.9)
        else:
            ax.scatter(sub['x'], sub['y'], s=np.maximum(sizes, 10), c=sub[color_by], cmap='Set1', alpha=0.9)
        ax.set_title(sample_id)
        ax.set_xlabel('MDS1')
        ax.set_ylabel('MDS2')
        ax.set_aspect('equal', adjustable='datalim')
        ax.grid(False)
    fig.suptitle(title)
    plt.tight_layout()
    plt.show()

[13]:

sequence_tables = {
    'CMV+': repertoire_to_sequence_table(cmv_rep_full, 'CMV+'),
    'B35+': repertoire_to_sequence_table(b35_rep_full, 'B35+'),
}

random_layouts = []
enriched_layouts = []
for sample_id, rep in [('CMV+', cmv_rep_full), ('B35+', b35_rep_full)]:
    seq_table = sequence_tables[sample_id]
    rnd_nodes = (
        seq_table[~seq_table['cdr3aa'].str.contains(r'[_*]', regex=True, na=False)]
        .sort_values('count', ascending=False)
        .head(3000)
        .copy()
    )
    random_layouts.append(layout_graph_df(rnd_nodes, sample_id))

    enriched_seqs = df_tcrnet_e[df_tcrnet_e['sample_id'].eq(sample_id)]['cdr3aa'].drop_duplicates().tolist()
    neighbor_seqs = collect_neighbor_sequences(rep, enriched_seqs)
    enriched_nodes = seq_table[seq_table['cdr3aa'].isin(neighbor_seqs)].copy()
    enriched_layouts.append(layout_graph_df(enriched_nodes, sample_id))

df_mds_rnd = pd.concat(random_layouts, ignore_index=True)
df_mds = pd.concat(enriched_layouts, ignore_index=True)

df_mds[['sample_id', 'cid2']].drop_duplicates().groupby('sample_id').size().rename('n_components')

[13]:

sample_id
B35+    125
CMV+     43
Name: n_components, dtype: int64

Layout and plot graphs#

Show the random top-3000 graph in grey and the enriched-neighborhood graph colored by connected component.

[14]:

plot_graph_panels(df_mds_rnd, color_by=None, title='Random graph: top 3000 clonotypes')
plot_graph_panels(df_mds, color_by='cid', title='Enriched-clonotype neighborhoods colored by connected component')

../_images/notebooks_tcrnet_analysis_22_0.png

../_images/notebooks_tcrnet_analysis_22_1.png

Combine TCRNET results and VDJdb annotations#

Color enriched-graph nodes by donor-filtered VDJdb annotations and compute annotation enrichment for connected components.

[15]:

annot_cols = df_vdjdb_f[['cdr3aa', 'epi.name', 'sample_id', 'antigen.species']].drop_duplicates().copy()
annot_cols['epi.name'] = annot_cols['antigen.species'].astype(str) + ' ' + annot_cols['epi.name'].astype(str)
annot_cols = annot_cols[['cdr3aa', 'epi.name', 'sample_id']]

df_mds_ag_freq = df_mds.merge(annot_cols, on=['cdr3aa', 'sample_id'], how='left')
plot_graph_panels(df_mds_ag_freq, color_by='epi.name', title='Enriched graphs colored by donor-filtered VDJdb annotation', annotate_only=True)

cluster_annot_stats = cluster_annotation_stats(df_mds_ag_freq)
cluster_annot_stats.head(20)

../_images/notebooks_tcrnet_analysis_24_0.png

[15]:

	sample_id	cid2	epi.name	total.sample	total.epi	total.cluster	count.matched	fraction.matched	fraction.matched.e	p.value	p.value.adj
0	B35+	B35+_C0	EBV B*35 EPL	885	21	40	21	0.525000	0.023729	6.457089e-24	1.614272e-22
22	CMV+	CMV+_C7	CMV B*07 RPH	444	27	47	27	0.574468	0.060811	4.291258e-21	5.364072e-20
1	B35+	B35+_C27	EBV B*35 HPV	885	7	20	6	0.300000	0.007910	8.629575e-09	7.191312e-08
23	CMV+	CMV+_C7	CMV B*07 TPR	444	8	47	8	0.170213	0.018018	1.865743e-06	1.166089e-05
12	CMV+	CMV+_C26	CMV A*02 NLV	444	44	7	5	0.714286	0.099099	1.689672e-04	8.448358e-04
13	CMV+	CMV+_C27	CMV A*02 NLV	444	44	18	6	0.333333	0.099099	6.136393e-03	2.556830e-02
21	CMV+	CMV+_C42	CMV A*02 NLV	444	44	5	3	0.600000	0.099099	8.342831e-03	2.979582e-02
3	CMV+	CMV+_C10	CMV A*02 MLN	444	2	36	2	0.055556	0.004505	1.154791e-02	3.608723e-02
2	B35+	B35+_C92	EBV B*35 HPV	885	7	3	1	0.333333	0.007910	2.354162e-02	6.539340e-02
14	CMV+	CMV+_C29	CMV A*02 NLV	444	44	4	2	0.500000	0.099099	5.142740e-02	1.168804e-01
7	CMV+	CMV+_C2	CMV A*02 NLV	444	44	4	2	0.500000	0.099099	5.142740e-02	1.168804e-01
8	CMV+	CMV+_C20	CMV A*02 NLV	444	44	5	2	0.400000	0.099099	8.015044e-02	1.565211e-01
4	CMV+	CMV+_C11	CMV A*02 NLV	444	44	11	3	0.272727	0.099099	8.765183e-02	1.565211e-01
5	CMV+	CMV+_C15	CMV A*02 NLV	444	44	11	3	0.272727	0.099099	8.765183e-02	1.565211e-01
24	CMV+	CMV+_C8	CMV A*02 NLV	444	44	7	2	0.285714	0.099099	1.474646e-01	2.318540e-01
9	CMV+	CMV+_C21	CMV A*02 NLV	444	44	21	4	0.190476	0.099099	1.483865e-01	2.318540e-01
6	CMV+	CMV+_C17	CMV A*02 NLV	444	44	2	1	0.500000	0.099099	1.883776e-01	2.770258e-01
19	CMV+	CMV+_C36	CMV A*02 NLV	444	44	58	3	0.051724	0.099099	2.775169e-01	3.854402e-01
16	CMV+	CMV+_C30	CMV A*02 NLV	444	44	4	1	0.250000	0.099099	3.412690e-01	4.265863e-01
17	CMV+	CMV+_C32	CMV A*02 NLV	444	44	4	1	0.250000	0.099099	3.412690e-01	4.265863e-01

Interesting annotated clusters#

Inspect components with the strongest excess of donor-compatible VDJdb annotations and show the matched clonotypes.

[16]:

top_clusters = cluster_annot_stats[cluster_annot_stats['p.value.adj'].lt(0.05)].head(10).copy()
top_clusters

if not top_clusters.empty:
    interesting = df_mds_ag_freq[df_mds_ag_freq['cid2'].isin(top_clusters['cid2'])].copy()
    plot_graph_panels(interesting, color_by='epi.name', title='Significant annotated enriched clusters', annotate_only=True)
    display(
        interesting[
            ['sample_id', 'cid2', 'cdr3aa', 'count', 'freq', 'epi.name']
        ].sort_values(['sample_id', 'cid2', 'count'], ascending=[True, True, False]).head(50)
    )

../_images/notebooks_tcrnet_analysis_26_0.png

	sample_id	cid2	cdr3aa	count	freq	epi.name
448	B35+	B35+_C0	CAIGTGDSNQPQHF	13369	0.015446	EBV B*35 EPL
1189	B35+	B35+_C0	CATSTGDSNQPQHF	3165	0.003657	EBV B*35 EPL
1063	B35+	B35+_C0	CASSTGDSNQPQHF	2929	0.003384	EBV B*35 EPL
455	B35+	B35+_C0	CAISTGDSNQPQHF	2113	0.002441	EBV B*35 EPL
1171	B35+	B35+_C0	CATGTGDSNQPQHF	1886	0.002179	EBV B*35 EPL
465	B35+	B35+_C0	CASGTGDSNQPQHF	975	0.001127	EBV B*35 EPL
449	B35+	B35+_C0	CAIGTGDTNQPQHF	533	0.000616	EBV B*35 EPL
1062	B35+	B35+_C0	CASSTGDNNQPQHF	355	0.000410	EBV B*35 EPL
571	B35+	B35+_C0	CASSEPYSNQPQHF	232	0.000268	NaN
476	B35+	B35+_C0	CASNTGDSNQPQHF	206	0.000238	EBV B*35 EPL
1074	B35+	B35+_C0	CASSTRDSNQPQHF	135	0.000156	NaN
444	B35+	B35+_C0	CAAGTGDSNQPQHF	55	0.000064	EBV B*35 EPL
474	B35+	B35+_C0	CASNQPYSNQPQHF	52	0.000060	NaN
984	B35+	B35+_C0	CASSQPYSNQPQHF	41	0.000047	NaN
1192	B35+	B35+_C0	CATWTGDSNQPQHF	28	0.000032	EBV B*35 EPL
463	B35+	B35+_C0	CASGQPYSNQPQHF	24	0.000028	NaN
715	B35+	B35+_C0	CASSLGDSNQPQHF	24	0.000028	EBV B*35 EPL
1070	B35+	B35+_C0	CASSTGVDNQPQHF	24	0.000028	NaN
531	B35+	B35+_C0	CASSAGDSNQPQHF	21	0.000024	EBV B*35 EPL
894	B35+	B35+_C0	CASSPGYSNQPQHF	21	0.000024	NaN
979	B35+	B35+_C0	CASSQGYSNQPQHF	19	0.000022	NaN
1061	B35+	B35+_C0	CASSTGDGNQPQHF	19	0.000022	EBV B*35 EPL
1059	B35+	B35+_C0	CASSTGAGNQPQHF	18	0.000021	NaN
1165	B35+	B35+_C0	CASTQPYSNQPQHF	17	0.000020	NaN
473	B35+	B35+_C0	CASNEGDSNQPQHF	16	0.000018	NaN
530	B35+	B35+_C0	CASSAGDGNQPQHF	16	0.000018	NaN
593	B35+	B35+_C0	CASSFGDSNQPQHF	15	0.000017	EBV B*35 EPL
1068	B35+	B35+_C0	CASSTGTGNQPQHF	12	0.000014	NaN
1190	B35+	B35+_C0	CATSTGDVNQPQHF	12	0.000014	EBV B*35 EPL
446	B35+	B35+_C0	CAGGTGDSNQPQHF	11	0.000013	EBV B*35 EPL
838	B35+	B35+_C0	CASSNPYSNQPQHF	11	0.000013	NaN
1162	B35+	B35+_C0	CASTNPYSNQPQHF	10	0.000012	NaN
445	B35+	B35+_C0	CAGGQPYSNQPQHF	9	0.000010	NaN
1071	B35+	B35+_C0	CASSTGVSNQPQHF	7	0.000008	EBV B*35 EPL
1095	B35+	B35+_C0	CASSVGVSNQPQHF	7	0.000008	NaN
1060	B35+	B35+_C0	CASSTGASNQPQHF	6	0.000007	EBV B*35 EPL
1072	B35+	B35+_C0	CASSTGYSNQPQHF	6	0.000007	EBV B*35 EPL
635	B35+	B35+_C0	CASSHGYSNQPQHF	5	0.000006	NaN
1128	B35+	B35+_C0	CASSYGDNNQPQHF	5	0.000006	NaN
999	B35+	B35+_C0	CASSRGDSNQPQHF	3	0.000003	EBV B*35 EPL
1067	B35+	B35+_C27	CASSTGTGELFF	107	0.000124	NaN
541	B35+	B35+_C27	CASSARTGELFF	67	0.000077	EBV B*35 HPV
1100	B35+	B35+_C27	CASSVRTGELFF	63	0.000073	EBV B*35 HPV
1145	B35+	B35+_C27	CASSYGTGELFF	54	0.000062	NaN
1098	B35+	B35+_C27	CASSVNTGELFF	38	0.000044	NaN
598	B35+	B35+_C27	CASSFGTGELFF	32	0.000037	NaN
1042	B35+	B35+_C27	CASSSNTGELFF	25	0.000029	NaN
761	B35+	B35+_C27	CASSLGTGELFF	22	0.000025	NaN
973	B35+	B35+_C27	CASSQGTGELFF	22	0.000025	NaN
1002	B35+	B35+_C27	CASSRGTGELFF	20	0.000023	NaN