VDJBet YF Analysis (Rmd-aligned)#

Python rewrite of tmp/vdjbet_snippet.Rmd with matching analysis outputs:

  1. YF vs OLGA V-usage and V/J correction factors

  2. LLW reference and adjusted mock generation

  3. LLW Pgen histogram match against mock bins

  4. LLW overlap per sample: matched clonotypes and duplicate_count

  5. Cohen d, z-scores, empirical p-values, FDR

  6. Red line (LLW) + mock boxplots and Cohen d heatmaps

[1]:
# Remove stale control locks left by killed processes before any control builds.
import os, signal
from pathlib import Path

_locks_dir = Path.home() / '.cache' / 'mirpy' / 'controls' / '.locks'
if _locks_dir.exists():
    for _lf in sorted(_locks_dir.glob('*.lock')):
        try:
            _pid = int(_lf.read_text().split('pid=')[1].split()[0])
            try:
                os.kill(_pid, 0)   # process still alive — keep lock
            except (ProcessLookupError, PermissionError):
                _lf.unlink(missing_ok=True)
                print(f'Removed stale lock: {_lf.name} (pid={_pid} dead)')
        except Exception:
            pass
print('Lock check done.')

Lock check done.
[2]:
import importlib.metadata as _meta
import sys as _sys
print(f"Python {_sys.version.split()[0]}")
for _pkg in ["mirpy-lib", "numpy", "pandas", "matplotlib", "scipy", "polars"]:
    try:
        print(f"  {_pkg}: {_meta.version(_pkg)}")
    except _meta.PackageNotFoundError:
        pass

import math
import sys
import warnings
from collections import Counter
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

repo_root = Path.cwd().resolve().parent if Path.cwd().name == "notebooks" else Path.cwd().resolve()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from mir.comparative.vdjbet_workflow import (
    bh_fdr,
    build_real_control_analysis,
    build_synthetic_comparison,
    compute_bin_alignment_diagnostics,
    compute_olga_usage_adjustment,
    load_yfv_trb_samples,
    score_samples_dataframe,
)
from mir.common.parser import load_vdjdb_latest
from mir.utils.notebook_assets import ensure_airr_yfv19, notebook_large_assets_root

SEED = 42
N_MOCKS = 1000
POOL_SIZE = 100_000
OLGA_USAGE_N = 1_000_000
# count_rearrangement (default, unweighted) or count_duplicates (weighted by duplicate_count)
USAGE_COUNT_MODE = "count_rearrangement"
USAGE_PSEUDOCOUNT = 1.0
YFV_CACHE_DIRNAME = "pkl_trb_repertoires"

ASSET_ROOT = notebook_large_assets_root(repo_root)
YFV_DIR = ensure_airr_yfv19(repo_root)

print(f"YFV_DIR = {YFV_DIR}")
print(f"ASSET_ROOT = {ASSET_ROOT}")
print(f"Usage mode = {USAGE_COUNT_MODE}, pseudocount = {USAGE_PSEUDOCOUNT}")
print(f"OLGA usage cache size = {OLGA_USAGE_N:,}")
print(f"N_MOCKS = {N_MOCKS}")

# Publication-quality matplotlib style (Nature/Science aesthetics)
plt.rcParams.update({
    "font.family": "sans-serif",
    "font.sans-serif": ["Arial", "Helvetica", "DejaVu Sans"],
    "font.size": 10,
    "axes.titlesize": 11,
    "axes.labelsize": 10,
    "xtick.labelsize": 9,
    "ytick.labelsize": 9,
    "legend.fontsize": 9,
    "figure.dpi": 150,
    "savefig.dpi": 300,
    "axes.linewidth": 0.8,
    "axes.spines.top": False,
    "axes.spines.right": False,
    "xtick.major.size": 3.5,
    "ytick.major.size": 3.5,
    "xtick.direction": "out",
    "ytick.direction": "out",
    "axes.grid": False,
})
Python 3.12.12
  mirpy-lib: 1.1.0
  numpy: 1.26.4
  pandas: 3.0.3
  matplotlib: 3.10.9
  scipy: 1.17.1
  polars: 1.40.1
/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
YFV_DIR = /Users/mikesh/vcs/mirpy/notebooks/assets/large/airr_yfv19
ASSET_ROOT = /Users/mikesh/vcs/mirpy/notebooks/assets/large
Usage mode = count_rearrangement, pseudocount = 1.0
OLGA usage cache size = 1,000,000
N_MOCKS = 1000

1. Load LLWNGPMAV TRB reference from VDJdb#

[3]:
vdjdb_rep = load_vdjdb_latest(
    epitope="LLWNGPMAV",
    locus="TRB",
    species="HomoSapiens",
    mhc_a_contains="A*02",
)
print(f"Reference clonotypes: {vdjdb_rep.clonotype_count}")
if vdjdb_rep.clonotype_count > 0:
    print(f"Example: {vdjdb_rep.clonotypes[0].junction_aa}  {vdjdb_rep.clonotypes[0].v_gene}  {vdjdb_rep.clonotypes[0].j_gene}")
else:
    print("Warning: no clonotypes returned for this epitope/allele in VDJdb. Downstream cells may fail.")
Downloading: https://github.com/antigenomics/vdjdb-db/releases/download/2026-05-16/vdjdb-2026-05-16.zip
LLWNGPMAV: 409 unique TRB clonotypes
Reference clonotypes: 409
Example: CAIQDAGASYEQYF  TRBV6-2*01  TRBJ2-7*01

2. Load YF samples#

[4]:
# Load YF TRB repertoires and build per-sample records for VDJBet.
warnings.filterwarnings("ignore", category=FutureWarning)

samples, yfv_gu = load_yfv_trb_samples(YFV_DIR)

print(f"Loaded TRB samples: {len(samples)}")
print(f"Total TRB clonotypes: {sum(s['repertoire'].clonotype_count for s in samples):,}")
print(f"Total TRB duplicates: {sum(s['repertoire'].duplicate_count for s in samples):,}")
Loaded TRB samples: 42
Total TRB clonotypes: 29,939,151
Total TRB duplicates: 56,732,204

3. OLGA usage and correction factors#

Compute OLGA and YF usage frequencies, then derive correction factors.

Frequencies are computed in GeneUsage with configurable count mode and Laplace smoothing:

  • mode count_rearrangement: count of unique clonotypes per key / total clonotypes

  • mode count_duplicates: sum duplicate_count per key / total duplicate_count

  • pseudocount 1 is added for both YF and OLGA sides

Formulas:

  • factor_v = P_YF(V) / P_OLGA(V)

  • factor_vj = P_YF(V,J) / P_OLGA(V,J)

These factors are used by PgenGeneUsageAdjustment.

[5]:
usage_result = compute_olga_usage_adjustment(
    yfv_gu,
    seed=SEED,
    n_jobs=8,
    olga_usage_n=OLGA_USAGE_N,
    count_mode=USAGE_COUNT_MODE,
    pseudocount=USAGE_PSEUDOCOUNT,
)

olga_model = usage_result.olga_model
olga_gu = usage_result.olga_usage
v_cmp = usage_result.v_cmp
vj_cmp = usage_result.vj_cmp
v_df = usage_result.v_df
if hasattr(v_df, "to_pandas"): v_df = v_df.to_pandas()
vj_df = usage_result.vj_df
if hasattr(vj_df, "to_pandas"): vj_df = vj_df.to_pandas()

# Keep OLGA-based adjustment for the synthetic-null comparison section.
pgen_adj_olga = usage_result.pgen_adj_olga

print("Top V genes by |log2 factor|:")
display(v_df.assign(abs_log2=lambda d: d["log2_factor_v"].abs()).sort_values("abs_log2", ascending=False).head(15))
print("Top VJ pairs by |log2 factor|:")
display(vj_df.assign(abs_log2=lambda d: d["log2_factor_vj"].abs()).sort_values("abs_log2", ascending=False).head(15))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

x = np.clip(v_df["p_yf"].values, 1e-12, None)
y = np.clip(v_df["p_olga"].values, 1e-12, None)
sc = axes[0].scatter(x, y, c=v_df["log2_factor_v"].values, cmap="RdBu_r", s=40, alpha=0.9)
axes[0].plot([x.min(), x.max()], [x.min(), x.max()], color="#666666", linestyle=":", linewidth=1)
axes[0].set_xscale("log")
axes[0].set_yscale("log")
axes[0].set_xlabel("P_YF(V)")
axes[0].set_ylabel("P_OLGA(V)")
axes[0].set_title(f"V usage frequencies: YF vs OLGA ({USAGE_COUNT_MODE}, n={OLGA_USAGE_N:,})")
axes[0].grid(alpha=0.2)
cb = plt.colorbar(sc, ax=axes[0], shrink=0.9)
cb.set_label("log2 correction factor")

top = v_df.assign(abs_log2=lambda d: d["log2_factor_v"].abs()).sort_values("abs_log2", ascending=False).head(20)
axes[1].barh(top["v_gene"], top["log2_factor_v"], color=["#c0392b" if z > 0 else "#2c3e50" for z in top["log2_factor_v"]])
axes[1].axvline(0, color="black", linewidth=1)
axes[1].set_title("Top V correction factors (log2)")
axes[1].set_xlabel("log2(P_YF(V) / P_OLGA(V))")

plt.tight_layout()
plt.show()

print(f"Zero-frequency OLGA V genes in comparison table: {(v_df['p_olga'] == 0).sum()}")
Top V genes by |log2 factor|:
v_gene p_yf p_olga factor_v log2_factor_v abs_log2
42 TRBV7-4 0.000514 0.012933 0.039769 -4.652226 4.652226
34 TRBV6-3 0.000602 0.011130 0.054079 -4.208799 4.208799
38 TRBV6-8 0.000327 0.002943 0.111047 -3.170758 3.170758
39 TRBV6-9 0.000210 0.001774 0.118152 -3.081285 3.081285
0 TRBV10-1 0.003474 0.027168 0.127878 -2.967164 2.967164
9 TRBV13 0.004602 0.033953 0.135543 -2.883182 2.883182
35 TRBV6-4 0.004885 0.035428 0.137887 -2.858442 2.858442
5 TRBV11-3 0.004155 0.020195 0.205763 -2.280945 2.280945
6 TRBV12-3 0.071966 0.016474 4.368434 2.127116 2.127116
8 TRBV12-5 0.003422 0.012931 0.264602 -1.918104 1.918104
41 TRBV7-3 0.013478 0.034757 0.387772 -1.366720 1.366720
12 TRBV16 0.000900 0.000352 2.557484 1.354725 1.354725
23 TRBV30 0.009252 0.021960 0.421335 -1.246962 1.246962
10 TRBV14 0.009560 0.022397 0.426836 -1.228247 1.228247
17 TRBV24-1 0.018288 0.042331 0.432027 -1.210806 1.210806
Top VJ pairs by |log2 factor|:
v_gene j_gene p_yf p_olga factor_vj log2_factor_vj abs_log2
565 TRBV7-4 TRBJ1-2 0.000024 0.001272 0.018877 -5.727231 5.727231
71 TRBV11-3 TRBJ1-5 0.000054 0.002093 0.025665 -5.284056 5.284056
575 TRBV7-4 TRBJ2-6 0.000006 0.000204 0.027196 -5.200448 5.200448
564 TRBV7-4 TRBJ1-1 0.000040 0.001453 0.027859 -5.165700 5.165700
473 TRBV6-4 TRBJ1-3 0.000030 0.001063 0.028051 -5.155828 5.155828
474 TRBV6-4 TRBJ1-4 0.000049 0.001615 0.030113 -5.053479 5.053479
567 TRBV7-4 TRBJ1-4 0.000019 0.000608 0.030893 -5.016563 5.016563
462 TRBV6-3 TRBJ1-5 0.000039 0.001222 0.032083 -4.962054 4.962054
569 TRBV7-4 TRBJ1-6 0.000015 0.000437 0.033728 -4.889917 4.889917
124 TRBV13 TRBJ1-5 0.000127 0.003639 0.034826 -4.843677 4.843677
568 TRBV7-4 TRBJ1-5 0.000049 0.001394 0.035315 -4.823591 4.823591
566 TRBV7-4 TRBJ1-3 0.000015 0.000423 0.035397 -4.820221 4.820221
483 TRBV6-4 TRBJ2-7 0.000276 0.007522 0.036708 -4.767773 4.767773
4 TRBV10-1 TRBJ1-5 0.000110 0.002953 0.037358 -4.742443 4.742443
467 TRBV6-3 TRBJ2-4 0.000009 0.000243 0.037686 -4.729843 4.729843
../_images/notebooks_vdjbet_yf_9_4.png
Zero-frequency OLGA V genes in comparison table: 0

Why real control?#

Using a real human TRB control repertoire instead of a fully synthetic OLGA null corrects for biases that OLGA does not model: sequencing depth artifacts, PCR amplification skew, and repertoire sampling effects.

For this section we also compute V/J adjustment against the real control distribution (YF / real-control usage), not YF / OLGA. OLGA-based adjustment is kept for the synthetic-null comparison in section 9.

The histogram above should show that the real-control mock bin distribution (blue dashes) is close to the LLW reference (red). A large discrepancy would indicate that Pgen or gene-usage adjustment requires further tuning.

After the mock key sets are built here (lazy, first .score() call), each subsequent .score() call reuses them — only the query-repertoire counting differs per sample.

[6]:
real_result = build_real_control_analysis(
    vdjdb_rep,
    yfv_gu,
    seed=SEED,
    count_mode=USAGE_COUNT_MODE,
    pseudocount=USAGE_PSEUDOCOUNT,
    pool_size=POOL_SIZE,
    n_mocks=N_MOCKS,
    n_jobs=8,
)

real_control_df = real_result.control_df
real_control_gu = real_result.control_usage
pgen_adj_real = real_result.pgen_adj_real
pool = real_result.pool
analysis = real_result.analysis
dt = real_result.elapsed_s
print(f"Pool built: n_generated={pool.n_generated:,}, bins={len(pool.bins)}, floor={pool.floor_bin}, ceil={pool.ceil_bin}, elapsed={dt:.1f}s")

diag = compute_bin_alignment_diagnostics(analysis)
all_bins = diag["all_bins"]
ref_vec = diag["ref_vec"]
mock_mat = diag["mock_mat"]
mock_mean = diag["mock_mean"]
mock_q10 = diag["mock_q10"]
mock_q90 = diag["mock_q90"]
max_abs_diff = diag["max_abs_diff"]
rmsd = diag["rmsd"]

fig, ax = plt.subplots(figsize=(12, 5))
ax.fill_between(all_bins, mock_q10, mock_q90, alpha=0.25, color="#1f77b4", label="Mock 10-90%", zorder=1)
ax.plot(all_bins, ref_vec, color="#d62728", marker="o", linewidth=2, label="LLW reference", zorder=2)
# Draw mock mean last so it remains visible even when overlapping the reference line.
ax.plot(all_bins, mock_mean, color="#1f77b4", linestyle="--", marker="s", markersize=4, linewidth=2.2, label="Mock mean", zorder=3)
ax.set_xlabel("log2 Pgen bin")
ax.set_ylabel("count")
ax.set_title("LLW reference vs real-control mock Pgen histogram")
ax.legend()
ax.grid(alpha=0.2)
plt.tight_layout()
plt.show()

print(f"Mock vs LLW: median max|Delta p(bin)|={np.median(max_abs_diff):.4f}, p95={np.percentile(max_abs_diff,95):.4f}")
print(f"Mock vs LLW: median RMSD={np.median(rmsd):.4f}, p95={np.percentile(rmsd,95):.4f}")
Pool built: n_generated=99,981, bins=49, floor=-66, ceil=-18, elapsed=96.8s
../_images/notebooks_vdjbet_yf_11_1.png
Mock vs LLW: median max|Delta p(bin)|=0.0000, p95=0.0000
Mock vs LLW: median RMSD=0.0000, p95=0.0000

5. LLW overlap per sample: matched clonotypes and duplicate_count#

Compute for each donor/day/replica:

  • real LLW overlap

  • mock distribution summary

  • z-score and empirical p-value

  • Cohen d

  • FDR-adjusted p-values

[7]:
df_res = score_samples_dataframe(analysis, samples, progress_every=10, sample_n_jobs=8)
# Convert Polars → pandas; downstream cells use pandas API (iterrows, pivot_table, etc.)
if hasattr(df_res, "to_pandas"):
    df_res = df_res.to_pandas()

display(df_res[[
    "donor", "replica", "day",
    "matched_n_real", "matched_dc_real", "matched_n_fraction", "matched_dc_fraction",
    "matched_n_mock_mean", "matched_n_z", "matched_n_p_emp", "matched_n_cohen_d",
    "matched_dc_log2_real", "matched_dc_log2_mock_mean", "matched_dc_log2_z", "matched_dc_log2_p_emp", "matched_dc_log2_cohen_d",
]])
Processed 10/42 samples
Processed 20/42 samples
Processed 30/42 samples
Processed 40/42 samples
donor replica day matched_n_real matched_dc_real matched_n_fraction matched_dc_fraction matched_n_mock_mean matched_n_z matched_n_p_emp matched_n_cohen_d matched_dc_log2_real matched_dc_log2_mock_mean matched_dc_log2_z matched_dc_log2_p_emp matched_dc_log2_cohen_d
0 P1 F1 -1 11.0 55.0 0.000017 0.000049 21.733 -2.528267 0.999001 -2.528267 5.807355 5.852050 -0.089607 0.525475 -0.089607
1 P1 F1 0 13.0 48.0 0.000022 0.000057 20.682 -1.913755 0.991009 -1.913755 5.614710 5.436454 0.391387 0.355644 0.391387
2 P1 F1 7 21.0 100.0 0.000025 0.000062 27.058 -1.306924 0.931069 -1.306924 6.658211 6.369355 0.581491 0.265734 0.581491
3 P1 F1 15 28.0 364.0 0.000031 0.000206 27.009 0.216564 0.444555 0.216564 8.511753 6.457772 3.232702 0.020979 3.232702
4 P1 F1 45 18.0 432.0 0.000030 0.000452 21.247 -0.767548 0.820180 -0.767548 8.758223 5.732641 5.786307 0.000999 5.786307
5 P1 F2 0 11.0 45.0 0.000021 0.000058 18.572 -1.881685 0.984016 -1.881685 5.523562 5.249873 0.548540 0.305694 0.548540
6 P2 F1 -1 20.0 61.0 0.000019 0.000028 31.375 -2.276396 0.995005 -2.276396 5.954196 6.817136 -1.359797 0.971029 -1.359797
7 P2 F1 0 16.0 41.0 0.000026 0.000042 21.875 -1.380711 0.945055 -1.380711 5.392317 5.736150 -0.618093 0.781219 -0.618093
8 P2 F1 7 29.0 88.0 0.000026 0.000038 30.789 -0.363601 0.673327 -0.363601 6.475733 6.795364 -0.496604 0.714286 -0.496604
9 P2 F1 15 34.0 278.0 0.000029 0.000128 32.486 0.296556 0.422577 0.296556 8.124121 6.744152 2.392145 0.025974 2.392145
10 P2 F1 45 22.0 95.0 0.000030 0.000074 24.309 -0.526232 0.734266 -0.526232 6.584963 6.083955 0.844796 0.125874 0.844796
11 P2 F2 -1 24.0 65.0 0.000025 0.000034 29.036 -1.046087 0.884116 -1.046087 6.044394 6.637700 -0.914796 0.881119 -0.914796
12 P2 F2 0 17.0 25.0 0.000027 0.000025 21.797 -1.133511 0.895105 -1.133511 4.700440 5.727567 -1.863944 0.993007 -1.863944
13 Q1 F1 -1 13.0 101.0 0.000033 0.000109 16.173 -0.863563 0.839161 -0.863563 6.672425 5.294234 1.811331 0.059940 1.811331
14 Q1 F1 0 10.0 58.0 0.000027 0.000072 15.908 -1.602554 0.972028 -1.602554 5.882643 4.994632 1.359688 0.090909 1.359688
15 Q1 F1 7 13.0 97.0 0.000029 0.000093 17.935 -1.259349 0.927073 -1.259349 6.614710 5.444051 1.619213 0.070929 1.619213
16 Q1 F1 15 23.0 237.0 0.000051 0.000281 18.464 1.144622 0.148851 1.144622 7.894818 5.328034 4.164496 0.000999 4.164496
17 Q1 F1 45 19.0 85.0 0.000043 0.000102 17.543 0.397169 0.393606 0.397169 6.426265 5.340650 1.636933 0.076923 1.636933
18 Q1 F2 0 13.0 60.0 0.000033 0.000065 16.339 -0.938175 0.857143 -0.938175 5.930737 5.106298 1.296140 0.098901 1.296140
19 Q2 F1 -1 16.0 43.0 0.000025 0.000028 24.353 -1.840431 0.985015 -1.840431 5.459432 6.213564 -1.023026 0.917083 -1.023026
20 Q2 F1 0 13.0 32.0 0.000019 0.000023 25.906 -2.846177 0.999001 -2.846177 5.044394 6.189693 -1.561570 0.986014 -1.561570
21 Q2 F1 7 18.0 35.0 0.000030 0.000020 22.224 -0.995003 0.863137 -0.995003 5.169925 6.432626 -1.424500 0.980020 -1.424500
22 Q2 F1 15 32.0 214.0 0.000053 0.000148 23.039 2.115927 0.030969 2.115927 7.748193 6.200783 2.005965 0.034965 2.005965
23 Q2 F1 45 22.0 48.0 0.000039 0.000033 22.172 -0.039240 0.547453 -0.039240 5.614710 6.157814 -0.565967 0.770230 -0.565967
24 Q2 F2 0 14.0 24.0 0.000020 0.000016 25.848 -2.562470 0.999001 -2.562470 4.643856 6.307501 -2.289550 1.000000 -2.289550
25 S1 F1 -1 16.0 47.0 0.000024 0.000042 25.112 -2.064365 0.988012 -2.064365 5.584963 6.137874 -0.956872 0.849151 -0.956872
26 S1 F1 0 20.0 66.0 0.000037 0.000071 22.855 -0.647971 0.772228 -0.647971 6.066089 5.916064 0.252305 0.357642 0.252305
27 S1 F1 7 21.0 32.0 0.000049 0.000049 19.118 0.476927 0.347652 0.476927 5.044394 5.401170 -0.611822 0.773227 -0.611822
28 S1 F1 15 42.0 814.0 0.000063 0.000761 25.978 3.463383 0.000999 3.463383 9.670656 6.058359 6.332003 0.000999 6.332003
29 S1 F1 45 28.0 368.0 0.000049 0.000378 22.422 1.370948 0.109890 1.370948 8.527477 5.825460 4.589465 0.003996 4.589465
30 S1 F2 -1 19.0 51.0 0.000030 0.000048 24.513 -1.238716 0.910090 -1.238716 5.700440 6.058294 -0.610654 0.753247 -0.610654
31 S1 F2 0 15.0 35.0 0.000032 0.000045 20.663 -1.341158 0.937063 -1.341158 5.169925 5.654028 -0.787073 0.816184 -0.787073
32 S2 F1 -1 27.0 77.0 0.000026 0.000036 35.406 -1.603117 0.958042 -1.603117 6.285402 7.130760 -1.703762 0.979021 -1.703762
33 S2 F1 0 21.0 54.0 0.000030 0.000040 27.917 -1.454915 0.953047 -1.454915 5.781360 6.427097 -1.334847 0.929071 -1.334847
34 S2 F1 7 23.0 73.0 0.000034 0.000050 25.855 -0.606896 0.755245 -0.606896 6.209453 6.482263 -0.503692 0.710290 -0.503692
35 S2 F1 15 39.0 471.0 0.000048 0.000292 30.088 1.865387 0.046953 1.865387 8.882643 6.664434 4.116877 0.005994 4.116877
36 S2 F1 45 30.0 67.0 0.000032 0.000039 32.719 -0.548777 0.726274 -0.548777 6.087463 6.771117 -1.319493 0.949051 -1.319493
37 S2 F2 -1 33.0 138.0 0.000029 0.000057 37.320 -0.800567 0.799201 -0.800567 7.118941 7.277645 -0.319066 0.615385 -0.319066
38 S2 F2 0 24.0 60.0 0.000034 0.000046 27.409 -0.700288 0.786214 -0.700288 5.930737 6.376660 -0.874301 0.837163 -0.874301
39 S2 F2 7 20.0 71.0 0.000028 0.000045 27.105 -1.461006 0.958042 -1.461006 6.169925 6.595491 -0.799844 0.808192 -0.799844
40 S2 F2 15 34.0 424.0 0.000043 0.000274 29.750 0.842204 0.218781 0.842204 8.731319 6.590215 3.941494 0.008991 3.941494
41 S2 F2 45 27.0 73.0 0.000031 0.000045 32.283 -1.063006 0.881119 -1.063006 6.209453 6.701391 -0.941058 0.867133 -0.941058

Reading the overlap results#

Four metrics are tracked per sample:

  • matched_n_real — number of unique LLW CDR3+V+J clonotypes found in the sample.

  • matched_dc_real — sum of duplicate_count over those matched clonotypes (read depth of the match).

  • matched_n_fractionmatched_n / n_total (fraction of unique clonotypes in the sample that are LLW).

  • matched_dc_fractionmatched_dc / dc_total (fraction of total read depth contributed by LLW matches).

Z-scores and Cohen d values are computed relative to the real-control mock null; positive values indicate enrichment above expectation.

[8]:
plot_rows = []
for _, r in df_res.iterrows():
    for x in r["mock_n"]:
        plot_rows.append({
            "donor": r["donor"], "replica": r["replica"], "day": int(r["day"]),
            "metric": "matched_n", "kind": "mock", "value": float(x),
        })
    plot_rows.append({
        "donor": r["donor"], "replica": r["replica"], "day": int(r["day"]),
        "metric": "matched_n", "kind": "real", "value": float(r["matched_n_real"]),
    })
    # Sum of duplicate_count for matched clonotypes (raw read depth of LLW signal)
    for x in r["mock_dc_log2"]:
        plot_rows.append({
            "donor": r["donor"], "replica": r["replica"], "day": int(r["day"]),
            "metric": "matched_dc_log2", "kind": "mock", "value": float(x),
        })
    plot_rows.append({
        "donor": r["donor"], "replica": r["replica"], "day": int(r["day"]),
        "metric": "matched_dc_log2", "kind": "real", "value": float(r["matched_dc_log2_real"]),
    })
    # Fraction of unique clonotypes in sample that are LLW-matched
    plot_rows.append({
        "donor": r["donor"], "replica": r["replica"], "day": int(r["day"]),
        "metric": "matched_n_fraction", "kind": "real", "value": float(r["matched_n_fraction"]),
    })
    # Fraction of total read depth contributed by LLW matches
    plot_rows.append({
        "donor": r["donor"], "replica": r["replica"], "day": int(r["day"]),
        "metric": "matched_dc_fraction", "kind": "real", "value": float(r["matched_dc_fraction"]),
    })

plot_df = pd.DataFrame(plot_rows)
days_all = sorted(df_res["day"].unique().tolist())
donors = sorted(df_res["donor"].unique().tolist())
replicas = sorted(df_res["replica"].unique().tolist())


def draw_panel(metric, ylabel, title, with_mock=True):
    """Draw a grid of per-donor/replica line (real) + optional boxplot (mock) subplots."""
    fig, axes = plt.subplots(
        len(replicas), len(donors),
        figsize=(4.0 * len(donors), 3.2 * len(replicas)),
        squeeze=False,
    )
    fig.suptitle(title, fontsize=13, y=1.02)
    for ri, rep in enumerate(replicas):
        for di, donor in enumerate(donors):
            ax = axes[ri, di]
            sub = plot_df[
                (plot_df["metric"] == metric)
                & (plot_df["donor"] == donor)
                & (plot_df["replica"] == rep)
            ]
            if sub.empty:
                ax.set_visible(False)
                continue

            real = sub[sub["kind"] == "real"].sort_values("day")
            if with_mock:
                mock = sub[sub["kind"] == "mock"]
                box_data = [mock[mock["day"] == d]["value"].values for d in days_all]
                width = 2.5
                ax.boxplot(
                    box_data,
                    positions=days_all,
                    widths=width,
                    patch_artist=True,
                    boxprops=dict(facecolor="#d0e4f7", alpha=0.85),
                    medianprops=dict(color="#1f77b4", linewidth=1.6),
                    flierprops=dict(markersize=2),
                    manage_ticks=False,
                )
            ax.plot(real["day"], real["value"], "-o", color="#d62728", linewidth=2, markersize=5, zorder=5)
            ax.set_xticks(days_all)
            ax.set_xlabel("Day")
            ax.set_ylabel(ylabel)
            ax.set_title(f"{donor} {rep}", fontsize=9)
            ax.grid(alpha=0.2)

    plt.tight_layout()
    plt.show()


draw_panel("matched_n", "Matched clonotypes", "LLW matched clonotypes: real (line) vs mock (boxplots)")
draw_panel("matched_dc_log2", "log2(matched duplicate_count + 1)", "LLW matched duplicate_count: real (line) vs mock (boxplots)")
draw_panel("matched_n_fraction", "Fraction of total clonotypes", "LLW matched fraction (clonotypes)", with_mock=False)
draw_panel("matched_dc_fraction", "Fraction of total duplicate_count", "LLW matched fraction (read depth)", with_mock=False)
../_images/notebooks_vdjbet_yf_15_0.png
../_images/notebooks_vdjbet_yf_15_1.png
../_images/notebooks_vdjbet_yf_15_2.png
../_images/notebooks_vdjbet_yf_15_3.png

7. Cohen d heatmaps#

[9]:
def heatmap_cohen(value_col, p_adj_col, title, cbar_label, cmap="RdBu_r", vlim=None):
    work = df_res.copy()
    work["sample"] = work["donor"] + " " + work["replica"]
    pv = work.pivot_table(index="sample", columns="day", values=value_col, aggfunc="first")
    pp = work.pivot_table(index="sample", columns="day", values=p_adj_col, aggfunc="first")

    mat = pv.values.astype(float)
    if vlim is None:
        vmax = max(1.0, np.nanpercentile(np.abs(mat), 95))
        vmin = -vmax
    else:
        vmin, vmax = vlim

    fig, ax = plt.subplots(figsize=(8, max(3.5, 0.6 * pv.shape[0])))
    im = ax.imshow(mat, cmap=cmap, aspect="auto", vmin=vmin, vmax=vmax, interpolation="nearest")
    cb = plt.colorbar(im, ax=ax, shrink=0.85)
    cb.set_label(cbar_label)

    for r in range(pv.shape[0]):
        for c in range(pv.shape[1]):
            p = pp.values[r, c]
            d = pv.values[r, c]
            if pd.notna(p) and pd.notna(d) and (float(p) < 0.10) and (float(d) > 0):
                ax.add_patch(plt.Rectangle((c - 0.5, r - 0.5), 1, 1, fill=False, edgecolor="black", linewidth=2))

    ax.set_xticks(range(len(pv.columns)))
    ax.set_xticklabels([f"Day {d}" for d in pv.columns], rotation=45, ha="right")
    ax.set_yticks(range(len(pv.index)))
    ax.set_yticklabels(pv.index)
    ax.set_title(title)
    plt.tight_layout()
    plt.show()

heatmap_cohen(
    value_col="matched_n_cohen_d",
    p_adj_col="matched_n_p_adj",
    title="Matched clonotypes Cohen d (LLW vs mock)",
    cbar_label="Cohen d (matched clonotypes)",
    cmap="RdBu_r",
)

heatmap_cohen(
    value_col="matched_dc_log2_cohen_d",
    p_adj_col="matched_dc_log2_p_adj",
    title="Matched duplicate_count Cohen d (log2, LLW vs mock)",
    cbar_label="Cohen d (log2 matched duplicate_count)",
    cmap="Reds",
    vlim=(0, max(1.0, np.nanpercentile(df_res["matched_dc_log2_cohen_d"].values, 95))),
)
../_images/notebooks_vdjbet_yf_17_0.png
../_images/notebooks_vdjbet_yf_17_1.png

8. Final summary tables#

[10]:
summary_cols = [
    "donor", "replica", "day",
    "n_total", "dc_total",
    "matched_n_real", "matched_dc_real", "matched_n_fraction", "matched_dc_fraction",
    "matched_n_mock_mean", "matched_n_mock_sd", "matched_n_cohen_d", "matched_n_z", "matched_n_p_emp", "matched_n_p_adj",
    "matched_dc_log2_real", "matched_dc_log2_mock_mean", "matched_dc_log2_mock_sd",
    "matched_dc_log2_cohen_d", "matched_dc_log2_z", "matched_dc_log2_p_emp", "matched_dc_log2_p_adj",
]
summary = df_res[summary_cols].copy()
for col in [
    "matched_n_real", "matched_dc_real",
    "matched_n_mock_mean", "matched_n_mock_sd", "matched_n_cohen_d", "matched_n_z",
    "matched_dc_log2_real", "matched_dc_log2_mock_mean", "matched_dc_log2_mock_sd", "matched_dc_log2_cohen_d", "matched_dc_log2_z",
]:
    summary[col] = summary[col].astype(float).round(3)
for col in ["matched_n_fraction", "matched_dc_fraction"]:
    summary[col] = summary[col].map(lambda x: f"{x:.5f}")
for col in ["matched_n_p_emp", "matched_n_p_adj", "matched_dc_log2_p_emp", "matched_dc_log2_p_adj"]:
    summary[col] = summary[col].map(lambda x: f"{x:.4f}")

display(summary.sort_values(["donor", "replica", "day"]).reset_index(drop=True))

print("Top positive matched clonotype effects by Cohen d:")
display(summary.sort_values("matched_n_cohen_d", ascending=False).head(12))

print("Top positive matched duplicate_count effects by Cohen d:")
display(summary.sort_values("matched_dc_log2_cohen_d", ascending=False).head(12))
donor replica day n_total dc_total matched_n_real matched_dc_real matched_n_fraction matched_dc_fraction matched_n_mock_mean ... matched_n_z matched_n_p_emp matched_n_p_adj matched_dc_log2_real matched_dc_log2_mock_mean matched_dc_log2_mock_sd matched_dc_log2_cohen_d matched_dc_log2_z matched_dc_log2_p_emp matched_dc_log2_p_adj
0 P1 F1 -1 640262 1113509 11.0 55.0 0.00002 0.00005 21.733 ... -2.528 0.9990 0.9990 5.807 5.852 0.499 -0.090 -0.090 0.5255 1.0000
1 P1 F1 0 590253 848323 13.0 48.0 0.00002 0.00006 20.682 ... -1.914 0.9910 0.9990 5.615 5.436 0.455 0.391 0.391 0.3556 0.7906
2 P1 F1 7 852394 1623860 21.0 100.0 0.00002 0.00006 27.058 ... -1.307 0.9311 0.9990 6.658 6.369 0.497 0.581 0.581 0.2657 0.6976
3 P1 F1 15 901914 1764263 28.0 364.0 0.00003 0.00021 27.009 ... 0.217 0.4446 0.9990 8.512 6.458 0.635 3.233 3.233 0.0210 0.1259
4 P1 F1 45 598677 956614 18.0 432.0 0.00003 0.00045 21.247 ... -0.768 0.8202 0.9990 8.758 5.733 0.523 5.786 5.786 0.0010 0.0140
5 P1 F2 0 534632 772165 11.0 45.0 0.00002 0.00006 18.572 ... -1.882 0.9840 0.9990 5.524 5.250 0.499 0.549 0.549 0.3057 0.7552
6 P2 F1 -1 1056617 2177498 20.0 61.0 0.00002 0.00003 31.375 ... -2.276 0.9950 0.9990 5.954 6.817 0.635 -1.360 -1.360 0.9710 1.0000
7 P2 F1 0 618627 974518 16.0 41.0 0.00003 0.00004 21.875 ... -1.381 0.9451 0.9990 5.392 5.736 0.556 -0.618 -0.618 0.7812 1.0000
8 P2 F1 7 1106211 2324263 29.0 88.0 0.00003 0.00004 30.789 ... -0.364 0.6733 0.9990 6.476 6.795 0.644 -0.497 -0.497 0.7143 1.0000
9 P2 F1 15 1175459 2165475 34.0 278.0 0.00003 0.00013 32.486 ... 0.297 0.4226 0.9990 8.124 6.744 0.577 2.392 2.392 0.0260 0.1364
10 P2 F1 45 743145 1284712 22.0 95.0 0.00003 0.00007 24.309 ... -0.526 0.7343 0.9990 6.585 6.084 0.593 0.845 0.845 0.1259 0.3524
11 P2 F2 -1 960830 1926603 24.0 65.0 0.00002 0.00003 29.036 ... -1.046 0.8841 0.9990 6.044 6.638 0.649 -0.915 -0.915 0.8811 1.0000
12 P2 F2 0 627140 1007567 17.0 25.0 0.00003 0.00002 21.797 ... -1.134 0.8951 0.9990 4.700 5.728 0.551 -1.864 -1.864 0.9930 1.0000
13 Q1 F1 -1 393851 930778 13.0 101.0 0.00003 0.00011 16.173 ... -0.864 0.8392 0.9990 6.672 5.294 0.761 1.811 1.811 0.0599 0.2517
14 Q1 F1 0 365046 801690 10.0 58.0 0.00003 0.00007 15.908 ... -1.603 0.9720 0.9990 5.883 4.995 0.653 1.360 1.360 0.0909 0.2937
15 Q1 F1 7 454574 1042775 13.0 97.0 0.00003 0.00009 17.935 ... -1.259 0.9271 0.9990 6.615 5.444 0.723 1.619 1.619 0.0709 0.2692
16 Q1 F1 15 447584 843549 23.0 237.0 0.00005 0.00028 18.464 ... 1.145 0.1489 0.9990 7.895 5.328 0.616 4.164 4.164 0.0010 0.0140
17 Q1 F1 45 442934 833618 19.0 85.0 0.00004 0.00010 17.543 ... 0.397 0.3936 0.9990 6.426 5.341 0.663 1.637 1.637 0.0769 0.2692
18 Q1 F2 0 395028 920754 13.0 60.0 0.00003 0.00007 16.339 ... -0.938 0.8571 0.9990 5.931 5.106 0.636 1.296 1.296 0.0989 0.2967
19 Q2 F1 -1 644960 1551950 16.0 43.0 0.00002 0.00003 24.353 ... -1.840 0.9850 0.9990 5.459 6.214 0.737 -1.023 -1.023 0.9171 1.0000
20 Q2 F1 0 688166 1391788 13.0 32.0 0.00002 0.00002 25.906 ... -2.846 0.9990 0.9990 5.044 6.190 0.733 -1.562 -1.562 0.9860 1.0000
21 Q2 F1 7 593659 1736828 18.0 35.0 0.00003 0.00002 22.224 ... -0.995 0.8631 0.9990 5.170 6.433 0.886 -1.424 -1.424 0.9800 1.0000
22 Q2 F1 15 607413 1449423 32.0 214.0 0.00005 0.00015 23.039 ... 2.116 0.0310 0.6503 7.748 6.201 0.771 2.006 2.006 0.0350 0.1632
23 Q2 F1 45 567178 1464089 22.0 48.0 0.00004 0.00003 22.172 ... -0.039 0.5475 0.9990 5.615 6.158 0.960 -0.566 -0.566 0.7702 1.0000
24 Q2 F2 0 704492 1481580 14.0 24.0 0.00002 0.00002 25.848 ... -2.562 0.9990 0.9990 4.644 6.308 0.727 -2.290 -2.290 1.0000 1.0000
25 S1 F1 -1 665282 1110908 16.0 47.0 0.00002 0.00004 25.112 ... -2.064 0.9880 0.9990 5.585 6.138 0.578 -0.957 -0.957 0.8492 1.0000
26 S1 F1 0 537799 925833 20.0 66.0 0.00004 0.00007 22.855 ... -0.648 0.7722 0.9990 6.066 5.916 0.595 0.252 0.252 0.3576 0.7906
27 S1 F1 7 431736 652745 21.0 32.0 0.00005 0.00005 19.118 ... 0.477 0.3477 0.9990 5.044 5.401 0.583 -0.612 -0.612 0.7732 1.0000
28 S1 F1 15 669972 1069012 42.0 814.0 0.00006 0.00076 25.978 ... 3.463 0.0010 0.0420 9.671 6.058 0.570 6.332 6.332 0.0010 0.0140
29 S1 F1 45 572955 972623 28.0 368.0 0.00005 0.00038 22.422 ... 1.371 0.1099 0.9990 8.527 5.825 0.589 4.589 4.589 0.0040 0.0420
30 S1 F2 -1 637371 1056200 19.0 51.0 0.00003 0.00005 24.513 ... -1.239 0.9101 0.9990 5.700 6.058 0.586 -0.611 -0.611 0.7532 1.0000
31 S1 F2 0 474096 779081 15.0 35.0 0.00003 0.00004 20.663 ... -1.341 0.9371 0.9990 5.170 5.654 0.615 -0.787 -0.787 0.8162 1.0000
32 S2 F1 -1 1051499 2154393 27.0 77.0 0.00003 0.00004 35.406 ... -1.603 0.9580 0.9990 6.285 7.131 0.496 -1.704 -1.704 0.9790 1.0000
33 S2 F1 0 711398 1356139 21.0 54.0 0.00003 0.00004 27.917 ... -1.455 0.9530 0.9990 5.781 6.427 0.484 -1.335 -1.335 0.9291 1.0000
34 S2 F1 7 667613 1458612 23.0 73.0 0.00003 0.00005 25.855 ... -0.607 0.7552 0.9990 6.209 6.482 0.542 -0.504 -0.504 0.7103 1.0000
35 S2 F1 15 821039 1611965 39.0 471.0 0.00005 0.00029 30.088 ... 1.865 0.0470 0.6573 8.883 6.664 0.539 4.117 4.117 0.0060 0.0503
36 S2 F1 45 927910 1731250 30.0 67.0 0.00003 0.00004 32.719 ... -0.549 0.7263 0.9990 6.087 6.771 0.518 -1.319 -1.319 0.9491 1.0000
37 S2 F2 -1 1142739 2426409 33.0 138.0 0.00003 0.00006 37.320 ... -0.801 0.7992 0.9990 7.119 7.278 0.497 -0.319 -0.319 0.6154 1.0000
38 S2 F2 0 706729 1293819 24.0 60.0 0.00003 0.00005 27.409 ... -0.700 0.7862 0.9990 5.931 6.377 0.510 -0.874 -0.874 0.8372 1.0000
39 S2 F2 7 710413 1595370 20.0 71.0 0.00003 0.00004 27.105 ... -1.461 0.9580 0.9990 6.170 6.595 0.532 -0.800 -0.800 0.8082 1.0000
40 S2 F2 15 786155 1544994 34.0 424.0 0.00004 0.00027 29.750 ... 0.842 0.2188 0.9990 8.731 6.590 0.543 3.941 3.941 0.0090 0.0629
41 S2 F2 45 871599 1604659 27.0 73.0 0.00003 0.00005 32.283 ... -1.063 0.8811 0.9990 6.209 6.701 0.523 -0.941 -0.941 0.8671 1.0000

42 rows × 22 columns

Top positive matched clonotype effects by Cohen d:
donor replica day n_total dc_total matched_n_real matched_dc_real matched_n_fraction matched_dc_fraction matched_n_mock_mean ... matched_n_z matched_n_p_emp matched_n_p_adj matched_dc_log2_real matched_dc_log2_mock_mean matched_dc_log2_mock_sd matched_dc_log2_cohen_d matched_dc_log2_z matched_dc_log2_p_emp matched_dc_log2_p_adj
28 S1 F1 15 669972 1069012 42.0 814.0 0.00006 0.00076 25.978 ... 3.463 0.0010 0.0420 9.671 6.058 0.570 6.332 6.332 0.0010 0.0140
22 Q2 F1 15 607413 1449423 32.0 214.0 0.00005 0.00015 23.039 ... 2.116 0.0310 0.6503 7.748 6.201 0.771 2.006 2.006 0.0350 0.1632
35 S2 F1 15 821039 1611965 39.0 471.0 0.00005 0.00029 30.088 ... 1.865 0.0470 0.6573 8.883 6.664 0.539 4.117 4.117 0.0060 0.0503
29 S1 F1 45 572955 972623 28.0 368.0 0.00005 0.00038 22.422 ... 1.371 0.1099 0.9990 8.527 5.825 0.589 4.589 4.589 0.0040 0.0420
16 Q1 F1 15 447584 843549 23.0 237.0 0.00005 0.00028 18.464 ... 1.145 0.1489 0.9990 7.895 5.328 0.616 4.164 4.164 0.0010 0.0140
40 S2 F2 15 786155 1544994 34.0 424.0 0.00004 0.00027 29.750 ... 0.842 0.2188 0.9990 8.731 6.590 0.543 3.941 3.941 0.0090 0.0629
27 S1 F1 7 431736 652745 21.0 32.0 0.00005 0.00005 19.118 ... 0.477 0.3477 0.9990 5.044 5.401 0.583 -0.612 -0.612 0.7732 1.0000
17 Q1 F1 45 442934 833618 19.0 85.0 0.00004 0.00010 17.543 ... 0.397 0.3936 0.9990 6.426 5.341 0.663 1.637 1.637 0.0769 0.2692
9 P2 F1 15 1175459 2165475 34.0 278.0 0.00003 0.00013 32.486 ... 0.297 0.4226 0.9990 8.124 6.744 0.577 2.392 2.392 0.0260 0.1364
3 P1 F1 15 901914 1764263 28.0 364.0 0.00003 0.00021 27.009 ... 0.217 0.4446 0.9990 8.512 6.458 0.635 3.233 3.233 0.0210 0.1259
23 Q2 F1 45 567178 1464089 22.0 48.0 0.00004 0.00003 22.172 ... -0.039 0.5475 0.9990 5.615 6.158 0.960 -0.566 -0.566 0.7702 1.0000
8 P2 F1 7 1106211 2324263 29.0 88.0 0.00003 0.00004 30.789 ... -0.364 0.6733 0.9990 6.476 6.795 0.644 -0.497 -0.497 0.7143 1.0000

12 rows × 22 columns

Top positive matched duplicate_count effects by Cohen d:
donor replica day n_total dc_total matched_n_real matched_dc_real matched_n_fraction matched_dc_fraction matched_n_mock_mean ... matched_n_z matched_n_p_emp matched_n_p_adj matched_dc_log2_real matched_dc_log2_mock_mean matched_dc_log2_mock_sd matched_dc_log2_cohen_d matched_dc_log2_z matched_dc_log2_p_emp matched_dc_log2_p_adj
28 S1 F1 15 669972 1069012 42.0 814.0 0.00006 0.00076 25.978 ... 3.463 0.0010 0.0420 9.671 6.058 0.570 6.332 6.332 0.0010 0.0140
4 P1 F1 45 598677 956614 18.0 432.0 0.00003 0.00045 21.247 ... -0.768 0.8202 0.9990 8.758 5.733 0.523 5.786 5.786 0.0010 0.0140
29 S1 F1 45 572955 972623 28.0 368.0 0.00005 0.00038 22.422 ... 1.371 0.1099 0.9990 8.527 5.825 0.589 4.589 4.589 0.0040 0.0420
16 Q1 F1 15 447584 843549 23.0 237.0 0.00005 0.00028 18.464 ... 1.145 0.1489 0.9990 7.895 5.328 0.616 4.164 4.164 0.0010 0.0140
35 S2 F1 15 821039 1611965 39.0 471.0 0.00005 0.00029 30.088 ... 1.865 0.0470 0.6573 8.883 6.664 0.539 4.117 4.117 0.0060 0.0503
40 S2 F2 15 786155 1544994 34.0 424.0 0.00004 0.00027 29.750 ... 0.842 0.2188 0.9990 8.731 6.590 0.543 3.941 3.941 0.0090 0.0629
3 P1 F1 15 901914 1764263 28.0 364.0 0.00003 0.00021 27.009 ... 0.217 0.4446 0.9990 8.512 6.458 0.635 3.233 3.233 0.0210 0.1259
9 P2 F1 15 1175459 2165475 34.0 278.0 0.00003 0.00013 32.486 ... 0.297 0.4226 0.9990 8.124 6.744 0.577 2.392 2.392 0.0260 0.1364
22 Q2 F1 15 607413 1449423 32.0 214.0 0.00005 0.00015 23.039 ... 2.116 0.0310 0.6503 7.748 6.201 0.771 2.006 2.006 0.0350 0.1632
13 Q1 F1 -1 393851 930778 13.0 101.0 0.00003 0.00011 16.173 ... -0.864 0.8392 0.9990 6.672 5.294 0.761 1.811 1.811 0.0599 0.2517
17 Q1 F1 45 442934 833618 19.0 85.0 0.00004 0.00010 17.543 ... 0.397 0.3936 0.9990 6.426 5.341 0.663 1.637 1.637 0.0769 0.2692
15 Q1 F1 7 454574 1042775 13.0 97.0 0.00003 0.00009 17.935 ... -1.259 0.9271 0.9990 6.615 5.444 0.723 1.619 1.619 0.0709 0.2692

12 rows × 22 columns

Notebook output coverage checklist:

  • V usage YF vs OLGA and correction factors

  • LLW reference vs mock Pgen histogram alignment

  • Matched clonotypes and duplicate_count per sample

  • Cohen d, z-scores, empirical p-values, FDR

  • Line + boxplot dynamics and Cohen d heatmaps

[11]:
print("Done: notebook rewritten to Rmd-aligned workflow.")
print(f"Samples: {len(samples)}, mocks: {N_MOCKS}, pool size: {POOL_SIZE:,}")
print("Use the summary table above for export/reporting.")
Done: notebook rewritten to Rmd-aligned workflow.
Samples: 42, mocks: 1000, pool size: 100,000
Use the summary table above for export/reporting.

9. Synthetic mock comparison and scale-factor calibration#

Sections 1–8 use a real human TRB control pool to drive the mock null distribution. This section builds a complementary synthetic OLGA pool and rescores all samples under that null, then computes a scale factor \(X\) aligning the two mean mock overlaps:

\[X = \frac{\bar{m}_{\text{real}}}{\bar{m}_{\text{synth}}}\]

where \(\bar{m}\) denotes the mean number of mocked matches across all samples. Three mock curves are overlaid per day:

  • mock real — null from the real-control pool (sections 1–8).

  • mock synthetic — null from OLGA synthetic sequences.

  • mock synthetic × X — synthetic rescaled to match the real-pool mean.

The scale factor quantifies how much OLGA under- or over-estimates real-sequence background overlap density. A value \(X > 1\) means the real control finds more background matches than OLGA predicts (typical due to empirical sequence clustering).

[12]:
import importlib
import mir.common.control as _ctrl_mod
import mir.comparative.vdjbet as _vdjbet_mod
importlib.reload(_ctrl_mod)
importlib.reload(_vdjbet_mod)
from mir.comparative.vdjbet import PgenBinPool, VDJBetOverlapAnalysis
print('Reloaded mir.common.control and mir.comparative.vdjbet from source')
Reloaded mir.common.control and mir.comparative.vdjbet from source
[13]:
df_res_real = df_res.copy()  # real-control null from sections 1-8

(
    pool_synth,
    analysis_synth,
    df_res_synth,
    X_scale,
    df_res_synth_scaled,
    ) = build_synthetic_comparison(
        vdjdb_rep,
        samples,
        pgen_adj_olga=pgen_adj_olga,
        pool_size=POOL_SIZE,
        n_mocks=N_MOCKS,
        n_jobs=8,
        seed=SEED,
        sample_n_jobs=8,
        df_res_real=df_res_real,
    )

# Convert Polars → pandas for downstream cells (build_synthetic_comparison returns pl.DataFrame)
if hasattr(df_res_synth, "to_pandas"):
    df_res_synth = df_res_synth.to_pandas()
if hasattr(df_res_synth_scaled, "to_pandas"):
    df_res_synth_scaled = df_res_synth_scaled.to_pandas()

print(f"Synthetic pool records used: {pool_synth.n_generated:,}")
print(f"Synthetic mock mean overlap: {df_res_synth['matched_n_mock_mean'].mean():.4f}")
print(f"Real-control mock mean overlap: {df_res_real['matched_n_mock_mean'].mean():.4f}")
print(f"Scale factor X (real/synthetic): {X_scale:.4f}")
Processed 10/42 samples
Processed 20/42 samples
Processed 30/42 samples
Processed 40/42 samples
Synthetic pool records used: 100,000
Synthetic mock mean overlap: 8.9107
Real-control mock mean overlap: 24.7711
Scale factor X (real/synthetic): 2.7799
[14]:
# Significant hits under real-control null (from sections 1-8).
sig_n_real = df_res_real[
    (df_res_real["matched_n_p_adj"] < 0.10) & (df_res_real["matched_n_cohen_d"] > 0)
]
sig_dc_real = df_res_real[
    (df_res_real["matched_dc_log2_p_adj"] < 0.10) & (df_res_real["matched_dc_log2_cohen_d"] > 0)
]


def _as_triplet_set(df):
    return {(r.donor, r.replica, int(r.day)) for r in df[["donor", "replica", "day"]].itertuples(index=False)}


def _as_label_list(df):
    return [
        f"{r.donor} {r.replica} day {int(r.day)}"
        for r in df[["donor", "replica", "day"]].itertuples(index=False)
    ]


# Expected significant sets based on the YFV19 Rmd analysis (day 15 peak responders).
expected_sig_n = {("S2", "F1", 15), ("S1", "F1", 15), ("Q2", "F1", 15), ("Q1", "F1", 15)}
expected_sig_dc = {
    ("S2", "F1", 15), ("S2", "F2", 15),
    ("S1", "F1", 15), ("S1", "F1", 45),
    ("Q1", "F1", 15),
    ("P2", "F1", 15),
    ("P1", "F1", 15), ("P1", "F1", 45),
}

set_sig_n = _as_triplet_set(sig_n_real)
set_sig_dc = _as_triplet_set(sig_dc_real)

print("Observed significant matched clonotypes (FDR<0.10, d>0):")
print(_as_label_list(sig_n_real))
print("Observed significant matched duplicate_count log2 (FDR<0.10, d>0):")
print(_as_label_list(sig_dc_real))

print("\nExpectation check: matched clonotypes")
print("match:", set_sig_n == expected_sig_n)
print("missing:", sorted(expected_sig_n - set_sig_n))
print("extra:", sorted(set_sig_n - expected_sig_n))

print("\nExpectation check: matched duplicate_count")
print("match:", set_sig_dc == expected_sig_dc)
print("missing:", sorted(expected_sig_dc - set_sig_dc))
print("extra:", sorted(set_sig_dc - expected_sig_dc))

print(f"\nScale factor X (real/synthetic): {X_scale:.3f}")
print("Observation 1: Real-control null captures empirical sequence clustering that OLGA misses.")
print(f"Observation 2: X = {X_scale:.3f} — synthetic mocks undercount overlap by this factor.")
print("Observation 3: Significant calls are exact junction_aa + V + J matching only.")
print("Observation 4: Compare listed observed sets with target day/sample expectations via missing/extra diagnostics.")
Observed significant matched clonotypes (FDR<0.10, d>0):
['S1 F1 day 15']
Observed significant matched duplicate_count log2 (FDR<0.10, d>0):
['P1 F1 day 45', 'Q1 F1 day 15', 'S1 F1 day 15', 'S1 F1 day 45', 'S2 F1 day 15', 'S2 F2 day 15']

Expectation check: matched clonotypes
match: False
missing: [('Q1', 'F1', 15), ('Q2', 'F1', 15), ('S2', 'F1', 15)]
extra: []

Expectation check: matched duplicate_count
match: False
missing: [('P1', 'F1', 15), ('P2', 'F1', 15)]
extra: []

Scale factor X (real/synthetic): 2.780
Observation 1: Real-control null captures empirical sequence clustering that OLGA misses.
Observation 2: X = 2.780 — synthetic mocks undercount overlap by this factor.
Observation 3: Significant calls are exact junction_aa + V + J matching only.
Observation 4: Compare listed observed sets with target day/sample expectations via missing/extra diagnostics.
[15]:
days_plot = sorted(df_res_real["day"].unique().tolist())


def _collect_mock_by_day(df_in, value_col="mock_n", transform=None):
    """Aggregate mock draws per day from a scored DataFrame."""
    out = {d: [] for d in days_plot}
    for _, row in df_in.iterrows():
        d = int(row["day"])
        vals = [float(x) for x in row[value_col]]
        if transform is not None:
            vals = [float(transform(v)) for v in vals]
        out[d].extend(vals)
    return out


real_mock_by_day = _collect_mock_by_day(df_res_real, "mock_n")
syn_mock_by_day = _collect_mock_by_day(df_res_synth, "mock_n")
syn_scaled_mock_by_day = _collect_mock_by_day(df_res_synth_scaled, "mock_n")

# Duplicate-count mock draws are stored as log2(mock_dc + 1). Convert back to raw counts.
_dc_from_log2 = lambda v: (2.0 ** float(v)) - 1.0
real_mock_dc_by_day = _collect_mock_by_day(df_res_real, "mock_dc_log2", transform=_dc_from_log2)
syn_mock_dc_by_day = _collect_mock_by_day(df_res_synth, "mock_dc_log2", transform=_dc_from_log2)
syn_scaled_mock_dc_by_day = _collect_mock_by_day(
    df_res_synth_scaled, "mock_dc_log2", transform=lambda v: _dc_from_log2(v) * X_scale
)

# Real-data mean overlap per day (from sections 1-8, real-control null).
real_mean_by_day = (
    df_res_real.groupby("day", as_index=False)["matched_n_real"].mean().sort_values("day")
)
real_dc_by_day = (
    df_res_real.groupby("day", as_index=False)["matched_dc_real"].mean().sort_values("day")
)
real_frac_n_by_day = (
    df_res_real.groupby("day", as_index=False)["matched_n_fraction"].mean().sort_values("day")
)
real_frac_dc_by_day = (
    df_res_real.groupby("day", as_index=False)["matched_dc_fraction"].mean().sort_values("day")
)

width = 1.5
offsets = [-1.8, 0.0, 1.8]
mock_specs = [
    ("mock real", real_mock_by_day, "#f6b26b", offsets[0]),
    ("mock synthetic", syn_mock_by_day, "#87b4e3", offsets[1]),
    (f"mock synthetic × {X_scale:.2f}", syn_scaled_mock_by_day, "#a4d4ae", offsets[2]),
]
mock_dc_specs = [
    ("mock real", real_mock_dc_by_day, "#f6b26b", offsets[0]),
    ("mock synthetic", syn_mock_dc_by_day, "#87b4e3", offsets[1]),
    (f"mock synthetic × {X_scale:.2f}", syn_scaled_mock_dc_by_day, "#a4d4ae", offsets[2]),
]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle("LLW overlap: real (line) vs mock null distributions (boxplots)", fontsize=14)

# ── Panel A: matched clonotype count ──────────────────────────────────────────
ax = axes[0, 0]
for label, by_day, color, off in mock_specs:
    data = [by_day[d] if len(by_day[d]) else [np.nan] for d in days_plot]
    pos = [d + off for d in days_plot]
    ax.boxplot(
        data, positions=pos, widths=width, patch_artist=True, manage_ticks=False,
        boxprops=dict(facecolor=color, alpha=0.55),
        medianprops=dict(color="#2f2f2f", linewidth=1.6),
        flierprops=dict(markersize=1.5),
    )
    ax.plot([], [], color=color, linewidth=8, alpha=0.55, label=label)
ax.plot(real_mean_by_day["day"], real_mean_by_day["matched_n_real"],
        "-o", color="#d62728", linewidth=2.2, markersize=5, label="real mean")
ax.set_xticks(days_plot)
ax.set_xlabel("Day")
ax.set_ylabel("Matched clonotypes")
ax.set_title("(A) Matched clonotype count")
ax.grid(alpha=0.2)
ax.legend(loc="upper right", fontsize=8)

# ── Panel B: sum of duplicate_count in matched clonotypes ─────────────────────
ax = axes[0, 1]
for label, by_day, color, off in mock_dc_specs:
    data_raw = [by_day[d] if len(by_day[d]) else [np.nan] for d in days_plot]
    # Use log2(x + 1) scale for duplicate-count panel.
    data = [np.log2(np.asarray(vals, dtype=float) + 1.0) for vals in data_raw]
    pos = [d + off for d in days_plot]
    ax.boxplot(
        data, positions=pos, widths=width, patch_artist=True, manage_ticks=False,
        boxprops=dict(facecolor=color, alpha=0.55),
        medianprops=dict(color="#2f2f2f", linewidth=1.6),
        flierprops=dict(markersize=1.5),
    )
    ax.plot([], [], color=color, linewidth=8, alpha=0.55, label=label)
ax.plot(
    real_dc_by_day["day"],
    np.log2(real_dc_by_day["matched_dc_real"].astype(float).values + 1.0),
    "-o", color="#d62728", linewidth=2.2, markersize=5, label="real mean"
)
ax.set_xticks(days_plot)
ax.set_xlabel("Day")
ax.set_ylabel("log2(sum of duplicate_count (matched) + 1)")
ax.set_title("(B) Sum of duplicate_count in matched clonotypes")
ax.grid(alpha=0.2)
ax.legend(loc="upper right", fontsize=8)

# ── Panel C: fraction of clonotypes (matched / total) ─────────────────────────
ax = axes[1, 0]
ax.plot(real_frac_n_by_day["day"], real_frac_n_by_day["matched_n_fraction"],
        "-o", color="#d62728", linewidth=2.2, markersize=5, label="real mean")
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.4%}"))
ax.set_xticks(days_plot)
ax.set_xlabel("Day")
ax.set_ylabel("Fraction of total clonotypes")
ax.set_title("(C) Matched clonotype fraction (matched / total clonotypes)")
ax.grid(alpha=0.2)
ax.legend()

# ── Panel D: fraction of duplicate_count (matched / total) ────────────────────
ax = axes[1, 1]
ax.plot(real_frac_dc_by_day["day"], real_frac_dc_by_day["matched_dc_fraction"],
        "-o", color="#d62728", linewidth=2.2, markersize=5, label="real mean")
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.4%}"))
ax.set_xticks(days_plot)
ax.set_xlabel("Day")
ax.set_ylabel("Fraction of total duplicate_count")
ax.set_title("(D) Matched read-depth fraction (matched / total duplicate_count)")
ax.grid(alpha=0.2)
ax.legend()

plt.tight_layout()
plt.show()
../_images/notebooks_vdjbet_yf_26_0.png