TCR Repertoire Aging Analysis#

Reproduces key overlap and aging trends using the Hugging Face AIRR benchmark:

  • Data source: isalgo/airr_benchmark (vdjtools/**, refreshed on run)

  • Includes full aging metadata with cord-blood donors (age = 0)

  • Pairwise overlap in exact, Hamming:1, and Hamming:1 modes

  • Diversity analysis from observed repertoires and 250k-read subsamples

  • Similarity-first analysis with distance metrics only where required

  • UMAP and MDS from pairwise distance metrics

Reference context: Britanova et al. 2016 (age-associated repertoire divergence).

[1]:
# Environment versions
import platform, importlib
print('Python:', platform.python_version())
for pkg in ('numpy', 'pandas', 'scipy', 'matplotlib', 'seaborn', 'umap', 'mir'):
    try:
        m = importlib.import_module(pkg if pkg != 'umap' else 'umap')
        ver = getattr(m, '__version__', '?')
        print(f'  {pkg}: {ver}')
    except ImportError:
        print(f'  {pkg}: NOT INSTALLED')
Python: 3.12.12
  numpy: 1.26.4
  pandas: 3.0.3
  scipy: 1.17.1
  matplotlib: 3.10.9
  seaborn: 0.13.2
/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
  umap: 0.5.12
  mir: ?
[2]:
import importlib
import os
import time
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.manifold import MDS
from IPython.display import display

import mir.comparative.overlap as overlap_module
overlap_module = importlib.reload(overlap_module)

from mir.common.parser import VDJtoolsParser
from mir.common.repertoire import LocusRepertoire
from mir.common.sampling import downsample
from mir.comparative import clear_pairwise_target_cache
from mir.comparative.overlap import many_vs_pool_overlap, pairwise_overlap, pairwise_overlap_matrix
from mir.utils.memory_debug import process_memory_snapshot
from mir.utils.notebook_assets import ensure_airr_benchmark, find_repo_root

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

plt.rcParams.update({
    'figure.dpi': 150,
    'font.family': 'STIXGeneral',
    'axes.titlesize': 12,
    'axes.labelsize': 11,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.linewidth': 0.8,
    'grid.linewidth': 0.4,
})
sns.set_theme(style='ticks', context='paper')
PALETTE = {
    'exact': '#1B4332',
    'hamming:1': '#B56576',
}
rng = np.random.default_rng(42)

# Drop any previously retained overlap targets when re-running from a warm kernel.
clear_pairwise_target_cache()

EXECUTION_TIMES: dict[str, float] = {}
STRESS_SNAPSHOTS: list[dict[str, float | str]] = []

def _record_stress(tag: str) -> None:
    snap = process_memory_snapshot(tag)
    try:
        load1, load5, load15 = os.getloadavg()
    except OSError:
        load1, load5, load15 = np.nan, np.nan, np.nan
    STRESS_SNAPSHOTS.append({
        'tag': tag,
        'rss_gb': float(snap['rss_gb']),
        'avail_gb': float(snap['available_gb']),
        'cpu_percent_proc': np.nan,
        'load1': float(load1),
        'load5': float(load5),
        'load15': float(load15),
    })
[3]:
# Resolve repo root and refresh AIRR benchmark assets from Hugging Face
repo_root = find_repo_root(Path.cwd())

t0 = time.perf_counter()
benchmark_root = ensure_airr_benchmark(repo_root, allow_patterns=['vdjtools/**'])
asset_refresh_s = time.perf_counter() - t0

VDJTOOLS_DIR = benchmark_root / 'vdjtools'
META_PATH = VDJTOOLS_DIR / 'metadata_aging.txt'

if not META_PATH.exists():
    raise FileNotFoundError(f'Missing expected metadata file: {META_PATH}')

repertoire_lookup = {p.name: p for p in sorted(VDJTOOLS_DIR.glob('*.txt.gz'))}

EXECUTION_TIMES = {'asset_refresh_s': asset_refresh_s}

print(f'Benchmark root      : {benchmark_root}')
print(f'VDJtools directory  : {VDJTOOLS_DIR}')
print(f'Metadata file       : {META_PATH}')
print(f'Refresh time        : {asset_refresh_s:.2f}s')
print(f'Indexed repertoires : {len(repertoire_lookup):,}')
Benchmark root      : /Users/mikesh/vcs/mirpy/notebooks/assets/large/airr_benchmark
VDJtools directory  : /Users/mikesh/vcs/mirpy/notebooks/assets/large/airr_benchmark/vdjtools
Metadata file       : /Users/mikesh/vcs/mirpy/notebooks/assets/large/airr_benchmark/vdjtools/metadata_aging.txt
Refresh time        : 0.71s
Indexed repertoires : 316

1 · Load metadata and repertoires#

[4]:
# Load metadata (polars-first; pandas only for pretty display)
meta = pl.read_csv(META_PATH, separator='\t')

# Normalize column names from benchmark files (e.g., '#file_name').
rename_map = {}
for c in meta.columns:
    c_norm = c.lstrip('#')
    if c_norm != c:
        rename_map[c] = c_norm
if rename_map:
    meta = meta.rename(rename_map)

required_cols = {'sample_id', 'file_name', 'age'}
missing = required_cols - set(meta.columns)
if missing:
    raise ValueError(f'Metadata missing required columns: {sorted(missing)}; found {meta.columns}')

meta = meta.with_columns([
    pl.col('sample_id').cast(pl.Utf8),
    pl.col('file_name').cast(pl.Utf8),
    pl.col('age').cast(pl.Int64),
])

n_rows = meta.height
n_donors = meta.select(pl.col('sample_id').n_unique()).item()
age_min = meta.select(pl.col('age').min()).item()
age_max = meta.select(pl.col('age').max()).item()
n_cord = meta.filter(pl.col('age') == 0).height

print(f'Metadata rows          : {n_rows}')
print(f'Unique donors          : {n_donors}')
print(f'Age range (years)      : {age_min}{age_max}')
print(f'Cord-blood samples age0: {n_cord}')

meta.head(10).to_pandas()
Metadata rows          : 79
Unique donors          : 79
Age range (years)      : 0–103
Cord-blood samples age0: 8
[4]:
file_name sample_id sex age label
0 A3-i101.txt A3-i101 F 36 p1-F36
1 A3-i102.txt A3-i102 F 43 p2-F43
2 A3-i106.txt A3-i106 F 43 p3-F43
3 A3-i107.txt A3-i107 F 39 p4-F39
4 A3-i110.txt A3-i110 F 34 p5-F34
5 A3-i145.txt A3-i145 F 86 p6-F86
6 A3-i150.txt A3-i150 F 87 p7-F87
7 A3-i151.txt A3-i151 F 87 p7-F87
8 A2-i129.txt A2-i129 M 11 p9-M11
9 A2-i131.txt A2-i131 M 9 p10-M9
[5]:
# Load repertoires from metadata rows (TRB per VDJtools file)
parser = VDJtoolsParser(sep='\t')

reps: list[LocusRepertoire] = []
loaded_rows: list[dict[str, object]] = []
missing_rows: list[dict[str, object]] = []

def _resolve_repertoire_path(file_name: str) -> Path | None:
    candidates = [
        file_name,
        f'{file_name}.gz',
        file_name.replace('.txt', '.txt.gz'),
        file_name.replace('.gz', ''),
    ]
    for cand in candidates:
        p = repertoire_lookup.get(cand)
        if p is not None and p.exists():
            return p
    return None

t0 = time.perf_counter()
for row in meta.iter_rows(named=True):
    fn = str(row['file_name'])
    p = _resolve_repertoire_path(fn)
    if p is None:
        missing_rows.append({'file_name': fn, 'sample_id': row['sample_id'], 'age': row['age']})
        continue

    clones = parser.parse(str(p))
    reps.append(LocusRepertoire(clonotypes=clones, locus='TRB'))
    loaded_rows.append({
        'file_name': fn,
        'sample_id': row['sample_id'],
        'sex': row.get('sex', None),
        'age': int(row['age']),
        'source_path': str(p),
    })

# Downsample each repertoire to 100k duplicate_count for memory-safe overlap computation.
DOWNSAMPLE_TARGET = 100_000
reps = [
    downsample(r, DOWNSAMPLE_TARGET, random_seed=42) if r.clonotype_count > 0 else r
    for r in reps
]

load_repertoires_s = time.perf_counter() - t0
EXECUTION_TIMES['load_repertoires_s'] = load_repertoires_s

meta_valid = pl.DataFrame(loaded_rows)
if meta_valid.height == 0:
    raise RuntimeError('No repertoires were loaded. Check file name normalization against VDJtools paths.')

sample_ids = meta_valid['sample_id'].to_list()
ages = meta_valid['age'].to_list()
sizes = np.array([r.clonotype_count for r in reps], dtype=int)

# Cohort-level diversity: unique clonotypes across all donors in aavj space.
cohort_unique_aavj = len({(c.junction_aa, c.v_gene, c.j_gene) for rep in reps for c in rep.clonotypes if c.junction_aa})

print(f'Loaded repertoires : {len(reps)} in {load_repertoires_s:.2f}s')
print(f'Loaded donors      : {meta_valid.select(pl.col("sample_id").n_unique()).item()}')
print(f'Loaded age range   : {min(ages)}{max(ages)} years')
print(f'Clonotype count    : min={sizes.min():,} median={int(np.median(sizes)):,} max={sizes.max():,}')
print(f'Cohort unique aavj : {cohort_unique_aavj:,}')

if missing_rows:
    miss_df = pl.DataFrame(missing_rows)
    print(f'Missing files for metadata rows: {miss_df.height}')
    display(miss_df.head(10).to_pandas())
else:
    print('All metadata rows resolved to repertoire files.')

_record_stress('after_load_repertoires')
/Users/mikesh/vcs/mirpy/mir/common/sampling.py:69: UserWarning: downsample_count (100000) >= total duplicate count (50047); no downsampling performed
  warnings.warn(
/Users/mikesh/vcs/mirpy/mir/common/sampling.py:69: UserWarning: downsample_count (100000) >= total duplicate count (35571); no downsampling performed
  warnings.warn(
/Users/mikesh/vcs/mirpy/mir/common/sampling.py:69: UserWarning: downsample_count (100000) >= total duplicate count (78463); no downsampling performed
  warnings.warn(
Loaded repertoires : 79 in 96.25s
Loaded donors      : 79
Loaded age range   : 0–103 years
Clonotype count    : min=12,439 median=51,303 max=93,154
Cohort unique aavj : 3,683,755
All metadata rows resolved to repertoire files.

2 · Diversity and 300k-read subsampling#

[6]:
# Compute observed diversity and diversity after 50k-read subsampling (adjusted for 100k rep downsample)
DOWNSAMPLE_READS = 50_000

diversity_rows: list[dict[str, object]] = []
for i, (sid, age, rep) in enumerate(zip(sample_ids, ages, reps)):
    total_reads = int(sum(cl.duplicate_count for cl in rep.clonotypes))
    observed_diversity = int(rep.clonotype_count)

    if total_reads >= DOWNSAMPLE_READS:
        rep_ds = downsample(rep, DOWNSAMPLE_READS, random_seed=42 + i)
        diversity_50k = int(rep_ds.clonotype_count)
    else:
        diversity_50k = None

    diversity_rows.append({
        'sample_id': sid,
        'age': int(age),
        'total_reads': total_reads,
        'observed_diversity': observed_diversity,
        'diversity_50k': diversity_50k,
    })

diversity_df = pl.DataFrame(diversity_rows)
print(f'Samples with >= {DOWNSAMPLE_READS:,} reads: {diversity_df.filter(pl.col("diversity_50k").is_not_null()).height}/{diversity_df.height}')

diversity_df.sort('age').head(12).to_pandas()
Samples with >= 50,000 reads: 78/79
[6]:
sample_id age total_reads observed_diversity diversity_50k
0 A5-S14 0 100000 82370 36065
1 A5-S15 0 100000 66539 32831
2 A5-S16 0 100000 88303 37232
3 A5-S17 0 100000 92095 37961
4 A5-S18 0 100000 93154 38139
5 A5-S11 0 100000 89060 37480
6 A5-S12 0 100000 90139 37524
7 A5-S13 0 100000 91912 37973
8 A2-i132 6 100000 85227 35730
9 A2-i131 9 100000 71013 31271
10 A2-i136 10 100000 65385 28878
11 A2-i129 11 100000 80930 34457
[7]:
# Diversity at 50k reads versus age
plot_df = diversity_df.filter(pl.col('diversity_50k').is_not_null())

fig, ax = plt.subplots(figsize=(7.0, 4.4), constrained_layout=True)
sc = ax.scatter(
    plot_df['age'].to_numpy(),
    plot_df['diversity_50k'].to_numpy(),
    c=plot_df['total_reads'].to_numpy(),
    cmap='viridis',
    s=60,
    alpha=0.85,
    edgecolors='white',
    linewidths=0.25,
)

x = plot_df['age'].to_numpy().astype(float)
y = plot_df['diversity_50k'].to_numpy().astype(float)
mask = np.isfinite(x) & np.isfinite(y)
slope, intercept, r, p, _ = stats.linregress(x[mask], y[mask])
xr = np.linspace(x[mask].min(), x[mask].max(), 200)
ax.plot(xr, slope * xr + intercept, color='#C1121F', lw=2.0)

ax.set_xlabel('Age (years)')
ax.set_ylabel('Unique clonotypes in 50k-read subsample')
ax.set_title('Subsampled Diversity vs Age (50k)')
ax.text(
    0.03,
    0.96,
    f'$r={r:.3f}$\n$p={p:.2e}$',
    transform=ax.transAxes,
    ha='left',
    va='top',
    fontsize=10,
    bbox=dict(facecolor='white', alpha=0.75, edgecolor='none', pad=2.5),
)
plt.colorbar(sc, ax=ax, label='Total reads in original sample')
plt.show()
../_images/notebooks_aging_analysis_9_0.png

3 · Pairwise overlap matrices and metric conversion#

[8]:
# Compute pairwise overlap matrices directly (no pilot estimation), with immediate progress logs.
N_JOBS_MATRIX = 4  # safe: 100k-downsampled reps keep qi_list small enough for 4 spawn workers
OVERLAP_SPACE = 'aavj'
TARGET_CELL_SECONDS = 15 * 60

results: dict[str, pl.DataFrame] = {}
pairwise_timing_rows: list[dict[str, object]] = []

for mode_label, metric, threshold in [
    ('exact', 'exact', 0),
    ('hamming:1', 'hamming', 1),
]:
    print(f"[{mode_label}] starting pairwise matrix...")
    t0 = time.perf_counter()
    df_pd = pairwise_overlap_matrix(
        reps,
        sample_ids=sample_ids,
        metric=metric,
        threshold=threshold,
        overlap_space=OVERLAP_SPACE,
        n_jobs=N_JOBS_MATRIX,
    )
    elapsed = time.perf_counter() - t0

    df = pl.from_pandas(df_pd)
    results[mode_label] = df

    try:
        import psutil
        n_jobs_effective = psutil.cpu_count(logical=False) or os.cpu_count() or 1
    except Exception:
        n_jobs_effective = os.cpu_count() or 1

    pairwise_timing_rows.append({
        'mode': mode_label,
        'n_pairs': df.height,
        'seconds': elapsed,
        'n_jobs_requested': N_JOBS_MATRIX,
        'n_jobs_effective': int(n_jobs_effective),
        'within_target': elapsed <= TARGET_CELL_SECONDS,
    })

    status = 'OK' if elapsed <= TARGET_CELL_SECONDS else 'OVER_TARGET'
    print(
        f"[{mode_label}] done pairs={df.height} n_jobs={int(n_jobs_effective)} "
        f"time={elapsed:.2f}s [{status}]"
    )

pairwise_timing = pl.DataFrame(pairwise_timing_rows)
EXECUTION_TIMES['pairwise_total_s'] = float(pairwise_timing.select(pl.col('seconds').sum()).item())

print('\nPairwise timing summary:')
display(pairwise_timing.to_pandas())
_record_stress('after_pairwise_matrix')
[exact] starting pairwise matrix...
[exact] done pairs=3081 n_jobs=16 time=27.01s [OK]
[hamming:1] starting pairwise matrix...
[hamming:1] done pairs=3081 n_jobs=16 time=536.21s [OK]

Pairwise timing summary:
mode n_pairs seconds n_jobs_requested n_jobs_effective within_target
0 exact 3081 27.010256 4 16 True
1 hamming:1 3081 536.206774 4 16 True

4 · Pool sample and per-sample overlap with pool#

[9]:
# Build pooled repertoire by concatenating clonotypes from all donors.
all_clones = []
for rep in reps:
    all_clones.extend(rep.clonotypes)

pool_rep = LocusRepertoire(clonotypes=all_clones, locus='TRB')
print(f'Pooled repertoire: {pool_rep.clonotype_count:,} clonotypes (sum of all samples)')
Pooled repertoire: 4,145,989 clonotypes (sum of all samples)
[10]:
# Compute donor-vs-pool overlap with the shared-worker many-vs-pool API.
POOL_N_JOBS = 4  # limit to 4 to avoid memory exhaustion
POOL_BATCH_SIZE = max(1, len(reps) // (POOL_N_JOBS * 2))
pool_results: dict[str, pl.DataFrame] = {}
pool_timing_rows: list[dict[str, object]] = []

t0_all = time.perf_counter()
for mode_label, metric, threshold in [
    ('exact', 'exact', 0),
    ('hamming:1', 'hamming', 1),
]:
    print(f"[{mode_label}] donor-vs-pool starting...")
    t0 = time.perf_counter()

    df = many_vs_pool_overlap(
        reps,
        pool_rep,
        sample_ids=sample_ids,
        ages=ages,
        metric=metric,
        threshold=threshold,
        overlap_space=OVERLAP_SPACE,
        n_jobs=POOL_N_JOBS,
        batch_size=POOL_BATCH_SIZE,
    )

    elapsed = time.perf_counter() - t0
    pool_results[mode_label] = df
    pool_timing_rows.append({'mode': mode_label, 'n_samples': len(df), 'seconds': elapsed})
    print(f"[{mode_label}] donor-vs-pool done in {elapsed:.2f}s")

pool_timing = pl.DataFrame(pool_timing_rows)
EXECUTION_TIMES['pool_total_s'] = time.perf_counter() - t0_all

print('\nPool overlap timing summary:')
display(pool_timing.to_pandas())
_record_stress('after_pool_overlap')
[exact] donor-vs-pool starting...
[exact] donor-vs-pool done in 29.15s
[hamming:1] donor-vs-pool starting...
[hamming:1] donor-vs-pool done in 136.79s

Pool overlap timing summary:
mode n_samples seconds
0 exact 79 29.147094
1 hamming:1 79 136.785390

5 · F and D metrics vs absolute age difference#

[11]:
def _attach_pair_ages(df_pairs: pl.DataFrame, meta_df: pl.DataFrame) -> pl.DataFrame:
    """Attach age columns and absolute age difference to pairwise rows."""
    age_map = meta_df.select(['sample_id', 'age']).rename({'sample_id': 'sid'})

    out = (
        df_pairs
        .join(age_map.rename({'sid': 'sample_id_1', 'age': 'age1'}), on='sample_id_1', how='left')
        .join(age_map.rename({'sid': 'sample_id_2', 'age': 'age2'}), on='sample_id_2', how='left')
        .with_columns((pl.col('age1') - pl.col('age2')).abs().alias('age_diff'))
    )
    return out


def _outlier_threshold_iqr(values: np.ndarray, k: float = 3.0) -> float:
    q1 = np.nanpercentile(values, 25)
    q3 = np.nanpercentile(values, 75)
    iqr = q3 - q1
    return q3 + k * iqr
[12]:
# Pairwise similarity versus absolute age difference (exact mode),
# with distance-like metrics computed only for plotting.
df_exact_age = _attach_pair_ages(results['exact'], meta_valid)

fig, axes = plt.subplots(2, 2, figsize=(12, 7.2), sharex='col', constrained_layout=True)
metric_specs = [
    ('f_similarity', 'F similarity', lambda s: 1.0 - s),
    ('d_similarity', 'D similarity', lambda s: np.where(s > 0, 1.0 / s, np.nan)),
]

for col_i, (sim_col, label, to_metric) in enumerate(metric_specs):
    x = df_exact_age['age_diff'].to_numpy().astype(float)
    sim = df_exact_age[sim_col].to_numpy().astype(float)
    y = to_metric(sim)

    mask = np.isfinite(x) & np.isfinite(y)
    x = x[mask]
    y = y[mask]

    top_ax = axes[0, col_i]
    bot_ax = axes[1, col_i]

    out_cut = _outlier_threshold_iqr(y, k=3.0)
    out_mask = y > out_cut

    bot_ax.scatter(x[~out_mask], y[~out_mask], c=x[~out_mask], cmap='viridis', s=22, alpha=0.75, edgecolors='none')
    if out_mask.any():
        top_ax.scatter(x[out_mask], y[out_mask], c=x[out_mask], cmap='viridis', s=30, alpha=0.9, edgecolors='black', linewidths=0.25)
        top_ax.set_ylim(np.nanmin(y[out_mask]) * 0.98, np.nanmax(y[out_mask]) * 1.02)
    else:
        top_ax.scatter(x, y, c=x, cmap='viridis', s=22, alpha=0.75, edgecolors='none')

    bot_ax.set_ylim(np.nanmin(y[~out_mask]) * 0.98, np.nanmax(y[~out_mask]) * 1.02)

    slope, intercept, r, p, _ = stats.linregress(x, y)
    xr = np.linspace(np.nanmin(x), np.nanmax(x), 200)
    bot_ax.plot(xr, slope * xr + intercept, color='#C1121F', lw=2.0)

    top_ax.set_title(f'{label} metric')
    top_ax.text(
        0.03,
        0.92,
        f'outliers: {int(out_mask.sum())}',
        transform=top_ax.transAxes,
        ha='left',
        va='top',
        fontsize=9,
        bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=2),
    )
    bot_ax.text(
        0.03,
        0.96,
        f'$r={r:.3f}$\\n$p={p:.2e}$',
        transform=bot_ax.transAxes,
        ha='left',
        va='top',
        fontsize=9,
        bbox=dict(facecolor='white', alpha=0.75, edgecolor='none', pad=2.5),
    )

    top_ax.spines['bottom'].set_visible(False)
    bot_ax.spines['top'].set_visible(False)
    top_ax.tick_params(labelbottom=False)

    top_ax.set_ylabel('Metric')
    bot_ax.set_ylabel('Metric')
    bot_ax.set_xlabel('Absolute age difference (years)')

fig.suptitle('Pairwise Metrics vs Absolute Age Difference (Exact)', y=1.02, fontsize=13)
plt.show()

# Report strongest D-metric outliers for interpretation.
d_metric_exact = np.where(df_exact_age['d_similarity'].to_numpy().astype(float) > 0, 1.0 / df_exact_age['d_similarity'].to_numpy().astype(float), np.nan)
outlier_report = (
    df_exact_age
    .with_columns(pl.Series('d_metric', d_metric_exact))
    .with_columns((pl.col('d_metric') > _outlier_threshold_iqr(d_metric_exact[np.isfinite(d_metric_exact)], 3.0)).alias('d_outlier'))
    .filter(pl.col('d_outlier'))
    .select(['sample_id_1', 'sample_id_2', 'age1', 'age2', 'age_diff', 'd_similarity', 'f_similarity', 'd_metric'])
    .sort('d_metric', descending=True)
)
print(f'N pairs: {df_exact_age.height}')
if outlier_report.height:
    print('Top D-metric outliers (likely near-identical or technical replicates):')
    display(outlier_report.head(10).to_pandas())
../_images/notebooks_aging_analysis_17_0.png
N pairs: 3081
Top D-metric outliers (likely near-identical or technical replicates):
sample_id_1 sample_id_2 age1 age2 age_diff d_similarity f_similarity d_metric
0 A2-i138 A5-S1 74 103 29 0.001931 0.000694 517.802173
1 A4-i125 A5-S2 64 100 36 0.001938 0.000839 516.100178
2 A3-i151 A5-S1 87 103 16 0.001999 0.000853 500.248067
3 A3-i150 A5-S1 87 103 16 0.002036 0.000854 491.121421
4 A4-i125 A6-I210ob 64 92 28 0.002037 0.009138 491.013978
[13]:
# Donor-vs-pool similarity trends by age.
df_pool_exact = pool_results['exact']

fig, axes = plt.subplots(1, 2, figsize=(12, 4.8), constrained_layout=True)

for ax, (sim_col, label) in zip(axes, [
    ('f_similarity', 'F similarity (vs pool)'),
    ('d_similarity', 'D similarity (vs pool)'),
]):
    x = df_pool_exact['age'].to_numpy().astype(float)
    y = df_pool_exact[sim_col].to_numpy().astype(float)
    mask = np.isfinite(x) & np.isfinite(y)

    sc = ax.scatter(
        x[mask],
        y[mask],
        c=x[mask],
        cmap='viridis',
        s=58,
        alpha=0.85,
        edgecolors='white',
        linewidths=0.25,
        rasterized=True,
    )

    slope, intercept, r, p, _ = stats.linregress(x[mask], y[mask])
    xr = np.linspace(x[mask].min(), x[mask].max(), 200)
    ax.plot(xr, slope * xr + intercept, color='#C1121F', lw=2.0)

    ax.set_xlabel('Age (years)')
    ax.set_ylabel(label)
    ax.set_title(label)
    ax.text(
        0.03,
        0.96,
        f'$r={r:.3f}$\\n$p={p:.2e}$',
        transform=ax.transAxes,
        ha='left',
        va='top',
        fontsize=10,
        bbox=dict(facecolor='white', alpha=0.75, edgecolor='none', pad=2.5),
    )
    plt.colorbar(sc, ax=ax, label='Age')

fig.suptitle('Overlap With Cohort Pool Across Aging', y=1.02, fontsize=13)
plt.show()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/vcs/mirpy/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:3641, in Index.get_loc(self, key)
   3640 try:
-> 3641     return self._engine.get_loc(casted_key)
   3642 except KeyError as err:

File pandas/_libs/index.pyx:168, in pandas._libs.index.IndexEngine.get_loc()
--> 168 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/index.pyx:197, in pandas._libs.index.IndexEngine.get_loc()
--> 197 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/hashtable_class_helper.pxi:7668, in pandas._libs.hashtable.PyObjectHashTable.get_item()
-> 7668 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/hashtable_class_helper.pxi:7676, in pandas._libs.hashtable.PyObjectHashTable.get_item()
-> 7676 'Could not get source, probably due dynamically evaluated source code.'

KeyError: 'f_similarity'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[13], line 11
      7     ('f_similarity', 'F similarity (vs pool)'),
      8     ('d_similarity', 'D similarity (vs pool)'),
      9 ]):
     10     x = df_pool_exact['age'].to_numpy().astype(float)
---> 11     y = df_pool_exact[sim_col].to_numpy().astype(float)
     12     mask = np.isfinite(x) & np.isfinite(y)
     13
     14     sc = ax.scatter(

File ~/vcs/mirpy/.venv/lib/python3.12/site-packages/pandas/core/frame.py:4378, in DataFrame.__getitem__(self, key)
   4374
   4375         if is_single_key:
   4376             if self.columns.nlevels > 1:
   4377                 return self._getitem_multilevel(key)
-> 4378             indexer = self.columns.get_loc(key)
   4379             if is_integer(indexer):
   4380                 indexer = [indexer]
   4381         else:

File ~/vcs/mirpy/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:3648, in Index.get_loc(self, key)
   3643     if isinstance(casted_key, slice) or (
   3644         isinstance(casted_key, abc.Iterable)
   3645         and any(isinstance(x, slice) for x in casted_key)
   3646     ):
   3647         raise InvalidIndexError(key) from err
-> 3648     raise KeyError(key) from err
   3649 except TypeError:
   3650     # If we have a listlike key, _check_indexing_error will raise
   3651     #  InvalidIndexError. Otherwise we fall through and re-raise
   3652     #  the TypeError.
   3653     self._check_indexing_error(key)

KeyError: 'f_similarity'
../_images/notebooks_aging_analysis_18_1.png

6 · UMAP and MDS from pairwise distance metrics#

[14]:
def _to_symmetric_matrix(df_pairs: pl.DataFrame, col: str, ids: list[str]) -> np.ndarray:
    """Convert long-format pairwise table to symmetric NxN matrix."""
    n = len(ids)
    id_to_idx = {sid: i for i, sid in enumerate(ids)}
    mat = np.zeros((n, n), dtype=float)

    for row in df_pairs.select(['sample_id_1', 'sample_id_2', col]).iter_rows(named=True):
        i = id_to_idx[row['sample_id_1']]
        j = id_to_idx[row['sample_id_2']]
        v = float(row[col])
        mat[i, j] = v
        mat[j, i] = v
    return mat


def _f_metric_from_similarity(f_similarity: np.ndarray) -> np.ndarray:
    return 1.0 - f_similarity


def _d_metric_from_similarity(d_similarity: np.ndarray) -> np.ndarray:
    with np.errstate(divide='ignore', invalid='ignore'):
        d_metric = np.where(d_similarity > 0, 1.0 / d_similarity, np.nan)
    if np.isfinite(d_metric).any():
        max_finite = np.nanmax(d_metric[np.isfinite(d_metric)])
        d_metric = np.nan_to_num(d_metric, nan=max_finite)
    else:
        d_metric = np.zeros_like(d_similarity)
    return d_metric
[15]:
try:
    from umap import UMAP
    HAS_UMAP = True
except ImportError:
    HAS_UMAP = False
    print('umap-learn not installed — skipping UMAP. Install with: pip install umap-learn')
[16]:
if HAS_UMAP:
    # Build exact-mode distance matrices for embedding.
    f_similarity_exact = _to_symmetric_matrix(results['exact'], 'f_similarity', sample_ids)
    d_similarity_exact = _to_symmetric_matrix(results['exact'], 'd_similarity', sample_ids)

    f_metric_exact = _f_metric_from_similarity(f_similarity_exact)
    d_metric_exact = _d_metric_from_similarity(d_similarity_exact)

    np.fill_diagonal(f_metric_exact, 0.0)
    np.fill_diagonal(d_metric_exact, 0.0)

    umap_f = UMAP(
        n_components=2,
        metric='precomputed',
        n_neighbors=min(10, len(reps) - 1),
        min_dist=0.08,
        random_state=42,
    ).fit_transform(f_metric_exact)

    mds_f = MDS(
        n_components=2,
        dissimilarity='precomputed',
        normalized_stress='auto',
        random_state=42,
        n_init=4,
        max_iter=400,
    ).fit_transform(f_metric_exact)

    umap_d = UMAP(
        n_components=2,
        metric='precomputed',
        n_neighbors=min(10, len(reps) - 1),
        min_dist=0.08,
        random_state=42,
    ).fit_transform(d_metric_exact)

    mds_d = MDS(
        n_components=2,
        dissimilarity='precomputed',
        normalized_stress='auto',
        random_state=42,
        n_init=4,
        max_iter=400,
    ).fit_transform(d_metric_exact)

    EMBEDDINGS = {
        'UMAP (F metric)': umap_f,
        'MDS (F metric)': mds_f,
        'UMAP (D metric)': umap_d,
        'MDS (D metric)': mds_d,
    }
    print('Computed embeddings:', ', '.join(EMBEDDINGS.keys()))
/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/umap/umap_.py:1865: UserWarning: using precomputed metric; inverse_transform will be unavailable
  warn("using precomputed metric; inverse_transform will be unavailable")
/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
Computed embeddings: UMAP (F metric), MDS (F metric), UMAP (D metric), MDS (D metric)
/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/umap/umap_.py:1865: UserWarning: using precomputed metric; inverse_transform will be unavailable
  warn("using precomputed metric; inverse_transform will be unavailable")
/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
[17]:
if HAS_UMAP:
    fig, axes = plt.subplots(2, 2, figsize=(13.5, 10.2), constrained_layout=True)

    for ax, (title, emb) in zip(axes.ravel(), EMBEDDINGS.items()):
        sc = ax.scatter(
            emb[:, 0],
            emb[:, 1],
            c=np.array(ages, dtype=float),
            cmap='viridis',
            s=64,
            alpha=0.9,
            linewidths=0.25,
            edgecolors='white',
        )
        ax.set_xlabel('Component 1')
        ax.set_ylabel('Component 2')
        ax.set_title(title)
        plt.colorbar(sc, ax=ax, label='Age (years)')

    fig.suptitle('Aging Structure in Distance Embeddings', y=1.01, fontsize=13)
    plt.show()
../_images/notebooks_aging_analysis_23_0.png

6 · Compare exact vs Hamming:1#

[18]:
# Compare F and D metric distributions across overlap modes with fixed binning.

f_metric_by_mode = {
    mode_label: (1.0 - df['f_similarity']).drop_nulls().to_numpy()
    for mode_label, df in results.items()
}
d_metric_by_mode = {
    mode_label: pl.Series(
        np.where(df['d_similarity'].to_numpy() > 0, 1.0 / df['d_similarity'].to_numpy(), np.nan)
    ).drop_nulls().to_numpy()
    for mode_label, df in results.items()
}

def _fixed_edges(series_by_mode: dict[str, np.ndarray], n_bins: int = 36) -> np.ndarray:
    all_vals = np.concatenate([v for v in series_by_mode.values() if len(v) > 0])
    lo = float(np.nanmin(all_vals))
    hi = float(np.nanmax(all_vals))
    if hi <= lo:
        hi = lo + 1e-6
    return np.linspace(lo, hi, n_bins + 1)

f_edges = _fixed_edges(f_metric_by_mode, n_bins=36)
d_edges = _fixed_edges(d_metric_by_mode, n_bins=36)

fig, axes = plt.subplots(1, 2, figsize=(12.4, 4.4), constrained_layout=True)

for mode_label in ('exact', 'hamming:1'):
    axes[0].hist(
        f_metric_by_mode[mode_label],
        bins=f_edges,
        alpha=0.45,
        label=mode_label,
        color=PALETTE[mode_label],
        density=True,
        edgecolor='white',
        linewidth=0.35,
    )
    axes[1].hist(
        d_metric_by_mode[mode_label],
        bins=d_edges,
        alpha=0.45,
        label=mode_label,
        color=PALETTE[mode_label],
        density=True,
        edgecolor='white',
        linewidth=0.35,
    )

axes[0].set_xlabel('F metric (1 − F similarity)')
axes[0].set_ylabel('Density')
axes[0].set_title('F Metric Distribution')

axes[1].set_xlabel('D metric (1 / D similarity)')
axes[1].set_ylabel('Density')
axes[1].set_title('D Metric Distribution')

axes[0].legend(frameon=False, ncol=1, loc='upper right')
axes[1].legend(frameon=False, ncol=1, loc='upper right')

plt.show()
../_images/notebooks_aging_analysis_25_0.png
[19]:
# Similarity summary table (reported, not plotted): F, D, F2, Jaccard,
# Szymkiewicz-Simpson, Morisita-Horn.
summary_rows = []
for mode_label, df in results.items():
    for metric_col in (
        'f_similarity',
        'd_similarity',
        'f2_similarity',
        'jaccard',
        'szymkiewicz_simpson',
        'morisita_horn',
    ):
        vals = df.select(pl.col(metric_col)).drop_nulls().to_series()
        if vals.len() == 0:
            continue
        summary_rows.append({
            'mode': mode_label,
            'metric': metric_col,
            'mean': float(vals.mean()),
            'median': float(vals.median()),
            'std': float(vals.std()),
            'max': float(vals.max()),
        })

summary_df = pl.DataFrame(summary_rows).sort(['mode', 'metric'])
display(summary_df.to_pandas())
mode metric mean median std max
0 exact d_similarity 0.006742 0.005746 0.007321 0.263944
1 exact f2_similarity 0.008369 0.005099 0.024601 0.740629
2 exact f_similarity 0.016933 0.007574 0.036414 0.756360
3 exact jaccard 0.003259 0.002765 0.004036 0.152034
4 exact morisita_horn 0.002958 0.000028 0.037584 0.998456
5 exact szymkiewicz_simpson 0.008686 0.007572 0.007841 0.265452
6 hamming:1 d_similarity 0.081960 0.078491 0.024006 0.311358
7 hamming:1 f_similarity 0.095677 0.088380 0.047409 0.772454
8 hamming:1 jaccard 0.040991 0.038942 0.013729 0.184380
9 hamming:1 morisita_horn 0.010708 0.000313 0.071508 0.998890
10 hamming:1 szymkiewicz_simpson 0.097405 0.096444 0.024549 0.312361

7 · Age correlation summary#

[20]:
# Spearman correlation of similarity vs age for donor-vs-pool results.
print('\nPool-overlap: Spearman correlation with age')
print(f'{"Mode":<18} {"Similarity":<15} {"rho":>8} {"p-value":>12}')
print('-' * 60)

for mode_label, df in pool_results.items():
    for sim_col in ('f_similarity', 'd_similarity'):
        vals = df[sim_col].to_numpy().astype(float)
        age_vals = df['age'].to_numpy().astype(float)
        mask = np.isfinite(vals) & np.isfinite(age_vals)
        rho, p = stats.spearmanr(age_vals[mask], vals[mask])
        print(f'{mode_label:<18} {sim_col:<15} {rho:>8.3f} {p:>12.3e}')

_record_stress('final')

print('\nExecution timing (seconds):')
display(pd.DataFrame([EXECUTION_TIMES]).T.rename(columns={0: 'seconds'}))

stress_df = pd.DataFrame(STRESS_SNAPSHOTS)
if not stress_df.empty:
    print('\nStress diagnostics:')
    display(stress_df)
    print('Peak RSS (GB):', f"{stress_df['rss_gb'].max():.2f}")
    print('Min available RAM (GB):', f"{stress_df['avail_gb'].min():.2f}")
    print('Max process CPU% sample:', f"{stress_df['cpu_percent_proc'].max():.1f}")

Pool-overlap: Spearman correlation with age
Mode               Similarity           rho      p-value
------------------------------------------------------------
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/vcs/mirpy/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:3641, in Index.get_loc(self, key)
   3640 try:
-> 3641     return self._engine.get_loc(casted_key)
   3642 except KeyError as err:

File pandas/_libs/index.pyx:168, in pandas._libs.index.IndexEngine.get_loc()
--> 168 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/index.pyx:197, in pandas._libs.index.IndexEngine.get_loc()
--> 197 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/hashtable_class_helper.pxi:7668, in pandas._libs.hashtable.PyObjectHashTable.get_item()
-> 7668 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/hashtable_class_helper.pxi:7676, in pandas._libs.hashtable.PyObjectHashTable.get_item()
-> 7676 'Could not get source, probably due dynamically evaluated source code.'

KeyError: 'f_similarity'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[20], line 8
      4 print('-' * 60)
      5
      6 for mode_label, df in pool_results.items():
      7     for sim_col in ('f_similarity', 'd_similarity'):
----> 8         vals = df[sim_col].to_numpy().astype(float)
      9         age_vals = df['age'].to_numpy().astype(float)
     10         mask = np.isfinite(vals) & np.isfinite(age_vals)
     11         rho, p = stats.spearmanr(age_vals[mask], vals[mask])

File ~/vcs/mirpy/.venv/lib/python3.12/site-packages/pandas/core/frame.py:4378, in DataFrame.__getitem__(self, key)
   4374
   4375         if is_single_key:
   4376             if self.columns.nlevels > 1:
   4377                 return self._getitem_multilevel(key)
-> 4378             indexer = self.columns.get_loc(key)
   4379             if is_integer(indexer):
   4380                 indexer = [indexer]
   4381         else:

File ~/vcs/mirpy/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:3648, in Index.get_loc(self, key)
   3643     if isinstance(casted_key, slice) or (
   3644         isinstance(casted_key, abc.Iterable)
   3645         and any(isinstance(x, slice) for x in casted_key)
   3646     ):
   3647         raise InvalidIndexError(key) from err
-> 3648     raise KeyError(key) from err
   3649 except TypeError:
   3650     # If we have a listlike key, _check_indexing_error will raise
   3651     #  InvalidIndexError. Otherwise we fall through and re-raise
   3652     #  the TypeError.
   3653     self._check_indexing_error(key)

KeyError: 'f_similarity'