TCR Repertoire Aging Analysis#

Reproduces key overlap and aging trends using the Hugging Face AIRR benchmark:

Data source: isalgo/airr_benchmark (vdjtools/**, refreshed on run)
Includes full aging metadata with cord-blood donors (age = 0)
Pairwise overlap in exact, Hamming:1, and Hamming:1 modes
Diversity analysis from observed repertoires and 250k-read subsamples
Similarity-first analysis with distance metrics only where required
UMAP and MDS from pairwise distance metrics

Reference context: Britanova et al. 2016 (age-associated repertoire divergence).

[1]:

# Environment versions
import platform, importlib
print('Python:', platform.python_version())
for pkg in ('numpy', 'pandas', 'scipy', 'matplotlib', 'seaborn', 'umap', 'mir'):
    try:
        m = importlib.import_module(pkg if pkg != 'umap' else 'umap')
        ver = getattr(m, '__version__', '?')
        print(f'  {pkg}: {ver}')
    except ImportError:
        print(f'  {pkg}: NOT INSTALLED')

Python: 3.12.12
  numpy: 1.26.4
  pandas: 3.0.3
  scipy: 1.17.1
  matplotlib: 3.10.9
  seaborn: 0.13.2

/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

  umap: 0.5.12
  mir: ?

[2]:

import importlib
import os
import time
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.manifold import MDS
from IPython.display import display

import mir.comparative.overlap as overlap_module
overlap_module = importlib.reload(overlap_module)

from mir.common.parser import VDJtoolsParser
from mir.common.repertoire import LocusRepertoire
from mir.common.sampling import downsample
from mir.comparative import clear_pairwise_target_cache
from mir.comparative.overlap import many_vs_pool_overlap, pairwise_overlap, pairwise_overlap_matrix
from mir.utils.memory_debug import process_memory_snapshot
from mir.utils.notebook_assets import ensure_airr_benchmark, find_repo_root

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

plt.rcParams.update({
    'figure.dpi': 150,
    'font.family': 'STIXGeneral',
    'axes.titlesize': 12,
    'axes.labelsize': 11,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.linewidth': 0.8,
    'grid.linewidth': 0.4,
})
sns.set_theme(style='ticks', context='paper')
PALETTE = {
    'exact': '#1B4332',
    'hamming:1': '#B56576',
}
rng = np.random.default_rng(42)

# Drop any previously retained overlap targets when re-running from a warm kernel.
clear_pairwise_target_cache()

EXECUTION_TIMES: dict[str, float] = {}
STRESS_SNAPSHOTS: list[dict[str, float | str]] = []

def _record_stress(tag: str) -> None:
    snap = process_memory_snapshot(tag)
    try:
        load1, load5, load15 = os.getloadavg()
    except OSError:
        load1, load5, load15 = np.nan, np.nan, np.nan
    STRESS_SNAPSHOTS.append({
        'tag': tag,
        'rss_gb': float(snap['rss_gb']),
        'avail_gb': float(snap['available_gb']),
        'cpu_percent_proc': np.nan,
        'load1': float(load1),
        'load5': float(load5),
        'load15': float(load15),
    })

[3]:

# Resolve repo root and refresh AIRR benchmark assets from Hugging Face
repo_root = find_repo_root(Path.cwd())

t0 = time.perf_counter()
benchmark_root = ensure_airr_benchmark(repo_root, allow_patterns=['vdjtools/**'])
asset_refresh_s = time.perf_counter() - t0

VDJTOOLS_DIR = benchmark_root / 'vdjtools'
META_PATH = VDJTOOLS_DIR / 'metadata_aging.txt'

if not META_PATH.exists():
    raise FileNotFoundError(f'Missing expected metadata file: {META_PATH}')

repertoire_lookup = {p.name: p for p in sorted(VDJTOOLS_DIR.glob('*.txt.gz'))}

EXECUTION_TIMES = {'asset_refresh_s': asset_refresh_s}

print(f'Benchmark root      : {benchmark_root}')
print(f'VDJtools directory  : {VDJTOOLS_DIR}')
print(f'Metadata file       : {META_PATH}')
print(f'Refresh time        : {asset_refresh_s:.2f}s')
print(f'Indexed repertoires : {len(repertoire_lookup):,}')

Benchmark root      : /Users/mikesh/vcs/mirpy/notebooks/assets/large/airr_benchmark
VDJtools directory  : /Users/mikesh/vcs/mirpy/notebooks/assets/large/airr_benchmark/vdjtools
Metadata file       : /Users/mikesh/vcs/mirpy/notebooks/assets/large/airr_benchmark/vdjtools/metadata_aging.txt
Refresh time        : 0.71s
Indexed repertoires : 316

1 · Load metadata and repertoires#

[4]:

# Load metadata (polars-first; pandas only for pretty display)
meta = pl.read_csv(META_PATH, separator='\t')

# Normalize column names from benchmark files (e.g., '#file_name').
rename_map = {}
for c in meta.columns:
    c_norm = c.lstrip('#')
    if c_norm != c:
        rename_map[c] = c_norm
if rename_map:
    meta = meta.rename(rename_map)

required_cols = {'sample_id', 'file_name', 'age'}
missing = required_cols - set(meta.columns)
if missing:
    raise ValueError(f'Metadata missing required columns: {sorted(missing)}; found {meta.columns}')

meta = meta.with_columns([
    pl.col('sample_id').cast(pl.Utf8),
    pl.col('file_name').cast(pl.Utf8),
    pl.col('age').cast(pl.Int64),
])

n_rows = meta.height
n_donors = meta.select(pl.col('sample_id').n_unique()).item()
age_min = meta.select(pl.col('age').min()).item()
age_max = meta.select(pl.col('age').max()).item()
n_cord = meta.filter(pl.col('age') == 0).height

print(f'Metadata rows          : {n_rows}')
print(f'Unique donors          : {n_donors}')
print(f'Age range (years)      : {age_min}–{age_max}')
print(f'Cord-blood samples age0: {n_cord}')

meta.head(10).to_pandas()

Metadata rows          : 79
Unique donors          : 79
Age range (years)      : 0–103
Cord-blood samples age0: 8

[4]:

	file_name	sample_id	sex	age	label
0	A3-i101.txt	A3-i101	F	36	p1-F36
1	A3-i102.txt	A3-i102	F	43	p2-F43
2	A3-i106.txt	A3-i106	F	43	p3-F43
3	A3-i107.txt	A3-i107	F	39	p4-F39
4	A3-i110.txt	A3-i110	F	34	p5-F34
5	A3-i145.txt	A3-i145	F	86	p6-F86
6	A3-i150.txt	A3-i150	F	87	p7-F87
7	A3-i151.txt	A3-i151	F	87	p7-F87
8	A2-i129.txt	A2-i129	M	11	p9-M11
9	A2-i131.txt	A2-i131	M	9	p10-M9

[5]:

# Load repertoires from metadata rows (TRB per VDJtools file)
parser = VDJtoolsParser(sep='\t')

reps: list[LocusRepertoire] = []
loaded_rows: list[dict[str, object]] = []
missing_rows: list[dict[str, object]] = []

def _resolve_repertoire_path(file_name: str) -> Path | None:
    candidates = [
        file_name,
        f'{file_name}.gz',
        file_name.replace('.txt', '.txt.gz'),
        file_name.replace('.gz', ''),
    ]
    for cand in candidates:
        p = repertoire_lookup.get(cand)
        if p is not None and p.exists():
            return p
    return None

t0 = time.perf_counter()
for row in meta.iter_rows(named=True):
    fn = str(row['file_name'])
    p = _resolve_repertoire_path(fn)
    if p is None:
        missing_rows.append({'file_name': fn, 'sample_id': row['sample_id'], 'age': row['age']})
        continue

    clones = parser.parse(str(p))
    reps.append(LocusRepertoire(clonotypes=clones, locus='TRB'))
    loaded_rows.append({
        'file_name': fn,
        'sample_id': row['sample_id'],
        'sex': row.get('sex', None),
        'age': int(row['age']),
        'source_path': str(p),
    })

# Downsample each repertoire to 100k duplicate_count for memory-safe overlap computation.
DOWNSAMPLE_TARGET = 100_000
reps = [
    downsample(r, DOWNSAMPLE_TARGET, random_seed=42) if r.clonotype_count > 0 else r
    for r in reps
]

load_repertoires_s = time.perf_counter() - t0
EXECUTION_TIMES['load_repertoires_s'] = load_repertoires_s

meta_valid = pl.DataFrame(loaded_rows)
if meta_valid.height == 0:
    raise RuntimeError('No repertoires were loaded. Check file name normalization against VDJtools paths.')

sample_ids = meta_valid['sample_id'].to_list()
ages = meta_valid['age'].to_list()
sizes = np.array([r.clonotype_count for r in reps], dtype=int)

# Cohort-level diversity: unique clonotypes across all donors in aavj space.
cohort_unique_aavj = len({(c.junction_aa, c.v_gene, c.j_gene) for rep in reps for c in rep.clonotypes if c.junction_aa})

print(f'Loaded repertoires : {len(reps)} in {load_repertoires_s:.2f}s')
print(f'Loaded donors      : {meta_valid.select(pl.col("sample_id").n_unique()).item()}')
print(f'Loaded age range   : {min(ages)}–{max(ages)} years')
print(f'Clonotype count    : min={sizes.min():,} median={int(np.median(sizes)):,} max={sizes.max():,}')
print(f'Cohort unique aavj : {cohort_unique_aavj:,}')

if missing_rows:
    miss_df = pl.DataFrame(missing_rows)
    print(f'Missing files for metadata rows: {miss_df.height}')
    display(miss_df.head(10).to_pandas())
else:
    print('All metadata rows resolved to repertoire files.')

_record_stress('after_load_repertoires')

/Users/mikesh/vcs/mirpy/mir/common/sampling.py:69: UserWarning: downsample_count (100000) >= total duplicate count (50047); no downsampling performed
  warnings.warn(
/Users/mikesh/vcs/mirpy/mir/common/sampling.py:69: UserWarning: downsample_count (100000) >= total duplicate count (35571); no downsampling performed
  warnings.warn(
/Users/mikesh/vcs/mirpy/mir/common/sampling.py:69: UserWarning: downsample_count (100000) >= total duplicate count (78463); no downsampling performed
  warnings.warn(

Loaded repertoires : 79 in 96.25s
Loaded donors      : 79
Loaded age range   : 0–103 years
Clonotype count    : min=12,439 median=51,303 max=93,154
Cohort unique aavj : 3,683,755
All metadata rows resolved to repertoire files.

2 · Diversity and 300k-read subsampling#

[6]:

# Compute observed diversity and diversity after 50k-read subsampling (adjusted for 100k rep downsample)
DOWNSAMPLE_READS = 50_000

diversity_rows: list[dict[str, object]] = []
for i, (sid, age, rep) in enumerate(zip(sample_ids, ages, reps)):
    total_reads = int(sum(cl.duplicate_count for cl in rep.clonotypes))
    observed_diversity = int(rep.clonotype_count)

    if total_reads >= DOWNSAMPLE_READS:
        rep_ds = downsample(rep, DOWNSAMPLE_READS, random_seed=42 + i)
        diversity_50k = int(rep_ds.clonotype_count)
    else:
        diversity_50k = None

    diversity_rows.append({
        'sample_id': sid,
        'age': int(age),
        'total_reads': total_reads,
        'observed_diversity': observed_diversity,
        'diversity_50k': diversity_50k,
    })

diversity_df = pl.DataFrame(diversity_rows)
print(f'Samples with >= {DOWNSAMPLE_READS:,} reads: {diversity_df.filter(pl.col("diversity_50k").is_not_null()).height}/{diversity_df.height}')

diversity_df.sort('age').head(12).to_pandas()

Samples with >= 50,000 reads: 78/79

[6]:

	sample_id	age	total_reads	observed_diversity	diversity_50k
0	A5-S14	0	100000	82370	36065
1	A5-S15	0	100000	66539	32831
2	A5-S16	0	100000	88303	37232
3	A5-S17	0	100000	92095	37961
4	A5-S18	0	100000	93154	38139
5	A5-S11	0	100000	89060	37480
6	A5-S12	0	100000	90139	37524
7	A5-S13	0	100000	91912	37973
8	A2-i132	6	100000	85227	35730
9	A2-i131	9	100000	71013	31271
10	A2-i136	10	100000	65385	28878
11	A2-i129	11	100000	80930	34457

[7]:

# Diversity at 50k reads versus age
plot_df = diversity_df.filter(pl.col('diversity_50k').is_not_null())

fig, ax = plt.subplots(figsize=(7.0, 4.4), constrained_layout=True)
sc = ax.scatter(
    plot_df['age'].to_numpy(),
    plot_df['diversity_50k'].to_numpy(),
    c=plot_df['total_reads'].to_numpy(),
    cmap='viridis',
    s=60,
    alpha=0.85,
    edgecolors='white',
    linewidths=0.25,
)

x = plot_df['age'].to_numpy().astype(float)
y = plot_df['diversity_50k'].to_numpy().astype(float)
mask = np.isfinite(x) & np.isfinite(y)
slope, intercept, r, p, _ = stats.linregress(x[mask], y[mask])
xr = np.linspace(x[mask].min(), x[mask].max(), 200)
ax.plot(xr, slope * xr + intercept, color='#C1121F', lw=2.0)

ax.set_xlabel('Age (years)')
ax.set_ylabel('Unique clonotypes in 50k-read subsample')
ax.set_title('Subsampled Diversity vs Age (50k)')
ax.text(
    0.03,
    0.96,
    f'$r={r:.3f}$\n$p={p:.2e}$',
    transform=ax.transAxes,
    ha='left',
    va='top',
    fontsize=10,
    bbox=dict(facecolor='white', alpha=0.75, edgecolor='none', pad=2.5),
)
plt.colorbar(sc, ax=ax, label='Total reads in original sample')
plt.show()

../_images/notebooks_aging_analysis_9_0.png

3 · Pairwise overlap matrices and metric conversion#

[8]:

# Compute pairwise overlap matrices directly (no pilot estimation), with immediate progress logs.
N_JOBS_MATRIX = 4  # safe: 100k-downsampled reps keep qi_list small enough for 4 spawn workers
OVERLAP_SPACE = 'aavj'
TARGET_CELL_SECONDS = 15 * 60

results: dict[str, pl.DataFrame] = {}
pairwise_timing_rows: list[dict[str, object]] = []

for mode_label, metric, threshold in [
    ('exact', 'exact', 0),
    ('hamming:1', 'hamming', 1),
]:
    print(f"[{mode_label}] starting pairwise matrix...")
    t0 = time.perf_counter()
    df_pd = pairwise_overlap_matrix(
        reps,
        sample_ids=sample_ids,
        metric=metric,
        threshold=threshold,
        overlap_space=OVERLAP_SPACE,
        n_jobs=N_JOBS_MATRIX,
    )
    elapsed = time.perf_counter() - t0

    df = pl.from_pandas(df_pd)
    results[mode_label] = df

    try:
        import psutil
        n_jobs_effective = psutil.cpu_count(logical=False) or os.cpu_count() or 1
    except Exception:
        n_jobs_effective = os.cpu_count() or 1

    pairwise_timing_rows.append({
        'mode': mode_label,
        'n_pairs': df.height,
        'seconds': elapsed,
        'n_jobs_requested': N_JOBS_MATRIX,
        'n_jobs_effective': int(n_jobs_effective),
        'within_target': elapsed <= TARGET_CELL_SECONDS,
    })

    status = 'OK' if elapsed <= TARGET_CELL_SECONDS else 'OVER_TARGET'
    print(
        f"[{mode_label}] done pairs={df.height} n_jobs={int(n_jobs_effective)} "
        f"time={elapsed:.2f}s [{status}]"
    )

pairwise_timing = pl.DataFrame(pairwise_timing_rows)
EXECUTION_TIMES['pairwise_total_s'] = float(pairwise_timing.select(pl.col('seconds').sum()).item())

print('\nPairwise timing summary:')
display(pairwise_timing.to_pandas())
_record_stress('after_pairwise_matrix')

[exact] starting pairwise matrix...
[exact] done pairs=3081 n_jobs=16 time=27.01s [OK]
[hamming:1] starting pairwise matrix...
[hamming:1] done pairs=3081 n_jobs=16 time=536.21s [OK]

Pairwise timing summary:

	mode	n_pairs	seconds	n_jobs_requested	n_jobs_effective	within_target
0	exact	3081	27.010256	4	16	True
1	hamming:1	3081	536.206774	4	16	True

4 · Pool sample and per-sample overlap with pool#

[9]:

# Build pooled repertoire by concatenating clonotypes from all donors.
all_clones = []
for rep in reps:
    all_clones.extend(rep.clonotypes)

pool_rep = LocusRepertoire(clonotypes=all_clones, locus='TRB')
print(f'Pooled repertoire: {pool_rep.clonotype_count:,} clonotypes (sum of all samples)')

Pooled repertoire: 4,145,989 clonotypes (sum of all samples)

[10]:

# Compute donor-vs-pool overlap with the shared-worker many-vs-pool API.
POOL_N_JOBS = 4  # limit to 4 to avoid memory exhaustion
POOL_BATCH_SIZE = max(1, len(reps) // (POOL_N_JOBS * 2))
pool_results: dict[str, pl.DataFrame] = {}
pool_timing_rows: list[dict[str, object]] = []

t0_all = time.perf_counter()
for mode_label, metric, threshold in [
    ('exact', 'exact', 0),
    ('hamming:1', 'hamming', 1),
]:
    print(f"[{mode_label}] donor-vs-pool starting...")
    t0 = time.perf_counter()

    df = many_vs_pool_overlap(
        reps,
        pool_rep,
        sample_ids=sample_ids,
        ages=ages,
        metric=metric,
        threshold=threshold,
        overlap_space=OVERLAP_SPACE,
        n_jobs=POOL_N_JOBS,
        batch_size=POOL_BATCH_SIZE,
    )

    elapsed = time.perf_counter() - t0
    pool_results[mode_label] = df
    pool_timing_rows.append({'mode': mode_label, 'n_samples': len(df), 'seconds': elapsed})
    print(f"[{mode_label}] donor-vs-pool done in {elapsed:.2f}s")

pool_timing = pl.DataFrame(pool_timing_rows)
EXECUTION_TIMES['pool_total_s'] = time.perf_counter() - t0_all

print('\nPool overlap timing summary:')
display(pool_timing.to_pandas())
_record_stress('after_pool_overlap')

[exact] donor-vs-pool starting...
[exact] donor-vs-pool done in 29.15s
[hamming:1] donor-vs-pool starting...
[hamming:1] donor-vs-pool done in 136.79s

Pool overlap timing summary:

	mode	n_samples	seconds
0	exact	79	29.147094
1	hamming:1	79	136.785390

5 · F and D metrics vs absolute age difference#

[11]:

def _attach_pair_ages(df_pairs: pl.DataFrame, meta_df: pl.DataFrame) -> pl.DataFrame:
    """Attach age columns and absolute age difference to pairwise rows."""
    age_map = meta_df.select(['sample_id', 'age']).rename({'sample_id': 'sid'})

    out = (
        df_pairs
        .join(age_map.rename({'sid': 'sample_id_1', 'age': 'age1'}), on='sample_id_1', how='left')
        .join(age_map.rename({'sid': 'sample_id_2', 'age': 'age2'}), on='sample_id_2', how='left')
        .with_columns((pl.col('age1') - pl.col('age2')).abs().alias('age_diff'))
    )
    return out


def _outlier_threshold_iqr(values: np.ndarray, k: float = 3.0) -> float:
    q1 = np.nanpercentile(values, 25)
    q3 = np.nanpercentile(values, 75)
    iqr = q3 - q1
    return q3 + k * iqr

[12]:

# Pairwise similarity versus absolute age difference (exact mode),
# with distance-like metrics computed only for plotting.
df_exact_age = _attach_pair_ages(results['exact'], meta_valid)

fig, axes = plt.subplots(2, 2, figsize=(12, 7.2), sharex='col', constrained_layout=True)
metric_specs = [
    ('f_similarity', 'F similarity', lambda s: 1.0 - s),
    ('d_similarity', 'D similarity', lambda s: np.where(s > 0, 1.0 / s, np.nan)),
]

for col_i, (sim_col, label, to_metric) in enumerate(metric_specs):
    x = df_exact_age['age_diff'].to_numpy().astype(float)
    sim = df_exact_age[sim_col].to_numpy().astype(float)
    y = to_metric(sim)

    mask = np.isfinite(x) & np.isfinite(y)
    x = x[mask]
    y = y[mask]

    top_ax = axes[0, col_i]
    bot_ax = axes[1, col_i]

    out_cut = _outlier_threshold_iqr(y, k=3.0)
    out_mask = y > out_cut

    bot_ax.scatter(x[~out_mask], y[~out_mask], c=x[~out_mask], cmap='viridis', s=22, alpha=0.75, edgecolors='none')
    if out_mask.any():
        top_ax.scatter(x[out_mask], y[out_mask], c=x[out_mask], cmap='viridis', s=30, alpha=0.9, edgecolors='black', linewidths=0.25)
        top_ax.set_ylim(np.nanmin(y[out_mask]) * 0.98, np.nanmax(y[out_mask]) * 1.02)
    else:
        top_ax.scatter(x, y, c=x, cmap='viridis', s=22, alpha=0.75, edgecolors='none')

    bot_ax.set_ylim(np.nanmin(y[~out_mask]) * 0.98, np.nanmax(y[~out_mask]) * 1.02)

    slope, intercept, r, p, _ = stats.linregress(x, y)
    xr = np.linspace(np.nanmin(x), np.nanmax(x), 200)
    bot_ax.plot(xr, slope * xr + intercept, color='#C1121F', lw=2.0)

    top_ax.set_title(f'{label} metric')
    top_ax.text(
        0.03,
        0.92,
        f'outliers: {int(out_mask.sum())}',
        transform=top_ax.transAxes,
        ha='left',
        va='top',
        fontsize=9,
        bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=2),
    )
    bot_ax.text(
        0.03,
        0.96,
        f'$r={r:.3f}$\\n$p={p:.2e}$',
        transform=bot_ax.transAxes,
        ha='left',
        va='top',
        fontsize=9,
        bbox=dict(facecolor='white', alpha=0.75, edgecolor='none', pad=2.5),
    )

    top_ax.spines['bottom'].set_visible(False)
    bot_ax.spines['top'].set_visible(False)
    top_ax.tick_params(labelbottom=False)

    top_ax.set_ylabel('Metric')
    bot_ax.set_ylabel('Metric')
    bot_ax.set_xlabel('Absolute age difference (years)')

fig.suptitle('Pairwise Metrics vs Absolute Age Difference (Exact)', y=1.02, fontsize=13)
plt.show()

# Report strongest D-metric outliers for interpretation.
d_metric_exact = np.where(df_exact_age['d_similarity'].to_numpy().astype(float) > 0, 1.0 / df_exact_age['d_similarity'].to_numpy().astype(float), np.nan)
outlier_report = (
    df_exact_age
    .with_columns(pl.Series('d_metric', d_metric_exact))
    .with_columns((pl.col('d_metric') > _outlier_threshold_iqr(d_metric_exact[np.isfinite(d_metric_exact)], 3.0)).alias('d_outlier'))
    .filter(pl.col('d_outlier'))
    .select(['sample_id_1', 'sample_id_2', 'age1', 'age2', 'age_diff', 'd_similarity', 'f_similarity', 'd_metric'])
    .sort('d_metric', descending=True)
)
print(f'N pairs: {df_exact_age.height}')
if outlier_report.height:
    print('Top D-metric outliers (likely near-identical or technical replicates):')
    display(outlier_report.head(10).to_pandas())

../_images/notebooks_aging_analysis_17_0.png

N pairs: 3081
Top D-metric outliers (likely near-identical or technical replicates):

	sample_id_1	sample_id_2	age1	age2	age_diff	d_similarity	f_similarity	d_metric
0	A2-i138	A5-S1	74	103	29	0.001931	0.000694	517.802173
1	A4-i125	A5-S2	64	100	36	0.001938	0.000839	516.100178
2	A3-i151	A5-S1	87	103	16	0.001999	0.000853	500.248067
3	A3-i150	A5-S1	87	103	16	0.002036	0.000854	491.121421
4	A4-i125	A6-I210ob	64	92	28	0.002037	0.009138	491.013978

[13]:

# Donor-vs-pool similarity trends by age.
df_pool_exact = pool_results['exact']

fig, axes = plt.subplots(1, 2, figsize=(12, 4.8), constrained_layout=True)

for ax, (sim_col, label) in zip(axes, [
    ('f_similarity', 'F similarity (vs pool)'),
    ('d_similarity', 'D similarity (vs pool)'),
]):
    x = df_pool_exact['age'].to_numpy().astype(float)
    y = df_pool_exact[sim_col].to_numpy().astype(float)
    mask = np.isfinite(x) & np.isfinite(y)

    sc = ax.scatter(
        x[mask],
        y[mask],
        c=x[mask],
        cmap='viridis',
        s=58,
        alpha=0.85,
        edgecolors='white',
        linewidths=0.25,
        rasterized=True,
    )

    slope, intercept, r, p, _ = stats.linregress(x[mask], y[mask])
    xr = np.linspace(x[mask].min(), x[mask].max(), 200)
    ax.plot(xr, slope * xr + intercept, color='#C1121F', lw=2.0)

    ax.set_xlabel('Age (years)')
    ax.set_ylabel(label)
    ax.set_title(label)
    ax.text(
        0.03,
        0.96,
        f'$r={r:.3f}$\\n$p={p:.2e}$',
        transform=ax.transAxes,
        ha='left',
        va='top',
        fontsize=10,
        bbox=dict(facecolor='white', alpha=0.75, edgecolor='none', pad=2.5),
    )
    plt.colorbar(sc, ax=ax, label='Age')

fig.suptitle('Overlap With Cohort Pool Across Aging', y=1.02, fontsize=13)
plt.show()

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/vcs/mirpy/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:3641, in Index.get_loc(self, key)
   3640 try:
-> 3641     return self._engine.get_loc(casted_key)
   3642 except KeyError as err:

File pandas/_libs/index.pyx:168, in pandas._libs.index.IndexEngine.get_loc()
--> 168 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/index.pyx:197, in pandas._libs.index.IndexEngine.get_loc()
--> 197 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/hashtable_class_helper.pxi:7668, in pandas._libs.hashtable.PyObjectHashTable.get_item()
-> 7668 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/hashtable_class_helper.pxi:7676, in pandas._libs.hashtable.PyObjectHashTable.get_item()
-> 7676 'Could not get source, probably due dynamically evaluated source code.'

KeyError: 'f_similarity'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[13], line 11
      7     ('f_similarity', 'F similarity (vs pool)'),
      8     ('d_similarity', 'D similarity (vs pool)'),
      9 ]):
     10     x = df_pool_exact['age'].to_numpy().astype(float)
---> 11     y = df_pool_exact[sim_col].to_numpy().astype(float)
     12     mask = np.isfinite(x) & np.isfinite(y)
     13
     14     sc = ax.scatter(

File ~/vcs/mirpy/.venv/lib/python3.12/site-packages/pandas/core/frame.py:4378, in DataFrame.__getitem__(self, key)
   4374
   4375         if is_single_key:
   4376             if self.columns.nlevels > 1:
   4377                 return self._getitem_multilevel(key)
-> 4378             indexer = self.columns.get_loc(key)
   4379             if is_integer(indexer):
   4380                 indexer = [indexer]
   4381         else:

File ~/vcs/mirpy/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:3648, in Index.get_loc(self, key)
   3643     if isinstance(casted_key, slice) or (
   3644         isinstance(casted_key, abc.Iterable)
   3645         and any(isinstance(x, slice) for x in casted_key)
   3646     ):
   3647         raise InvalidIndexError(key) from err
-> 3648     raise KeyError(key) from err
   3649 except TypeError:
   3650     # If we have a listlike key, _check_indexing_error will raise
   3651     #  InvalidIndexError. Otherwise we fall through and re-raise
   3652     #  the TypeError.
   3653     self._check_indexing_error(key)

KeyError: 'f_similarity'

../_images/notebooks_aging_analysis_18_1.png

6 · UMAP and MDS from pairwise distance metrics#

[14]:

def _to_symmetric_matrix(df_pairs: pl.DataFrame, col: str, ids: list[str]) -> np.ndarray:
    """Convert long-format pairwise table to symmetric NxN matrix."""
    n = len(ids)
    id_to_idx = {sid: i for i, sid in enumerate(ids)}
    mat = np.zeros((n, n), dtype=float)

    for row in df_pairs.select(['sample_id_1', 'sample_id_2', col]).iter_rows(named=True):
        i = id_to_idx[row['sample_id_1']]
        j = id_to_idx[row['sample_id_2']]
        v = float(row[col])
        mat[i, j] = v
        mat[j, i] = v
    return mat


def _f_metric_from_similarity(f_similarity: np.ndarray) -> np.ndarray:
    return 1.0 - f_similarity


def _d_metric_from_similarity(d_similarity: np.ndarray) -> np.ndarray:
    with np.errstate(divide='ignore', invalid='ignore'):
        d_metric = np.where(d_similarity > 0, 1.0 / d_similarity, np.nan)
    if np.isfinite(d_metric).any():
        max_finite = np.nanmax(d_metric[np.isfinite(d_metric)])
        d_metric = np.nan_to_num(d_metric, nan=max_finite)
    else:
        d_metric = np.zeros_like(d_similarity)
    return d_metric

[15]:

try:
    from umap import UMAP
    HAS_UMAP = True
except ImportError:
    HAS_UMAP = False
    print('umap-learn not installed — skipping UMAP. Install with: pip install umap-learn')

[16]:

if HAS_UMAP:
    # Build exact-mode distance matrices for embedding.
    f_similarity_exact = _to_symmetric_matrix(results['exact'], 'f_similarity', sample_ids)
    d_similarity_exact = _to_symmetric_matrix(results['exact'], 'd_similarity', sample_ids)

    f_metric_exact = _f_metric_from_similarity(f_similarity_exact)
    d_metric_exact = _d_metric_from_similarity(d_similarity_exact)

    np.fill_diagonal(f_metric_exact, 0.0)
    np.fill_diagonal(d_metric_exact, 0.0)

    umap_f = UMAP(
        n_components=2,
        metric='precomputed',
        n_neighbors=min(10, len(reps) - 1),
        min_dist=0.08,
        random_state=42,
    ).fit_transform(f_metric_exact)

    mds_f = MDS(
        n_components=2,
        dissimilarity='precomputed',
        normalized_stress='auto',
        random_state=42,
        n_init=4,
        max_iter=400,
    ).fit_transform(f_metric_exact)

    umap_d = UMAP(
        n_components=2,
        metric='precomputed',
        n_neighbors=min(10, len(reps) - 1),
        min_dist=0.08,
        random_state=42,
    ).fit_transform(d_metric_exact)

    mds_d = MDS(
        n_components=2,
        dissimilarity='precomputed',
        normalized_stress='auto',
        random_state=42,
        n_init=4,
        max_iter=400,
    ).fit_transform(d_metric_exact)

    EMBEDDINGS = {
        'UMAP (F metric)': umap_f,
        'MDS (F metric)': mds_f,
        'UMAP (D metric)': umap_d,
        'MDS (D metric)': mds_d,
    }
    print('Computed embeddings:', ', '.join(EMBEDDINGS.keys()))

/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/umap/umap_.py:1865: UserWarning: using precomputed metric; inverse_transform will be unavailable
  warn("using precomputed metric; inverse_transform will be unavailable")
/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.

Computed embeddings: UMAP (F metric), MDS (F metric), UMAP (D metric), MDS (D metric)

/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/umap/umap_.py:1865: UserWarning: using precomputed metric; inverse_transform will be unavailable
  warn("using precomputed metric; inverse_transform will be unavailable")
/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(

[17]:

if HAS_UMAP:
    fig, axes = plt.subplots(2, 2, figsize=(13.5, 10.2), constrained_layout=True)

    for ax, (title, emb) in zip(axes.ravel(), EMBEDDINGS.items()):
        sc = ax.scatter(
            emb[:, 0],
            emb[:, 1],
            c=np.array(ages, dtype=float),
            cmap='viridis',
            s=64,
            alpha=0.9,
            linewidths=0.25,
            edgecolors='white',
        )
        ax.set_xlabel('Component 1')
        ax.set_ylabel('Component 2')
        ax.set_title(title)
        plt.colorbar(sc, ax=ax, label='Age (years)')

    fig.suptitle('Aging Structure in Distance Embeddings', y=1.01, fontsize=13)
    plt.show()

../_images/notebooks_aging_analysis_23_0.png

6 · Compare exact vs Hamming:1#

[18]:

# Compare F and D metric distributions across overlap modes with fixed binning.

f_metric_by_mode = {
    mode_label: (1.0 - df['f_similarity']).drop_nulls().to_numpy()
    for mode_label, df in results.items()
}
d_metric_by_mode = {
    mode_label: pl.Series(
        np.where(df['d_similarity'].to_numpy() > 0, 1.0 / df['d_similarity'].to_numpy(), np.nan)
    ).drop_nulls().to_numpy()
    for mode_label, df in results.items()
}

def _fixed_edges(series_by_mode: dict[str, np.ndarray], n_bins: int = 36) -> np.ndarray:
    all_vals = np.concatenate([v for v in series_by_mode.values() if len(v) > 0])
    lo = float(np.nanmin(all_vals))
    hi = float(np.nanmax(all_vals))
    if hi <= lo:
        hi = lo + 1e-6
    return np.linspace(lo, hi, n_bins + 1)

f_edges = _fixed_edges(f_metric_by_mode, n_bins=36)
d_edges = _fixed_edges(d_metric_by_mode, n_bins=36)

fig, axes = plt.subplots(1, 2, figsize=(12.4, 4.4), constrained_layout=True)

for mode_label in ('exact', 'hamming:1'):
    axes[0].hist(
        f_metric_by_mode[mode_label],
        bins=f_edges,
        alpha=0.45,
        label=mode_label,
        color=PALETTE[mode_label],
        density=True,
        edgecolor='white',
        linewidth=0.35,
    )
    axes[1].hist(
        d_metric_by_mode[mode_label],
        bins=d_edges,
        alpha=0.45,
        label=mode_label,
        color=PALETTE[mode_label],
        density=True,
        edgecolor='white',
        linewidth=0.35,
    )

axes[0].set_xlabel('F metric (1 − F similarity)')
axes[0].set_ylabel('Density')
axes[0].set_title('F Metric Distribution')

axes[1].set_xlabel('D metric (1 / D similarity)')
axes[1].set_ylabel('Density')
axes[1].set_title('D Metric Distribution')

axes[0].legend(frameon=False, ncol=1, loc='upper right')
axes[1].legend(frameon=False, ncol=1, loc='upper right')

plt.show()

../_images/notebooks_aging_analysis_25_0.png

[19]:

# Similarity summary table (reported, not plotted): F, D, F2, Jaccard,
# Szymkiewicz-Simpson, Morisita-Horn.
summary_rows = []
for mode_label, df in results.items():
    for metric_col in (
        'f_similarity',
        'd_similarity',
        'f2_similarity',
        'jaccard',
        'szymkiewicz_simpson',
        'morisita_horn',
    ):
        vals = df.select(pl.col(metric_col)).drop_nulls().to_series()
        if vals.len() == 0:
            continue
        summary_rows.append({
            'mode': mode_label,
            'metric': metric_col,
            'mean': float(vals.mean()),
            'median': float(vals.median()),
            'std': float(vals.std()),
            'max': float(vals.max()),
        })

summary_df = pl.DataFrame(summary_rows).sort(['mode', 'metric'])
display(summary_df.to_pandas())

	mode	metric	mean	median	std	max
0	exact	d_similarity	0.006742	0.005746	0.007321	0.263944
1	exact	f2_similarity	0.008369	0.005099	0.024601	0.740629
2	exact	f_similarity	0.016933	0.007574	0.036414	0.756360
3	exact	jaccard	0.003259	0.002765	0.004036	0.152034
4	exact	morisita_horn	0.002958	0.000028	0.037584	0.998456
5	exact	szymkiewicz_simpson	0.008686	0.007572	0.007841	0.265452
6	hamming:1	d_similarity	0.081960	0.078491	0.024006	0.311358
7	hamming:1	f_similarity	0.095677	0.088380	0.047409	0.772454
8	hamming:1	jaccard	0.040991	0.038942	0.013729	0.184380
9	hamming:1	morisita_horn	0.010708	0.000313	0.071508	0.998890
10	hamming:1	szymkiewicz_simpson	0.097405	0.096444	0.024549	0.312361

7 · Age correlation summary#

[20]:

# Spearman correlation of similarity vs age for donor-vs-pool results.
print('\nPool-overlap: Spearman correlation with age')
print(f'{"Mode":<18} {"Similarity":<15} {"rho":>8} {"p-value":>12}')
print('-' * 60)

for mode_label, df in pool_results.items():
    for sim_col in ('f_similarity', 'd_similarity'):
        vals = df[sim_col].to_numpy().astype(float)
        age_vals = df['age'].to_numpy().astype(float)
        mask = np.isfinite(vals) & np.isfinite(age_vals)
        rho, p = stats.spearmanr(age_vals[mask], vals[mask])
        print(f'{mode_label:<18} {sim_col:<15} {rho:>8.3f} {p:>12.3e}')

_record_stress('final')

print('\nExecution timing (seconds):')
display(pd.DataFrame([EXECUTION_TIMES]).T.rename(columns={0: 'seconds'}))

stress_df = pd.DataFrame(STRESS_SNAPSHOTS)
if not stress_df.empty:
    print('\nStress diagnostics:')
    display(stress_df)
    print('Peak RSS (GB):', f"{stress_df['rss_gb'].max():.2f}")
    print('Min available RAM (GB):', f"{stress_df['avail_gb'].min():.2f}")
    print('Max process CPU% sample:', f"{stress_df['cpu_percent_proc'].max():.1f}")


Pool-overlap: Spearman correlation with age
Mode               Similarity           rho      p-value
------------------------------------------------------------

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/vcs/mirpy/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:3641, in Index.get_loc(self, key)
   3640 try:
-> 3641     return self._engine.get_loc(casted_key)
   3642 except KeyError as err:

File pandas/_libs/index.pyx:168, in pandas._libs.index.IndexEngine.get_loc()
--> 168 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/index.pyx:197, in pandas._libs.index.IndexEngine.get_loc()
--> 197 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/hashtable_class_helper.pxi:7668, in pandas._libs.hashtable.PyObjectHashTable.get_item()
-> 7668 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/hashtable_class_helper.pxi:7676, in pandas._libs.hashtable.PyObjectHashTable.get_item()
-> 7676 'Could not get source, probably due dynamically evaluated source code.'

KeyError: 'f_similarity'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[20], line 8
      4 print('-' * 60)
      5
      6 for mode_label, df in pool_results.items():
      7     for sim_col in ('f_similarity', 'd_similarity'):
----> 8         vals = df[sim_col].to_numpy().astype(float)
      9         age_vals = df['age'].to_numpy().astype(float)
     10         mask = np.isfinite(vals) & np.isfinite(age_vals)
     11         rho, p = stats.spearmanr(age_vals[mask], vals[mask])

File ~/vcs/mirpy/.venv/lib/python3.12/site-packages/pandas/core/frame.py:4378, in DataFrame.__getitem__(self, key)
   4374
   4375         if is_single_key:
   4376             if self.columns.nlevels > 1:
   4377                 return self._getitem_multilevel(key)
-> 4378             indexer = self.columns.get_loc(key)
   4379             if is_integer(indexer):
   4380                 indexer = [indexer]
   4381         else:

File ~/vcs/mirpy/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:3648, in Index.get_loc(self, key)
   3643     if isinstance(casted_key, slice) or (
   3644         isinstance(casted_key, abc.Iterable)
   3645         and any(isinstance(x, slice) for x in casted_key)
   3646     ):
   3647         raise InvalidIndexError(key) from err
-> 3648     raise KeyError(key) from err
   3649 except TypeError:
   3650     # If we have a listlike key, _check_indexing_error will raise
   3651     #  InvalidIndexError. Otherwise we fall through and re-raise
   3652     #  the TypeError.
   3653     self._check_indexing_error(key)

KeyError: 'f_similarity'