Diversity Analysis: MS vs Healthy Cohort#

This notebook demonstrates mirpy diversity analysis workflows with:

VDJtools-style diversity metrics
Hill diversity curves
Rarefaction and sample coverage curves
Cohort comparison (Healthy vs Multiple Sclerosis)

Dataset source: isalgo/airr_benchmark (vdjtools/metadata_ms.txt). Target visuals include a Fig. 2-inspired diversity summary, rarefaction/coverage curves with confidence intervals, and Chao1 boxplot significance annotation.

[1]:

# Set deterministic state and report environment versions for reproducibility
import importlib.metadata as _meta
import random
import sys

import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

print(f'Python {sys.version.split()[0]}')
for _pkg in ['mirpy-lib', 'numpy', 'pandas', 'polars', 'scipy', 'matplotlib', 'seaborn']:
    try:
        print(f'  {_pkg}: {_meta.version(_pkg)}')
    except _meta.PackageNotFoundError:
        pass
print(f'SEED={SEED}')

Python 3.12.12
  mirpy-lib: 1.1.0
  numpy: 1.26.4
  pandas: 3.0.3
  polars: 1.40.1
  scipy: 1.17.1
  matplotlib: 3.10.9
  seaborn: 0.13.2
SEED=42

[2]:

# Resolve repository root and import mirpy notebook asset helpers
from pathlib import Path

repo_root = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd().resolve()

import sys as _sys
if str(repo_root) not in _sys.path:
    _sys.path.insert(0, str(repo_root))

from mir.utils.notebook_assets import ensure_airr_benchmark

/Users/mikesh/vcs/mirpy/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

[3]:

# Download the required VDJtools benchmark subset and locate metadata
dataset_root = ensure_airr_benchmark(
    repo_root,
    allow_patterns=[
        'vdjtools/**',
        'vdjtools_lite/**',
    ],
)

meta_candidates = [
    dataset_root / 'vdjtools' / 'metadata_ms.txt',
    dataset_root / 'vdjtools_lite' / 'metadata_ms.txt',
]

metadata_path = next((p for p in meta_candidates if p.exists()), None)
if metadata_path is None:
    raise FileNotFoundError('metadata_ms.txt not found under vdjtools/ or vdjtools_lite/.')

print('dataset_root:', dataset_root)
print('metadata_path:', metadata_path)

dataset_root: /Users/mikesh/vcs/mirpy/notebooks/assets/large/airr_benchmark
metadata_path: /Users/mikesh/vcs/mirpy/notebooks/assets/large/airr_benchmark/vdjtools/metadata_ms.txt

[4]:

# Load cohort metadata and normalize disease labels to Healthy/MS
import pandas as pd

meta = pd.read_csv(metadata_path, sep='\t')

file_col_candidates = ['file_name', 'file', 'filename', 'sample_file']
sample_col_candidates = ['sample_id', 'Run', 'run', 'sample', 'id']
group_col_candidates = [
    c for c in meta.columns if any(x in c.lower() for x in ['status', 'group', 'disease', 'condition', 'state'])
]

file_col = next((c for c in file_col_candidates if c in meta.columns), None)
sample_col = next((c for c in sample_col_candidates if c in meta.columns), None)
group_col = group_col_candidates[0] if group_col_candidates else None

if file_col is None or sample_col is None or group_col is None:
    raise ValueError(f'Could not infer metadata columns. Found columns: {list(meta.columns)}')

keep_cols = [sample_col, file_col, group_col]
for extra_col in ('age', 'sex'):
    if extra_col in meta.columns:
        keep_cols.append(extra_col)

meta = meta[keep_cols].copy()
rename_map = {sample_col: 'sample_id', file_col: 'file_name', group_col: 'raw_group'}
if 'age' in meta.columns:
    rename_map['age'] = 'age'
if 'sex' in meta.columns:
    rename_map['sex'] = 'sex'
meta = meta.rename(columns=rename_map)

# Re-label columns after the copy so the optional demographic fields stay available.
if 'age' in meta.columns:
    meta['age'] = pd.to_numeric(meta['age'], errors='coerce')
if 'sex' in meta.columns:
    meta['sex'] = meta['sex'].astype(str).str.strip()

def _normalize_group(value: str) -> str:
    s = str(value).strip().lower()
    if 'multiple' in s or s == 'ms' or 'ms' in s.replace('_', ' ').split():
        return 'MS'
    return 'Healthy'

meta['group'] = meta['raw_group'].map(_normalize_group)
meta = meta.sort_values(['group', 'sample_id', 'file_name']).reset_index(drop=True)
meta.head()

[4]:

	sample_id	file_name	raw_group	age	sex	group
0	C1	A2-i129.txt.gz	C	11.0	M	Healthy
1	C10	A4-i194.txt.gz	C	20.0	M	Healthy
2	C11	A4-i195.txt.gz	C	21.0	F	Healthy
3	C2	A2-i131.txt.gz	C	9.0	M	Healthy
4	C3	A2-i132.txt.gz	C	6.0	F	Healthy

[5]:

# Parse repertoires and compute per-sample diversity summaries and curves
from typing import Iterable

import polars as pl

from mir.common.parser import VDJtoolsParser
from mir.common.repertoire import LocusRepertoire

parser = VDJtoolsParser(sep='\t')

def _resolve_file(name: str) -> Path:
    p1 = dataset_root / 'vdjtools' / name
    p2 = dataset_root / 'vdjtools_lite' / name
    if p1.exists():
        return p1
    if p2.exists():
        return p2
    return dataset_root / name

def _adaptive_global_steps(min_n: int, max_n: int, n_steps: int = 14) -> list[int]:
    low = max(10, int(min_n))
    high = max(low, int(max_n))
    if low == high:
        return [low]
    raw = np.linspace(low, high, n_steps)
    return sorted({int(round(x)) for x in raw if int(round(x)) > 0})

sample_payloads: list[dict] = []
summary_rows: list[dict] = []
hill_rows: list[pl.DataFrame] = []
rare_rows: list[pl.DataFrame] = []

for row in meta.itertuples(index=False):
    sample_id = str(row.sample_id)
    group = str(row.group)
    age = float(getattr(row, 'age', np.nan)) if 'age' in meta.columns else np.nan
    file_path = _resolve_file(str(row.file_name))
    if not file_path.exists():
        continue

    try:
        clonotypes = parser.parse(str(file_path))
    except Exception:
        continue
    if not clonotypes:
        continue

    locus = str(getattr(clonotypes[0], 'locus', '') or 'TRB')
    rep = LocusRepertoire(clonotypes=clonotypes, locus=locus, repertoire_id=sample_id)

    summary = rep.diversity(count_field='duplicate_count')
    summary_rows.append(
        {
            'sample_id': sample_id,
            'group': group,
            'age': age,
            'locus': locus,
            **summary.to_dict(),
        }
    )

    hill_df = rep.hill_curve(count_field='duplicate_count')
    hill_rows.append(hill_df.with_columns(pl.lit(sample_id).alias('sample_id'), pl.lit(group).alias('group')))

    sample_payloads.append(
        {
            'sample_id': sample_id,
            'group': group,
            'rep': rep,
            'n': int(summary.abundance),
        }
    )

if sample_payloads:
    min_n = min(x['n'] for x in sample_payloads)
    max_n = max(x['n'] for x in sample_payloads)
    global_steps = _adaptive_global_steps(min_n, max_n, n_steps=14)
else:
    min_n, max_n, global_steps = 0, 0, []

for payload in sample_payloads:
    sample_id = payload['sample_id']
    group = payload['group']
    rep = payload['rep']
    n = payload['n']

    # Shared, evenly spaced steps across the cohort + exact sample depth.
    steps = sorted({m for m in global_steps if m > 0} | {n})
    rare_df = rep.rarefaction_curve(
        count_field='duplicate_count',
        m_steps=steps,
        include_exact=True,
    )
    rare_rows.append(rare_df.with_columns(pl.lit(sample_id).alias('sample_id'), pl.lit(group).alias('group')))

summary_df = pd.DataFrame(summary_rows)
if summary_df.empty:
    raise RuntimeError('No samples were parsed. Check metadata file_name values and parser compatibility.')
summary_df = summary_df.sort_values(['group', 'sample_id']).reset_index(drop=True)
hill_df = pl.concat(hill_rows) if hill_rows else pl.DataFrame()
rare_df = pl.concat(rare_rows) if rare_rows else pl.DataFrame()

print('Samples used:', len(summary_df))
print('Group counts:')
print(summary_df[['group']].value_counts())
print(f'Adaptive rarefaction step range: min_n={min_n}, max_n={max_n}, n_steps={len(global_steps)}')
if global_steps:
    print('First 5 global steps:', global_steps[:5])
    print('Last 5 global steps:', global_steps[-5:])
summary_df.head()

Samples used: 25
Group counts:
group
MS         14
Healthy    11
Name: count, dtype: int64
Adaptive rarefaction step range: min_n=156825, max_n=1260200, n_steps=14
First 5 global steps: [156825, 241700, 326575, 411450, 496325]
Last 5 global steps: [920700, 1005575, 1090450, 1175325, 1260200]

[5]:

	sample_id	group	age	locus	abundance	diversity	singletons	doubletons	expanded	hyperexpanded	chao1	gini_simpson	shannon
0	C1	Healthy	11.0	TRB	733696	540240	473315	47939	14	0	2.876771e+06	0.999833	12.623683
1	C10	Healthy	20.0	TRB	183043	125307	113865	7384	27	1	1.003108e+06	0.998167	10.735929
2	C11	Healthy	21.0	TRB	475026	337012	302848	25105	19	2	2.163599e+06	0.998536	11.718054
3	C2	Healthy	9.0	TRB	606357	381345	325027	35918	17	0	1.851907e+06	0.999775	11.972000
4	C3	Healthy	6.0	TRB	977264	763165	699291	42508	2	0	6.514973e+06	0.999983	13.142935

[6]:

# Render a polished donor summary table with per-group mean ± SD
from IPython.display import display

summary_table = summary_df[['sample_id', 'group', 'age', 'abundance', 'diversity', 'chao1', 'gini_simpson', 'shannon']].copy()
summary_table['age'] = summary_table['age'].round(1)
summary_table['abundance'] = summary_table['abundance'].map(lambda x: f'{int(x):,}')
summary_table['diversity'] = summary_table['diversity'].map(lambda x: f'{int(x):,}')
summary_table['chao1'] = summary_table['chao1'].map(lambda x: f'{x:,.0f}')
summary_table['gini_simpson'] = summary_table['gini_simpson'].map(lambda x: f'{x:.4f}')
summary_table['shannon'] = summary_table['shannon'].map(lambda x: f'{x:.2f}')

agg_rows = []
for group, group_df in summary_df.groupby('group', sort=True):
    agg_rows.append(
        {
            'sample_id': f'{group} mean ± sd',
            'group': group,
            'age': f"{group_df['age'].mean():.1f} ± {group_df['age'].std(ddof=1):.1f}",
            'abundance': f"{group_df['abundance'].mean():,.0f} ± {group_df['abundance'].std(ddof=1):,.0f}",
            'diversity': f"{group_df['diversity'].mean():,.0f} ± {group_df['diversity'].std(ddof=1):,.0f}",
            'chao1': f"{group_df['chao1'].mean():,.0f} ± {group_df['chao1'].std(ddof=1):,.0f}",
            'gini_simpson': f"{group_df['gini_simpson'].mean():.4f} ± {group_df['gini_simpson'].std(ddof=1):.4f}",
            'shannon': f"{group_df['shannon'].mean():.2f} ± {group_df['shannon'].std(ddof=1):.2f}",
        }
    )

summary_display = pd.concat([summary_table, pd.DataFrame(agg_rows)], ignore_index=True)
summary_display = summary_display.sort_values(['group', 'sample_id'], kind='stable').reset_index(drop=True)

styler = (
    summary_display.style
    .set_caption('Donor diversity summary: ages, abundance, and diversity with per-group mean ± SD')
    .hide(axis='index')
    .format(na_rep='')
    .set_properties(**{'border-color': '#d9d9d9', 'padding': '0.35rem 0.5rem'})
    .set_table_styles(
        [
            {'selector': 'caption', 'props': [('font-weight', '700'), ('font-size', '1.05rem'), ('text-align', 'left'), ('padding-bottom', '0.35rem')]},
            {'selector': 'th', 'props': [('background-color', '#f6f6f6'), ('font-weight', '700')]},
            {'selector': 'tr:nth-child(even)', 'props': [('background-color', '#fcfcfc')]},
        ]
    )
)

display(styler)
print('Table rows:', len(summary_display))
print('Per-group mean ± SD rows appended at bottom.')

Donor diversity summary: ages, abundance, and diversity with per-group mean ± SD
sample_id	group	age	abundance	diversity	chao1	gini_simpson	shannon
C1	Healthy	11.000000	733,696	540,240	2,876,771	0.9998	12.62
C10	Healthy	20.000000	183,043	125,307	1,003,108	0.9982	10.74
C11	Healthy	21.000000	475,026	337,012	2,163,599	0.9985	11.72
C2	Healthy	9.000000	606,357	381,345	1,851,907	0.9998	11.97
C3	Healthy	6.000000	977,264	763,165	6,514,973	1.0000	13.14
C4	Healthy	16.000000	768,339	564,360	3,386,699	0.9997	12.60
C5	Healthy	16.000000	754,191	553,121	3,821,300	0.9959	12.08
C6	Healthy	10.000000	714,268	385,691	1,391,268	0.9983	11.40
C7	Healthy	25.000000	1,260,200	570,480	1,514,045	0.9969	10.96
C8	Healthy	22.000000	718,357	384,882	1,139,362	0.9985	11.39
C9	Healthy	24.000000	1,252,793	721,262	2,021,072	0.9997	12.52
Healthy mean ± sd	Healthy	16.4 ± 6.6	767,594 ± 313,358	484,260 ± 183,411	2,516,737 ± 1,607,698	0.9987 ± 0.0013	11.92 ± 0.76
MS mean ± sd	MS	12.8 ± 2.9	401,930 ± 145,733	239,199 ± 111,607	1,130,573 ± 740,958	0.9979 ± 0.0040	11.17 ± 1.25
MS1	MS	12.000000	425,677	240,890	1,001,516	0.9971	10.81
MS10	MS	15.000000	672,153	507,713	1,929,221	1.0000	12.88
MS11	MS	14.000000	382,959	298,278	2,295,109	0.9995	12.00
MS12	MS	13.000000	354,380	178,465	364,851	0.9998	11.35
MS13	MS	14.000000	238,554	145,710	394,297	0.9997	11.30
MS2	MS		209,897	130,666	327,689	0.9998	11.25
MS3	MS	8.000000	333,035	247,923	2,160,768	0.9971	11.36
MS4	MS	14.000000	156,825	106,979	386,774	0.9999	11.19
MS5	MS	15.000000	395,758	219,336	780,708	0.9966	10.76
MS6	MS	15.000000	354,984	261,446	1,510,009	0.9999	12.03
MS7	MS		551,228	335,372	1,381,174	0.9991	11.51
MS8	MS	14.000000	508,701	349,422	1,883,991	0.9991	11.92
MS8_post	MS	14.000000	580,622	89,947	215,243	0.9846	7.35
MS9	MS	6.000000	462,240	236,637	1,196,674	0.9988	10.63

Table rows: 27
Per-group mean ± SD rows appended at bottom.

[7]:

# Build a cohort demographics summary with age range and sex counts
from IPython.display import display

if 'sex' not in meta.columns:
    raise RuntimeError("Expected a 'sex' column in the cohort metadata.")

meta_demographics = meta.copy()
meta_demographics['sex'] = meta_demographics['sex'].fillna('').astype(str).str.strip().str.upper()
meta_demographics['sex'] = meta_demographics['sex'].replace({'FEMALE': 'F', 'MALE': 'M'})
meta_demographics['cohort_label'] = meta_demographics['group']
meta_demographics['note'] = ''
meta_demographics.loc[meta_demographics['sample_id'].astype(str).str.upper() == 'M14', 'note'] = 'post-HSCT'

cohort_rows = []
for group, group_df in meta_demographics.groupby('cohort_label', sort=True):
    age_series = pd.to_numeric(group_df['age'], errors='coerce') if 'age' in group_df.columns else pd.Series(dtype=float)
    sex_counts = group_df['sex'].value_counts(dropna=False)
    cohort_rows.append(
        {
            'Cohort': group,
            'Samples': len(group_df),
            'Age range': (
                f"{int(age_series.min())}–{int(age_series.max())}" if age_series.notna().any() else ''
            ),
            'Age mean ± sd': (
                f"{age_series.mean():.1f} ± {age_series.std(ddof=1):.1f}" if age_series.notna().sum() > 1 else ''
            ),
            'Sex F': int(sex_counts.get('F', 0)),
            'Sex M': int(sex_counts.get('M', 0)),
            'Sex other/NA': int(len(group_df) - int(sex_counts.get('F', 0)) - int(sex_counts.get('M', 0))),
            'Post-HSCT note': 'M14' if (group_df['sample_id'].astype(str).str.upper() == 'M14').any() else '',
        }
    )

cohort_df = pd.DataFrame(cohort_rows)
cohort_display = cohort_df.copy()

cohort_styler = (
    cohort_display.style
    .set_caption('Cohort demographics summary: age range and sex counts')
    .hide(axis='index')
    .set_properties(**{'border-color': '#d9d9d9', 'padding': '0.35rem 0.5rem'})
    .set_table_styles(
        [
            {'selector': 'caption', 'props': [('font-weight', '700'), ('font-size', '1.05rem'), ('text-align', 'left'), ('padding-bottom', '0.35rem')]},
            {'selector': 'th', 'props': [('background-color', '#f6f6f6'), ('font-weight', '700')]},
            {'selector': 'tr:nth-child(even)', 'props': [('background-color', '#fcfcfc')]},
        ]
    )
)

display(cohort_styler)
print('M14 is marked as a post-HSCT MS sample in the note column.')

Cohort demographics summary: age range and sex counts
Cohort	Samples	Age range	Age mean ± sd	Sex F	Sex M	Sex other/NA	Post-HSCT note
Healthy	11	6–25	16.4 ± 6.6	4	7	0
MS	14	6–15	12.8 ± 2.9	14	0	0

M14 is marked as a post-HSCT MS sample in the note column.

[8]:

# Build a Fig. 2-inspired multi-panel diversity summary figure
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Publication-style typography and panel aesthetics inspired by journal figure conventions.
mpl.rcParams.update(
    {
        'font.family': 'serif',
        'font.serif': ['STIX Two Text', 'Times New Roman', 'DejaVu Serif'],
        'mathtext.fontset': 'stix',
        'axes.linewidth': 1.0,
        'axes.titlesize': 13,
        'axes.labelsize': 11,
        'xtick.labelsize': 10,
        'ytick.labelsize': 10,
        'legend.fontsize': 10,
        'figure.dpi': 140,
        'savefig.dpi': 300,
    }
)

sns.set_theme(style='ticks', context='paper')
metrics = ['abundance', 'diversity', 'singletons', 'doubletons', 'chao1', 'gini_simpson', 'shannon']
palette = {'Healthy': '#1f9e89', 'MS': '#d95f02'}

fig, axes = plt.subplots(2, 4, figsize=(12.8, 6.8), constrained_layout=True)
axes = axes.ravel()

for i, metric in enumerate(metrics):
    ax = axes[i]
    sns.boxplot(
        data=summary_df,
        x='group',
        y=metric,
        hue='group',
        order=['Healthy', 'MS'],
        hue_order=['Healthy', 'MS'],
        palette=palette,
        ax=ax,
        width=0.54,
        linewidth=1.1,
        fliersize=0,
        legend=False,
    )
    sns.stripplot(
        data=summary_df,
        x='group',
        y=metric,
        order=['Healthy', 'MS'],
        color='black',
        alpha=0.32,
        size=2.8,
        jitter=0.14,
        ax=ax,
    )
    ax.set_title(metric.replace('_', ' ').title(), pad=5)
    ax.set_xlabel('')
    ax.grid(axis='y', color='#d9d9d9', linewidth=0.6, alpha=0.8)
    sns.despine(ax=ax, offset=3, trim=True)

axes[-1].axis('off')
fig.suptitle('Diversity Summary Across Healthy and MS Donors', y=1.02, fontsize=15, fontweight='bold')
plt.show()

../_images/notebooks_diversity_analysis_8_0.png

[9]:

# Plot rarefaction and sample coverage curves with confidence intervals by cohort
from matplotlib.lines import Line2D

if rare_df.height == 0:
    raise RuntimeError('No rarefaction results were generated.')

rare_pd = rare_df.to_pandas().sort_values(['group', 'sample_id', 'm'])
common_m = sorted(set(global_steps)) if 'global_steps' in globals() else sorted(rare_pd['m'].unique())

fig, axes = plt.subplots(1, 2, figsize=(12.8, 4.9), constrained_layout=True)

for group in ['Healthy', 'MS']:
    sub = rare_pd[rare_pd['group'] == group].copy()
    if sub.empty:
        continue
    color = palette[group]

    # Light donor trajectories on common steps only to keep curves smooth.
    for sid, ssub in sub.groupby('sample_id'):
        scommon = ssub[ssub['m'].isin(common_m)].sort_values('m')
        if scommon.empty:
            continue
        axes[0].plot(scommon['m'], scommon['s_est'], color=color, linewidth=0.8, alpha=0.12)
        axes[1].plot(scommon['m'], scommon['coverage'], color=color, linewidth=0.8, alpha=0.12)

        # Mark exact sample depth as a small point.
        sexact = ssub[ssub['m'] == ssub['n']]
        if not sexact.empty:
            axes[0].scatter(sexact['m'], sexact['s_est'], color=color, s=10, alpha=0.3, zorder=3)
            axes[1].scatter(sexact['m'], sexact['coverage'], color=color, s=10, alpha=0.3, zorder=3)

    ssub_common = sub[sub['m'].isin(common_m)]
    agg = ssub_common.groupby('m', as_index=False).agg(
        s_med=('s_est', 'median'),
        s_lwr=('s_est', lambda x: np.quantile(x, 0.1)),
        s_upr=('s_est', lambda x: np.quantile(x, 0.9)),
        coverage_med=('coverage', 'median'),
        coverage_lwr=('coverage', lambda x: np.quantile(x, 0.1)),
        coverage_upr=('coverage', lambda x: np.quantile(x, 0.9)),
    )

    axes[0].plot(agg['m'], agg['s_med'], color=color, linewidth=2.7)
    axes[0].fill_between(agg['m'], agg['s_lwr'], agg['s_upr'], color=color, alpha=0.24)

    axes[1].plot(agg['m'], agg['coverage_med'], color=color, linewidth=2.7)
    axes[1].fill_between(agg['m'], agg['coverage_lwr'], agg['coverage_upr'], color=color, alpha=0.24)

axes[0].set_title('Rarefaction (Expected Richness)', pad=7)
axes[0].set_xlabel('Sampling depth (m)')
axes[0].set_ylabel('Expected richness')
axes[0].grid(axis='both', color='#d9d9d9', linewidth=0.6, alpha=0.8)

axes[1].set_title('Sample Coverage', pad=7)
axes[1].set_xlabel('Sampling depth (m)')
axes[1].set_ylabel('Coverage')
axes[1].set_ylim(0, 1.03)
axes[1].grid(axis='both', color='#d9d9d9', linewidth=0.6, alpha=0.8)

legend = [
    Line2D([0], [0], color=palette['Healthy'], lw=3, label='Healthy'),
    Line2D([0], [0], color=palette['MS'], lw=3, label='MS'),
]
axes[0].legend(handles=legend, frameon=False, loc='best')
axes[1].legend(handles=legend, frameon=False, loc='best')

for ax in axes:
    sns.despine(ax=ax, offset=3, trim=True)

plt.show()

../_images/notebooks_diversity_analysis_9_0.png

[10]:

# Compare Chao1 distributions and annotate significance with p-value stars
from scipy.stats import mannwhitneyu

healthy = summary_df.loc[summary_df['group'] == 'Healthy', 'chao1'].to_numpy()
ms = summary_df.loc[summary_df['group'] == 'MS', 'chao1'].to_numpy()

if len(healthy) > 0 and len(ms) > 0:
    stat, p_value = mannwhitneyu(healthy, ms, alternative='two-sided')
else:
    stat, p_value = np.nan, np.nan

def p_to_star(p: float) -> str:
    if not np.isfinite(p):
        return 'n.s.'
    if p < 1e-4:
        return '****'
    if p < 1e-3:
        return '***'
    if p < 1e-2:
        return '**'
    if p < 0.05:
        return '*'
    return 'n.s.'

star = p_to_star(float(p_value))

fig, ax = plt.subplots(figsize=(7, 6), constrained_layout=True)
sns.boxplot(
    data=summary_df,
    x='group',
    y='chao1',
    hue='group',
    order=['Healthy', 'MS'],
    hue_order=['Healthy', 'MS'],
    palette=palette,
    ax=ax,
    width=0.6,
    legend=False,
)
sns.stripplot(data=summary_df, x='group', y='chao1', order=['Healthy', 'MS'], color='black', alpha=0.35, size=3, ax=ax)

ymax = float(summary_df['chao1'].max()) if len(summary_df) else 1.0
y = ymax * 1.08
ax.plot([0, 0, 1, 1], [y * 0.98, y, y, y * 0.98], color='black', linewidth=1.5)
ax.text(0.5, y * 1.01, f'{star}  (p={p_value:.3g})', ha='center', va='bottom')
ax.set_title('Chao1 Diversity Comparison')
ax.set_xlabel('')
ax.set_ylabel('Chao1')
ax.grid(axis='y', color='#d9d9d9', linewidth=0.6, alpha=0.8)
sns.despine(ax=ax, offset=3, trim=True)
plt.show()

print('Mann-Whitney U statistic:', stat)
print('p-value:', p_value)

../_images/notebooks_diversity_analysis_10_0.png

Mann-Whitney U statistic: 124.0
p-value: 0.01090784295903633

[11]:

# Plot Hill diversity profiles by cohort
if hill_df.height == 0:
    raise RuntimeError('No Hill curve results were generated.')

hill_pd = hill_df.to_pandas().sort_values(['group', 'sample_id', 'q'])

fig, ax = plt.subplots(figsize=(6.8, 5.0), constrained_layout=True)

for group in ['Healthy', 'MS']:
    sub = hill_pd[hill_pd['group'] == group].copy()
    if sub.empty:
        continue
    color = palette[group]

    # Light donor-level trajectories + bold cohort mean with quantile band.
    for sid, ssub in sub.groupby('sample_id'):
        ax.plot(ssub['q'], ssub['hill'], color=color, linewidth=0.8, alpha=0.14)

    agg = sub.groupby('q', as_index=False).agg(
        hill_mean=('hill', 'mean'),
        hill_lwr=('hill', lambda x: np.quantile(x, 0.1)),
        hill_upr=('hill', lambda x: np.quantile(x, 0.9)),
    )
    ax.plot(agg['q'], agg['hill_mean'], color=color, linewidth=2.7, label=group)
    ax.fill_between(agg['q'], agg['hill_lwr'], agg['hill_upr'], color=color, alpha=0.24)

ax.set_title('Hill Diversity Profiles', pad=7)
ax.set_xlabel('Order q')
ax.set_ylabel('Effective number of clonotypes')
ax.grid(axis='both', color='#d9d9d9', linewidth=0.6, alpha=0.8)
ax.legend(frameon=False, loc='best')
sns.despine(ax=ax, offset=3, trim=True)
plt.show()

../_images/notebooks_diversity_analysis_11_0.png

Notes#

The cohort labels are normalized to Healthy and MS from metadata text values.
The donor summary table includes age, abundance, diversity, Chao1, Gini-Simpson, and Shannon, with per-group mean ± SD rows.
A cohort-level demographics table above summarizes age range and sex counts for each cohort.
The source MS metadata includes a post-HSCT sample (M14); treat it as a special case when making cohort-level comparisons.
Diversity summaries use duplicate_count by default in this notebook.
For UMI-based diversity, pass count_field='umi_count' to diversity, hill_curve, and rarefaction_curve.
For paired/single-cell objects, mirpy defaults to count_field='barcode_count'.