Skip to content

Latest commit

 

History

History
1387 lines (1199 loc) · 36.4 KB

005_020_compare-nb-to-normal.md

File metadata and controls

1387 lines (1199 loc) · 36.4 KB

Comparing negative binomial and normal linear regression models

The goal of this notebook is to compare negative binomial and normal models of real CRISPR-screen data. Comparisons will be made on computational efficiency, MCMC diagnositics, model accuracy and fitness, and posterior predictive checks.

Setup

%load_ext autoreload
%autoreload 2
import logging
import pprint
import re
import string
import warnings
from pathlib import Path
from time import time

import arviz as az
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotnine as gg
import pymc3 as pm
import seaborn as sns
from theano import tensor as tt
from src.analysis import pymc3_analysis as pmanal
from src.data_processing import achilles as achelp
from src.data_processing import common as dphelp
from src.data_processing import vectors as vhelp
from src.globals import get_pymc3_constants
from src.io import cache_io, data_io
from src.loggers import set_console_handler_level
from src.modeling import pymc3_helpers as pmhelp
from src.modeling import pymc3_sampling_api as pmapi
from src.plot.color_pal import SeabornColor
from src.plot.plotnine_helpers import set_gg_theme
notebook_tic = time()
warnings.simplefilter(action="ignore", category=UserWarning)
set_console_handler_level(logging.WARN)
set_gg_theme()
%config InlineBackend.figure_format = "retina"
PYMC3 = get_pymc3_constants
RANDOM_SEED = 212
np.random.seed(RANDOM_SEED)

Data

crc_modeling_data_path = data_io.data_path(data_io.DataFile.DEPMAP_CRC_SUBSAMPLE)
crc_modeling_data = achelp.read_achilles_data(
    crc_modeling_data_path, low_memory=False, set_categorical_cols=True
)
crc_modeling_data.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
sgrna replicate_id lfc p_dna_batch genome_alignment hugo_symbol screen multiple_hits_on_gene sgrna_target_chr sgrna_target_pos ... num_mutations any_deleterious any_tcga_hotspot any_cosmic_hotspot is_mutated copy_number lineage primary_or_metastasis is_male age
0 ATAACACTGCACCTTCCAAC LS513-311Cas9_RepA_p6_batch2 0.179367 2 chr2_157587191_- ACVR1C broad True 2 157587191 ... 0 NaN NaN NaN False 0.964254 colorectal primary True 63.0
1 ATAACACTGCACCTTCCAAC CL-11-311Cas9_RepB_p6_batch3 -0.139505 3 chr2_157587191_- ACVR1C broad False 2 157587191 ... 0 NaN NaN NaN False 1.004888 colorectal primary True NaN
2 ATAACACTGCACCTTCCAAC SW1463-311cas9 Rep A p5_batch2 -0.192216 2 chr2_157587191_- ACVR1C broad True 2 157587191 ... 0 NaN NaN NaN False 0.923384 colorectal primary False 66.0
3 ATAACACTGCACCTTCCAAC HT29-311Cas9_RepA_p6 AVANA_batch3 0.282499 3 chr2_157587191_- ACVR1C broad True 2 157587191 ... 0 NaN NaN NaN False 1.014253 colorectal primary False 44.0
4 ATAACACTGCACCTTCCAAC KM12-311Cas9 Rep A p5_batch3 0.253698 3 chr2_157587191_- ACVR1C broad True 2 157587191 ... 0 NaN NaN NaN False 1.048861 colorectal primary NaN NaN

5 rows × 24 columns

crc_modeling_data.columns.to_list()
['sgrna',
 'replicate_id',
 'lfc',
 'p_dna_batch',
 'genome_alignment',
 'hugo_symbol',
 'screen',
 'multiple_hits_on_gene',
 'sgrna_target_chr',
 'sgrna_target_pos',
 'depmap_id',
 'counts_final',
 'counts_initial',
 'rna_expr',
 'num_mutations',
 'any_deleterious',
 'any_tcga_hotspot',
 'any_cosmic_hotspot',
 'is_mutated',
 'copy_number',
 'lineage',
 'primary_or_metastasis',
 'is_male',
 'age']
data = (
    crc_modeling_data[~crc_modeling_data.counts_final.isna()]
    .reset_index(drop=True)
    .pipe(achelp.set_achilles_categorical_columns)
    .astype({"counts_final": int, "counts_initial": int})
    .reset_index(drop=True)
    .shuffle()
    .pipe(
        achelp.zscale_cna_by_group,
        cn_col="copy_number",
        new_col="copy_number_z",
        groupby_cols=["depmap_id"],
        cn_max=20,
    )
    .pipe(achelp.append_total_read_counts)
    .pipe(achelp.add_useful_read_count_columns)
)
data.shape
(2384, 29)
plot_df = (
    data[["depmap_id", "hugo_symbol", "copy_number", "copy_number_z", "lfc"]]
    .drop_duplicates()
    .pivot_longer(
        index=["depmap_id", "hugo_symbol", "lfc"], names_to="cn", values_to="value"
    )
)
(
    gg.ggplot(plot_df.drop(columns=["lfc"]).drop_duplicates(), gg.aes(x="value"))
    + gg.facet_wrap("~cn", nrow=1, scales="free")
    + gg.geom_histogram(gg.aes(fill="depmap_id"), position="identity", alpha=0.4)
    + gg.scale_x_continuous(expand=(0, 0))
    + gg.scale_y_continuous(expand=(0, 0, 0.02, 0))
    + gg.scale_fill_brewer(type="qual", palette="Dark2")
    + gg.theme(figure_size=(8, 3), panel_spacing_x=0.4)
    + gg.labs(x="copy number", y="count", fill="cell line")
)

png

<ggplot: (349013863)>
(
    gg.ggplot(plot_df, gg.aes(x="value", y="lfc", color="depmap_id"))
    + gg.facet_wrap("~ cn", nrow=1, scales="free")
    + gg.geom_point(alpha=0.6)
    + gg.geom_smooth(formula="y~x", method="lm", linetype="--", size=0.5, alpha=0.1)
    + gg.scale_x_continuous(expand=(0.02, 0))
    + gg.scale_y_continuous(expand=(0.02, 0))
    + gg.scale_color_brewer(type="qual", palette="Dark2")
    + gg.theme(figure_size=(8, 4))
)

png

<ggplot: (349248925)>
ax = sns.scatterplot(data=data, x="counts_initial_adj", y="counts_final")
ax.set_yscale("log", base=10)
ax.set_xscale("log", base=10)
plt.show()

png

Modeling

Fit models with hierarchical structure for gene and copy number effect per cell line.

For each model:

$$ \begin{aligned} \mu_{\beta_0} &\sim \text{N}(0, 2.5) \quad \sigma_{\beta_0} \sim \text{HN}(2.5) \\ \mu_{\beta_\text{CNA}} &\sim \text{N}(0, 2.5) \quad \sigma_{\beta_\text{CNA}} \sim \text{HN}(2.5) \\ \beta_0 &\sim_g \text{N}(\mu_{\beta_0}, \sigma_{\beta_0}) \\ \beta_\text{CNA} &\sim_c \text{N}(\mu_{\beta_\text{CNA}}, \sigma_{\beta_\text{CNA}}) \\ \end{aligned} $$

For the negative binomial:

$$ \begin{aligned} \eta &= \beta_0[g] + x_\text{CNA} \beta_\text{CNA}[c] \\ \mu &= \exp(\eta) \\ \alpha &\sim \text{HN}(0, 5) \\ y &\sim \text{NB}(\mu x_\text{initial}, \alpha) \end{aligned} $$

For the normal model:

$$ \begin{aligned} \mu &= \beta_0[g] + x_\text{CNA} \beta_\text{CNA}[c] \\ \sigma &\sim \text{HN}(0, 5) \\ y &\sim \text{N}(\mu, \sigma) \end{aligned} $$

gene_idx, n_genes = dphelp.get_indices_and_count(data, "hugo_symbol")
print(f"number of genes: {n_genes}")

cell_line_idx, n_cells = dphelp.get_indices_and_count(data, "depmap_id")
print(f"number of cell lines: {n_cells}")
number of genes: 114
number of cell lines: 10

Build models

Negative Binomial model

gene_copynumber_averages = vhelp.careful_zscore(
    data.groupby("hugo_symbol")["copy_number_z"].mean().values
)
with pm.Model() as nb_model:
    g = pm.Data("g", gene_idx)
    c = pm.Data("c", cell_line_idx)
    x_cna = pm.Data("x_cna", data.copy_number_z.values)
    ct_i = pm.Data("initial_count", data.counts_initial_adj.values)
    ct_f = pm.Data("final_count", data.counts_final.values)

    β_0 = pmhelp.hierarchical_normal(
        "β_0", shape=n_genes, centered=False, mu_sd=0.1, sigma_sd=0.1
    )
    β_cna = pmhelp.hierarchical_normal(
        "β_cna", shape=n_cells, centered=False, mu_sd=0.1, sigma_sd=0.1
    )
    η = pm.Deterministic("η", β_0[g] + β_cna[c] * x_cna)
    μ = pm.Deterministic("μ", pm.math.exp(η) * ct_i)
    α = pm.HalfNormal("α", 50)
    y = pm.NegativeBinomial("y", μ, α, observed=ct_f)

pm.model_to_graphviz(nb_model)

svg

Normal linear regression model

with pm.Model() as lin_model:
    g = pm.Data("g", gene_idx)
    c = pm.Data("c", cell_line_idx)
    x_cna = pm.Data("x_cna", data.copy_number_z.values)
    lfc = pm.Data("lfc", data.lfc.values)

    β_0 = pmhelp.hierarchical_normal(
        "β_0", shape=n_genes, centered=False, mu_sd=1.0, sigma_sd=1.0
    )
    β_cna = pmhelp.hierarchical_normal(
        "β_cna", shape=n_cells, centered=False, mu_sd=0.5, sigma_sd=1.0
    )
    μ = pm.Deterministic("μ", β_0[g] + β_cna[c] * x_cna)

    σ = pm.HalfNormal("σ", 1)

    y = pm.Normal("y", μ, σ, observed=lfc)

pm.model_to_graphviz(lin_model)

svg

Prior predictive checks

prior_pred_kwargs = {"samples": 500, "random_seed": RANDOM_SEED}

with nb_model:
    nb_prior_pred = pm.sample_prior_predictive(**prior_pred_kwargs)


with lin_model:
    lin_prior_pred = pm.sample_prior_predictive(**prior_pred_kwargs)
def summarize_array(a: np.ndarray, name: str) -> np.ndarray:
    return pd.DataFrame(
        {
            "min": a.min(),
            "25%": np.quantile(a, q=0.25).astype(int),
            "mean": a.mean().astype(int),
            "median": np.median(a),
            "75%": np.quantile(a, q=0.75).astype(int),
            "max": a.max(),
        },
        index=[name],
    )


def get_varnames_to_print(prior_pred: dict[str, np.ndarray]) -> list[str]:
    single_dim_vars: list[str] = []
    for varname, vals in prior_pred.items():
        if "log__" in varname or vals.ndim != 1:
            continue
        single_dim_vars.append(varname)
    return single_dim_vars


def plot_prior_pred_against_real(
    prior_pred_i: np.array,
    real_values: pd.Series,
    i: int,
    binwidth: float,
    x_lab: str = "value",
) -> gg.ggplot:
    plot = (
        gg.ggplot(
            pd.DataFrame({"model": prior_pred_i, "observed": real_values}).pivot_longer(
                names_to="source"
            ),
            gg.aes(x="value"),
        )
        + gg.geom_histogram(
            gg.aes(fill="source"), alpha=0.75, binwidth=binwidth, position="identity"
        )
        + gg.scale_x_continuous(expand=(0, 0))
        + gg.scale_y_continuous(expand=(0, 0, 0.02, 0))
        + gg.scale_fill_brewer(type="qual", palette="Set1")
        + gg.theme(
            figure_size=(8, 4),
            legend_title=gg.element_blank(),
            legend_position=(0.8, 0.3),
            legend_background=gg.element_blank(),
        )
        + gg.labs(
            x=x_lab,
            y="count",
            title=f"Prior predictive distribution (#{i})",
        )
    )
    return plot
def prior_pred_analysis(
    prior_pred: dict[str, np.ndarray],
    real_values: pd.Series,
    x_lab: str,
    num_examples: int = 5,
    hist_binwidth: float = 1.0,
) -> None:
    print("Comparison of range of predicted values")
    print(
        pd.concat(
            [
                summarize_array(prior_pred["y"], "model"),
                summarize_array(real_values.values, "real"),
            ]
        )
    )
    print()

    fold_diff = prior_pred["y"].max() / real_values.max()
    print(f"Fold difference in maximum values: {fold_diff:0.2f}")

    for i in np.random.choice(
        np.arange(prior_pred["y"].shape[0]), size=num_examples, replace=False
    ):
        print(
            plot_prior_pred_against_real(
                prior_pred["y"][i, :],
                real_values,
                i=i,
                binwidth=hist_binwidth,
                x_lab=x_lab,
            )
        )
        params: dict[str, float] = {}
        for v in get_varnames_to_print(prior_pred):
            params[v] = np.round(prior_pred[v][i], 2)
        pprint.pprint(params)
prior_pred_analysis(
    nb_prior_pred,
    real_values=data.counts_final,
    x_lab="final read counts",
    hist_binwidth=100,
)
Comparison of range of predicted values
       min  25%  mean  median  75%     max
model    0  226   501   381.0  649  237913
real     0  225   579   417.0  743    9255

Fold difference in maximum values: 25.71

png

{'α': 116.98, 'μ_β_0': 0.17, 'μ_β_cna': -0.01, 'σ_β_0': 0.08, 'σ_β_cna': 0.06}

png

{'α': 20.2, 'μ_β_0': 0.06, 'μ_β_cna': 0.07, 'σ_β_0': 0.11, 'σ_β_cna': 0.12}

png

{'α': 39.92, 'μ_β_0': 0.04, 'μ_β_cna': 0.04, 'σ_β_0': 0.03, 'σ_β_cna': 0.05}

png

{'α': 19.26, 'μ_β_0': -0.05, 'μ_β_cna': -0.16, 'σ_β_0': 0.14, 'σ_β_cna': 0.11}

png

{'α': 14.81, 'μ_β_0': 0.03, 'μ_β_cna': 0.18, 'σ_β_0': 0.08, 'σ_β_cna': 0.2}
prior_pred_analysis(
    lin_prior_pred,
    real_values=data.lfc,
    x_lab="log-fold change",
    hist_binwidth=0.25,
)
Comparison of range of predicted values
             min  25%  mean    median  75%        max
model -80.960324   -1     0 -0.011173    1  55.902164
real   -4.318012    0     0 -0.037561    0   3.011199

Fold difference in maximum values: 18.56

png

{'μ_β_0': -0.6, 'μ_β_cna': 0.27, 'σ': 1.71, 'σ_β_0': 0.05, 'σ_β_cna': 1.3}

png

{'μ_β_0': -0.61, 'μ_β_cna': 0.43, 'σ': 0.77, 'σ_β_0': 0.15, 'σ_β_cna': 0.93}

png

{'μ_β_0': 1.01, 'μ_β_cna': -0.25, 'σ': 0.93, 'σ_β_0': 0.3, 'σ_β_cna': 1.89}

png

{'μ_β_0': -1.48, 'μ_β_cna': 0.13, 'σ': 0.17, 'σ_β_0': 1.85, 'σ_β_cna': 1.71}

png

{'μ_β_0': -0.17, 'μ_β_cna': -1.44, 'σ': 0.3, 'σ_β_0': 1.96, 'σ_β_cna': 1.56}

Sampling from the models

pm_sample_kwargs = {
    "draws": 1000,
    "chains": 4,
    "tune": 2000,
    "random_seed": [349 + 1 for i in range(4)],
    "target_accept": 0.95,
    "return_inferencedata": True,
}
pm_sample_ppc_kwargs = {"random_seed": 400}

I timed the sampling processes a few times and put together the following table to show how long each model takes to run inference:

model sampling (sec.) post. pred. (sec.) total (min.)
NB 59 64 2.23
normal 38 73 1.98
tic = time()

with nb_model:
    nb_trace = pm.sample(**pm_sample_kwargs)  # , start={"α_log__": np.array(5)})
    ppc = pm.sample_posterior_predictive(nb_trace, **pm_sample_ppc_kwargs)
    ppc["lfc"] = np.log2((1.0 + ppc["y"]) / data["counts_initial_adj"].values)
    nb_trace.extend(az.from_pymc3(posterior_predictive=ppc))

toc = time()
print(f"sampling required {(toc - tic) / 60:.2f} minutes")
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [α, Δ_β_cna, σ_β_cna, μ_β_cna, Δ_β_0, σ_β_0, μ_β_0]
<style> /*Turns off some styling*/ progress { /*gets rid of default border in Firefox and Opera.*/ border: none; /*Needs to be in here for Safari polyfill so background images work as expected.*/ background-size: auto; } .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar { background: #F44336; } </style> 100.00% [12000/12000 03:23<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 2_000 tune and 1_000 draw iterations (8_000 + 4_000 draws total) took 222 seconds.
The number of effective samples is smaller than 25% for some parameters.
<style> /*Turns off some styling*/ progress { /*gets rid of default border in Firefox and Opera.*/ border: none; /*Needs to be in here for Safari polyfill so background images work as expected.*/ background-size: auto; } .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar { background: #F44336; } </style> 100.00% [4000/4000 01:17<00:00]
sampling required 5.32 minutes
tic = time()

with lin_model:
    lin_trace = pm.sample(**pm_sample_kwargs)
    ppc = pm.sample_posterior_predictive(lin_trace, **pm_sample_ppc_kwargs)
    lin_trace.extend(az.from_pymc3(posterior_predictive=ppc))

toc = time()
print(f"sampling required {(toc - tic) / 60:.2f} minutes")
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [σ, Δ_β_cna, σ_β_cna, μ_β_cna, Δ_β_0, σ_β_0, μ_β_0]
<style> /*Turns off some styling*/ progress { /*gets rid of default border in Firefox and Opera.*/ border: none; /*Needs to be in here for Safari polyfill so background images work as expected.*/ background-size: auto; } .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar { background: #F44336; } </style> 100.00% [12000/12000 01:37<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 2_000 tune and 1_000 draw iterations (8_000 + 4_000 draws total) took 119 seconds.
The number of effective samples is smaller than 25% for some parameters.
<style> /*Turns off some styling*/ progress { /*gets rid of default border in Firefox and Opera.*/ border: none; /*Needs to be in here for Safari polyfill so background images work as expected.*/ background-size: auto; } .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar { background: #F44336; } </style> 100.00% [4000/4000 01:19<00:00]
sampling required 3.54 minutes

Model comparison

LOO-CV

We can compare the models using LOO-CV to see which is more robust to changes to individual data points. It looks like the normal model is far superior to the NB model. I am concerned that these are not comparable, though.

model_collection: dict[str, az.InferenceData] = {
    "negative binomial": nb_trace,
    "normal": lin_trace,
}
model_comparison = az.compare(model_collection)
model_comparison
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
rank loo p_loo d_loo weight se dse warning loo_scale
normal 0 -2249.736256 103.83120 0.000000 1.0 42.739655 0.00000 False log
negative binomial 1 -16280.009812 76.46166 14030.273556 0.0 51.161340 48.77773 True log
az.plot_compare(model_comparison);

png

Also, the LOO probability integral transformation (PIT) predictive checks (ArviZ doc indicate that the normal model is a better fit. My concern is that this test is only for continuous data.

for name, idata in model_collection.items():
    az.plot_loo_pit(idata, y="y")
    plt.title(name)
    plt.show()

png

png

Posterior distributions

shared_varnames = ["β_0", "μ_β_0", "σ_β_0"]
nb_varnames = shared_varnames.copy() + ["α"]
az.plot_trace(nb_trace, var_names=nb_varnames);

png

lin_varnames = shared_varnames.copy() + ["σ"]
az.plot_trace(lin_trace, var_names=lin_varnames);

png

def get_beta_0_summary(name: str, idata: az.InferenceData) -> pd.DataFrame:
    return (
        az.summary(idata, var_names="β_0", kind="stats", hdi_prob=0.89)
        .assign(hugo_symbol=data.hugo_symbol.cat.categories, model=name)
        .reset_index(drop=False)
        .rename(columns={"index": "parameter"})
    )


beta_0_post = pd.concat([get_beta_0_summary(n, d) for n, d in model_collection.items()])
beta_0_post.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
parameter mean sd hdi_5.5% hdi_94.5% hugo_symbol model
0 β_0[0] -0.180 0.116 -0.373 0.002 ACVR1C negative binomial
1 β_0[1] 0.406 0.113 0.219 0.579 ADAMTS2 negative binomial
2 β_0[2] 0.171 0.106 0.014 0.353 ADPRHL1 negative binomial
3 β_0[3] 0.201 0.118 0.011 0.387 ALKBH8 negative binomial
4 β_0[4] -0.161 0.118 -0.364 0.015 APC negative binomial
pos = gg.position_dodge(width=0.5)

(
    gg.ggplot(beta_0_post, gg.aes(x="hugo_symbol", y="mean", color="model"))
    + gg.geom_hline(yintercept=0, linetype="-", size=0.3, color="grey")
    + gg.geom_linerange(
        gg.aes(ymin="hdi_5.5%", ymax="hdi_94.5%"), position=pos, size=0.6, alpha=0.4
    )
    + gg.geom_point(position=pos, size=0.8, alpha=0.75)
    + gg.scale_y_continuous(expand=(0.01, 0))
    + gg.scale_color_brewer(type="qual", palette="Dark2")
    + gg.theme(axis_text_x=gg.element_text(angle=90, size=7), figure_size=(10, 4.5))
    + gg.labs(x="gene", y="β_0 posterior (mean ± 89% CI)")
)

png

<ggplot: (353979935)>
for name, idata in model_collection.items():
    print(f"posterior distributions in {name} model")
    az.plot_posterior(idata, var_names=["μ_β_0", "σ_β_0"])
    plt.show()
    print("-" * 80)
posterior distributions in negative binomial model

png

--------------------------------------------------------------------------------
posterior distributions in normal model

png

--------------------------------------------------------------------------------

Posterior predictive checks

for name, idata in model_collection.items():
    ax = az.plot_ppc(idata, num_pp_samples=100, random_seed=RANDOM_SEED)
    ax.set_title(name)
    if "binomial" in name:
        ax.set_yscale("log", base=2)
    plt.show()

png

png

nb_lfc_ppc_sample, _ = pmanal.down_sample_ppc(
    pmhelp.thin_posterior(
        nb_trace["posterior_predictive"]["lfc"], thin_to=100
    ).values.squeeze(),
    n=100,
    axis=1,
)

nb_ppc_lfc_sample_df = pd.DataFrame(nb_lfc_ppc_sample.T).pivot_longer(
    names_to="ppc_idx", values_to="draw"
)

(
    gg.ggplot(nb_ppc_lfc_sample_df, gg.aes(x="draw"))
    + gg.geom_density(gg.aes(group="ppc_idx"), weight=0.2, size=0.1, color="C0")
    + gg.geom_density(gg.aes(x="lfc"), data=data, color="k", size=1, linetype="--")
    + gg.scale_x_continuous(expand=(0, 0))
    + gg.scale_y_continuous(expand=(0, 0, 0.02, 0))
    + gg.labs(
        x="log-fold change",
        y="density",
        title="PPC of NB model\ntransformed to log-fold changes",
    )
)

png

<ggplot: (354723762)>
def prep_ppc(name: str, idata: az.InferenceData, observed_y: str) -> pd.DataFrame:
    df = pmanal.summarize_posterior_predictions(
        idata["posterior_predictive"]["y"].values.squeeze(),
        merge_with=data[["hugo_symbol", "lfc", "counts_final", "counts_initial_adj"]],
        calc_error=True,
        observed_y=observed_y,
    ).assign(
        model=name,
        percent_error=lambda d: 100 * (d[observed_y] - d.pred_mean) / d[observed_y],
        real_value=lambda d: d[observed_y],
    )

    if observed_y == "counts_final":
        df["pred_lfc"] = np.log2(df.pred_mean / df.counts_initial_adj)
        df["pred_lfc_low"] = np.log2(df.pred_hdi_low / df.counts_initial_adj)
        df["pred_lfc_high"] = np.log2(df.pred_hdi_high / df.counts_initial_adj)
    else:
        df["pred_lfc"] = df.pred_mean
        df["pred_lfc_low"] = df.pred_hdi_low
        df["pred_lfc_high"] = df.pred_hdi_high

    df["lfc_error"] = df["lfc"] - df["pred_lfc"]

    return df


ppc_df = pd.concat(
    [
        prep_ppc(m[0], m[1], y)
        for m, y in zip(model_collection.items(), ("counts_final", "lfc"))
    ]
)
ppc_df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
pred_mean pred_hdi_low pred_hdi_high hugo_symbol lfc counts_final counts_initial_adj error model percent_error real_value pred_lfc pred_lfc_low pred_lfc_high lfc_error
0 461.45850 57.0 814.0 SAMD8 -0.807107 220 345.041669 -241.45850 negative binomial -109.753864 220.0 0.419430 -2.597737 1.238258 -1.226538
1 548.05375 62.0 963.0 NKAPL 0.363149 589 383.379632 40.94625 negative binomial 6.951825 589.0 0.515544 -2.628434 1.328762 -0.152395
2 213.53300 23.0 374.0 SLC27A2 -0.118512 221 191.689816 7.46700 negative binomial 3.378733 221.0 0.155685 -3.059068 0.964265 -0.274197
3 226.34275 28.0 403.0 SEC23B -0.643089 128 191.689816 -98.34275 negative binomial -76.830273 128.0 0.239735 -2.775275 1.072006 -0.882824
4 308.46625 39.0 559.0 BRAF -0.033767 355 306.703706 46.53375 negative binomial 13.108099 355.0 0.008267 -2.975300 0.866003 -0.042034
(
    gg.ggplot(ppc_df, gg.aes(x="lfc_error"))
    + gg.geom_density(gg.aes(color="model", fill="model"), alpha=0.1)
    + gg.geom_vline(gg.aes(xintercept=0), alpha=0.6, size=0.3)
    + gg.scale_x_continuous(expand=(0, 0))
    + gg.scale_y_continuous(expand=(0, 0, 0.02, 0))
    + gg.scale_color_brewer(type="qual", palette="Set2")
    + gg.scale_fill_brewer(type="qual", palette="Set2")
    + gg.theme(figure_size=(5, 4))
    + gg.labs(x="log-fold change prediction error (obs. - pred.)")
)

png

<ggplot: (354248081)>
(
    gg.ggplot(ppc_df, gg.aes(x="lfc", y="pred_lfc"))
    + gg.facet_wrap("~ model", nrow=1)
    + gg.geom_linerange(
        gg.aes(ymin="pred_lfc_low", ymax="pred_lfc_high"),
        alpha=0.1,
        size=0.5,
        color=SeabornColor.BLUE.value,
    )
    + gg.geom_point(alpha=0.5, color=SeabornColor.BLUE.value)
    + gg.geom_abline(slope=1, intercept=0, color=SeabornColor.RED.value)
    + gg.scale_x_continuous(expand=(0.02, 0, 0.02, 0))
    + gg.scale_y_continuous(expand=(0.02, 0, 0.02, 0))
    + gg.theme(figure_size=(8, 4), panel_spacing_x=0.5)
    + gg.labs(
        x="observed log-fold change", y="predicted log-fold change\n(mean ± 89% CI)"
    )
)

png

<ggplot: (356103437)>
(
    gg.ggplot(ppc_df, gg.aes(x="lfc", y="lfc_error"))
    + gg.facet_wrap("~ model", nrow=1)
    + gg.geom_point(alpha=0.5, color=SeabornColor.BLUE.value)
    + gg.geom_hline(yintercept=0, linetype="--")
    + gg.geom_vline(xintercept=0, linetype="--")
    + gg.theme(figure_size=(8, 4), panel_spacing_x=0.5)
    + gg.labs(x="log-fold change", y="prediction error")
)

png

<ggplot: (355967741)>
(
    gg.ggplot(
        ppc_df[["model", "pred_lfc", "hugo_symbol"]]
        .pivot_wider(names_from="model", values_from="pred_lfc")
        .merge(data, left_index=True, right_index=True),
        gg.aes(x="normal", y="negative binomial"),
    )
    + gg.geom_point(
        gg.aes(color="hugo_symbol"), size=1.2, alpha=0.25, show_legend=False
    )
    + gg.geom_abline(slope=1, intercept=0, linetype="--", alpha=0.5)
    + gg.geom_smooth(
        method="lm",
        formula="y~x",
        alpha=0.6,
        linetype="--",
        size=0.6,
        color=SeabornColor.BLUE.value,
    )
    + gg.geom_hline(yintercept=0, alpha=0.5, color="grey")
    + gg.geom_vline(xintercept=0, alpha=0.5, color="grey")
    + gg.scale_color_hue()
    + gg.labs(
        x="post. pred. by normal model",
        y="post.pred by NB model",
        title="Comparison of posterior predictions",
    )
)

png

<ggplot: (356700865)>
loo_collection = {n: az.loo(d, pointwise=True) for n, d in model_collection.items()}
for name, loo_res in loo_collection.items():
    d = (
        pd.DataFrame({"hugo_symbol": data["hugo_symbol"], "loo": loo_res.loo_i})
        .sort_values("hugo_symbol")
        .reset_index(drop=False)
    )
    p = (
        gg.ggplot(d, gg.aes(x="index", y="loo"))
        + gg.geom_point(gg.aes(color="hugo_symbol"), show_legend=False)
        + gg.scale_color_hue()
        + gg.labs(x="index", y="LOO")
    )
    print(p)

png

png


notebook_toc = time()
print(f"execution time: {(notebook_toc - notebook_tic) / 60:.2f} minutes")
execution time: 12.83 minutes
%load_ext watermark
%watermark -d -u -v -iv -b -h -m
The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Last updated: 2021-10-06

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 7.26.0

Compiler    : Clang 11.1.0
OS          : Darwin
Release     : 20.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 4
Architecture: 64bit

Hostname: JHCookMac.local

Git branch: nb-model-2

pandas    : 1.3.2
theano    : 1.0.5
plotnine  : 0.8.0
logging   : 0.5.1.2
numpy     : 1.21.2
seaborn   : 0.11.2
matplotlib: 3.4.3
arviz     : 0.11.2
pymc3     : 3.11.2
re        : 2.2.1