diff --git a/afqinsight/parametric.py b/afqinsight/parametric.py new file mode 100644 index 0000000..92853ec --- /dev/null +++ b/afqinsight/parametric.py @@ -0,0 +1,167 @@ +"""Perform linear modeling at leach node along the tract.""" + +import numpy as np +import pandas as pd +import statsmodels.formula.api as smf + +from sklearn.impute import SimpleImputer +from statsmodels.api import OLS +from statsmodels.stats.multitest import multipletests + + +def node_wise_regression( + afq_dataset, + tract, + metric, + formula, + group="group", + lme=False, + rand_eff="subjectID", +): + """Model group differences using node-wise regression along the length of the tract. + + Returns a list of beta-weights, confidence intervals, p-values, and rejection criteria + based on multiple-comparison correction. + + Based on this example: https://github.com/yeatmanlab/AFQ-Insight/blob/main/examples/plot_als_comparison.py + + Parameters + ---------- + afq_dataset: AFQDataset + Loaded AFQDataset object + tract: str + String specifying the tract to model + + metric: str + String specifying which diffusion metric to use as an outcome + eg. 'fa' + + formula: str + An R-style formula + specifying the regression model to fit at each node. This can take the form + of either a linear mixed-effects model or OLS regression + + lme: Bool, default=False + Boolean specifying whether to fit a linear mixed-effects model + + rand_eff: str, default='subjectID' + String specifying the random effect grouping structure for linear mixed-effects + models. If using anything other than the default value, this column must be present + in the 'target_cols' of the AFQDataset object + + + Returns + ------- + tract_dict: dict + A dictionary with the following key-value pairs: + + {'tract': tract, + 'reference_coefs': coefs_default, + 'group_coefs': coefs_treat, + 'reference_CI': cis_default, + 'group_CI': cis_treat, + 'pvals': pvals, + 'reject_idx': reject_idx, + 'model_fits': fits} + + tract: str + The tract described by this dictionary + + reference_coefs: list of floats + A list of beta-weights representing the average diffusion metric for the + reference group on a diffusion metric at a given location along the tract + + group_coefs: list of floats + A list of beta-weights representing the average group effect metric for the + treatment group on a diffusion metric at a given location along the tract + + reference_CI: np.array of np.array + A numpy array containing a series of numpy arrays indicating the 95% confidence interval + around the estimated beta-weight of the reference category at a given location along the tract + + group_CI: np.array of np.array + A numpy array containing a series of numpy arrays indicating the 95% confidence interval + around the estimated beta-weight of the treatment effect at a given location along the tract + + pvals: list of floats + A list of p-values testing whether or not the beta-weight of the group effect is + different from 0 + + reject_idx: list of Booleans + A list of node indices where the null hypothesis is rejected after multiple-comparison + corrections + + model_fits: list of statsmodels objects + A list of the statsmodels object fit along the length of the nodes + + """ + X = SimpleImputer(strategy="median").fit_transform(afq_dataset.X) + afq_dataset.target_cols[0] = group + + tract_data = ( + pd.DataFrame(columns=afq_dataset.feature_names, data=X) + .filter(like=tract) + .filter(like=metric) + ) + + pvals = np.zeros(tract_data.shape[-1]) + coefs_default = np.zeros(tract_data.shape[-1]) + coefs_treat = np.zeros(tract_data.shape[-1]) + cis_default = np.zeros((tract_data.shape[-1], 2)) + cis_treat = np.zeros((tract_data.shape[-1], 2)) + fits = {} + + # Loop through each node and fit a model + for ii, column in enumerate(tract_data.columns): + + # fit linear mixed-effects model + if lme: + + this = pd.DataFrame(afq_dataset.y, columns=afq_dataset.target_cols) + this[metric] = tract_data[column] + + # if no random effect specified, use subjectID as random effect + if rand_eff == "subjectID": + this["subjectID"] = afq_dataset.subjects + + model = smf.mixedlm(formula, this, groups=rand_eff) + fit = model.fit() + fits[column] = fit + + # fit OLS model + else: + + _, _, _ = column + this = pd.DataFrame(afq_dataset.y, columns=afq_dataset.target_cols) + this[metric] = tract_data[column] + + model = OLS.from_formula(formula, this) + fit = model.fit() + fits[column] = fit + + # pull out coefficients, CIs, and p-values from our model + coefs_default[ii] = fit.params.filter(regex="Intercept", axis=0).iloc[0] + coefs_treat[ii] = fit.params.filter(regex=group, axis=0).iloc[0] + + cis_default[ii] = ( + fit.conf_int(alpha=0.05).filter(regex="Intercept", axis=0).values + ) + cis_treat[ii] = fit.conf_int(alpha=0.05).filter(regex=group, axis=0).values + pvals[ii] = fit.pvalues.filter(regex=group, axis=0).iloc[0] + + # Correct p-values for multiple comparisons + reject, pval_corrected, _, _ = multipletests(pvals, alpha=0.05, method="fdr_bh") + reject_idx = np.where(reject) + + tract_dict = { + "tract": tract, + "reference_coefs": coefs_default, + "group_coefs": coefs_treat, + "reference_CI": cis_default, + "group_CI": cis_treat, + "pvals": pvals, + "reject_idx": reject_idx, + "model_fits": fits, + } + + return tract_dict diff --git a/afqinsight/plot.py b/afqinsight/plot.py index aa746cd..468c85e 100644 --- a/afqinsight/plot.py +++ b/afqinsight/plot.py @@ -331,3 +331,48 @@ def plot_tract_profiles( figs[metric] = fig return figs + + +def plot_regression_profiles(model_dict, ax): + """Plot parametric tract profiles based on node-wise linear models. + + Parameters + ---------- + model_dict: dict + Dictionary returned by parametric.node_wise_regression containing information about + parametric model fits over the length of a tract + + ax: matplotlib.axes + A matplotlib axes for a subplot where the tract profiles will be plotted + + Returns + ------- + ax: matplotlib.axes + A matplotlib axes with plotted parametric tract profiles + + """ + # Add beta-weight for group to intercept (mean control) + combo_coefs = model_dict["reference_coefs"] + model_dict["group_coefs"] + + # Generate 95% CI around group mean by adding to control intercept + combo_conf_int = np.zeros(model_dict["reference_CI"].shape) + combo_conf_int[:, 0] = model_dict["reference_coefs"] + model_dict["group_CI"][:, 0] + combo_conf_int[:, 1] = model_dict["reference_coefs"] + model_dict["group_CI"][:, 1] + + # Plot regression based tract profiles + ax.plot(model_dict["reference_coefs"]) + ax.plot(combo_coefs) + ax.plot(model_dict["reject_idx"], np.zeros(len(model_dict["reject_idx"])), "k*") + + ax.fill_between( + range(100), + model_dict["reference_CI"][:, 0], + model_dict["reference_CI"][:, 1], + alpha=0.5, + ) # Adding fill_between + + ax.fill_between( + range(100), combo_conf_int[:, 0], combo_conf_int[:, 1], alpha=0.5 + ) # Adding fill_between + + return ax diff --git a/examples/plot_als_comparison.py b/examples/plot_als_comparison.py index 0223af7..3bd62b9 100644 --- a/examples/plot_als_comparison.py +++ b/examples/plot_als_comparison.py @@ -19,16 +19,12 @@ DOI: 10.1002/hbm.23412 """ + import matplotlib.pyplot as plt -import numpy as np -import pandas as pd from afqinsight import AFQDataset -from statsmodels.api import OLS -from statsmodels.stats.anova import anova_lm -from statsmodels.stats.multitest import multipletests - -from sklearn.impute import SimpleImputer +from afqinsight.parametric import node_wise_regression +from afqinsight.plot import plot_regression_profiles ############################################################################# # Fetch data from Sarica et al. @@ -42,78 +38,59 @@ afqdata = AFQDataset.from_study("sarica") -# Examine the data -# ---------------- -# ``afqdata`` is an ``AFQDataset`` object, with properties corresponding to the -# tractometry features and phenotypic targets. In this case, y includes only the -# group to which the subjects belong, encoded as 0 (patients) or 1 (controls). - -X = afqdata.X -y = afqdata.y -groups = afqdata.groups -feature_names = afqdata.feature_names -group_names = afqdata.group_names -subjects = afqdata.subjects - -X = SimpleImputer(strategy="median").fit_transform(X) - -# Filtering the data -# ---------------- -# We start by filtering the data down to only the FA estimates in the left -# corticospinal tract. We can do that by creating a Pandas dataframe and then -# using the column labels that `afqdataset` has generated from the data. - -data = pd.DataFrame(columns=afqdata.feature_names, data=X) -lcst_fa = data.filter(like="Left Corticospinal").filter(like="fa") - -pvals = np.zeros(lcst_fa.shape[-1]) -coefs = np.zeros(lcst_fa.shape[-1]) -cis = np.zeros((lcst_fa.shape[-1], 2)) - -# Fit models -# ---------------- -# Next, we fit a model for each node of the CST for the FA as a function of -# group. The initializer `OLS.from_formula` takes -# `R-style formulas `_ -# as its model specification. Here, we are using the `"group"` variable as a -# categorical variable in the model. Much more complex linear models (including -# linear mixed effects model) are possible for datasets with more complex -# phenotypical data. - -for ii, column in enumerate(lcst_fa.columns): - feature, node, tract = column - this = pd.DataFrame({"group": y, feature: lcst_fa[column]}) - model = OLS.from_formula(f"{feature} ~ C(group)", this) - fit = model.fit() - coefs[ii] = fit.params.loc["C(group)[T.1]"] - cis[ii] = fit.conf_int(alpha=0.05).loc["C(group)[T.1]"].values - pvals[ii] = anova_lm(fit, typ=2)["PR(>F)"][0] - -# Correct for multiple comparison -# -------------------------------- +# Specify the tracts of interest +# ------------------------------ +# Many times we have a specific set of tracts we want to analyze. We can specify +# those tracts using a list with each tract of interest + +tracts = ["Left Arcuate", "Right Arcuate", "Left Corticospinal", "Right Corticospinal"] + +# Set up plot, run linear models, and show the results +# ---------------------------------------------------- +# With the next few lines of code, we fit a linear model in order to predict +# group differences in the diffusion properties of our tracts of interest. The +# function `node_wise_regression` does this by looping through each node of a +# tract and predicting our diffusion metric, in this case FA, as a function of +# group. The initializer `OLS.from_formula` takes `R-style formulas +# `_ as its model specification. +# Here, we are using the `"group"` variable as a categorical variable in the model. +# We can also specify linear-mixed effects models for more complex phenotypic data +# by passing in the appropriate model formula and setting `lme=True`. +# # Because we conducted 100 comparisons, we need to correct the p-values that # we obtained for the potential for a false discovery. There are multiple # ways to conduct multuple comparison correction, and we will not get into -# the considerations in selecting a method here. For the example, we chose the -# Benjamini/Hochberg FDR controlling method. This returns a boolean array for +# the considerations in selecting a method here. The function `node_wise_regression` +# uses Benjamini/Hochberg FDR controlling method. This returns a boolean array for # the p-values that are rejected at a specified alpha level (after correction), # as well as an array of the corrected p-values. -reject, pval_corrected, _, _ = multipletests(pvals, alpha=0.05, method="fdr_bh") -reject_idx = np.where(reject) +num_cols = 2 + +# Define the figure and its grid +fig, axes = plt.subplots(nrows=2, ncols=num_cols, figsize=(10, 6)) + + +# Loop through the data and generate plots +for i, tract in enumerate(tracts): + + # fit node-wise regression for each tract based on model formula + tract_dict = node_wise_regression(afqdata, tract, "fa", "fa ~ C(group)") + + row = i // num_cols + col = i % num_cols + + axes[row][col].set_title(tract) -# Visualize -# ---------- -# Using Matplotlib, we can visualize the results. The top figure shows the tract -# profiles of the two groups, with stars indicating the nodes at which the null -# hypothesis is rejected. The bottom figure shows the model coefficients -# together with their estimated 95% confidence interval. + # Visualize + # ---------- + # We can visualize the results with the `plot_regression_profiles` function. + # Each subplot shows the tract profiles of the two groups while controlling for + # any covariates, with stars indicating the nodes at which the null hypothesis is rejected. + plot_regression_profiles(tract_dict, axes[row][col]) -fig, ax = plt.subplots(2, 1) -ax[0].plot(np.mean(lcst_fa.iloc[afqdata.y == 0], 0).values) -ax[0].plot(np.mean(lcst_fa.iloc[afqdata.y == 1], 0).values) -ax[0].plot(reject_idx, np.zeros(len(reject_idx)), "k*") +# Adjust layout to prevent overlap +plt.tight_layout() -ax[1].plot(coefs) -ax[1].fill_between(range(100), cis[:, 0], cis[:, 1], alpha=0.5) -ax[1].plot(range(100), np.zeros(100), "k--") +# Show the plot +plt.show() diff --git a/setup.cfg b/setup.cfg index f70d775..f2a82d6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,7 +34,7 @@ python_requires = >=3.10 install_requires = dipy>=1.0.0 groupyr>=0.3.3 - matplotlib + matplotlib<3.9 numpy==1.23.5 pandas==2.1.4 requests @@ -49,7 +49,7 @@ include_package_data = True packages = find: [options.extras_require] -tables = +tables = tables==3.9.1 torch = torch