yeatmanlab · earoy · Jun 11, 2024 · Jun 11, 2024 · Jun 11, 2024 · Jun 12, 2024
diff --git a/afqinsight/parametric.py b/afqinsight/parametric.py
@@ -0,0 +1,167 @@
+"""Perform linear modeling at leach node along the tract."""
+
+import numpy as np
+import pandas as pd
+import statsmodels.formula.api as smf
+
+from sklearn.impute import SimpleImputer
+from statsmodels.api import OLS
+from statsmodels.stats.multitest import multipletests
+
+
+def node_wise_regression(
+    afq_dataset,
+    tract,
+    metric,
+    formula,
+    group="group",
+    lme=False,
+    rand_eff="subjectID",
+):
+    """Model group differences using node-wise regression along the length of the tract.
+
+       Returns a list of beta-weights, confidence intervals, p-values, and rejection criteria
+       based on multiple-comparison correction.
+
+       Based on this example: https://github.com/yeatmanlab/AFQ-Insight/blob/main/examples/plot_als_comparison.py
+
+    Parameters
+    ----------
+    afq_dataset: AFQDataset
+        Loaded AFQDataset object
+    tract: str
+        String specifying the tract to model
+
+    metric: str
+        String specifying which diffusion metric to use as an outcome
+        eg. 'fa'
+
+    formula: str
+        An R-style formula <https://www.statsmodels.org/dev/example_formulas.html>
+        specifying the regression model to fit at each node. This can take the form
+        of either a linear mixed-effects model or OLS regression
+
+    lme: Bool, default=False
+        Boolean specifying whether to fit a linear mixed-effects model
+
+    rand_eff: str, default='subjectID'
+        String specifying the random effect grouping structure for linear mixed-effects
+        models. If using anything other than the default value, this column must be present
+        in the 'target_cols' of the AFQDataset object
+
+
+    Returns
+    -------
+    tract_dict: dict
+        A dictionary with the following key-value pairs:
+
+        {'tract': tract,
+                  'reference_coefs': coefs_default,
+                  'group_coefs': coefs_treat,
+                  'reference_CI': cis_default,
+                  'group_CI': cis_treat,
+                  'pvals': pvals,
+                  'reject_idx': reject_idx,
+                  'model_fits': fits}
+
+        tract: str
+            The tract described by this dictionary
+
+        reference_coefs: list of floats
+            A list of beta-weights representing the average diffusion metric for the
+            reference group on a diffusion metric at a given location along the tract
+
+        group_coefs: list of floats
+            A list of beta-weights representing the average group effect metric for the
+            treatment group on a diffusion metric at a given location along the tract
+
+        reference_CI: np.array of np.array
+            A numpy array containing a series of numpy arrays indicating the 95% confidence interval
+            around the estimated beta-weight of the reference category at a given location along the tract
+
+        group_CI: np.array of np.array
+            A numpy array containing a series of numpy arrays indicating the 95% confidence interval
+            around the estimated beta-weight of the treatment effect at a given location along the tract
+
+        pvals: list of floats
+            A list of p-values testing whether or not the beta-weight of the group effect is
+            different from 0
+
+        reject_idx: list of Booleans
+            A list of node indices where the null hypothesis is rejected after multiple-comparison
+            corrections
+
+        model_fits: list of statsmodels objects
+            A list of the statsmodels object fit along the length of the nodes
+
+    """
+    X = SimpleImputer(strategy="median").fit_transform(afq_dataset.X)
+    afq_dataset.target_cols[0] = group
+
+    tract_data = (
+        pd.DataFrame(columns=afq_dataset.feature_names, data=X)
+        .filter(like=tract)
+        .filter(like=metric)
+    )
+
+    pvals = np.zeros(tract_data.shape[-1])
+    coefs_default = np.zeros(tract_data.shape[-1])
+    coefs_treat = np.zeros(tract_data.shape[-1])
+    cis_default = np.zeros((tract_data.shape[-1], 2))
+    cis_treat = np.zeros((tract_data.shape[-1], 2))
+    fits = {}
+
+    # Loop through each node and fit a model
+    for ii, column in enumerate(tract_data.columns):
+
+        # fit linear mixed-effects model
+        if lme:
+
+            this = pd.DataFrame(afq_dataset.y, columns=afq_dataset.target_cols)
+            this[metric] = tract_data[column]
+
+            # if no random effect specified, use subjectID as random effect
+            if rand_eff == "subjectID":
+                this["subjectID"] = afq_dataset.subjects
+
+            model = smf.mixedlm(formula, this, groups=rand_eff)
+            fit = model.fit()
+            fits[column] = fit
+
+        # fit OLS model
+        else:
+
+            _, _, _ = column
+            this = pd.DataFrame(afq_dataset.y, columns=afq_dataset.target_cols)
+            this[metric] = tract_data[column]
+
+            model = OLS.from_formula(formula, this)
+            fit = model.fit()
+            fits[column] = fit
+
+        # pull out coefficients, CIs, and p-values from our model
+        coefs_default[ii] = fit.params.filter(regex="Intercept", axis=0).iloc[0]
+        coefs_treat[ii] = fit.params.filter(regex=group, axis=0).iloc[0]
+
+        cis_default[ii] = (
+            fit.conf_int(alpha=0.05).filter(regex="Intercept", axis=0).values
+        )
+        cis_treat[ii] = fit.conf_int(alpha=0.05).filter(regex=group, axis=0).values
+        pvals[ii] = fit.pvalues.filter(regex=group, axis=0).iloc[0]
+
+    # Correct p-values for multiple comparisons
+    reject, pval_corrected, _, _ = multipletests(pvals, alpha=0.05, method="fdr_bh")
+    reject_idx = np.where(reject)
+
+    tract_dict = {
+        "tract": tract,
+        "reference_coefs": coefs_default,
+        "group_coefs": coefs_treat,
+        "reference_CI": cis_default,
+        "group_CI": cis_treat,
+        "pvals": pvals,
+        "reject_idx": reject_idx,
+        "model_fits": fits,
+    }
+
+    return tract_dict
diff --git a/afqinsight/plot.py b/afqinsight/plot.py
@@ -331,3 +331,48 @@ def plot_tract_profiles(
         figs[metric] = fig
 
     return figs
+
+
+def plot_regression_profiles(model_dict, ax):
+    """Plot parametric tract profiles based on node-wise linear models.
+
+    Parameters
+    ----------
+    model_dict: dict
+        Dictionary returned by parametric.node_wise_regression containing information about
+        parametric model fits over the length of a tract
+
+    ax: matplotlib.axes
+        A matplotlib axes for a subplot where the tract profiles will be plotted
+
+    Returns
+    -------
+    ax: matplotlib.axes
+        A matplotlib axes with plotted parametric tract profiles
+
+    """
+    # Add beta-weight for group to intercept (mean control)
+    combo_coefs = model_dict["reference_coefs"] + model_dict["group_coefs"]
+
+    # Generate 95% CI around group mean by adding to control intercept
+    combo_conf_int = np.zeros(model_dict["reference_CI"].shape)
+    combo_conf_int[:, 0] = model_dict["reference_coefs"] + model_dict["group_CI"][:, 0]
+    combo_conf_int[:, 1] = model_dict["reference_coefs"] + model_dict["group_CI"][:, 1]
+
+    # Plot regression based tract profiles
+    ax.plot(model_dict["reference_coefs"])
+    ax.plot(combo_coefs)
+    ax.plot(model_dict["reject_idx"], np.zeros(len(model_dict["reject_idx"])), "k*")
+
+    ax.fill_between(
+        range(100),
+        model_dict["reference_CI"][:, 0],
+        model_dict["reference_CI"][:, 1],
+        alpha=0.5,
+    )  # Adding fill_between
+
+    ax.fill_between(
+        range(100), combo_conf_int[:, 0], combo_conf_int[:, 1], alpha=0.5
+    )  # Adding fill_between
+
+    return ax
diff --git a/examples/plot_als_comparison.py b/examples/plot_als_comparison.py
@@ -19,16 +19,12 @@
    DOI: 10.1002/hbm.23412
 
 """
+
 import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
 
 from afqinsight import AFQDataset
-from statsmodels.api import OLS
-from statsmodels.stats.anova import anova_lm
-from statsmodels.stats.multitest import multipletests
-
-from sklearn.impute import SimpleImputer
+from afqinsight.parametric import node_wise_regression
+from afqinsight.plot import plot_regression_profiles
 
 #############################################################################
 # Fetch data from Sarica et al.
@@ -42,78 +38,59 @@
 
 afqdata = AFQDataset.from_study("sarica")
 
-# Examine the data
-# ----------------
-# ``afqdata`` is an ``AFQDataset`` object, with properties corresponding to the
-# tractometry features and phenotypic targets. In this case, y includes only the
-# group to which the subjects belong, encoded as 0 (patients) or 1 (controls).
-
-X = afqdata.X
-y = afqdata.y
-groups = afqdata.groups
-feature_names = afqdata.feature_names
-group_names = afqdata.group_names
-subjects = afqdata.subjects
-
-X = SimpleImputer(strategy="median").fit_transform(X)
-
-# Filtering the data
-# ----------------
-# We start by filtering the data down to only the FA estimates in the left
-# corticospinal tract. We can do that by creating a Pandas dataframe and then
-# using the column labels that `afqdataset` has generated from the data.
-
-data = pd.DataFrame(columns=afqdata.feature_names, data=X)
-lcst_fa = data.filter(like="Left Corticospinal").filter(like="fa")
-
-pvals = np.zeros(lcst_fa.shape[-1])
-coefs = np.zeros(lcst_fa.shape[-1])
-cis = np.zeros((lcst_fa.shape[-1], 2))
-
-# Fit models
-# ----------------
-# Next, we fit a model for each node of the CST for the FA as a function of
-# group. The initializer `OLS.from_formula` takes
-# `R-style formulas <https://www.statsmodels.org/dev/example_formulas.html>`_
-# as its model specification. Here, we are using the `"group"` variable as a
-# categorical variable in the model. Much more complex linear models (including
-# linear mixed effects model) are possible for datasets with more complex
-# phenotypical data.
-
-for ii, column in enumerate(lcst_fa.columns):
-    feature, node, tract = column
-    this = pd.DataFrame({"group": y, feature: lcst_fa[column]})
-    model = OLS.from_formula(f"{feature} ~ C(group)", this)
-    fit = model.fit()
-    coefs[ii] = fit.params.loc["C(group)[T.1]"]
-    cis[ii] = fit.conf_int(alpha=0.05).loc["C(group)[T.1]"].values
-    pvals[ii] = anova_lm(fit, typ=2)["PR(>F)"][0]
-
-# Correct for multiple comparison
-# --------------------------------
+# Specify the tracts of interest
+# ------------------------------
+# Many times we have a specific set of tracts we want to analyze. We can specify
+# those tracts using a list with each tract of interest
+
+tracts = ["Left Arcuate", "Right Arcuate", "Left Corticospinal", "Right Corticospinal"]
+
+# Set up plot, run linear models, and show the results
+# ----------------------------------------------------
+# With the next few lines of code, we fit a linear model in order to predict
+# group differences in the diffusion properties of our tracts of interest. The
+# function `node_wise_regression` does this by looping through each node of a
+# tract and predicting our diffusion metric, in this case FA, as a function of
+# group. The initializer `OLS.from_formula` takes `R-style formulas
+# <https://www.statsmodels.org/dev/example_formulas.html>`_ as its model specification.
+# Here, we are using the `"group"` variable as a categorical variable in the model.
+# We can also specify linear-mixed effects models for more complex phenotypic data
+# by passing in the appropriate model formula and setting `lme=True`.
+#
 # Because we conducted 100 comparisons, we need to correct the p-values that
 # we obtained for the potential for a false discovery. There are multiple
 # ways to conduct multuple comparison correction, and we will not get into
-# the considerations in selecting a method here. For the example, we chose the
-# Benjamini/Hochberg FDR controlling method. This returns a boolean array for
+# the considerations in selecting a method here. The function `node_wise_regression`
+# uses Benjamini/Hochberg FDR controlling method. This returns a boolean array for
 # the p-values that are rejected at a specified alpha level (after correction),
 # as well as an array of the corrected p-values.
 
-reject, pval_corrected, _, _ = multipletests(pvals, alpha=0.05, method="fdr_bh")
-reject_idx = np.where(reject)
+num_cols = 2
+
+# Define the figure and its grid
+fig, axes = plt.subplots(nrows=2, ncols=num_cols, figsize=(10, 6))
+
+
+# Loop through the data and generate plots
+for i, tract in enumerate(tracts):
+
+    # fit node-wise regression for each tract based on model formula
+    tract_dict = node_wise_regression(afqdata, tract, "fa", "fa ~ C(group)")
+
+    row = i // num_cols
+    col = i % num_cols
+
+    axes[row][col].set_title(tract)
 
-# Visualize
-# ----------
-# Using Matplotlib, we can visualize the results. The top figure shows the tract
-# profiles of the two groups, with stars indicating the nodes at which the null
-# hypothesis is rejected. The bottom figure shows the model coefficients
-# together with their estimated 95% confidence interval.
+    # Visualize
+    # ----------
+    # We can visualize the results with the `plot_regression_profiles` function.
+    # Each subplot shows the tract profiles of the two groups while controlling for
+    # any covariates, with stars indicating the nodes at which the null hypothesis is rejected.
+    plot_regression_profiles(tract_dict, axes[row][col])
 
-fig, ax = plt.subplots(2, 1)
-ax[0].plot(np.mean(lcst_fa.iloc[afqdata.y == 0], 0).values)
-ax[0].plot(np.mean(lcst_fa.iloc[afqdata.y == 1], 0).values)
-ax[0].plot(reject_idx, np.zeros(len(reject_idx)), "k*")
+# Adjust layout to prevent overlap
+plt.tight_layout()
 
-ax[1].plot(coefs)
-ax[1].fill_between(range(100), cis[:, 0], cis[:, 1], alpha=0.5)
-ax[1].plot(range(100), np.zeros(100), "k--")
+# Show the plot
+plt.show()