diff --git a/WeightSearch.csv b/WeightSearch.csv index de567e2ed..4605e1098 100644 --- a/WeightSearch.csv +++ b/WeightSearch.csv @@ -1,2 +1,3 @@ -,Clusters,DDMC,Average,Zero,PCA -0,1.0,,,, +,Weight,DDMC,Average,Zero,PCA +0,0.0,,,, +1,100.0,,,, diff --git a/ddmc/binomial.py b/ddmc/binomial.py index 054c152e6..7857b5e61 100644 --- a/ddmc/binomial.py +++ b/ddmc/binomial.py @@ -54,7 +54,7 @@ def GenerateBinarySeqID(seqs: list[str]) -> np.ndarray: return res -def BackgroundSeqs(forseqs: pd.Series) -> list[str]: +def BackgroundSeqs(forseqs: np.ndarray[str]) -> list[str]: """Build Background data set with the same proportion of pY, pT, and pS motifs as in the foreground set of sequences. Note this PsP data set contains 51976 pY, 226131 pS, 81321 pT Source: https://www.phosphosite.org/staticDownloads.action - @@ -62,7 +62,7 @@ def BackgroundSeqs(forseqs: pd.Series) -> list[str]: Cite: Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926""" # Get porportion of psite types in foreground set - forw_pYn, forw_pSn, forw_pTn = CountPsiteTypes(forseqs.astype(str).tolist()) + forw_pYn, forw_pSn, forw_pTn = CountPsiteTypes(forseqs) forw_tot = forw_pYn + forw_pSn + forw_pTn pYf = forw_pYn / forw_tot @@ -132,9 +132,9 @@ def BackgProportions(refseqs: list[str], pYn: int, pSn: int, pTn: int) -> list[s class Binomial: """Definition of the binomial sequence distance distribution.""" - def __init__(self, seq: pd.Series, seqs: list[str]): + def __init__(self, seqs: np.ndarray[str]): # Background sequences - background = position_weight_matrix(BackgroundSeqs(seq)) + background = position_weight_matrix(BackgroundSeqs(seqs)) self.background = np.array([background[AA] for AA in AAlist]) self.foreground: np.ndarray = GenerateBinarySeqID(seqs) @@ -152,7 +152,7 @@ def from_summaries(self, weightsIn: np.ndarray): self.logWeights = np.log(tempp) -def CountPsiteTypes(X: list[str]) -> tuple[int, int, int]: +def CountPsiteTypes(X: np.ndarray[str]) -> tuple[int, int, int]: """Count the number of different phosphorylation types in an MS data set. Args: @@ -161,11 +161,13 @@ def CountPsiteTypes(X: list[str]) -> tuple[int, int, int]: Returns: tuple[int, int, int]: The number of pY, pS, and pT sites. """ + X = np.char.upper(X) + # Find the center amino acid cA = int((len(X[0]) - 1) / 2) - positionSeq = [seq[cA] for seq in X] - pS = positionSeq.count("s") - pT = positionSeq.count("t") - pY = positionSeq.count("y") + phospho_aminos = [seq[cA] for seq in X] + pS = phospho_aminos.count("S") + pT = phospho_aminos.count("T") + pY = phospho_aminos.count("Y") return pY, pS, pT diff --git a/ddmc/clustering.py b/ddmc/clustering.py index edc07dd20..3a45e9540 100644 --- a/ddmc/clustering.py +++ b/ddmc/clustering.py @@ -1,6 +1,6 @@ """ Clustering functions. """ -from typing import Literal +from typing import Literal, List, Dict import warnings from copy import deepcopy import itertools @@ -10,7 +10,7 @@ from sklearn.utils.validation import check_is_fitted from .binomial import Binomial, AAlist, BackgroundSeqs, frequencies from .pam250 import PAM250 -from .motifs import PSPLdict, compute_control_pssm +from .motifs import get_pspls, compute_control_pssm from fancyimpute import SoftImpute @@ -21,7 +21,7 @@ class DDMC(GaussianMixture): def __init__( self, - sequences: pd.Series, + sequences, n_components: int, seq_weight: float, distance_method: Literal["PAM250", "Binomial"], @@ -37,24 +37,25 @@ def __init__( tol=tol, random_state=random_state, ) - - self.sequences = sequences + self.gen_peptide_distances(sequences, distance_method) self.seq_weight = seq_weight - self.distance_method = distance_method - seqs = sequences.str.upper().to_list() - - - - def gen_peptide_distances(self, seqs, distance_method): + def gen_peptide_distances(self, seqs: np.ndarray | pd.DataFrame, distance_method): + # store parameters for sklearn's checks + self.distance_method = distance_method + if not isinstance(seqs, np.ndarray): + seqs = seqs.values + if seqs.dtype != str: + seqs = seqs.astype("str") + seqs = np.char.upper(seqs) + self.sequences = seqs if distance_method == "PAM250": self.seq_dist: PAM250 | Binomial = PAM250(seqs) elif distance_method == "Binomial": - self.seq_dist = Binomial(sequences, seqs) + self.seq_dist = Binomial(seqs) else: raise ValueError("Wrong distance type.") - def _estimate_log_prob(self, X: np.ndarray): """Estimate the log-probability of each point in each cluster.""" logp = super()._estimate_log_prob(X) # Do the regular work @@ -87,9 +88,18 @@ def _m_step(self, X: np.ndarray, log_resp: np.ndarray): # Do sequence m step self.seq_dist.from_summaries(np.exp(log_resp)) - def fit(self, X: np.ndarray | pd.DataFrame, y=None): - """Compute EM clustering""" - d = np.array(X.T) + def fit(self, X: pd.DataFrame): + """ + Compute EM clustering + + Args: + X: dataframe consisting of a "Sequence" column, and sample + columns. Every column that is not named "Sequence" will be treated + as a sample. + """ + # TODO: assert that the peptides passed in match the length of X + # TODO: probably just pass in sequences here + d = np.array(X) if np.any(np.isnan(d)): self._missing = True @@ -151,12 +161,12 @@ def impute(self, X: np.ndarray) -> np.ndarray: assert np.all(np.isfinite(X)) return X - def pssms(self, PsP_background=False): + def get_pssms(self, PsP_background=False, clusters: List=None): """Compute position-specific scoring matrix of each cluster. Note, to normalize by amino acid frequency this uses either all the sequences in the data set or a collection of random MS phosphosites in PhosphoSitePlus. """ - pssms, cl_num = [], [] + pssms, pssm_names = [], [] if PsP_background: bg_seqs = BackgroundSeqs(self.sequences) back_pssm = compute_control_pssm(bg_seqs) @@ -200,7 +210,7 @@ def pssms(self, PsP_background=False): pssm.index = AAlist # Normalize phosphoacceptor position to frequency - df = pd.DataFrame(self.sequences.str.upper()) + df = pd.DataFrame({"Sequence": self.sequences}) df["Cluster"] = self.labels() clSeq = df[df["Cluster"] == ii]["Sequence"] clSeq = pd.DataFrame(frequencies(clSeq)).T @@ -209,34 +219,33 @@ def pssms(self, PsP_background=False): pssm.loc[p_site, 5] = np.log2(clSeq.loc[p_site, 5] / tm) pssms.append(np.clip(pssm, a_min=0, a_max=3)) - cl_num.append(ii) + pssm_names.append(ii) + + pssm_names, pssms = np.array(pssm_names), np.array(pssms) + + if clusters is not None: + return pssms[[np.where(pssm_names == cluster)[0][0] for cluster in clusters]] - return pssms, cl_num + return pssm_names, pssms - def predict_UpstreamKinases( - self, additional_pssms=False, add_labels=False, PsP_background=True + def predict_upstream_kinases( + self, + PsP_background=True, ): - """Compute matrix-matrix similarity between kinase specificity profiles and cluster PSSMs to identify upstream kinases regulating clusters.""" - PSPLs = PSPLdict() - PSSMs, cl_num = self.pssms(PsP_background=PsP_background) - - # Optionally add external pssms - if not isinstance(additional_pssms, bool): - PSSMs += additional_pssms - cl_num += add_labels - PSSMs = [ - np.delete(np.array(list(np.array(mat))), [5, 10], axis=1) for mat in PSSMs - ] # Remove P0 and P+5 from pssms - - a = np.zeros((len(PSPLs), len(PSSMs))) - for ii, spec_profile in enumerate(PSPLs.values()): - for jj, pssm in enumerate(PSSMs): - a[ii, jj] = np.linalg.norm(pssm - spec_profile) - - table = pd.DataFrame(a) - table.columns = cl_num - table.insert(0, "Kinase", list(PSPLdict().keys())) - return table + """Compute matrix-matrix similarity between kinase specificity profiles + and cluster PSSMs to identify upstream kinases regulating clusters.""" + kinases, pspls = get_pspls() + clusters, pssms = self.get_pssms(PsP_background=PsP_background) + + distances = get_pspl_pssm_distances( + pspls, + pssms, + as_df=True, + pssm_names=clusters, + kinases=kinases, + ) + + return distances def predict(self) -> np.ndarray: """Provided the current model parameters, predict the cluster each peptide belongs to.""" @@ -252,3 +261,19 @@ def score(self) -> float: check_is_fitted(self, ["lower_bound_"]) return self.lower_bound_ + +def get_pspl_pssm_distances( + pspls: np.ndarray, pssms: np.ndarray, as_df=False, pssm_names=None, kinases=None +) -> np.ndarray | pd.DataFrame: + """ + Args: + pspls: kinase specificity profiles of shape (n_kinase, 20, 9) + pssms: position-specific scoring matrices of shape (n_peptides, 20, 11) + """ + assert pssms.shape[1:3] == (20, 11) + assert pspls.shape[1:3] == (20, 9) + pssms = np.delete(pssms, [5, 10], axis=2) + dists = np.linalg.norm(pspls[:, None, :, :] - pssms[None, :, :, :], axis=(2, 3)) + if as_df: + dists = pd.DataFrame(dists, index=kinases, columns=pssm_names) + return dists diff --git a/ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv b/ddmc/data/MS/CPTAC/CPTAC-preprocessedMotifs.csv similarity index 100% rename from ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv rename to ddmc/data/MS/CPTAC/CPTAC-preprocessedMotifs.csv diff --git a/ddmc/figures/common.py b/ddmc/figures/common.py index 0ecd89335..18afd97af 100644 --- a/ddmc/figures/common.py +++ b/ddmc/figures/common.py @@ -15,8 +15,8 @@ from ..clustering import DDMC from scipy.stats import mannwhitneyu from statsmodels.stats.multitest import multipletests -from ..pre_processing import MeanCenter from ..motifs import KinToPhosphotypeDict +from ddmc.binomial import AAlist rcParams["font.sans-serif"] = "Arial" @@ -146,7 +146,7 @@ def getDDMC_CPTAC(n_components: int, SeqWeight: float): return model, X -def plotMotifs(pssm, ax: axes.Axes, titles=False, yaxis=False): +def plot_motifs(pssm, ax: axes.Axes, titles=False, yaxis=False): """Generate logo plots of a list of PSSMs""" pssm = pssm.T if pssm.shape[0] == 11: @@ -172,7 +172,52 @@ def plotMotifs(pssm, ax: axes.Axes, titles=False, yaxis=False): logo.ax.set_ylim([yaxis[0], yaxis[1]]) -def plotDistanceToUpstreamKinase( +def plot_cluster_kinase_distances(distances: pd.DataFrame, pssms: np.ndarray, ax, num_hits=1): + pssm_names = distances.columns + + # melt distances + distances = distances.sub(distances.mean(axis=0), axis=1) + distances = pd.melt( + distances.reset_index(names="Kinase"), + id_vars="Kinase", + value_vars=list(distances.columns), + var_name="PSSM name", + value_name="Frobenius Distance", + ) + + sns.stripplot(data=distances, x="PSSM name", y="Frobenius Distance", ax=ax) + + # Annotate upstream kinase predictions + for i, pssm_name in enumerate(pssm_names): + distances_pssm = distances[distances["PSSM name"] == pssm_name] + distances_pssm = distances_pssm.sort_values( + by="Frobenius Distance", ascending=True + ) + distances_pssm = distances_pssm.reset_index(drop=True) + # assert that the kinase phosphoacceptor and most frequent phosphoacceptor in the pssm match + distances_pssm["Phosphoacceptor"] = [ + KinToPhosphotypeDict[kin] for kin in distances_pssm["Kinase"] + ] + try: + most_frequent_phosphoacceptor = AAlist(pssms[i, 5].argmax()) + except: + most_frequent_phosphoacceptor = "S/T" + if most_frequent_phosphoacceptor == "S" or most_frequent_phosphoacceptor == "T": + most_frequent_phosphoacceptor = "S/T" + distances_pssm = distances_pssm[ + distances_pssm["Phosphoacceptor"] == most_frequent_phosphoacceptor + ] + for jj in range(num_hits): + ax.annotate( + distances_pssm["Kinase"].iloc[jj], + (i, distances_pssm["Frobenius Distance"].iloc[jj] - 0.01), + fontsize=8, + ) + ax.legend().remove() + ax.set_title("Kinase vs Cluster Motif") + + +def plot_distance_to_upstream_kinase( model: DDMC, clusters: list[int], ax, @@ -183,7 +228,7 @@ def plotDistanceToUpstreamKinase( PsP_background=True, ): """Plot Frobenius norm between kinase PSPL and cluster PSSMs""" - ukin = model.predict_UpstreamKinases( + ukin = model.predict_upstream_kinases( additional_pssms=additional_pssms, add_labels=add_labels, PsP_background=PsP_background, @@ -206,8 +251,7 @@ def plotDistanceToUpstreamKinase( data["Cluster"] = data["Cluster"].astype(str) d1 = data[~data["Cluster"].str.contains("_S")] sns.stripplot(data=d1, x="Cluster", y="Frobenius Distance", ax=ax[0]) - print(cOG) - AnnotateUpstreamKinases(model, list(cOG) + ["ERK2+"], ax[0], d1, 1) + annotate_upstream_kinases(model, list(cOG) + ["ERK2+"], ax[0], d1, 1) # Shuffled d2 = data[data["Kinase"] == "ERK2"] @@ -223,64 +267,14 @@ def plotDistanceToUpstreamKinase( ) ax[1].set_title("ERK2 Shuffled Positions") ax[1].legend(prop={"size": 10}, loc="lower left") - DrawArrows(ax[1], d2) + draw_arrows(ax[1], d2) else: sns.stripplot(data=data, x="Cluster", y="Frobenius Distance", ax=ax) - AnnotateUpstreamKinases(model, clusters, ax, data, num_hits) + annotate_upstream_kinases(model, clusters, ax, data, num_hits) if title: ax.set_title(title) -def AnnotateUpstreamKinases(model: DDMC, clusters, ax, data, num_hits: int = 1): - """Annotate upstream kinase predictions""" - data.iloc[:, 1] = data.iloc[:, 1].astype(str) - pssms, _ = model.pssms() - for ii, c in enumerate(clusters, start=1): - cluster = data[data.iloc[:, 1] == str(c)] - hits = cluster.sort_values(by="Frobenius Distance", ascending=True) - hits.index = np.arange(hits.shape[0]) - hits["Phosphoacceptor"] = [KinToPhosphotypeDict[kin] for kin in hits["Kinase"]] - try: - cCP = pssms[c - 1].iloc[:, 5].idxmax() - except BaseException: - cCP == "S/T" - if cCP == "S" or cCP == "T": - cCP = "S/T" - hits = hits[hits["Phosphoacceptor"] == cCP] - for jj in range(num_hits): - ax.annotate( - hits["Kinase"].iloc[jj], - (ii - 1, hits["Frobenius Distance"].iloc[jj] - 0.01), - fontsize=8, - ) - ax.legend().remove() - ax.set_title("Kinase vs Cluster Motif") - - -def DrawArrows(ax, d2): - data_shuff = d2[d2["Shuffled"]] - actual_erks = d2[d2["Shuffled"] == False] - arrow_lengths = ( - np.add( - data_shuff["Frobenius Distance"].values, - abs(actual_erks["Frobenius Distance"].values), - ) - * -1 - ) - for dp in range(data_shuff.shape[0]): - ax.arrow( - dp, - data_shuff["Frobenius Distance"].iloc[dp] - 0.1, - 0, - arrow_lengths[dp] + 0.3, - head_width=0.25, - head_length=0.15, - width=0.025, - fc="black", - ec="black", - ) - - def plot_clusters_binaryfeatures(centers, id_var, ax, pvals=False, loc="best"): """Plot p-signal of binary features (tumor vs NAT or mutational status) per cluster""" data = pd.melt( diff --git a/ddmc/figures/figureM2.py b/ddmc/figures/figureM2.py index 41f2aea46..ee8a7902a 100644 --- a/ddmc/figures/figureM2.py +++ b/ddmc/figures/figureM2.py @@ -114,7 +114,7 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1, tmt= """Calculate missingness error across different numbers of clusters and/or weights.""" assert len(weights) == len(n_clusters) X_raw = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], + pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotifs.csv").iloc[:, 1:], tmt=tmt, ) # reset index @@ -122,7 +122,7 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1, tmt= info_cols = ["Sequence", "Protein", "Gene", "Position"] sample_cols = [col for col in X_raw.columns if col not in info_cols] - info = X_raw[info_cols].copy() + sequences = X_raw["Sequence"].copy() X = X_raw[sample_cols].copy() # the condition in which each sample was collected @@ -165,7 +165,7 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1, tmt= cluster, weights[jj], imputation_error( - X, impute_ddmc(X, info, cluster, weights[jj], distance_method) + X, impute_ddmc(X, sequences, cluster, weights[jj], distance_method) ), *baseline_errs, ] @@ -211,8 +211,8 @@ def impute_pca(X, rank): return IterativeSVD(rank=rank, verbose=False).fit_transform(X) -def impute_ddmc(X, info, n_clusters, weight, distance_method): - return DDMC(info, n_clusters, weight, distance_method, max_iter=1, tol=0.1).fit(X.T).impute(X) +def impute_ddmc(X, sequences, n_clusters, weight, distance_method): + return DDMC(sequences, n_clusters, weight, distance_method, max_iter=1, tol=0.1).fit(X).impute(X) makeFigure() diff --git a/ddmc/figures/figureM3.py b/ddmc/figures/figureM3.py index 8cd611180..d41f92884 100644 --- a/ddmc/figures/figureM3.py +++ b/ddmc/figures/figureM3.py @@ -10,132 +10,126 @@ from ..binomial import AAlist from .common import getSetup from ..pca import plotPCA -from .common import plotDistanceToUpstreamKinase, plotMotifs -from ..clustering import compute_control_pssm +from .common import ( + plot_distance_to_upstream_kinase, + plot_motifs, + plot_cluster_kinase_distances, +) +from ..clustering import compute_control_pssm, get_pspl_pssm_distances from ..binomial import AAlist -from ..motifs import DictProteomeNameToSeq -from ..pre_processing import filter_NaNpeptides +from ..motifs import DictProteomeNameToSeq, get_pspls +from ..pre_processing import filter_NaNpeptides, separate_sequence_and_abundance +from sklearn.decomposition import PCA def makeFigure(): """Get a list of the axis objects and create a figure""" # Get list of axis objects - ax, f = getSetup((12, 12), (3, 3), multz={3: 1}) + axes, f = getSetup((12, 12), (3, 3), multz={3: 1}) - # Import signaling data - x = preprocess_ebdt_mcf7() - d = x.select_dtypes(include=[float]).T - i = x["Sequence"] - - # Fit DDMC and find centers - model = DDMC( - i, n_components=20, seq_weight=5, distance_method="Binomial", random_state=10 - ).fit(d) - centers = pd.DataFrame(model.transform()) - centers.columns = np.arange(model.n_components) + 1 - centers.insert(0, "Inhibitor", x.columns[3:]) - centers["Inhibitor"] = [s.split(".")[1].split(".")[0] for s in centers["Inhibitor"]] - - # PCA AKT - AKTi = [ - "GSK690693", - "Torin1", - "HS173", - "GDC0941", - "Ku0063794", - "AZ20", - "MK2206", - "AZD5363", - "GDC0068", - "AZD6738", - "AT13148", - "Edelfosine", - "GF109203X", - "AZD8055", - ] - centers["AKTi"] = [drug in AKTi for drug in centers["Inhibitor"]] - plotPCA(ax[:2], centers, 2, ["Inhibitor", "AKTi"], "Cluster", hue_scores="AKTi") - ax[0].legend(loc="lower left", prop={"size": 9}, title="AKTi", fontsize=9) - - # Upstream Kinases AKT EBDT - plotDistanceToUpstreamKinase(model, [16], ax=ax[2], num_hits=1) - - # first plot heatmap of clusters - ax[3].axis("off") - - # AKT substrates bar plot - plot_NetPhoresScoreByKinGroup( - "ddmc/data/cluster_analysis/MCF7_NKIN_CL16.csv", - ax[4], - title="Cluster 16—Kinase Predictions", - n=40, + plot_fig_3abd(axes[0], axes[1], axes[2]) + + # 3c cannot be automatically placed on figure because of seaborn limitation + axes[3].axis("off") + + plot_fig_3e( + axes[4], ) - # # ERK2 White lab motif - erk2 = pd.read_csv("ddmc/data/Validations/Computational/ERK2_substrates.csv") - erk2 = compute_control_pssm([s.upper() for s in erk2["Peptide"]]) - erk2 = pd.DataFrame(np.clip(erk2, a_min=0, a_max=3)) - erk2.index = AAlist - plotMotifs(erk2, ax=ax[5], titles="ERK2") + plot_fig_3fgh(*axes[5:8]) - # ERK2 prediction + return f + + +def plot_fig_3abd(ax_a, ax_b, ax_d): # Import signaling data - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, - ) - d = X.select_dtypes(include=[float]).T - i = X["Sequence"] + x = preprocess_ebdt_mcf7() + seqs, abund = separate_sequence_and_abundance(x) # Fit DDMC - model_cptac = DDMC( - i, n_components=30, seq_weight=100, distance_method="Binomial", random_state=5 - ).fit(d) - - s_pssms = ShuffleClusters([3, 7, 21], model_cptac, additional=erk2) - plotDistanceToUpstreamKinase( - model_cptac, - [3, 7, 21], - additional_pssms=s_pssms + [erk2], - add_labels=["3_S", "7_S", "21_S", "ERK2+_S", "ERK2+"], - ax=ax[-2:], - num_hits=1, - ) + ddmc = DDMC( + seqs, + n_components=20, + seq_weight=5, + distance_method="Binomial", + random_state=10, + max_iter=1, + ).fit(abund) - return f + # get cluster centers + centers = pd.DataFrame(ddmc.transform()) + centers.columns = np.arange(ddmc.n_components) + 1 + # parse inhibitor names from sample names + inhibitors = [s.split(".")[1].split(".")[0] for s in abund.columns] -def ShuffleClusters(shuffle, model, additional=False): - """Returns PSSMs with shuffled positions""" - ClustersToShuffle = np.array(shuffle) - pssms, _ = model.pssms(PsP_background=False) - s_pssms = [] - for s in ClustersToShuffle: - mat = ShufflePositions(pssms[s]) - s_pssms.append(mat) + # create new bool array specifying whether each inhibitor is an AKT inhibitor + is_AKTi = [ + drug + in [ + "GSK690693", + "Torin1", + "HS173", + "GDC0941", + "Ku0063794", + "AZ20", + "MK2206", + "AZD5363", + "GDC0068", + "AZD6738", + "AT13148", + "Edelfosine", + "GF109203X", + "AZD8055", + ] + for drug in inhibitors + ] - if not isinstance(additional, bool): - mat = ShufflePositions(additional) - s_pssms.append(mat) + # run PCA on cluster centers + pca = PCA(n_components=2) + scores = pca.fit_transform(centers) # sample by PCA component + loadings = pca.components_ # PCA component by cluster + variance_explained = np.round(pca.explained_variance_ratio_, 2) - return s_pssms + # plot scores + sns.scatterplot( + x=scores[:, 0], + y=scores[:, 1], + hue=is_AKTi, + ax=ax_a, + **{"linewidth": 0.5, "edgecolor": "k"}, + ) + ax_a.legend(loc="lower left", prop={"size": 9}, title="AKTi", fontsize=9) + ax_a.set_title("PCA Scores") + ax_a.set_xlabel("PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10) + ax_a.set_ylabel("PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10) + ax_a.legend(prop={"size": 8}) + # plot loadings + sns.scatterplot( + x=loadings[0], y=loadings[1], ax=ax_b, **{"linewidth": 0.5, "edgecolor": "k"} + ) + ax_b.set_title("PCA Loadings") + ax_b.set_xlabel("PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10) + ax_b.set_ylabel("PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10) + ax_b.legend(prop={"size": 8}) + for j, txt in enumerate(centers.columns): + ax_b.annotate( + txt, (loadings[0][j] + 0.001, loadings[1][j] + 0.001), fontsize=10 + ) -def ShufflePositions(pssm): - """Shuffles the positions of input PSSMs""" - pssm = np.array(pssm) - mat = pssm[:, np.random.permutation([0, 1, 2, 3, 4, 6, 7, 8, 9])] - mat = np.insert(mat, 5, pssm[:, 5], axis=1) - mat = np.insert(mat, 1, pssm[:, -1], axis=1) - mat = pd.DataFrame(mat) - mat.index = AAlist - return mat + # Plot kinase predictions for cluster 16 + plot_cluster_kinase_distances( + ddmc.predict_upstream_kinases()[[16]], + ddmc.get_pssms(PsP_background=True, clusters=[16])[0], + ax=ax_d, + ) -def plot_NetPhoresScoreByKinGroup(PathToFile, ax, n=5, title=False, color="royalblue"): +def plot_fig_3e(ax): """Plot top scoring kinase groups""" NPtoCumScore = {} - X = pd.read_csv(PathToFile) + X = pd.read_csv("ddmc/data/cluster_analysis/MCF7_NKIN_CL16.csv") for ii in range(X.shape[0]): curr_NPgroup = X["netphorest_group"][ii] if curr_NPgroup == "any_group": @@ -147,25 +141,120 @@ def plot_NetPhoresScoreByKinGroup(PathToFile, ax, n=5, title=False, color="royal X = pd.DataFrame.from_dict(NPtoCumScore, orient="index").reset_index() X.columns = ["KIN Group", "NetPhorest Score"] X["KIN Group"] = [s.split("_")[0] for s in X["KIN Group"]] - X = X.sort_values(by="NetPhorest Score", ascending=False).iloc[:n, :] + X = X.sort_values(by="NetPhorest Score", ascending=False).iloc[:40, :] sns.stripplot( data=X, y="KIN Group", x="NetPhorest Score", ax=ax, orient="h", - color=color, + color="royalblue", size=5, **{"linewidth": 1}, **{"edgecolor": "black"}, ) - if title: - ax.set_title(title) - else: - ax.set_title("Kinase Predictions") + ax.set_title("Cluster 16—Kinase Predictions") -def plotMCF7AKTclustermap(model, cluster): +def plot_fig_3fgh(ax_f, ax_g, ax_h): + # plot erk2+ pssm + # ERK2 White lab motif + erk2_pssm = pd.read_csv("ddmc/data/Validations/Computational/ERK2_substrates.csv") + erk2_pssm = compute_control_pssm([s.upper() for s in erk2_pssm["Peptide"]]) + erk2_pssm = pd.DataFrame(np.clip(erk2_pssm, a_min=0, a_max=3)) + erk2_pssm.index = AAlist + plot_motifs(erk2_pssm, ax=ax_f, titles="ERK2") + + # ERK2 prediction + # Import signaling data + seqs, abund = separate_sequence_and_abundance( + filter_NaNpeptides( + pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotifs.csv").iloc[:, 1:], + tmt=2, + ) + ) + + # Fit DDMC + ddmc_cptac = DDMC( + seqs, + n_components=30, + seq_weight=100, + distance_method="Binomial", + random_state=5, + max_iter=1, + ).fit(abund) + + clusters = [3, 7, 21] + # get pssms from ddmc clusters + pssms = ddmc_cptac.get_pssms(PsP_background=True, clusters=clusters) + + # append erk2+ pssm + pssm_names = clusters + ["ERK2+"] + pssms = np.append(pssms, erk2_pssm.values[None, :, :], axis=0) + + # get kinase-pssm specificities + kinases, pspls = get_pspls() + distances = get_pspl_pssm_distances( + pspls, pssms, as_df=True, pssm_names=pssm_names, kinases=kinases + ) + + # plot the specificities + plot_cluster_kinase_distances(distances, pssms, ax_g) + + # plot change in specificity to ERK2 due to shuffling + shuffled_pssms = np.array([shuffle_pssm(pssm) for pssm in pssms]) + shuffled_distances = get_pspl_pssm_distances( + pspls, shuffled_pssms, as_df=True, pssm_names=pssm_names, kinases=kinases + ) + + # reformat data for plotting + melt_distances = lambda ds: ds.reset_index(names="Kinase").melt( + id_vars="Kinase", var_name="pssm_name" + ) + distances_melt = pd.concat( + [ + melt_distances(distances).assign(Shuffled=False), + melt_distances(shuffled_distances).assign(Shuffled=True), + ] + ) + sns.stripplot( + data=distances_melt[distances_melt["Kinase"] == "ERK2"], + x="pssm_name", + y="value", + hue="Shuffled", + ax=ax_h, + size=8, + ) + + ax_h.set_xlabel("Cluster") + ax_h.set_ylabel("Frobenius Distance") + ax_h.set_title("ERK2 Shuffled Positions") + ax_h.legend(prop={"size": 10}, loc="lower left") + + # add arrows from original to shuffled + for i, pssm_name in enumerate(pssm_names): + ax_h.arrow( + i, + distances.loc["ERK2", pssm_name] - 0.1, + 0, + shuffled_distances.loc["ERK2", pssm_name] + - distances.loc["ERK2", pssm_name] + + 0.3, + head_width=0.25, + head_length=0.15, + width=0.025, + fc="black", + ec="black", + ) + + +def shuffle_pssm(pssm): + shuffled_pssm = pssm[:, np.random.permutation([0, 1, 2, 3, 4, 6, 7, 8, 9])] + shuffled_pssm = np.insert(shuffled_pssm, 5, pssm[:, 5], axis=1) + return np.insert(shuffled_pssm, 1, pssm[:, -1], axis=1) + + +def plot_fig_3c(model, cluster): """Code to create hierarchical clustering of cluster 1 across treatments""" c1 = pd.DataFrame(model.transform()[:, cluster - 1]) X = pd.read_csv("ddmc/data/Validations/Computational/ebdt_mcf7.csv") @@ -234,3 +323,6 @@ def pos_to_motif(genes, pos, motif_size=5): motif[motif_size] = motif[motif_size].lower() motifs.append("".join(motif)) return motifs, del_GeneToPos + + +makeFigure() diff --git a/ddmc/figures/figureM4.py b/ddmc/figures/figureM4.py index c431159e8..1adba5bc9 100644 --- a/ddmc/figures/figureM4.py +++ b/ddmc/figures/figureM4.py @@ -25,8 +25,6 @@ def makeFigure(): ) d = X.select_dtypes(include=[float]).T - return f # TODO: This code is broken. - # Plot mean AUCs per model p = pd.read_csv("ddmc/data/Performance/preds_phenotypes_rs_15cl.csv").iloc[:, 1:] p = p.melt( diff --git a/ddmc/figures/figureM5.py b/ddmc/figures/figureM5.py index 2b8c637fe..b5e85f069 100644 --- a/ddmc/figures/figureM5.py +++ b/ddmc/figures/figureM5.py @@ -14,7 +14,7 @@ from statsmodels.stats.multitest import multipletests from .common import getSetup, getDDMC_CPTAC from ..logistic_regression import plotClusterCoefficients, plotROC -from .common import plotDistanceToUpstreamKinase, TumorType +from .common import plot_distance_to_upstream_kinase, TumorType from ..pca import plotPCA from ..pre_processing import filter_NaNpeptides @@ -79,7 +79,7 @@ def makeFigure(): ax[5].set_xticklabels(centers.columns[:-1]) # Upstream Kinases - plotDistanceToUpstreamKinase(model, [6, 15, 20], ax[6], num_hits=2) + plot_distance_to_upstream_kinase(model, [6, 15, 20], ax[6], num_hits=2) return f diff --git a/ddmc/figures/figureM6.py b/ddmc/figures/figureM6.py index 0862c6d2d..c761e23f4 100644 --- a/ddmc/figures/figureM6.py +++ b/ddmc/figures/figureM6.py @@ -10,7 +10,7 @@ from statsmodels.stats.multitest import multipletests from bioinfokit import visuz from ..pre_processing import filter_NaNpeptides -from .common import plotDistanceToUpstreamKinase +from .common import plot_distance_to_upstream_kinase from .figureM4 import find_patients_with_NATandTumor from .figureM5 import ( plot_clusters_binaryfeatures, @@ -93,7 +93,7 @@ def makeFigure(): ax[2].legend(loc="lower left", prop={"size": 10}) # plot Upstream Kinases - plotDistanceToUpstreamKinase( + plot_distance_to_upstream_kinase( model, [5, 16, 27], ax[3], num_hits=3, PsP_background=False ) diff --git a/ddmc/figures/figureM7.py b/ddmc/figures/figureM7.py index 8c679fdd8..5826973cc 100644 --- a/ddmc/figures/figureM7.py +++ b/ddmc/figures/figureM7.py @@ -8,7 +8,7 @@ import textwrap from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler -from .common import getSetup, plotDistanceToUpstreamKinase +from .common import getSetup, plot_distance_to_upstream_kinase from .figureM5 import ( build_pval_matrix, calculate_mannW_pvals, @@ -78,7 +78,7 @@ def makeFigure(): ) # plot Upstream Kinases - plotDistanceToUpstreamKinase(model, [17, 20, 21], ax[3], num_hits=3) + plot_distance_to_upstream_kinase(model, [17, 20, 21], ax[3], num_hits=3) return f diff --git a/ddmc/figures/figureMS2.py b/ddmc/figures/figureMS2.py index 42e4c2610..a84b50c87 100644 --- a/ddmc/figures/figureMS2.py +++ b/ddmc/figures/figureMS2.py @@ -4,7 +4,7 @@ import numpy as np from .common import getSetup, getDDMC_CPTAC -from .common import plotMotifs +from .common import plot_motifs def makeFigure(): @@ -15,12 +15,12 @@ def makeFigure(): # Fit DDMC model, _ = getDDMC_CPTAC(n_components=30, SeqWeight=100.0) - pssms, cl_num = model.pssms(PsP_background=False) + pssms, cl_num = model.get_pssms(PsP_background=False) ylabels = np.arange(0, 21, 5) xlabels = [20, 21, 22, 23, 24, 25] for ii, cc in enumerate(cl_num): cluster = "Cluster " + str(cc) - plotMotifs(pssms[ii], ax=ax[ii], titles=cluster, yaxis=[0, 10]) + plot_motifs(pssms[ii], ax=ax[ii], titles=cluster, yaxis=[0, 10]) if ii not in ylabels: ax[ii].set_ylabel("") ax[ii].get_yaxis().set_visible(False) diff --git a/ddmc/figures/figureMS6.py b/ddmc/figures/figureMS6.py index 23deb716a..8ee2b6af2 100644 --- a/ddmc/figures/figureMS6.py +++ b/ddmc/figures/figureMS6.py @@ -5,7 +5,7 @@ import pandas as pd from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler -from .common import plotDistanceToUpstreamKinase, getDDMC_CPTAC +from .common import plot_distance_to_upstream_kinase, getDDMC_CPTAC from .figureM4 import find_patients_with_NATandTumor from .figureM5 import ( plot_clusters_binaryfeatures, @@ -93,7 +93,7 @@ def makeFigure(): ax[2].legend(loc="lower left", prop={"size": 10}) # plot Upstream Kinases - plotDistanceToUpstreamKinase(model, [9, 11, 16, 18], ax[3], num_hits=1) + plot_distance_to_upstream_kinase(model, [9, 11, 16, 18], ax[3], num_hits=1) ax[-1].axis("off") diff --git a/ddmc/motifs.py b/ddmc/motifs.py index eaf1bb2e5..f0b531b7a 100644 --- a/ddmc/motifs.py +++ b/ddmc/motifs.py @@ -235,9 +235,10 @@ def ForegroundSeqs(sequences): return seqs -def PSPLdict(): +def get_pspls() -> tuple[np.ndarray, np.ndarray]: """Generate dictionary with kinase name-specificity profile pairs""" - pspl_dict = {} + pspls_arr = [] + kinases = [] # individual files PSPLs = glob.glob("./ddmc/data/PSPL/*.csv") for sp in PSPLs: @@ -249,7 +250,8 @@ def PSPLdict(): if np.all(sp_mat >= 0): sp_mat = np.log2(sp_mat) - pspl_dict[sp.split("PSPL/")[1].split(".csv")[0]] = sp_mat + kinases.append(sp.split("PSPL/")[1].split(".csv")[0]) + pspls_arr.append(sp_mat.values) # NetPhores PSPL results f = pd.read_csv("ddmc/data/PSPL/pssm_data.csv", header=None) @@ -262,9 +264,10 @@ def PSPLdict(): mat = np.ma.log2(mat) mat = mat.filled(0) mat = np.clip(mat, a_min=0, a_max=3) - pspl_dict[kin] = mat + kinases.append(kin) + pspls_arr.append(mat) - return pspl_dict + return np.array(kinases), np.array(pspls_arr) def compute_control_pssm(bg_sequences) -> np.ndarray: diff --git a/ddmc/pca.py b/ddmc/pca.py index 282fd4cf1..9a72bcde7 100644 --- a/ddmc/pca.py +++ b/ddmc/pca.py @@ -1,4 +1,5 @@ """ PCA functions """ +from typing import List import pandas as pd import numpy as np @@ -17,14 +18,15 @@ def pca_dfs(scores, loadings, df, n_components, sIDX, lIDX): for j in sIDX: dScor[j] = list(df[j]) + # populate the "Cluster" col with the names of the clusters from df dLoad[lIDX] = df.select_dtypes(include=["float64"]).columns return dScor, dLoad def plotPCA( - ax, - d, - n_components, + axes: List, + cluster_centers: pd.DataFrame, + n_components: int, scores_ind, loadings_ind, hue_scores=None, @@ -35,11 +37,11 @@ def plotPCA( quadrants=True, ): """Plot PCA scores and loadings.""" - pp = PCA(n_components=n_components) - dScor_ = pp.fit_transform(d.select_dtypes(include=["float64"])) - dLoad_ = pp.components_ - dScor_, dLoad_ = pca_dfs(dScor_, dLoad_, d, n_components, scores_ind, loadings_ind) - varExp = np.round(pp.explained_variance_ratio_, 2) + pca = PCA(n_components=n_components) + dScor_ = pca.fit_transform(cluster_centers.select_dtypes(include=["float64"])) + dLoad_ = pca.components_ + dScor_, dLoad_ = pca_dfs(dScor_, dLoad_, cluster_centers, n_components, scores_ind, loadings_ind) + varExp = np.round(pca.explained_variance_ratio_, 2) # Scores sns.scatterplot( @@ -48,15 +50,15 @@ def plotPCA( data=dScor_, hue=hue_scores, style=style_scores, - ax=ax[0], + ax=axes[0], **{"linewidth": 0.5, "edgecolor": "k"} ) - ax[0].set_title("PCA Scores") - ax[0].set_xlabel("PC1 (" + str(int(varExp[0] * 100)) + "%)", fontsize=10) - ax[0].set_ylabel("PC2 (" + str(int(varExp[1] * 100)) + "%)", fontsize=10) - ax[0].legend(prop={"size": 8}) + axes[0].set_title("PCA Scores") + axes[0].set_xlabel("PC1 (" + str(int(varExp[0] * 100)) + "%)", fontsize=10) + axes[0].set_ylabel("PC2 (" + str(int(varExp[1] * 100)) + "%)", fontsize=10) + axes[0].legend(prop={"size": 8}) if legendOut: - ax[0].legend( + axes[0].legend( bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0, @@ -73,7 +75,7 @@ def plotPCA( data=dLoad_, hue="p-value", style=style_load, - ax=ax[1], + ax=axes[1], **{"linewidth": 0.5, "edgecolor": "k"} ) else: @@ -82,21 +84,21 @@ def plotPCA( y="PC2", data=dLoad_, style=style_load, - ax=ax[1], + ax=axes[1], **{"linewidth": 0.5, "edgecolor": "k"} ) - ax[1].set_title("PCA Loadings") - ax[1].set_xlabel("PC1 (" + str(int(varExp[0] * 100)) + "%)", fontsize=10) - ax[1].set_ylabel("PC2 (" + str(int(varExp[1] * 100)) + "%)", fontsize=10) - ax[1].legend(prop={"size": 8}) + axes[1].set_title("PCA Loadings") + axes[1].set_xlabel("PC1 (" + str(int(varExp[0] * 100)) + "%)", fontsize=10) + axes[1].set_ylabel("PC2 (" + str(int(varExp[1] * 100)) + "%)", fontsize=10) + axes[1].legend(prop={"size": 8}) for j, txt in enumerate(dLoad_[loadings_ind]): - ax[1].annotate( + axes[1].annotate( txt, (dLoad_["PC1"][j] + 0.001, dLoad_["PC2"][j] + 0.001), fontsize=10 ) if quadrants: - ax[0].axhline(0, ls="--", color="lightgrey") - ax[0].axvline(0, ls="--", color="lightgrey") - ax[1].axhline(0, ls="--", color="lightgrey") - ax[1].axvline(0, ls="--", color="lightgrey") + axes[0].axhline(0, ls="--", color="lightgrey") + axes[0].axvline(0, ls="--", color="lightgrey") + axes[1].axhline(0, ls="--", color="lightgrey") + axes[1].axvline(0, ls="--", color="lightgrey") diff --git a/ddmc/pre_processing.py b/ddmc/pre_processing.py index 8d54e9bb6..631ea2cb6 100644 --- a/ddmc/pre_processing.py +++ b/ddmc/pre_processing.py @@ -120,13 +120,14 @@ def FoldChangeFilterBasedOnMaxFC(X, data_headers, cutoff=0.5): return X.iloc[Xidx, :] -###------------ Filter by variance (stdev or range/pearson's) ------------------### +def separate_sequence_and_abundance(ms_df: pd.DataFrame): + # by default, we assume that ms_df is composed of "Gene", "Sequence", + # "Position", and sample column + sample_cols = [ + col + for col in ms_df.columns + if col not in ("Gene", "Sequence", "Position", "Protein") + ] + return ms_df["Sequence"].copy(), ms_df[sample_cols].copy() + -def MeanCenter(X, mc_row, mc_col): - """Mean centers each row of values. logT also optionally log2-transforms.""" - data_headers = X.select_dtypes(include=["float64"]).columns - if mc_row: - X[data_headers] = X[data_headers].sub(X[data_headers].mean(axis=1), axis=0) - if mc_col: - X[data_headers] = X[data_headers].sub(X[data_headers].mean(axis=0), axis=1) - return X