From 123b3921ac7fd0bd8e14a15960ced2fb27f03004 Mon Sep 17 00:00:00 2001 From: Aaron Meyer <2065146+aarmey@users.noreply.github.com> Date: Sat, 10 Feb 2024 06:25:33 -0800 Subject: [PATCH] Cleanup to get figures building again (#533) * Initial pass * Some fixes * Lots more cleanup * More cleanup * Fix MS7 * clean figureM2 * rename for consistency * stage * class changes * figure 2 and 3 * revert figure 4 * remake figure 4 * remove figure 4 copy * remake figure 5 * remake figure 6 * remake figure 7 * refactor * figure S2 * figure S3 * refactor pca functions * move ebdt to datasets * figure S4, renaming * figure MS5 * figure s7 * delete many things * fix tests * delete unused function, can always restore * add fit() documentation * update README and fix validation * Update build.yml * make figures more like before * small fix * small cv fold fixes * update random states --------- Co-authored-by: Armaan Abraham --- .github/workflows/autopep8.yml | 25 - .github/workflows/build.yml | 2 +- README.md | 46 +- WeightSearch.csv | 3 + ddmc/binomial.py | 152 ++-- ddmc/clustering.py | 309 ++++---- ...otfis.csv => CPTAC-preprocessedMotifs.csv} | 0 ddmc/datasets.py | 188 +++++ ddmc/figures/common.py | 665 +++++------------- ddmc/figures/figureM2.py | 212 +++--- ddmc/figures/figureM3.py | 366 +++++----- ddmc/figures/figureM4.py | 404 ++++------- ddmc/figures/figureM5.py | 297 ++------ ddmc/figures/figureM6.py | 139 +--- ddmc/figures/figureM7.py | 133 +--- ddmc/figures/figureMS2.py | 63 +- ddmc/figures/figureMS3.py | 116 ++- ddmc/figures/figureMS4.py | 164 ++--- ddmc/figures/figureMS5.py | 60 ++ ddmc/figures/figureMS6.py | 127 ---- ddmc/figures/figureMS7.py | 183 +---- ddmc/figures/figureMS8.py | 105 --- ddmc/logistic_regression.py | 71 +- ddmc/motifs.py | 108 +-- ddmc/pam250.py | 34 +- ddmc/pca.py | 511 -------------- ddmc/pre_processing.py | 346 --------- ddmc/tests/test_CoClustering.py | 38 - ddmc/tests/test_DifClusters.py | 30 - ddmc/tests/test_cluster.py | 64 ++ makefile | 4 + poetry.lock | 218 ++++-- 32 files changed, 1687 insertions(+), 3496 deletions(-) delete mode 100644 .github/workflows/autopep8.yml create mode 100644 WeightSearch.csv rename ddmc/data/MS/CPTAC/{CPTAC-preprocessedMotfis.csv => CPTAC-preprocessedMotifs.csv} (100%) create mode 100644 ddmc/datasets.py create mode 100644 ddmc/figures/figureMS5.py delete mode 100644 ddmc/figures/figureMS6.py delete mode 100644 ddmc/figures/figureMS8.py delete mode 100644 ddmc/pca.py delete mode 100644 ddmc/pre_processing.py delete mode 100644 ddmc/tests/test_CoClustering.py delete mode 100644 ddmc/tests/test_DifClusters.py create mode 100644 ddmc/tests/test_cluster.py diff --git a/.github/workflows/autopep8.yml b/.github/workflows/autopep8.yml deleted file mode 100644 index 31b6db25e..000000000 --- a/.github/workflows/autopep8.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: autopep8 - -on: - schedule: - - cron: '0 18 * * *' -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: autopep8 - id: autopep8 - uses: peter-evans/autopep8@v1 - with: - args: --exit-code -raai --max-line-length 200 . - - name: Create Pull Request - if: steps.autopep8.outputs.exit-code == 2 - uses: peter-evans/create-pull-request@v3 - with: - token: ${{ secrets.GITHUB_TOKEN }} - commit-message: autopep8 action fixes - title: Fixes by autopep8 action - body: This is an auto-generated PR with fixes by autopep8. - labels: autopep8, automated pr - branch: create-pull-request/autopep8 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5f09a3898..521f6b6ec 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,7 +10,7 @@ jobs: - name: Install dependencies run: poetry install - name: Build figures - run: make -j 3 all + run: make all - name: Upload files uses: actions/upload-artifact@v4 with: diff --git a/README.md b/README.md index 872a03b7d..4e28805df 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,48 @@ ![Build](https://github.com/meyer-lab/resistance-MS/workflows/Build/badge.svg) ![Test](https://github.com/meyer-lab/resistance-MS/workflows/Test/badge.svg) -Predicting cell phenotype with mass spec data. +Clusters peptides based on both sequence similarity and phosphorylation signal across samples. + + +## Usage + +``` +>>> from ddmc.clustering import DDMC + +>>> # load dataset as p_signal... + +>>> p_signal + Sample 1 Sample 2 Sample 3 Sample 4 Sample 5 +Sequence +AAAAAsQQGSA -3.583614 NaN -0.662659 -1.320029 -0.730832 +AAAAGsASPRS -0.174779 -1.796899 0.891798 -3.092941 2.394315 +AAAAGsGPSPP -1.951552 -2.937095 2.692876 -2.344894 0.556615 +AAAAGsGPsPP 3.666782 NaN -2.081231 0.989394 NaN +AAAAPsPGSAR 1.753855 -2.135835 0.896778 3.369230 2.020967 +... ... ... ... ... ... +YYSPYsVSGSG -3.502871 2.831169 3.383486 2.589559 3.624968 +YYSSRsQSGGY -0.870365 0.887317 2.600291 -0.374107 3.285459 +YYTAGyNSPVK 0.249539 2.047050 -0.286033 0.042650 2.863317 +YYTSAsGDEMV 0.662787 0.135326 -1.004350 0.879398 -1.609894 +YYYSSsEDEDS NaN -1.101679 -3.273987 -0.872370 -1.735891 + +>>> p_signal.index # p_signal.index contains the peptide sequences +Index(['AAAAAsQQGSA', 'AAAAGsASPRS', 'AAAAGsGPSPP', 'AAAAGsGPsPP', + 'AAAAPsPGSAR', 'AAAAPsPGsAR', 'AAAARsLLNHT', 'AAAARsPDRNL', + 'AAAARtQAPPT', 'AAADFsDEDED', + ... + 'YYDRMySYPAR', 'YYEDDsEGEDI', 'YYGGGsEGGRA', 'YYRNNsFTAPS', + 'YYSPDyGLPSP', 'YYSPYsVSGSG', 'YYSSRsQSGGY', 'YYTAGyNSPVK', + 'YYTSAsGDEMV', 'YYYSSsEDEDS'], + dtype='object', name='Sequence', length=30561) + +>>> model = DDMC(n_components=2, seq_weight=100).fit(p_signal) # fit model + +>>> model.transform(as_df=True) # get cluster centers + 0 1 +Sample 1 0.017644 0.370375 +Sample 2 -0.003625 -0.914869 +Sample 3 -0.087624 -0.682140 +Sample 4 0.014644 -0.658907 +Sample 5 0.023885 0.196063 +``` \ No newline at end of file diff --git a/WeightSearch.csv b/WeightSearch.csv new file mode 100644 index 000000000..4605e1098 --- /dev/null +++ b/WeightSearch.csv @@ -0,0 +1,3 @@ +,Weight,DDMC,Average,Zero,PCA +0,0.0,,,, +1,100.0,,,, diff --git a/ddmc/binomial.py b/ddmc/binomial.py index 6a00546c0..7857b5e61 100644 --- a/ddmc/binomial.py +++ b/ddmc/binomial.py @@ -5,55 +5,34 @@ import pandas as pd import scipy.special as sc from Bio import motifs -from Bio.Seq import Seq +from collections import OrderedDict # Binomial method inspired by Schwartz & Gygi's Nature Biotech 2005: doi:10.1038/nbt1146 # Amino acids frequencies (http://www.tiem.utk.edu/~gross/bioed/webmodules/aminoacid.htm) used for pseudocounts, -AAfreq = { - "A": 0.074, - "R": 0.042, - "N": 0.044, - "D": 0.059, - "C": 0.033, - "Q": 0.058, - "E": 0.037, - "G": 0.074, - "H": 0.029, - "I": 0.038, - "L": 0.076, - "K": 0.072, - "M": 0.018, - "F": 0.04, - "P": 0.05, - "S": 0.081, - "T": 0.062, - "W": 0.013, - "Y": 0.033, - "V": 0.068, -} -AAlist = [ - "A", - "C", - "D", - "E", - "F", - "G", - "H", - "I", - "K", - "L", - "M", - "N", - "P", - "Q", - "R", - "S", - "T", - "V", - "W", - "Y", -] +AAfreq: OrderedDict[str, float] = OrderedDict() +AAfreq["A"] = 0.074 +AAfreq["R"] = 0.042 +AAfreq["N"] = 0.044 +AAfreq["D"] = 0.059 +AAfreq["C"] = 0.033 +AAfreq["Q"] = 0.058 +AAfreq["E"] = 0.037 +AAfreq["G"] = 0.074 +AAfreq["H"] = 0.029 +AAfreq["I"] = 0.038 +AAfreq["L"] = 0.076 +AAfreq["K"] = 0.072 +AAfreq["M"] = 0.018 +AAfreq["F"] = 0.040 +AAfreq["P"] = 0.050 +AAfreq["S"] = 0.081 +AAfreq["T"] = 0.062 +AAfreq["W"] = 0.013 +AAfreq["Y"] = 0.033 +AAfreq["V"] = 0.068 + +AAlist = list(AAfreq.keys()) def position_weight_matrix(seqs, pseudoC=AAfreq): @@ -61,20 +40,12 @@ def position_weight_matrix(seqs, pseudoC=AAfreq): return frequencies(seqs).normalize(pseudocounts=pseudoC) -def frequencies(seqs): +def frequencies(seqs: list[str]): """Build counts matrix of a given set of sequences.""" - return motifs.create(seqs, alphabet=AAlist).counts + return motifs.create(seqs, alphabet="".join(AAlist)).counts -def InformationContent(seqs): - """The mean of the PSSM is particularly important becuase its value is equal to the - Kullback-Leibler divergence or relative entropy, and is a measure for the information content - of the motif compared to the background.""" - pssm = position_weight_matrix(seqs).log_odds(AAfreq) - return pssm.mean(AAfreq) - - -def GenerateBinarySeqID(seqs): +def GenerateBinarySeqID(seqs: list[str]) -> np.ndarray: """Build matrix with 0s and 1s to identify residue/position pairs for every sequence""" res = np.zeros((len(seqs), len(AAlist), 11), dtype=bool) for ii, seq in enumerate(seqs): @@ -83,7 +54,7 @@ def GenerateBinarySeqID(seqs): return res -def BackgroundSeqs(forseqs): +def BackgroundSeqs(forseqs: np.ndarray[str]) -> list[str]: """Build Background data set with the same proportion of pY, pT, and pS motifs as in the foreground set of sequences. Note this PsP data set contains 51976 pY, 226131 pS, 81321 pT Source: https://www.phosphosite.org/staticDownloads.action - @@ -91,7 +62,7 @@ def BackgroundSeqs(forseqs): Cite: Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926""" # Get porportion of psite types in foreground set - forw_pYn, forw_pSn, forw_pTn, _ = CountPsiteTypes(forseqs, 5) + forw_pYn, forw_pSn, forw_pTn = CountPsiteTypes(forseqs) forw_tot = forw_pYn + forw_pSn + forw_pTn pYf = forw_pYn / forw_tot @@ -106,7 +77,7 @@ def BackgroundSeqs(forseqs): PsP = PsP[~PsP["SITE_+/-7_AA"].str.contains("X")] refseqs = list(PsP["SITE_+/-7_AA"]) len_bg = int(len(refseqs)) - backg_pYn, _, _, _ = CountPsiteTypes(refseqs, 7) + backg_pYn, _, _ = CountPsiteTypes(refseqs) # Make sure there are enough pY peptides to meet proportions if backg_pYn >= len_bg * pYf: @@ -126,9 +97,12 @@ def BackgroundSeqs(forseqs): return bg_seqs -def BackgProportions(refseqs, pYn, pSn, pTn): +def BackgProportions(refseqs: list[str], pYn: int, pSn: int, pTn: int) -> list[str]: """Provided the proportions, add peptides to background set.""" - y_seqs, s_seqs, t_seqs = [], [], [] + y_seqs: list[str] = [] + s_seqs: list[str] = [] + t_seqs: list[str] = [] + pR = ["y", "t", "s"] for seq in refseqs: if seq[7] not in pR: @@ -144,50 +118,56 @@ def BackgProportions(refseqs, pYn, pSn, pTn): ), "Wrong central AA in background set. Sliced: %s, Full: %s" % (motif, seq) if motif[5] == "Y" and len(y_seqs) < pYn: - y_seqs.append(Seq(motif)) + y_seqs.append(motif) if motif[5] == "S" and len(s_seqs) < pSn: - s_seqs.append(Seq(motif)) + s_seqs.append(motif) if motif[5] == "T" and len(t_seqs) < pTn: - t_seqs.append(Seq(motif)) + t_seqs.append(motif) return y_seqs + s_seqs + t_seqs class Binomial: - """Create a binomial distance distribution.""" + """Definition of the binomial sequence distance distribution.""" - def __init__(self, seq, seqs): + def __init__(self, seqs: np.ndarray[str]): # Background sequences - background = position_weight_matrix(BackgroundSeqs(seq)) - self.background = ( - np.array([background[AA] for AA in AAlist]), - GenerateBinarySeqID(seqs), - ) + background = position_weight_matrix(BackgroundSeqs(seqs)) + self.background = np.array([background[AA] for AA in AAlist]) + self.foreground: np.ndarray = GenerateBinarySeqID(seqs) self.logWeights = 0.0 - assert np.all(np.isfinite(self.background[0])) - assert np.all(np.isfinite(self.background[1])) + assert np.all(np.isfinite(self.background)) + assert np.all(np.isfinite(self.foreground)) - def from_summaries(self, weightsIn): + def from_summaries(self, weightsIn: np.ndarray): """Update the underlying distribution.""" - k = np.einsum("kji,kl->lji", self.background[1], weightsIn) + k = np.einsum("kji,kl->lji", self.foreground, weightsIn) betaA = np.sum(weightsIn, axis=0)[:, None, None] - k betaA = np.clip(betaA, 0.001, np.inf) - probmat = sc.betainc(betaA, k + 1, 1 - self.background[0]) - tempp = np.einsum("ijk,ljk->il", self.background[1], probmat) + probmat = sc.betainc(betaA, k + 1, 1 - self.background) + tempp = np.einsum("ijk,ljk->il", self.foreground, probmat) self.logWeights = np.log(tempp) -def CountPsiteTypes(X, cA): - """Count number of different phosphorylation types in a MS data set.""" - positionSeq = [seq[cA] for seq in X] - pS = positionSeq.count("s") - pT = positionSeq.count("t") - pY = positionSeq.count("y") +def CountPsiteTypes(X: np.ndarray[str]) -> tuple[int, int, int]: + """Count the number of different phosphorylation types in an MS data set. + + Args: + X (list[str]): The list of peptide sequences. + + Returns: + tuple[int, int, int]: The number of pY, pS, and pT sites. + """ + X = np.char.upper(X) - countt = [sum(map(str.islower, seq)) for seq in X] - primed = sum(map(lambda i: i > 1, countt)) + # Find the center amino acid + cA = int((len(X[0]) - 1) / 2) - return pY, pS, pT, primed + phospho_aminos = [seq[cA] for seq in X] + pS = phospho_aminos.count("S") + pT = phospho_aminos.count("T") + pY = phospho_aminos.count("Y") + return pY, pS, pT diff --git a/ddmc/clustering.py b/ddmc/clustering.py index 9d78fd971..98eb87dae 100644 --- a/ddmc/clustering.py +++ b/ddmc/clustering.py @@ -1,6 +1,6 @@ """ Clustering functions. """ -from typing import Literal +from typing import Literal, List, Sequence, Tuple import warnings from copy import deepcopy import itertools @@ -10,59 +10,57 @@ from sklearn.utils.validation import check_is_fitted from .binomial import Binomial, AAlist, BackgroundSeqs, frequencies from .pam250 import PAM250 -from .motifs import PSPLdict, compute_control_pssm +from .motifs import get_pspls, compute_control_pssm from fancyimpute import SoftImpute -# pylint: disable=W0201 - - class DDMC(GaussianMixture): - """Cluster peptides by both sequence similarity and data behavior following an - expectation-maximization algorithm. SeqWeight specifies which method's expectation step - should have a larger effect on the peptide assignment.""" + """Cluster peptides by both sequence similarity and condition-wise phosphorylation following an + expectation-maximization algorithm.""" def __init__( self, - info: pd.DataFrame, n_components: int, - SeqWeight: float, - distance_method: Literal["PAM250", "Binomial"], + seq_weight: float, + distance_method: Literal["PAM250", "Binomial"] = "Binomial", random_state=None, + max_iter=200, + tol=1e-4, ): super().__init__( n_components=n_components, covariance_type="diag", n_init=2, - max_iter=200, - tol=1e-4, + max_iter=max_iter, + tol=tol, random_state=random_state, ) - - self.info = info - self.SeqWeight = SeqWeight self.distance_method = distance_method + self.seq_weight = seq_weight - seqs = [s.upper() for s in info["Sequence"]] - + def _gen_peptide_distances(self, sequences: np.ndarray, distance_method): + if sequences.dtype != str: + sequences = sequences.astype("str") + sequences = np.char.upper(sequences) + self.sequences = sequences if distance_method == "PAM250": - self.seqDist: PAM250 | Binomial = PAM250(seqs) + self.seq_dist: PAM250 | Binomial = PAM250(sequences) elif distance_method == "Binomial": - self.seqDist = Binomial(info["Sequence"], seqs) + self.seq_dist = Binomial(sequences) else: raise ValueError("Wrong distance type.") - def _estimate_log_prob(self, X): + def _estimate_log_prob(self, X: np.ndarray): """Estimate the log-probability of each point in each cluster.""" logp = super()._estimate_log_prob(X) # Do the regular work # Add in the sequence effect - self.seq_scores_ = self.SeqWeight * self.seqDist.logWeights + self.seq_scores_ = self.seq_weight * self.seq_dist.logWeights logp += self.seq_scores_ return logp - def _m_step(self, X, log_resp): + def _m_step(self, X: np.ndarray, log_resp: np.ndarray): """M step. Parameters ---------- @@ -73,105 +71,132 @@ def _m_step(self, X, log_resp): """ if self._missing: labels = np.argmax(log_resp, axis=1) - centers = self.means_.T # samples x clusters + centers = np.array(self.means_) # samples x clusters + centers_fill = centers[labels, :] - assert len(labels) == X.shape[0] - for ii in range(X.shape[0]): # X is peptides x samples - X[ii, self.missing_d[ii, :]] = centers[ - self.missing_d[ii, :], labels[ii] - ] + assert centers_fill.shape == X.shape + X[self.missing_d] = centers_fill[self.missing_d] super()._m_step(X, log_resp) # Do the regular m step # Do sequence m step - self.seqDist.from_summaries(np.exp(log_resp)) + self.seq_dist.from_summaries(np.exp(log_resp)) - def fit(self, X, y=None): - """Compute EM clustering""" - d = np.array(X.T) + def fit(self, p_signal: pd.DataFrame): + """ + Compute EM clustering. - if np.any(np.isnan(d)): + Args: + p_signal: Dataframe of shape (number of peptides, number of samples) + containing the phosphorylation signal. `p_signal.index` contains + the length-11 AA sequence of each peptide, containing the + phosphoacceptor in the middle and five AAs flanking it. + """ + assert isinstance( + p_signal, pd.DataFrame + ), "`p_signal` must be a pandas dataframe." + sequences = p_signal.index.values + assert ( + isinstance(sequences[0], str) and len(sequences[0]) == 11 + ), "The index of p_signal must be the peptide sequences of length 11" + assert all( + [token.upper() in AAlist for token in sequences[0]] + ), "Sequence(s) contain invalid characters" + assert ( + p_signal.select_dtypes(include=[np.number]).shape[1] == p_signal.shape[1] + ), "All values in `p_signal` should be numerical" + + self.p_signal = p_signal + self._gen_peptide_distances(sequences, self.distance_method) + + if np.any(np.isnan(p_signal)): self._missing = True - self.missing_d = np.isnan(d) + self.missing_d = np.isnan(p_signal) with warnings.catch_warnings(): warnings.simplefilter("ignore") - d = SoftImpute(verbose=False).fit_transform(d) + p_signal = SoftImpute(verbose=False).fit_transform(p_signal) else: self._missing = False - super().fit(d) - self.scores_ = self.predict_proba(d) + super().fit(p_signal) + self.scores_ = self.predict_proba(p_signal) assert np.all(np.isfinite(self.scores_)) assert np.all(np.isfinite(self.seq_scores_)) return self - def wins(self, X): - """Find similarity of fitted model to data and sequence models""" - check_is_fitted(self, ["scores_", "seq_scores_"]) - - alt_model = deepcopy(self) - alt_model.SeqWeight = 0.0 # No influence - alt_model.fit(X) - data_model = alt_model.scores_ - - alt_model.SeqWeight = 1000.0 # Overwhelming influence - alt_model.fit(X) - seq_model = alt_model.scores_ - - dataDist = np.linalg.norm(self.scores_ - data_model) - seqDist = np.linalg.norm(self.scores_ - seq_model) - - for i in itertools.permutations(np.arange(self.n_components)): - dataDistTemp = np.linalg.norm(self.scores_ - data_model[:, i]) - seqDistTemp = np.linalg.norm(self.scores_ - seq_model[:, i]) - - dataDist = np.minimum(dataDist, dataDistTemp) - seqDist = np.minimum(seqDist, seqDistTemp) + def transform(self, as_df=False) -> np.ndarray | pd.DataFrame: + """ + Return cluster centers. - return (dataDist, seqDist) + Args: + as_df: Whether or not the result should be wrapped in a dataframe with labeled axes. - def transform(self): - """Calculate cluster averages.""" + Returns: + The cluster centers, either a np array or pd df of shape (n_samples, n_components). + """ check_is_fitted(self, ["means_"]) - return self.means_.T - - def impute(self, X): - """Impute a matching dataset.""" - X = X.copy() + centers = self.means_.T + if as_df: + centers = pd.DataFrame( + centers, + index=self.p_signal.columns, + columns=np.arange(self.n_components), + ) + return centers + + def impute(self) -> pd.DataFrame: + """ + Imputes missing values in the dataset passed in fit() and returns the + imputed dataset. + """ + p_signal = self.p_signal.copy() labels = self.labels() # cluster assignments centers = self.transform() # samples x clusters - - assert len(labels) == X.shape[0] - for ii in range(X.shape[0]): # X is peptides x samples - X[ii, np.isnan(X[ii, :])] = centers[np.isnan(X[ii, :]), labels[ii] - 1] - - assert np.all(np.isfinite(X)) - return X - - def pssms(self, PsP_background=False): - """Compute position-specific scoring matrix of each cluster. + for ii in range(p_signal.shape[0]): + p_signal.iloc[ii, np.isnan(p_signal.iloc[ii, :])] = centers[ + np.isnan(p_signal.iloc[ii, :]), labels[ii] - 1 + ] + assert np.all(np.isfinite(p_signal)) + return p_signal + + def get_pssms( + self, PsP_background=False, clusters: List[int] = None + ) -> Tuple[np.ndarray, np.ndarray] | np.ndarray: + """ + Compute position-specific scoring matrix of each cluster. Note, to normalize by amino acid frequency this uses either all the sequences in the data set or a collection of random MS phosphosites in PhosphoSitePlus. + + Args: + PsP_background: Whether or not PhosphoSitePlus should be used for background frequency. + clusters: cluster indices to get pssms for + + Returns: + If the clusters argument is used, an array of shape (len(clusters), 20, 11), + else two arrays, where the first (of shape (n_pssms,)) + contains the clusters of the pssms in the second + (of shape (n_pssms, 20, 11)). """ - pssms, cl_num = [], [] + pssm_names, pssms = [], [] if PsP_background: - bg_seqs = BackgroundSeqs(self.info["Sequence"]) + bg_seqs = BackgroundSeqs(self.sequences) back_pssm = compute_control_pssm(bg_seqs) else: back_pssm = np.zeros((len(AAlist), 11), dtype=float) - for ii in range(1, self.n_components + 1): + + l1 = list(np.arange(self.n_components)) + l2 = list(set(self.labels())) + ec = [i for i in l1 + l2 if i not in l1 or i not in l2] + for ii in range(self.n_components): # Check for empty clusters and ignore them, if there are - l1 = list(np.arange(self.n_components) + 1) - l2 = list(set(self.labels())) - ec = [i for i in l1 + l2 if i not in l1 or i not in l2] if ii in ec: continue # Compute PSSM pssm = np.zeros((len(AAlist), 11), dtype=float) - for jj, seq in enumerate(self.info["Sequence"]): + for jj, seq in enumerate(self.sequences): seq = seq.upper() for kk, aa in enumerate(seq): pssm[AAlist.index(aa), kk] += self.scores_[jj, ii - 1] @@ -187,8 +212,9 @@ def pssms(self, PsP_background=False): back_pssm[:, pos] /= np.mean(back_pssm[:, pos]) # Normalize to background PSSM to account for AA frequencies per position - with np.seterr(divide='ignore', invalid='ignore'): - pssm /= back_pssm.copy() + old_settings = np.seterr(divide="ignore", invalid="ignore") + pssm /= back_pssm.copy() + np.seterr(**old_settings) # Log2 transform pssm = np.ma.log2(pssm) @@ -198,7 +224,7 @@ def pssms(self, PsP_background=False): pssm.index = AAlist # Normalize phosphoacceptor position to frequency - df = pd.DataFrame(self.info["Sequence"].str.upper()) + df = pd.DataFrame({"Sequence": self.sequences}) df["Cluster"] = self.labels() clSeq = df[df["Cluster"] == ii]["Sequence"] clSeq = pd.DataFrame(frequencies(clSeq)).T @@ -207,53 +233,84 @@ def pssms(self, PsP_background=False): pssm.loc[p_site, 5] = np.log2(clSeq.loc[p_site, 5] / tm) pssms.append(np.clip(pssm, a_min=0, a_max=3)) - cl_num.append(ii) + pssm_names.append(ii) - return pssms, cl_num + pssm_names, pssms = np.array(pssm_names), np.array(pssms) - def predict_UpstreamKinases( - self, additional_pssms=False, add_labels=False, PsP_background=True - ): - """Compute matrix-matrix similarity between kinase specificity profiles and cluster PSSMs to identify upstream kinases regulating clusters.""" - PSPLs = PSPLdict() - PSSMs, cl_num = self.pssms(PsP_background=PsP_background) - - # Optionally add external pssms - if not isinstance(additional_pssms, bool): - PSSMs += additional_pssms - cl_num += add_labels - PSSMs = [ - np.delete(np.array(list(np.array(mat))), [5, 10], axis=1) for mat in PSSMs - ] # Remove P0 and P+5 from pssms - - a = np.zeros((len(PSPLs), len(PSSMs))) - for ii, spec_profile in enumerate(PSPLs.values()): - for jj, pssm in enumerate(PSSMs): - a[ii, jj] = np.linalg.norm(pssm - spec_profile) - - table = pd.DataFrame(a) - table.columns = cl_num - table.insert(0, "Kinase", list(PSPLdict().keys())) - return table - - def predict(self): + if clusters is not None: + return pssms[ + [np.where(pssm_names == cluster)[0][0] for cluster in clusters] + ] + + return pssm_names, pssms + + def predict_upstream_kinases( + self, + PsP_background=True, + ) -> np.ndarray: + """Compute matrix-matrix similarity between kinase specificity profiles + and cluster PSSMs to identify upstream kinases regulating clusters.""" + kinases, pspls = get_pspls() + clusters, pssms = self.get_pssms(PsP_background=PsP_background) + distances = get_pspl_pssm_distances( + pspls, + pssms, + as_df=True, + pssm_names=clusters, + kinases=kinases, + ) + return distances + + def get_nonempty_clusters(self) -> np.ndarray[int]: + return np.unique(self.labels()) + + def has_empty_clusters(self) -> bool: + """ + Checks whether the most recent call to fit() resulted in empty clusters. + """ + check_is_fitted(self, ["scores_"]) + return self.get_nonempty_clusters().size != self.n_components + + def predict(self) -> np.ndarray[int]: """Provided the current model parameters, predict the cluster each peptide belongs to.""" check_is_fitted(self, ["scores_"]) return np.argmax(self.scores_, axis=1) - def labels(self): + def labels(self) -> np.ndarray[int]: """Find cluster assignment with highest likelihood for each peptide.""" - return self.predict() + 1 + return self.predict() - def score(self): + def score(self) -> float: """Generate score of the fitting.""" check_is_fitted(self, ["lower_bound_"]) return self.lower_bound_ - def get_params(self, deep=True): - """Returns a dict of the estimator parameters with their values.""" - dictt = super().get_params(deep=deep) - dictt["info"] = self.info - dictt["SeqWeight"] = self.SeqWeight - dictt["distance_method"] = self.distance_method - return dictt + +def get_pspl_pssm_distances( + pspls: np.ndarray, + pssms: np.ndarray, + as_df=False, + pssm_names: Sequence[str] = None, + kinases: Sequence[str] = None, +) -> np.ndarray | pd.DataFrame: + """ + Computes a distance matrix between PSPLs and PSSMs. + + Args: + pspls: kinase specificity profiles of shape (n_kinase, 20, 9) + pssms: position-specific scoring matrices of shape (n_pssms, 20, 11) + as_df: Whether or not the returned matrix should be returned as a + dataframe. Requires pssm_names and kinases. + pssm_names: list of names for the pssms of shape (n_pssms,) + kinases: list of names for the pspls of shape (n_kinase,) + + Returns: + Distance matrix of shape (n_kinase, n_pssms). + """ + assert pssms.shape[1:3] == (20, 11) + assert pspls.shape[1:3] == (20, 9) + pssms = np.delete(pssms, [5, 10], axis=2) + dists = np.linalg.norm(pspls[:, None, :, :] - pssms[None, :, :, :], axis=(2, 3)) + if as_df: + dists = pd.DataFrame(dists, index=kinases, columns=pssm_names) + return dists diff --git a/ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv b/ddmc/data/MS/CPTAC/CPTAC-preprocessedMotifs.csv similarity index 100% rename from ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv rename to ddmc/data/MS/CPTAC/CPTAC-preprocessedMotifs.csv diff --git a/ddmc/datasets.py b/ddmc/datasets.py new file mode 100644 index 000000000..b6c3e2463 --- /dev/null +++ b/ddmc/datasets.py @@ -0,0 +1,188 @@ +import re +from pathlib import Path + +import numpy as np +import pandas as pd +from typing import Sequence + +from ddmc.motifs import get_proteome_name_to_seq + +DATA_DIR = Path(__file__).parent / "data" + + +def filter_incomplete_peptides( + p_signal: pd.DataFrame, + sample_presence_ratio: float = None, + min_experiments: int = None, + sample_to_experiment: np.ndarray = None, +): + """ + Filters out missing values from p-signal array. + + Args: + sample_presence_ratio: the minimum fraction of non-missing values + allowed for a peptide before it is DROPPED. + min_experiments: the minimum number of experiments allowed for a peptide + before it is DROPPED. Must also pass in sample_to_experiment. + sample_to_experiment: array of shape `len(p_signal.columns)` that maps + each sample to an experiment (any identifier). + + Returns: + Filtered data. + """ + # assume that X has sequences as the index and samples as columns + if sample_presence_ratio is not None: + peptide_idx = ( + np.count_nonzero(~np.isnan(p_signal), axis=1) / p_signal.shape[1] + >= sample_presence_ratio + ) + elif min_experiments is not None: + assert min_experiments is not None + assert sample_to_experiment is not None + # this is kind of confusing because of the use of numpy, but we're + # removing rows that have less than the minimum number of experiments + unique_experiments = np.unique(sample_to_experiment) + experiments_grid, s_to_e_grid = np.meshgrid( + unique_experiments, sample_to_experiment, indexing="ij" + ) + bool_matrix = experiments_grid == s_to_e_grid + present = ~np.isnan(p_signal.values) + peptide_idx = (present[None, :, :] & bool_matrix[:, None, :]).any(axis=2).sum( + axis=0 + ) >= min_experiments + else: + raise ValueError( + "Must specify either a sample presence or n_experiments threshold" + ) + return p_signal.iloc[peptide_idx] + + +def select_peptide_subset( + p_signal: pd.DataFrame, keep_ratio: float = None, keep_num: int = None +): + """ + Selects a random subset of peptides from p_signal. + """ + if keep_ratio is not None: + keep_num = int(p_signal.shape[0] * keep_ratio) + return p_signal.iloc[np.random.choice(p_signal.shape[0], keep_num)] + + +class CPTAC: + data_dir = DATA_DIR / "MS" / "CPTAC" + + def get_sample_to_experiment(self, as_df=False): + sample_to_experiment = pd.read_csv(self.data_dir / "IDtoExperiment.csv") + if as_df: + return sample_to_experiment + return sample_to_experiment.iloc[:, 1].values + + def get_p_signal(self, min_experiments=2) -> pd.DataFrame: + p_signal = pd.read_csv(self.data_dir / "CPTAC-preprocessedMotifs.csv").iloc[ + :, 1: + ] + p_signal = p_signal.set_index("Sequence") + p_signal = p_signal.drop(columns=["Protein", "Gene", "Position"]) + return filter_incomplete_peptides( + p_signal, + min_experiments=min_experiments, + sample_to_experiment=self.get_sample_to_experiment(), + ) + + def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]) -> np.ndarray: + """ + Get patients that have both NAT and tumor samples. + """ + samples = samples.astype(str) + samples = samples[np.char.find(samples, "IR") == -1] + tumor_samples = np.sort(samples[~np.char.endswith(samples, ".N")]) + nat_samples = np.sort(samples[np.char.endswith(samples, ".N")]) + tumor_patients = tumor_samples + nat_patients = np.char.replace(nat_samples, ".N", "") + return np.intersect1d(tumor_patients, nat_patients) + + def get_mutations(self, mutation_names: Sequence[str] = None) -> pd.DataFrame: + mutations = pd.read_csv(self.data_dir / "Patient_Mutations.csv") + mutations = mutations.set_index("Sample.ID") + patients = self.get_patients_with_nat_and_tumor(mutations.index.values) + mutations = mutations.loc[patients] + if mutation_names is not None: + mutations = mutations[mutation_names] + return mutations.astype(bool) + + def get_hot_cold_labels(self) -> pd.Series: + hot_cold = ( + pd.read_csv(self.data_dir / "Hot_Cold.csv") + .dropna(axis=1) + .sort_values(by="Sample ID") + .set_index("Sample ID") + )["Group"] + hot_cold = hot_cold[~hot_cold.index.str.endswith(".N")] + hot_cold = hot_cold[hot_cold != "NAT enriched"] + hot_cold = hot_cold.replace("Cold-tumor enriched", 0) + hot_cold = hot_cold.replace("Hot-tumor enriched", 1) + hot_cold = hot_cold.dropna() + return np.squeeze(hot_cold).astype(bool) + + def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]: + """ + Get tumor vs NAT for each of samples. Returned array contains True if + tumor. + """ + return ~np.array([sample.endswith(".N") for sample in samples]) + + +# MCF7 mass spec data set from EBDT (Hijazi et al Nat Biotech 2020) +class EBDT: + def get_p_signal(self) -> pd.DataFrame: + p_signal = ( + pd.read_csv(DATA_DIR / "Validations" / "Computational" / "ebdt_mcf7.csv") + .drop("FDR", axis=1) + .set_index("sh.index.sites") + .drop("ARPC2_HUMAN;") + .reset_index() + ) + p_signal.insert( + 0, "Gene", [s.split("(")[0] for s in p_signal["sh.index.sites"]] + ) + p_signal.insert( + 1, + "Position", + [ + re.search(r"\(([A-Za-z0-9]+)\)", s).group(1) + for s in p_signal["sh.index.sites"] + ], + ) + p_signal = p_signal.drop("sh.index.sites", axis=1) + motifs, del_ids = self.pos_to_motif(p_signal["Gene"], p_signal["Position"]) + p_signal = p_signal.set_index(["Gene", "Position"]).drop(del_ids).reset_index() + p_signal.insert(0, "Sequence", motifs) + p_signal = p_signal.drop(columns=["Gene", "Position"]) + p_signal = p_signal.set_index("Sequence") + return p_signal + + def pos_to_motif(self, genes, pos): + """Map p-site sequence position to uniprot's proteome and extract motifs.""" + proteome = open(DATA_DIR / "Sequence_analysis" / "proteome_uniprot2019.fa", "r") + motif_size = 5 + ProteomeDict = get_proteome_name_to_seq(proteome, n="gene") + motifs = [] + del_GeneToPos = [] + for gene, pos in list(zip(genes, pos)): + try: + UP_seq = ProteomeDict[gene] + except BaseException: + del_GeneToPos.append([gene, pos]) + continue + idx = int(pos[1:]) - 1 + motif = list(UP_seq[max(0, idx - motif_size) : idx + motif_size + 1]) + if ( + len(motif) != motif_size * 2 + 1 + or pos[0] != motif[motif_size] + or pos[0] not in ["S", "T", "Y"] + ): + del_GeneToPos.append([gene, pos]) + continue + motif[motif_size] = motif[motif_size].lower() + motifs.append("".join(motif)) + return motifs, del_GeneToPos diff --git a/ddmc/figures/common.py b/ddmc/figures/common.py index 9a9a60b5a..068bbdd2a 100644 --- a/ddmc/figures/common.py +++ b/ddmc/figures/common.py @@ -1,31 +1,32 @@ """ This file contains functions that are used in multiple figures. """ + import sys import time from string import ascii_uppercase -from matplotlib import gridspec, pyplot as plt +from matplotlib import gridspec, pyplot as plt, axes, rcParams import seaborn as sns -import svgutils.transform as st import numpy as np import pandas as pd -import seaborn as sns -import mygene -from matplotlib import gridspec, pyplot as plt -from string import ascii_uppercase import svgutils.transform as st import logomaker as lm -from sklearn.preprocessing import StandardScaler -from sklearn.cluster import KMeans -from ..clustering import DDMC from scipy.stats import mannwhitneyu from statsmodels.stats.multitest import multipletests -from ..pre_processing import MeanCenter from ..motifs import KinToPhosphotypeDict -from ..binomial import AAlist +from ddmc.binomial import AAlist +from sklearn.decomposition import PCA + + +rcParams["font.sans-serif"] = "Arial" -def getSetup(figsize, gridd, multz=None): +def getSetup( + figsize: tuple[int, int], + gridd: tuple[int, int], + multz: None | dict = None, + labels=True, +) -> tuple: """Establish figure set-up with subplots.""" sns.set( style="whitegrid", @@ -40,7 +41,7 @@ def getSetup(figsize, gridd, multz=None): # Setup plotting space and grid f = plt.figure(figsize=figsize, constrained_layout=True) - gs1 = gridspec.GridSpec(*gridd, figure=f) + gs1 = gridspec.GridSpec(gridd[0], gridd[1], figure=f) # Get list of axis objects x = 0 @@ -53,10 +54,13 @@ def getSetup(figsize, gridd, multz=None): x += multz[x] x += 1 + if labels: + subplotLabel(ax) + return (ax, f) -def subplotLabel(axs): +def subplotLabel(axs: list[axes.Axes]): """Place subplot labels on the list of axes.""" for ii, ax in enumerate(axs): ax.text( @@ -71,7 +75,7 @@ def subplotLabel(axs): def overlayCartoon( - figFile, cartoonFile, x, y, scalee=1, scale_x=1, scale_y=1, rotate=None + figFile: str, cartoonFile: str, x: float, y: float, scalee: float = 1.0 ): """Add cartoon to a figure file.""" @@ -79,9 +83,7 @@ def overlayCartoon( template = st.fromfile(figFile) cartoon = st.fromfile(cartoonFile).getroot() - cartoon.moveto(x, y, scale_x=scalee * scale_x, scale_y=scalee * scale_y) - if rotate: - cartoon.rotate(rotate, x, y) + cartoon.moveto(x, y, scale_x=scalee, scale_y=scalee) # type: ignore template.append(cartoon) template.save(figFile) @@ -123,521 +125,186 @@ def genFigure(): print(f"Figure {sys.argv[1]} is done after {time.time() - start} seconds.\n") -def ComputeCenters(X, d, i, ddmc, ncl): - """Calculate cluster centers of different algorithms.""" - # k-means - labels = KMeans(n_clusters=ncl).fit(d.T).labels_ - x_ = X.copy() - x_["Cluster"] = labels - c_kmeans = x_.groupby("Cluster").mean().T - - # GMM - ddmc_data = DDMC( - i, - ncl=ncl, - SeqWeight=0, - distance_method=ddmc.distance_method, - random_state=ddmc.random_state, - ).fit(d) - c_gmm = ddmc_data.transform() - - # DDMC seq - ddmc_seq = DDMC( - i, - ncl=ncl, - SeqWeight=ddmc.SeqWeight + 20, - distance_method=ddmc.distance_method, - random_state=ddmc.random_state, - ).fit(d) - ddmc_seq_c = ddmc_seq.transform() - - # DDMC mix - ddmc_c = ddmc.transform() - return [c_kmeans, c_gmm, ddmc_seq_c, ddmc_c], [ - "Unclustered", - "k-means", - "GMM", - "DDMC seq", - "DDMC mix", - ] - - -def plotCenters(ax, model, xlabels, yaxis=False, drop=False): - centers = pd.DataFrame(model.transform()).T - centers.columns = xlabels - if drop: - centers = centers.drop(drop) - num_peptides = [ - np.count_nonzero(model.labels() == jj) - for jj in range(1, model.n_components + 1) - ] - for i in range(centers.shape[0]): - cl = pd.DataFrame(centers.iloc[i, :]).T - m = pd.melt( - cl, value_vars=list(cl.columns), value_name="p-signal", var_name="Lines" - ) - m["p-signal"] = m["p-signal"].astype("float64") - sns.lineplot( - x="Lines", y="p-signal", data=m, color="#658cbb", ax=ax[i], linewidth=2 - ) - ax[i].set_xticklabels(xlabels, rotation=45) - ax[i].set_xticks(np.arange(len(xlabels))) - ax[i].set_ylabel("$log_{10}$ p-signal") - ax[i].xaxis.set_tick_params(bottom=True) - ax[i].set_xlabel("") - ax[i].set_title( - "Cluster " - + str(centers.index[i] + 1) - + " Center " - + "(" - + "n=" - + str(num_peptides[i]) - + ")" - ) - if yaxis: - ax[i].set_ylim([yaxis[0], yaxis[1]]) - - -def plotMotifs(pssms, axes, titles=False, yaxis=False): +def plot_motifs(pssm, ax: axes.Axes, titles=False, yaxis=False): """Generate logo plots of a list of PSSMs""" - for i, ax in enumerate(axes): - pssm = pssms[i].T - if pssm.shape[0] == 11: - pssm.index = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5] - elif pssm.shape[0] == 9: - pssm.index = [-5, -4, -3, -2, -1, 1, 2, 3, 4] - logo = lm.Logo( - pssm, - font_name="Arial", - vpad=0.1, - width=0.8, - flip_below=False, - center_values=False, - ax=ax, - ) - logo.ax.set_ylabel("log_{2} (Enrichment Score)") - logo.style_xticks(anchor=1, spacing=1) - if titles: - logo.ax.set_title(titles[i] + " Motif") - else: - logo.ax.set_title("Motif Cluster " + str(i + 1)) - if yaxis: - logo.ax.set_ylim([yaxis[0], yaxis[1]]) - - -def plot_LassoCoef(ax, model, title=False): - """Plot Lasso Coefficients""" - coefs = pd.DataFrame(model.coef_).T - coefs.index += 1 - coefs = coefs.reset_index() - coefs.columns = ["Cluster", "Viability", "Apoptosis", "Migration", "Island"] - m = pd.melt( - coefs, - id_vars="Cluster", - value_vars=list(coefs.columns)[1:], - var_name="Phenotype", - value_name="Coefficient", + pssm = pssm.T + pssm = pd.DataFrame(pssm) + if pssm.shape[0] == 11: + pssm.index = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5] + elif pssm.shape[0] == 9: + pssm.index = [-5, -4, -3, -2, -1, 1, 2, 3, 4] + pssm.columns = AAlist + logo = lm.Logo( + pssm, + font_name="Arial", + vpad=0.1, + width=0.8, + flip_below=False, + center_values=False, + ax=ax, ) - sns.barplot(x="Cluster", y="Coefficient", hue="Phenotype", data=m, ax=ax) - if title: - ax.set_title(title) + logo.ax.set_ylabel("log_{2} (Enrichment Score)") + logo.style_xticks(anchor=1, spacing=1) + if titles: + logo.ax.set_title(titles + " Motif") + else: + logo.ax.set_title("Motif Cluster 1") + if yaxis: + logo.ax.set_ylim([yaxis[0], yaxis[1]]) -def plotDistanceToUpstreamKinase( - model, - clusters, - ax, - kind="strip", - num_hits=5, - additional_pssms=False, - add_labels=False, - title=False, - PsP_background=True, +def plot_cluster_kinase_distances( + distances: pd.DataFrame, pssms: np.ndarray, ax, num_hits=1 ): - """Plot Frobenius norm between kinase PSPL and cluster PSSMs""" - ukin = model.predict_UpstreamKinases( - additional_pssms=additional_pssms, - add_labels=add_labels, - PsP_background=PsP_background, + pssm_names = distances.columns + + # these centering lines make no sense, but they were used in the original + # publication-version of this code + distances = distances.sub(distances.mean(axis=1), axis=0) + distances = distances.sub(distances.mean(axis=0), axis=1) + + # melt distances + distances = pd.melt( + distances.reset_index(names="Kinase"), + id_vars="Kinase", + value_vars=list(distances.columns), + var_name="PSSM name", + value_name="Frobenius Distance", ) - ukin_mc = MeanCenter(ukin, mc_col=True, mc_row=True) - cOG = np.array(clusters).copy() - if isinstance(add_labels, list): - clusters += add_labels - data = ukin_mc.sort_values(by="Kinase").set_index("Kinase")[clusters] - if kind == "heatmap": - sns.heatmap(data.T, ax=ax, xticklabels=data.index) - cbar = ax.collections[0].colorbar - cbar.ax.tick_params(labelsize=7) - ax.set_ylabel("Cluster") - - elif kind == "strip": - data = pd.melt( - data.reset_index(), - id_vars="Kinase", - value_vars=list(data.columns), - var_name="Cluster", - value_name="Frobenius Distance", - ) - if isinstance(add_labels, list): - # Actual ERK predictions - data["Cluster"] = data["Cluster"].astype(str) - d1 = data[~data["Cluster"].str.contains("_S")] - sns.stripplot(data=d1, x="Cluster", y="Frobenius Distance", ax=ax[0]) - print(cOG) - AnnotateUpstreamKinases(model, list(cOG) + ["ERK2+"], ax[0], d1, 1) - - # Shuffled - d2 = data[data["Kinase"] == "ERK2"] - d2["Shuffled"] = ["_S" in s for s in d2["Cluster"]] - d2["Cluster"] = [s.split("_S")[0] for s in d2["Cluster"]] - sns.stripplot( - data=d2, - x="Cluster", - y="Frobenius Distance", - hue="Shuffled", - ax=ax[1], - size=8, - ) - ax[1].set_title("ERK2 Shuffled Positions") - ax[1].legend(prop={"size": 10}, loc="lower left") - DrawArrows(ax[1], d2) - else: - sns.stripplot(data=data, x="Cluster", y="Frobenius Distance", ax=ax) - AnnotateUpstreamKinases(model, clusters, ax, data, num_hits) - if title: - ax.set_title(title) - - -def AnnotateUpstreamKinases(model, clusters, ax, data, num_hits=1): - """Annotate upstream kinase predictions""" - data.iloc[:, 1] = data.iloc[:, 1].astype(str) - pssms, _ = model.pssms() - for ii, c in enumerate(clusters, start=1): - cluster = data[data.iloc[:, 1] == str(c)] - hits = cluster.sort_values(by="Frobenius Distance", ascending=True) - hits.index = np.arange(hits.shape[0]) - hits["Phosphoacceptor"] = [KinToPhosphotypeDict[kin] for kin in hits["Kinase"]] + sns.stripplot(data=distances, x="PSSM name", y="Frobenius Distance", ax=ax) + + # Annotate upstream kinase predictions + for i, pssm_name in enumerate(pssm_names): + distances_pssm = distances[distances["PSSM name"] == pssm_name] + distances_pssm = distances_pssm.sort_values( + by="Frobenius Distance", ascending=True + ) + distances_pssm = distances_pssm.reset_index(drop=True) + # assert that the kinase phosphoacceptor and most frequent phosphoacceptor in the pssm match + distances_pssm["Phosphoacceptor"] = [ + KinToPhosphotypeDict[kin] for kin in distances_pssm["Kinase"] + ] try: - cCP = pssms[c - 1].iloc[:, 5].idxmax() - except BaseException: - cCP == "S/T" - if cCP == "S" or cCP == "T": - cCP = "S/T" - hits = hits[hits["Phosphoacceptor"] == cCP] + most_frequent_phosphoacceptor = AAlist(pssms[i, 5].argmax()) + except: + most_frequent_phosphoacceptor = "S/T" + if most_frequent_phosphoacceptor == "S" or most_frequent_phosphoacceptor == "T": + most_frequent_phosphoacceptor = "S/T" + distances_pssm = distances_pssm[ + distances_pssm["Phosphoacceptor"] == most_frequent_phosphoacceptor + ] for jj in range(num_hits): ax.annotate( - hits["Kinase"].iloc[jj], - (ii - 1, hits["Frobenius Distance"].iloc[jj] - 0.01), + distances_pssm["Kinase"].iloc[jj], + (i, distances_pssm["Frobenius Distance"].iloc[jj] - 0.01), fontsize=8, ) ax.legend().remove() ax.set_title("Kinase vs Cluster Motif") -def DrawArrows(ax, d2): - data_shuff = d2[d2["Shuffled"]] - actual_erks = d2[d2["Shuffled"] == False] - arrow_lengths = ( - np.add( - data_shuff["Frobenius Distance"].values, - abs(actual_erks["Frobenius Distance"].values), - ) - * -1 - ) - for dp in range(data_shuff.shape[0]): - ax.arrow( - dp, - data_shuff["Frobenius Distance"].iloc[dp] - 0.1, - 0, - arrow_lengths[dp] + 0.3, - head_width=0.25, - head_length=0.15, - width=0.025, - fc="black", - ec="black", - ) - - -def ShuffleClusters(shuffle, model, additional=False): - """Returns PSSMs with shuffled positions""" - ClustersToShuffle = np.array(shuffle) - pssms, _ = model.pssms(PsP_background=False) - s_pssms = [] - for s in ClustersToShuffle: - mat = ShufflePositions(pssms[s]) - s_pssms.append(mat) - - if not isinstance(additional, bool): - mat = ShufflePositions(additional) - s_pssms.append(mat) +def get_pvals_across_clusters( + label: pd.Series | np.ndarray[bool], centers: pd.DataFrame | np.ndarray +) -> np.ndarray[float]: + pvals = [] + if isinstance(centers, pd.DataFrame): + centers = centers.values + centers_pos = centers[label] + centers_neg = centers[~label] + for i in range(centers.shape[1]): + pvals.append(mannwhitneyu(centers_pos[:, i], centers_neg[:, i])[1]) + return multipletests(pvals)[1] - return s_pssms - -def ShufflePositions(pssm): - """Shuffles the positions of input PSSMs""" - pssm = np.array(pssm) - mat = pssm[:, np.random.permutation([0, 1, 2, 3, 4, 6, 7, 8, 9])] - mat = np.insert(mat, 5, pssm[:, 5], axis=1) - mat = np.insert(mat, 1, pssm[:, -1], axis=1) - mat = pd.DataFrame(mat) - mat.index = AAlist - return mat - - -def plot_clusters_binaryfeatures(centers, id_var, ax, pvals=False, loc="best"): - """Plot p-signal of binary features (tumor vs NAT or mutational status) per cluster""" - data = pd.melt( - id_vars=id_var, - value_vars=centers.columns[:-1], +def plot_p_signal_across_clusters_and_binary_feature( + feature: pd.Series | np.ndarray[bool], + centers: pd.DataFrame | np.ndarray, + label_name: str, + ax, +) -> None: + centers = centers.copy() + centers_labeled = centers.copy() + centers_labeled[label_name] = feature + df_violin = centers_labeled.reset_index().melt( + id_vars=label_name, + value_vars=centers.columns, value_name="p-signal", var_name="Cluster", - frame=centers, ) sns.violinplot( + data=df_violin, x="Cluster", y="p-signal", - hue=id_var, - data=data, + hue=label_name, dodge=True, ax=ax, linewidth=0.25, - fliersize=2, ) - ax.legend(prop={"size": 8}, loc=loc) - - if not isinstance(pvals, bool): - for ii, s in enumerate(pvals["Significant"]): - y, h, col = data["p-signal"].max(), 0.05, "k" - if s == "NS": - continue - elif s == "<0.05": - mark = "*" - else: - mark = "**" - ax.text(ii, y + h, mark, ha="center", va="bottom", color=col, fontsize=20) - - -def calculate_mannW_pvals(centers, col, feature1, feature2): - """Compute Mann Whitney rank test p-values corrected for multiple tests.""" - pvals, clus = [], [] - for ii in centers.columns[:-1]: - x = centers.loc[:, [ii, col]] - x1 = x[x[col] == feature1].iloc[:, 0] - x2 = x[x[col] == feature2].iloc[:, 0] - pval = mannwhitneyu(x1, x2)[1] - pvals.append(pval) - clus.append(ii) - return dict(zip(clus, multipletests(pvals)[1])) - - -def build_pval_matrix(ncl, pvals): - """Build data frame with pvalues per cluster""" - data = pd.DataFrame() - data["Clusters"] = pvals.keys() - data["p-value"] = pvals.values() - signif = [] - for val in pvals.values(): - if 0.01 < val < 0.05: - signif.append("<0.05") - elif val < 0.01: - signif.append("<0.01") + ax.legend(prop={"size": 8}) + annotation_height = df_violin["p-signal"].max() + 0.02 + for i, pval in enumerate(get_pvals_across_clusters(feature, centers)): + if pval < 0.05: + annotation = "*" + elif pval < 0.01: + annotation = "**" else: - signif.append("NS") - data["Significant"] = signif - return data + continue + ax.text(i, annotation_height, annotation, ha="center", va="bottom", fontsize=10) -def TumorType(X): - """Add NAT vs Tumor column.""" - tumortype = [] - for i in range(X.shape[0]): - if X["Patient_ID"][i].endswith(".N"): - tumortype.append("NAT") - else: - tumortype.append("Tumor") - X["Type"] = tumortype - return X - - -def ExportClusterFile(cluster, cptac=False, mcf7=False): - """Export cluster SVG file for NetPhorest and GO analysis.""" - if cptac: - c = pd.read_csv( - "ddmc/data/cluster_members/CPTAC_DDMC_35CL_W100_MembersCluster" - + str(cluster) - + ".csv" - ) - if mcf7: - c = pd.read_csv( - "ddmc/data/cluster_members/msresist/data/cluster_members/CPTAC_MF7_20CL_W5_MembersCluster" - + str(cluster) - + ".csv" - ) - c["pos"] = [s.split(s[0])[1].split("-")[0] for s in c["Position"]] - c["res"] = [s[0] for s in c["Position"]] - c.insert(4, "Gene_Human", [s + "_HUMAN" for s in c["Gene"]]) - c = c.drop(["Position"], axis=1) - drop_list = [ - "NHSL2", - "MAGI3", - "SYNC", - "LMNB2", - "PLS3", - "PI4KA", - "SYNM", - "MAP2", - "MIA2", - "SPRY4", - "KSR1", - "RUFY2", - "MAP11", - "MGA", - "PRR12", - "PCLO", - "NCOR2", - "BNIP3", - "CENPF", - "OTUD4", - "RPA1", - "CLU", - "CDK18", - "CHD1L", - "DEF6", - "MAST4", - "SSR3", - ] - for gene in drop_list: - c = c[c["Gene"] != gene] - c.to_csv("Cluster_" + str(cluster) + ".csv") - - -def plot_NetPhoresScoreByKinGroup(PathToFile, ax, n=5, title=False, color="royalblue"): - """Plot top scoring kinase groups""" - NPtoCumScore = {} - X = pd.read_csv(PathToFile) - for ii in range(X.shape[0]): - curr_NPgroup = X["netphorest_group"][ii] - if curr_NPgroup == "any_group": - continue - elif curr_NPgroup not in NPtoCumScore.keys(): - NPtoCumScore[curr_NPgroup] = X["netphorest_score"][ii] - else: - NPtoCumScore[curr_NPgroup] += X["netphorest_score"][ii] - X = pd.DataFrame.from_dict(NPtoCumScore, orient="index").reset_index() - X.columns = ["KIN Group", "NetPhorest Score"] - X["KIN Group"] = [s.split("_")[0] for s in X["KIN Group"]] - X = X.sort_values(by="NetPhorest Score", ascending=False).iloc[:n, :] - sns.stripplot( - data=X, - y="KIN Group", - x="NetPhorest Score", - ax=ax, - orient="h", - color=color, - size=5, - **{"linewidth": 1}, - **{"edgecolor": "black"}, +def plot_pca_on_cluster_centers( + centers: pd.DataFrame, + axes, + hue_scores: np.ndarray = None, + hue_scores_title: str = None, + hue_loadings: np.ndarray = None, + hue_loadings_title: str = None, +): + # run PCA on cluster centers + pca = PCA(n_components=2) + scores = pca.fit_transform(centers) # sample by PCA component + loadings = pca.components_ # PCA component by cluster + variance_explained = np.round(pca.explained_variance_ratio_, 2) + + # plot scores + sns.scatterplot( + x=scores[:, 0], + y=scores[:, 1], + hue=hue_scores, + ax=axes[0], + **{"linewidth": 0.5, "edgecolor": "k"}, ) - if title: - ax.set_title(title) - else: - ax.set_title("Kinase Predictions") - - -def make_BPtoGenes_table(X, cluster): - d = X[["Clusters", "Description", "geneID"]] - d = d[d["Clusters"] == cluster] - gAr = d[["geneID"]].values - bpAr = d[["Description"]].values - mg = mygene.MyGeneInfo() - BPtoGenesDict = {} - for ii, arr in enumerate(gAr): - gg = mg.querymany( - list(arr[0].split("/")), - scopes="entrezgene", - fields="symbol", - species="human", - returnall=False, - as_dataframe=True, + if hue_scores_title: + axes[0].legend( + loc="lower left", prop={"size": 9}, title=hue_scores_title, fontsize=9 ) - BPtoGenesDict[bpAr[ii][0]] = list(gg["symbol"]) - return pd.DataFrame(dict([(k, pd.Series(v)) for k, v in BPtoGenesDict.items()])) - - -def merge_binary_vectors(y, mutant1, mutant2): - """Merge binary mutation status vectors to identify all patients having one of the two mutations""" - y1 = y[mutant1] - y2 = y[mutant2] - y_ = np.zeros(y.shape[0]) - for binary in [y1, y2]: - indices = [i for i, x in enumerate(binary) if x == 1] - y_[indices] = 1 - return pd.Series(y_) - - -def find_patients_with_NATandTumor(X, label, conc=False): - """Reshape data to display patients as rows and samples (Tumor and NAT per cluster) as columns. - Note that to do so, samples that don't have their tumor/NAT counterpart are dropped. - """ - xT = X[~X[label].str.endswith(".N")].sort_values(by=label) - xN = X[X[label].str.endswith(".N")].sort_values(by=label) - l1 = list(xT[label]) - l2 = [s.split(".N")[0] for s in xN[label]] - dif = [i for i in l1 + l2 if i not in l1 or i not in l2] - X = xT.set_index(label).drop(dif) - assert all(X.index.values == np.array(l2)), "Samples don't match" - - if conc: - xN = xN.set_index(label) - xN.index = l2 - xN.columns = [str(i) + "_Normal" for i in xN.columns] - X.columns = [str(i) + "_Tumor" for i in X.columns] - X = pd.concat([X, xN], axis=1) - return X - - -def TransformCenters(model, X): - """For a given model, find centers and transform for regression.""" - centers = pd.DataFrame(model.transform()).T - centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform( - centers.iloc[:, :] + axes[0].set_title("PCA Scores") + axes[0].set_xlabel( + "PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10 ) - centers = centers.T - centers.columns = np.arange(model.n_components) + 1 - centers["Patient_ID"] = X.columns[4:] - centers1 = find_patients_with_NATandTumor(centers.copy(), "Patient_ID", conc=True) - centers2 = ( - centers.loc[~centers["Patient_ID"].str.endswith(".N"), :] - .sort_values(by="Patient_ID") - .set_index("Patient_ID") + axes[0].set_ylabel( + "PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10 ) - return centers1, centers2 - -def HotColdBehavior(centers): - # Import Cold-Hot Tumor data - y = ( - pd.read_csv("ddmc/data/CPTAC_LUAD/Hot_Cold.csv") - .dropna(axis=1) - .sort_values(by="Sample ID") + # plot loadings + sns.scatterplot( + x=loadings[0], + y=loadings[1], + ax=axes[1], + hue=hue_loadings, + **{"linewidth": 0.5, "edgecolor": "k"}, + ) + if hue_loadings_title: + axes[1].legend(title="p < 0.01", prop={"size": 8}) + axes[1].set_title("PCA Loadings") + axes[1].set_xlabel( + "PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10 ) - y = y.loc[~y["Sample ID"].str.endswith(".N"), :].set_index("Sample ID") - l1 = list(centers.index) - l2 = list(y.index) - dif = [i for i in l1 + l2 if i not in l1 or i not in l2] - centers = centers.drop(dif) - - # Transform to binary - y = y.replace("Cold-tumor enriched", 0) - y = y.replace("Hot-tumor enriched", 1) - y = np.squeeze(y) - - # Remove NAT-enriched samples - centers = centers.drop(y[y == "NAT enriched"].index) - y = y.drop(y[y == "NAT enriched"].index).astype(int) - assert all(centers.index.values == y.index.values), "Samples don't match" - - return y, centers + axes[1].set_ylabel( + "PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10 + ) + for j, txt in enumerate(centers.columns): + axes[1].annotate( + txt, (loadings[0][j] + 0.001, loadings[1][j] + 0.001), fontsize=10 + ) diff --git a/ddmc/figures/figureM2.py b/ddmc/figures/figureM2.py index 77fa2b814..5746a5db7 100644 --- a/ddmc/figures/figureM2.py +++ b/ddmc/figures/figureM2.py @@ -1,15 +1,11 @@ -""" -This creates Figure 2: Evaluation of Imputating Missingness -""" -import matplotlib import numpy as np from scipy.stats import gmean import pandas as pd import seaborn as sns -from .common import subplotLabel, getSetup +from .common import getSetup from ..clustering import DDMC -from ..pre_processing import filter_NaNpeptides from fancyimpute import IterativeSVD +from ddmc.datasets import CPTAC def makeFigure(): @@ -17,65 +13,48 @@ def makeFigure(): # Get list of axis objects ax, f = getSetup((10, 10), (3, 3), multz={0: 2}) - # Set plotting format - matplotlib.rcParams["font.sans-serif"] = "Arial" - sns.set( - style="whitegrid", - font_scale=1, - color_codes=True, - palette="colorblind", - rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6}, - ) - - # Add subplot labels - subplotLabel(ax) - # diagram explaining reconstruction process ax[0].axis("off") + n_clusters = np.array([1, 45]) + # Imputation error across Cluster numbers - dataC_W0 = ErrorAcross( - "Binomial", [0] * 12, n_clusters=np.arange(1, 46, 4), n_runs=3 + dataC_W0 = run_repeated_imputation( + "Binomial", [0] * len(n_clusters), n_clusters=n_clusters, n_runs=1 ) - plotErrorAcrossNumberOfClustersOrWeights(ax[1], dataC_W0, "Clusters") - ax[1].set_ylim(10.5, 12) - dataC_W25 = ErrorAcross( - "Binomial", [100] * 12, n_clusters=np.arange(1, 46, 4), n_runs=3 + plot_imputation_errs(ax[1], dataC_W0, "Clusters") + + dataC_W25 = run_repeated_imputation( + "Binomial", [100] * len(n_clusters), n_clusters=n_clusters, n_runs=1 ) - dataC_W0.iloc[:, 2] = 1250 - plotErrorAcrossNumberOfClustersOrWeights(ax[2], dataC_W25, "Clusters") - ax[2].set_ylim(10.5, 12) - dataC_W100 = ErrorAcross( - "Binomial", [1000000] * 12, n_clusters=np.arange(1, 46, 4), n_runs=3 + plot_imputation_errs(ax[2], dataC_W25, "Clusters") + + dataC_W100 = run_repeated_imputation( + "Binomial", [1000000] * len(n_clusters), n_clusters=n_clusters, n_runs=1 ) - plotErrorAcrossNumberOfClustersOrWeights(ax[3], dataC_W100, "Clusters") - ax[3].set_ylim(10.5, 12) + plot_imputation_errs(ax[3], dataC_W100, "Clusters") - # Imputation error across different Weights - weights = [0, 50, 100, 250, 500, 750, 1000, 1000000] - dataW_2C = ErrorAcross( - "Binomial", weights=weights, n_clusters=[2] * len(weights), n_runs=3 + # Imputation error across different weights + weights = [0, 100] + dataW_2C = run_repeated_imputation( + "Binomial", weights=weights, n_clusters=[2] * len(weights), n_runs=1 ) - dataW_2C.replace(1000000, 1250) - plotErrorAcrossNumberOfClustersOrWeights(ax[4], dataW_2C, "Weight", legend=False) - ax[4].set_ylim(10.5, 12) - dataW_20C = ErrorAcross( - "Binomial", weights=weights, n_clusters=[20] * len(weights), n_runs=3 + plot_imputation_errs(ax[4], dataW_2C, "Weight") + + dataW_20C = run_repeated_imputation( + "Binomial", weights=weights, n_clusters=[20] * len(weights), n_runs=1 ) - dataW_20C.replace(1000000, 1250) - plotErrorAcrossNumberOfClustersOrWeights(ax[5], dataW_20C, "Weight", legend=False) - ax[5].set_ylim(10.5, 12) - dataW_40C = ErrorAcross( - "Binomial", weights=weights, n_clusters=[40] * len(weights), n_runs=3 + plot_imputation_errs(ax[5], dataW_20C, "Weight") + + dataW_40C = run_repeated_imputation( + "Binomial", weights=weights, n_clusters=[40] * len(weights), n_runs=1 ) - dataW_40C.replace(1000000, 1250) - plotErrorAcrossNumberOfClustersOrWeights(ax[6], dataW_40C, "Weight", legend=False) - ax[6].set_ylim(10.5, 12) + plot_imputation_errs(ax[6], dataW_40C, "Weight") return f -def plotErrorAcrossNumberOfClustersOrWeights(ax, data, kind, legend=True): +def plot_imputation_errs(ax, data, kind): """Plot artificial missingness error across different number of clusters or weighths.""" if kind == "Weight": title = "Weight Selection" @@ -86,18 +65,16 @@ def plotErrorAcrossNumberOfClustersOrWeights(ax, data, kind, legend=True): gm["DDMC"] = np.log(gm["DDMC"]) gm["Average"] = np.log(data.groupby([kind]).Average.apply(gmean).values) gm["Zero"] = np.log(data.groupby([kind]).Zero.apply(gmean).values) - # gm["Minimum"] = np.log(data.groupby([kind]).Minimum.apply(gmean).values) gm["PCA"] = np.log(data.groupby([kind]).PCA.apply(gmean).values) - gm.to_csv("WeightSearch.csv") - sns.regplot( x=kind, y="DDMC", data=gm, - scatter_kws={"alpha": 0.25}, + scatter_kws={"alpha": 0.5}, color="darkblue", ax=ax, + ci=None, label="DDMC", lowess=True, ) @@ -105,41 +82,44 @@ def plotErrorAcrossNumberOfClustersOrWeights(ax, data, kind, legend=True): x=kind, y="Average", data=gm, + ci=None, color="black", scatter=False, ax=ax, label="Average", ) sns.regplot( - x=kind, y="Zero", data=gm, color="lightblue", scatter=False, ax=ax, label="Zero" + x=kind, + y="Zero", + data=gm, + color="lightblue", + scatter=False, + ax=ax, + label="Zero", + ci=None, ) - # sns.regplot(x=kind, y="Minimum", data=gm, color="green", scatter=False, ax=ax, label="Minimum") sns.regplot( - x=kind, y="PCA", data=gm, color="orange", scatter=False, ax=ax, label="PCA" + x=kind, + y="PCA", + data=gm, + color="orange", + scatter=False, + ax=ax, + label="PCA", + ci=None, ) ax.set_title(title) ax.set_ylabel("log(MSE)—Actual vs Imputed") ax.legend(prop={"size": 10}, loc="upper left") - if not legend: - ax.legend().remove() -def ErrorAcross(distance_method, weights, n_clusters, n_runs=1, tmt=6): - """Calculate missingness error across different number of clusters.""" +def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1): + """Calculate missingness error across different numbers of clusters and/or weights.""" assert len(weights) == len(n_clusters) - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=tmt, - ) - X.index = np.arange(X.shape[0]) - md = X.copy() - info = md.select_dtypes(include=["object"]) - X = X.select_dtypes(include=["float64"]) - StoE = pd.read_csv("ddmc/data/MS/CPTAC/IDtoExperiment.csv") - assert all(StoE.iloc[:, 0] == X.columns), "Sample labels don't match." - X = X.to_numpy() - tmtIDX = StoE["Experiment (TMT10plex)"].to_numpy() - assert X.shape[1] == tmtIDX.size + cptac = CPTAC() + p_signal = cptac.get_p_signal(min_experiments=6) + print(p_signal.shape) + sample_to_experiment = cptac.get_sample_to_experiment() df = pd.DataFrame( columns=[ @@ -155,46 +135,72 @@ def ErrorAcross(distance_method, weights, n_clusters, n_runs=1, tmt=6): ) for ii in range(n_runs): - Xmiss = IncorporateMissingValues(X, tmtIDX) - baseline_errors = ComputeBaselineErrors(X, Xmiss) + X_miss = p_signal.copy() + X_miss.iloc[:, :] = add_missingness(p_signal.values, sample_to_experiment) + baseline_imputations = [ + impute_mean(X_miss.values), + impute_zero(X_miss.values), + impute_min(X_miss.values), + impute_pca(X_miss.values, 5), + ] + baseline_errs = [ + imputation_error(p_signal.values, X_impute) + for X_impute in baseline_imputations + ] for jj, cluster in enumerate(n_clusters): - model = DDMC(info, cluster, weights[jj], distance_method).fit(Xmiss.T) - eDDMC = np.nansum(np.square(X - model.impute(Xmiss))) - dfs = pd.Series( - [ii, cluster, weights[jj], eDDMC, *baseline_errors], index=df.columns - ) - print(df) - print(dfs) - df = pd.concat([df, dfs], ignore_index=True) - + df.loc[len(df)] = [ + ii, + cluster, + weights[jj], + imputation_error( + p_signal.values, + impute_ddmc(X_miss, cluster, weights[jj], distance_method).values, + ), + *baseline_errs, + ] return df -def IncorporateMissingValues(X, tmtIDX): +def add_missingness(p_signal, sample_to_experiment): """Remove a random TMT experiment for each peptide.""" + p_signal = p_signal.copy() + for ii in range(p_signal.shape[0]): + experiments = sample_to_experiment[np.isfinite(p_signal[ii, :])] + p_signal[ + ii, sample_to_experiment == np.random.choice(np.unique(experiments)) + ] = np.nan + return p_signal + + +def imputation_error(X, X_impute): + # returns MSE between X and X_impute + mse = np.nansum(np.square(X - X_impute)) + return mse + + +def impute_zero(X): + X = X.copy() + X[np.isnan(X)] = 0.0 + return X + + +def impute_min(X): X = X.copy() - for ii in range(X.shape[0]): - tmtNum = tmtIDX[np.isfinite(X[ii, :])] - X[ii, tmtIDX == np.random.choice(np.unique(tmtNum))] = np.nan + np.copyto(X, np.nanmin(X, axis=0, keepdims=True), where=np.isnan(X)) return X -def ComputeBaselineErrors(X, d, ncomp=5): - """Compute error between baseline methods (i.e. average signal, minimum signal, zero, and PCA) and real value.""" - dmean = d.copy() - np.copyto(dmean, np.nanmean(d, axis=0, keepdims=True), where=np.isnan(d)) - emean = np.nansum(np.square(X - dmean)) +def impute_mean(X): + X = X.copy() + np.copyto(X, np.nanmean(X, axis=0, keepdims=True), where=np.isnan(X)) + return X - dmin = d.copy() - np.copyto(dmin, np.nanmin(d, axis=0, keepdims=True), where=np.isnan(d)) - emin = np.nansum(np.square(X - dmin)) - dzero = d.copy() - np.copyto(dzero, 0.0, where=np.isnan(d)) - ezero = np.nansum(np.square(X - dzero)) +def impute_pca(X, rank): + X = X.copy() + return IterativeSVD(rank=rank, verbose=False).fit_transform(X) - dpca = IterativeSVD(rank=ncomp, verbose=False).fit_transform(d.copy()) - epca = np.nansum(np.square(X - dpca)) - return emean, ezero, emin, epca +def impute_ddmc(p_signal, n_clusters, weight, distance_method): + return DDMC(n_clusters, weight, distance_method).fit(p_signal).impute() diff --git a/ddmc/figures/figureM3.py b/ddmc/figures/figureM3.py index 5b6234308..97635f530 100644 --- a/ddmc/figures/figureM3.py +++ b/ddmc/figures/figureM3.py @@ -5,150 +5,94 @@ import numpy as np import pandas as pd import seaborn as sns -import matplotlib -from ..clustering import DDMC -from ..binomial import AAlist -from .common import subplotLabel, getSetup -from ..pca import plotPCA -from .common import plotDistanceToUpstreamKinase, plotMotifs -from ..clustering import compute_control_pssm -from ..binomial import AAlist -from ..motifs import DictProteomeNameToSeq -from ..pre_processing import filter_NaNpeptides + +from ddmc.clustering import DDMC, compute_control_pssm, get_pspl_pssm_distances +from ddmc.binomial import AAlist +from ddmc.figures.common import ( + getSetup, + plot_motifs, + plot_cluster_kinase_distances, + plot_pca_on_cluster_centers, +) +from ddmc.datasets import CPTAC, EBDT +from ddmc.motifs import get_pspls def makeFigure(): """Get a list of the axis objects and create a figure""" # Get list of axis objects - ax, f = getSetup((12, 12), (3, 3), multz={3: 1}) - - # Add subplot labels - subplotLabel(ax) - - # Set plotting format - matplotlib.rcParams["font.sans-serif"] = "Arial" - sns.set( - style="whitegrid", - font_scale=1, - color_codes=True, - palette="colorblind", - rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6}, - ) + axes, f = getSetup((12, 12), (3, 3), multz={3: 1}) - # Import signaling data - x = preprocess_ebdt_mcf7() - d = x.select_dtypes(include=[float]).T - i = x.select_dtypes(include=[object]) - - # Fit DDMC and find centers - model = DDMC( - i, n_components=20, SeqWeight=5, distance_method="Binomial", random_state=10 - ).fit(d) - centers = pd.DataFrame(model.transform()) - centers.columns = np.arange(model.n_components) + 1 - centers.insert(0, "Inhibitor", x.columns[3:]) - centers["Inhibitor"] = [s.split(".")[1].split(".")[0] for s in centers["Inhibitor"]] - - # PCA AKT - AKTi = [ - "GSK690693", - "Torin1", - "HS173", - "GDC0941", - "Ku0063794", - "AZ20", - "MK2206", - "AZD5363", - "GDC0068", - "AZD6738", - "AT13148", - "Edelfosine", - "GF109203X", - "AZD8055", - ] - centers["AKTi"] = [drug in AKTi for drug in centers["Inhibitor"]] - plotPCA(ax[:2], centers, 2, ["Inhibitor", "AKTi"], "Cluster", hue_scores="AKTi") - ax[0].legend(loc="lower left", prop={"size": 9}, title="AKTi", fontsize=9) - - # Upstream Kinases AKT EBDT - plotDistanceToUpstreamKinase(model, [16], ax=ax[2], num_hits=1) - - # first plot heatmap of clusters - ax[3].axis("off") - - # AKT substrates bar plot - plot_NetPhoresScoreByKinGroup( - "ddmc/data/cluster_analysis/MCF7_NKIN_CL16.csv", - ax[4], - title="Cluster 16—Kinase Predictions", - n=40, - ) + plot_fig_3abd(axes[0], axes[1], axes[2]) - # # ERK2 White lab motif - erk2 = pd.read_csv("ddmc/data/Validations/Computational/ERK2_substrates.csv") - erk2 = compute_control_pssm([s.upper() for s in erk2["Peptide"]]) - erk2 = pd.DataFrame(np.clip(erk2, a_min=0, a_max=3)) - erk2.index = AAlist - plotMotifs([erk2], axes=[ax[5]], titles=["ERK2"]) + # 3c cannot be automatically placed on figure because of seaborn limitation + axes[3].axis("off") - # ERK2 prediction - # Import signaling data - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, + plot_fig_3e( + axes[4], ) - d = X.select_dtypes(include=[float]).T - i = X.select_dtypes(include=[object]) - # Fit DDMC - model_cptac = DDMC( - i, n_components=30, SeqWeight=100, distance_method="Binomial", random_state=5 - ).fit(d) - - s_pssms = ShuffleClusters([3, 7, 21], model_cptac, additional=erk2) - plotDistanceToUpstreamKinase( - model_cptac, - [3, 7, 21], - additional_pssms=s_pssms + [erk2], - add_labels=["3_S", "7_S", "21_S", "ERK2+_S", "ERK2+"], - ax=ax[-2:], - num_hits=1, - ) + plot_fig_3fgh(*axes[5:8]) return f -def ShuffleClusters(shuffle, model, additional=False): - """Returns PSSMs with shuffled positions""" - ClustersToShuffle = np.array(shuffle) - pssms, _ = model.pssms(PsP_background=False) - s_pssms = [] - for s in ClustersToShuffle: - mat = ShufflePositions(pssms[s]) - s_pssms.append(mat) - - if not isinstance(additional, bool): - mat = ShufflePositions(additional) - s_pssms.append(mat) +def plot_fig_3abd(ax_a, ax_b, ax_d): + # Import signaling data + p_signal = EBDT().get_p_signal() - return s_pssms + # Fit DDMC + model = DDMC( + n_components=20, + seq_weight=5, + distance_method="Binomial", + random_state=10, + ).fit(p_signal) + + # get cluster centers + centers = model.transform(as_df=True) + + # parse inhibitor names from sample names + inhibitors = [s.split(".")[1].split(".")[0] for s in p_signal.columns] + + # create new bool array specifying whether each inhibitor is an AKT inhibitor + is_AKTi = [ + drug + in [ + "GSK690693", + "Torin1", + "HS173", + "GDC0941", + "Ku0063794", + "AZ20", + "MK2206", + "AZD5363", + "GDC0068", + "AZD6738", + "AT13148", + "Edelfosine", + "GF109203X", + "AZD8055", + ] + for drug in inhibitors + ] + plot_pca_on_cluster_centers( + centers, [ax_a, ax_b], hue_scores=is_AKTi, hue_scores_title="AKTi?" + ) -def ShufflePositions(pssm): - """Shuffles the positions of input PSSMs""" - pssm = np.array(pssm) - mat = pssm[:, np.random.permutation([0, 1, 2, 3, 4, 6, 7, 8, 9])] - mat = np.insert(mat, 5, pssm[:, 5], axis=1) - mat = np.insert(mat, 1, pssm[:, -1], axis=1) - mat = pd.DataFrame(mat) - mat.index = AAlist - return mat + # Plot kinase predictions for cluster 16 + plot_cluster_kinase_distances( + model.predict_upstream_kinases()[[16]], + model.get_pssms(PsP_background=True, clusters=[16])[0], + ax=ax_d, + ) -def plot_NetPhoresScoreByKinGroup(PathToFile, ax, n=5, title=False, color="royalblue"): +def plot_fig_3e(ax): """Plot top scoring kinase groups""" NPtoCumScore = {} - X = pd.read_csv(PathToFile) + X = pd.read_csv("ddmc/data/cluster_analysis/MCF7_NKIN_CL16.csv") for ii in range(X.shape[0]): curr_NPgroup = X["netphorest_group"][ii] if curr_NPgroup == "any_group": @@ -160,34 +104,126 @@ def plot_NetPhoresScoreByKinGroup(PathToFile, ax, n=5, title=False, color="royal X = pd.DataFrame.from_dict(NPtoCumScore, orient="index").reset_index() X.columns = ["KIN Group", "NetPhorest Score"] X["KIN Group"] = [s.split("_")[0] for s in X["KIN Group"]] - X = X.sort_values(by="NetPhorest Score", ascending=False).iloc[:n, :] + X = X.sort_values(by="NetPhorest Score", ascending=False).iloc[:40, :] sns.stripplot( data=X, y="KIN Group", x="NetPhorest Score", ax=ax, orient="h", - color=color, + color="royalblue", size=5, **{"linewidth": 1}, **{"edgecolor": "black"}, ) - if title: - ax.set_title(title) - else: - ax.set_title("Kinase Predictions") - - -def plotMCF7AKTclustermap(model, cluster): - """Code to create hierarchical clustering of cluster 1 across treatments""" - c1 = pd.DataFrame(model.transform()[:, cluster - 1]) - X = pd.read_csv("ddmc/data/Validations/Computational/ebdt_mcf7.csv") - index = [col.split("7.")[1].split(".")[0] for col in X.columns[2:]] - c1["Inhibitor"] = index - c1 = c1.set_index("Inhibitor") - lim = np.max(np.abs(c1)) * 0.3 - g = sns.clustermap( - c1, + ax.set_title("Cluster 16—Kinase Predictions") + + +def plot_fig_3fgh(ax_f, ax_g, ax_h): + # plot erk2+ pssm + # ERK2 White lab motif + erk2_pssm = pd.read_csv("ddmc/data/Validations/Computational/ERK2_substrates.csv") + erk2_pssm = compute_control_pssm([s.upper() for s in erk2_pssm["Peptide"]]) + erk2_pssm = pd.DataFrame(np.clip(erk2_pssm, a_min=0, a_max=3)) + erk2_pssm.index = AAlist + plot_motifs(erk2_pssm, ax=ax_f, titles="ERK2") + + p_signal = CPTAC().get_p_signal() + + model = DDMC( + n_components=30, + seq_weight=100, + distance_method="Binomial", + random_state=5, + ).fit(p_signal) + + clusters = [2, 6, 20] + # get pssms from ddmc clusters + pssms = model.get_pssms(PsP_background=True, clusters=clusters) + + # append erk2+ pssm + pssm_names = clusters + ["ERK2+"] + pssms = np.append(pssms, erk2_pssm.values[None, :, :], axis=0) + + # get kinase-pssm specificities + kinases, pspls = get_pspls() + distances = get_pspl_pssm_distances( + pspls, pssms, as_df=True, pssm_names=pssm_names, kinases=kinases + ) + + # plot the specificities + plot_cluster_kinase_distances(distances, pssms, ax_g) + + # plot change in specificity to ERK2 due to shuffling + shuffled_pssms = np.array([shuffle_pssm(pssm) for pssm in pssms]) + shuffled_distances = get_pspl_pssm_distances( + pspls, shuffled_pssms, as_df=True, pssm_names=pssm_names, kinases=kinases + ) + + # reformat data for plotting + melt_distances = lambda ds: ds.reset_index(names="Kinase").melt( + id_vars="Kinase", var_name="pssm_name" + ) + distances_melt = pd.concat( + [ + melt_distances(distances).assign(Shuffled=False), + melt_distances(shuffled_distances).assign(Shuffled=True), + ] + ) + sns.stripplot( + data=distances_melt[distances_melt["Kinase"] == "ERK2"], + x="pssm_name", + y="value", + hue="Shuffled", + ax=ax_h, + size=8, + ) + + ax_h.set_xlabel("Cluster") + ax_h.set_ylabel("Frobenius Distance") + ax_h.set_title("ERK2 Shuffled Positions") + ax_h.legend(prop={"size": 10}, loc="lower left") + + # add arrows from original to shuffled + for i, pssm_name in enumerate(pssm_names): + ax_h.arrow( + i, + distances.loc["ERK2", pssm_name] - 0.1, + 0, + shuffled_distances.loc["ERK2", pssm_name] + - distances.loc["ERK2", pssm_name] + + 0.3, + head_width=0.25, + head_length=0.15, + width=0.025, + fc="black", + ec="black", + ) + + +def shuffle_pssm(pssm): + shuffled_pssm = pssm[:, np.random.permutation([0, 1, 2, 3, 4, 6, 7, 8, 9])] + shuffled_pssm = np.insert(shuffled_pssm, 5, pssm[:, 5], axis=1) + return np.insert(shuffled_pssm, 1, pssm[:, -1], axis=1) + + +def plot_fig_3c(): + """Code to create hierarchical clustering of cluster 0 across treatments""" + p_signal = EBDT().get_p_signal() + model = DDMC( + n_components=20, + seq_weight=5, + distance_method="Binomial", + random_state=10, + ).fit(p_signal) + centers = model.transform(as_df=True) + # the labels are structured as "MCF7..fold" + centers.index = [i[5:-5] for i in centers.index] + # first cluster + center = centers.iloc[:, 0] + lim = np.max(np.abs(center)) * 0.3 + sns.clustermap( + center, method="centroid", cmap="bwr", robust=True, @@ -199,51 +235,3 @@ def plotMCF7AKTclustermap(model, cluster): yticklabels=True, xticklabels=False, ) - - -def preprocess_ebdt_mcf7(): - """Preprocess MCF7 mass spec data set from EBDT (Hijazi et al Nat Biotech 2020)""" - x = ( - pd.read_csv("ddmc/data/Validations/Computational/ebdt_mcf7.csv") - .drop("FDR", axis=1) - .set_index("sh.index.sites") - .drop("ARPC2_HUMAN;") - .reset_index() - ) - x.insert(0, "Gene", [s.split("(")[0] for s in x["sh.index.sites"]]) - x.insert( - 1, - "Position", - [re.search(r"\(([A-Za-z0-9]+)\)", s).group(1) for s in x["sh.index.sites"]], - ) - x = x.drop("sh.index.sites", axis=1) - motifs, del_ids = pos_to_motif(x["Gene"], x["Position"], motif_size=5) - x = x.set_index(["Gene", "Position"]).drop(del_ids).reset_index() - x.insert(0, "Sequence", motifs) - return x - - -def pos_to_motif(genes, pos, motif_size=5): - """Map p-site sequence position to uniprot's proteome and extract motifs.""" - proteome = open("ddmc/data/Sequence_analysis/proteome_uniprot2019.fa", "r") - ProteomeDict = DictProteomeNameToSeq(proteome, n="gene") - motifs = [] - del_GeneToPos = [] - for gene, pos in list(zip(genes, pos)): - try: - UP_seq = ProteomeDict[gene] - except BaseException: - del_GeneToPos.append([gene, pos]) - continue - idx = int(pos[1:]) - 1 - motif = list(UP_seq[max(0, idx - motif_size) : idx + motif_size + 1]) - if ( - len(motif) != motif_size * 2 + 1 - or pos[0] != motif[motif_size] - or pos[0] not in ["S", "T", "Y"] - ): - del_GeneToPos.append([gene, pos]) - continue - motif[motif_size] = motif[motif_size].lower() - motifs.append("".join(motif)) - return motifs, del_GeneToPos diff --git a/ddmc/figures/figureM4.py b/ddmc/figures/figureM4.py index 5f10ceb42..a2575e412 100644 --- a/ddmc/figures/figureM4.py +++ b/ddmc/figures/figureM4.py @@ -1,101 +1,64 @@ -""" -This creates Figure 4: Predictive performance of DDMC clusters using different weights -""" +from typing import List import numpy as np import pandas as pd import seaborn as sns -import matplotlib from sklearn.linear_model import LogisticRegressionCV -from sklearn.preprocessing import StandardScaler -from sklearn.metrics import mean_squared_error -from ..clustering import DDMC -from .common import subplotLabel, getSetup -from ..logistic_regression import plotROC -from ..pre_processing import filter_NaNpeptides +from ddmc.clustering import DDMC +from ddmc.figures.common import getSetup +from ddmc.logistic_regression import plot_roc, normalize_cluster_centers +from ddmc.datasets import CPTAC, select_peptide_subset -def makeFigure(): - """Get a list of the axis objects and create a figure""" - # Get list of axis objects - ax, f = getSetup((8, 4), (1, 3)) - - # Add subplot labels - subplotLabel(ax) - - # Set plotting format - matplotlib.rcParams["font.sans-serif"] = "Arial" - sns.set( - style="whitegrid", - font_scale=0.5, - color_codes=True, - palette="colorblind", - rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6}, - ) - - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, - ) - d = X.select_dtypes(include=[float]).T - i = X.select_dtypes(include=[object]) - # Plot mean AUCs per model - p = pd.read_csv("ddmc/data/Performance/preds_phenotypes_rs_15cl.csv").iloc[:, 1:] - p = p.melt( - id_vars=["Run", "Weight"], - value_vars=d.columns[2:], - value_name="p-signal", - var_name="Phenotype", - ) - out = d[ - (d["Weight"] == 0) - | (d["Phenotype"] == "STK11") & (d["Weight"] == 1000) - | (d["Phenotype"] == "EGFRm/ALKf") & (d["Weight"] == 250) - | (d["Phenotype"] == "Infiltration") & (d["Weight"] == 250) +def makeFigure(): + axes, f = getSetup((5, 5), (2, 2), multz={0: 1}) + + # use small numbers here so it doesn't take forever + regression_results = do_phenotype_regression(n_runs=1, n_cv_folds=2) + plot_phenotype_regression(regression_results, axes[0]) + p_signal = select_peptide_subset(CPTAC().get_p_signal(), keep_num=2000) + models = [ + DDMC( + n_components=30, + seq_weight=0, + distance_method="Binomial", + random_state=5, + ).fit(p_signal), + DDMC( + n_components=30, + seq_weight=250, + distance_method="Binomial", + random_state=5, + ).fit(p_signal), + DDMC( + n_components=30, + seq_weight=1e6, + distance_method="Binomial", + random_state=5, + ).fit(p_signal), ] - out["Model"] = ["DDMC" if s != 0 else "GMM" for s in out["Weight"]] - sns.barplot(data=out, x="Phenotype", y="p-signal", hue="Model", ci=68, ax=ax[0]) - ax[0].legend(prop={"size": 5}, loc=0) - - # Fit Data, Mix, and Seq Models - dataM = DDMC( - i, n_components=30, SeqWeight=0, distance_method="Binomial", random_state=5 - ).fit(d) - mixM = DDMC( - i, n_components=30, SeqWeight=250, distance_method="Binomial", random_state=5 - ).fit(d) - seqM = DDMC( - i, n_components=30, SeqWeight=1e6, distance_method="Binomial", random_state=5 - ).fit(d) - models = [dataM, mixM, seqM] - - # Center to peptide distance - barplot_PeptideToClusterDistances(models, ax[1], n=2000) - - # Position Enrichment - boxplot_TotalPositionEnrichment(models, ax[2]) - + plot_peptide_to_cluster_p_signal_distances(p_signal, models, axes[1]) + plot_total_position_enrichment(models, axes[2]) return f -def calculate_AUCs_phenotypes(ax, X, nRuns=3, n_components=35): +def do_phenotype_regression(n_runs=3, n_components=35, n_cv_folds=3): """Plot mean AUCs per phenotype across weights.""" - # Signaling - d = X.select_dtypes(include=[float]).T - i = X.select_dtypes(include=[object]) + cptac = CPTAC() + p_signal = cptac.get_p_signal() - # Genotype data - mutations = pd.read_csv("ddmc/data/MS/CPTAC/Patient_Mutations.csv") - mOI = mutations[ - ["Sample.ID"] + list(mutations.columns)[45:54] + list(mutations.columns)[61:64] - ] - y = mOI[~mOI["Sample.ID"].str.contains("IR")] - y = find_patients_with_NATandTumor(y.copy(), "Sample.ID", conc=False) + mutations = cptac.get_mutations( + ["STK11.mutation.status", "EGFR.mutation.status", "ALK.fusion"] + ) + stk11 = mutations["STK11.mutation.status"] + egfr = mutations["EGFR.mutation.status"] + alk = mutations["ALK.fusion"] + egfr_or_alk = egfr | alk + hot_cold = cptac.get_hot_cold_labels() - # LASSO lr = LogisticRegressionCV( - cv=3, + cv=n_cv_folds, solver="saga", max_iter=10000, n_jobs=-1, @@ -103,199 +66,114 @@ def calculate_AUCs_phenotypes(ax, X, nRuns=3, n_components=35): class_weight="balanced", ) - weights = [0, 50, 100, 250, 500, 750, 1000, 1000000] - run, ws, stk, ea, hcb = [], [], [], [], [] - for w in weights: - for r in range(nRuns): - run.append(r) - ws.append(w) - model = DDMC( - i, n_components=n_components, SeqWeight=w, distance_method="Binomial" - ).fit(d) - - # Find and scale centers - centers_gen, centers_hcb = TransformCenters(model, X) - - # STK11 - stk.append( - plotROC( - ax, - lr, - centers_gen.values, - y["STK11.mutation.status"], - cv_folds=3, - return_mAUC=True, - kfold="Repeated", - ) + results = pd.DataFrame(columns=["Weight", "STK11", "EGFRm/ALKf", "Infiltration"]) + + for seq_weight in [0, 1e2, 1e6]: + for _ in range(n_runs): + ddmc = DDMC( + n_components=n_components, + seq_weight=seq_weight, + distance_method="Binomial", + ).fit(p_signal) + centers = ddmc.transform(as_df=True) + centers.iloc[:, :] = normalize_cluster_centers(centers.values) + # the available patients vary by label + stk11_auc = plot_roc( + lr, + centers.loc[stk11.index].values, + stk11, + cv_folds=n_cv_folds, + return_mAUC=True, + kfold="Repeated", ) - - # EGFRm/ALKf - y_EA = merge_binary_vectors(y.copy(), "EGFR.mutation.status", "ALK.fusion") - ea.append( - plotROC( - ax, - lr, - centers_gen.values, - y_EA, - cv_folds=3, - return_mAUC=True, - kfold="Repeated", - ) + egfr_or_alk_auc = plot_roc( + lr, + centers.loc[egfr_or_alk.index].values, + egfr_or_alk, + cv_folds=n_cv_folds, + return_mAUC=True, + kfold="Repeated", ) - - # Hot-Cold behavior - y_hcb, centers_hcb = HotColdBehavior(centers_hcb) - hcb.append( - plotROC( - ax, - lr, - centers_hcb.values, - y_hcb, - cv_folds=3, - return_mAUC=True, - kfold="Repeated", - ) + hot_cold_auc = plot_roc( + lr, + centers.loc[hot_cold.index].values, + hot_cold, + cv_folds=n_cv_folds, + return_mAUC=True, + kfold="Repeated", ) + results.loc[len(results)] = [ + seq_weight, + stk11_auc, + egfr_or_alk_auc, + hot_cold_auc, + ] - out = pd.DataFrame() - out["Run"] = run - out["Weight"] = ws - out["STK11"] = stk - out["EGFRm/ALKf"] = ea - out["Infiltration"] = hcb - - return out + return results -def barplot_PeptideToClusterDistances(models, ax, n=3000): - """Compute and plot p-signal-to-center and motif to cluster distance for n peptides across weights.""" - # Import signaling data, select random peptides, and find cluster assignments - X = pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:] - X = filter_NaNpeptides(X, tmt=2) - random_peptides = np.random.choice( - list(np.arange(len(models[0].labels()))), n, replace=False +def plot_phenotype_regression(results: pd.DataFrame, ax) -> None: + df_melted = results.melt( + id_vars=["Weight"], var_name="Task", value_name="Performance" ) - X["labels0"] = models[0].labels() - X["labels500"] = models[1].labels() - X["labels1M"] = models[2].labels() - X = X.iloc[random_peptides, :] - labels = X.loc[:, ["labels0", "labels500", "labels1M"]].values.T - d = X.select_dtypes(include=[float]).values - - psDist = np.zeros((3, d.shape[0])) - for ii, model in enumerate(models): - for jj in range(d.shape[0]): - # Data distance - center = model.transform()[:, labels[ii, jj] - 1] - idx_values = np.argwhere(~np.isnan(d[jj, :])) - psDist[ii, jj] = mean_squared_error(d[jj, idx_values], center[idx_values]) - - psDist = pd.DataFrame(psDist).T - psDist.columns = ["p-Abundance", "Mix", "Sequence"] - ps_data = pd.melt( - psDist, - value_vars=["p-Abundance", "Mix", "Sequence"], - var_name="Model", + gmm = df_melted[df_melted["Weight"] == 0].copy() + ddmc = df_melted[df_melted["Weight"] != 0].copy() + best_ddmc = ddmc.loc[ddmc.groupby("Task")["Performance"].idxmax()] + gmm["Type"] = "GMM" + best_ddmc["Type"] = "DDMC" + plot_data = pd.concat([gmm, best_ddmc]) + sns.barplot(data=plot_data, x="Task", y="Performance", hue="Type", ax=ax) + ax.set_title("Performance by Task and Weight Type") + ax.set_xlabel("Performance") + ax.set_ylabel("Regression Task") + + +def plot_peptide_to_cluster_p_signal_distances( + p_signal: pd.DataFrame, models: List[DDMC], ax, n_peptides=100 +): + peptide_idx = np.random.choice(len(p_signal), n_peptides) + seq_weights = [model.seq_weight for model in models] + dists = pd.DataFrame( + np.zeros((n_peptides, len(models)), dtype=float), columns=seq_weights + ) + for i, model in enumerate(models): + labels = model.labels() + centers = model.transform().T + dists.iloc[:, i] = np.nanmean( + (p_signal.iloc[peptide_idx] - centers[labels[peptide_idx]]) ** 2, axis=1 + ) + dists_melted = pd.melt( + dists, + value_vars=seq_weights, + var_name="Sequence Weight", value_name="p-signal MSE", ) - sns.barplot(data=ps_data, x="Model", y="p-signal MSE", ci=None, ax=ax) - ax.set_title("Peptide-to-Cluster signal MSE") + sns.barplot(dists_melted, x="Sequence Weight", y="p-signal MSE", ax=ax) + ax.set_title("Peptide-to-cluster p-signal MSE") -def boxplot_TotalPositionEnrichment(models, ax): +def plot_total_position_enrichment(models: List[DDMC], ax): """Position enrichment of cluster PSSMs""" - enr = np.zeros((3, models[0].n_components), dtype=float) - for ii, model in enumerate(models): - pssms = model.pssms()[0] - for jj in range(len(pssms)): - enr[ii, jj] = np.sum(pssms[jj].sum().drop(5)) - - enr = pd.DataFrame(enr) - enr.columns = np.arange(models[0].n_components) + 1 - enr.insert(0, "Model", ["p-Abundance", "Mix", "Sequence"]) - dd = pd.melt( - frame=enr, - id_vars="Model", - value_vars=enr.columns[1:], - var_name="Cluster", - value_name="Total information (bits)", + enrichment = pd.DataFrame( + columns=["Sequence Weight", "Component", "Total information (bits)"] ) - sns.stripplot(data=dd, x="Model", y="Total information (bits)", ax=ax) - sns.boxplot(data=dd, x="Model", y="Total information (bits)", ax=ax, fliersize=0) - ax.set_title("Cumulative PSSM Enrichment") - - -def merge_binary_vectors(y, mutant1, mutant2): - """Merge binary mutation status vectors to identify all patients having one of the two mutations""" - y1 = y[mutant1] - y2 = y[mutant2] - y_ = np.zeros(y.shape[0]) - for binary in [y1, y2]: - indices = [i for i, x in enumerate(binary) if x == 1] - y_[indices] = 1 - return pd.Series(y_) - - -def find_patients_with_NATandTumor(X, label, conc=False): - """Reshape data to display patients as rows and samples (Tumor and NAT per cluster) as columns. - Note that to do so, samples that don't have their tumor/NAT counterpart are dropped. - """ - xT = X[~X[label].str.endswith(".N")].sort_values(by=label) - xN = X[X[label].str.endswith(".N")].sort_values(by=label) - l1 = list(xT[label]) - l2 = [s.split(".N")[0] for s in xN[label]] - dif = [i for i in l1 + l2 if i not in l1 or i not in l2] - X = xT.set_index(label).drop(dif) - assert all(X.index.values == np.array(l2)), "Samples don't match" - - if conc: - xN = xN.set_index(label) - xN.index = l2 - xN.columns = [str(i) + "_Normal" for i in xN.columns] - X.columns = [str(i) + "_Tumor" for i in X.columns] - X = pd.concat([X, xN], axis=1) - return X - -def TransformCenters(model, X): - """For a given model, find centers and transform for regression.""" - centers = pd.DataFrame(model.transform()).T - centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform( - centers.iloc[:, :] - ) - centers = centers.T - centers.columns = np.arange(model.n_components) + 1 - centers["Patient_ID"] = X.columns[4:] - centers1 = find_patients_with_NATandTumor(centers.copy(), "Patient_ID", conc=True) - centers2 = ( - centers.loc[~centers["Patient_ID"].str.endswith(".N"), :] - .sort_values(by="Patient_ID") - .set_index("Patient_ID") - ) - return centers1, centers2 - - -def HotColdBehavior(centers): - # Import Cold-Hot Tumor data - y = ( - pd.read_csv("ddmc/data/MS/CPTAC/Hot_Cold.csv") - .dropna(axis=1) - .sort_values(by="Sample ID") + # loop because it's not guaranteed that each cluster will contain a peptide + for model in models: + pssm_names, pssms = model.get_pssms() + for cluster, pssm in zip(pssm_names, pssms): + enrichment.loc[len(enrichment)] = [ + model.seq_weight, + cluster, + np.sum(np.delete(pssm, 5, axis=1)), + ] + + sns.stripplot(enrichment, x="Sequence Weight", y="Total information (bits)", ax=ax) + sns.boxplot( + enrichment, + x="Sequence Weight", + y="Total information (bits)", + ax=ax, + fliersize=0, ) - y = y.loc[~y["Sample ID"].str.endswith(".N"), :].set_index("Sample ID") - l1 = list(centers.index) - l2 = list(y.index) - dif = [i for i in l1 + l2 if i not in l1 or i not in l2] - centers = centers.drop(dif) - - # Transform to binary - y = y.replace("Cold-tumor enriched", 0) - y = y.replace("Hot-tumor enriched", 1) - y = np.squeeze(y) - - # Remove NAT-enriched samples - centers = centers.drop(y[y == "NAT enriched"].index) - y = y.drop(y[y == "NAT enriched"].index).astype(int) - assert all(centers.index.values == y.index.values), "Samples don't match" - - return y, centers + ax.set_title("Cumulative PSSM Enrichment") diff --git a/ddmc/figures/figureM5.py b/ddmc/figures/figureM5.py index 354a71e4d..7b1620222 100644 --- a/ddmc/figures/figureM5.py +++ b/ddmc/figures/figureM5.py @@ -1,93 +1,53 @@ -""" -This creates Figure 5: Tumor vs NAT analysis -""" - import numpy as np -import pandas as pd -import seaborn as sns -import matplotlib -import textwrap -import mygene -from scipy.stats import mannwhitneyu from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler -from statsmodels.stats.multitest import multipletests -from ..clustering import DDMC -from .common import subplotLabel, getSetup -from ..logistic_regression import plotClusterCoefficients, plotROC -from .common import plotDistanceToUpstreamKinase -from ..pca import plotPCA -from ..pre_processing import filter_NaNpeptides - - -def makeFigure(): - """Get a list of the axis objects and create a figure""" - # Get list of axis objects - ax, f = getSetup((11, 10), (3, 3), multz={0: 1, 4: 1}) - # Add subplot labels - subplotLabel(ax) +from ddmc.clustering import DDMC +from ddmc.datasets import CPTAC, select_peptide_subset +from ddmc.figures.common import ( + getSetup, + plot_cluster_kinase_distances, + get_pvals_across_clusters, + plot_p_signal_across_clusters_and_binary_feature, + plot_pca_on_cluster_centers, +) +from ddmc.logistic_regression import ( + plot_cluster_regression_coefficients, + plot_roc, + normalize_cluster_centers, + get_highest_weighted_clusters, +) - # Set plotting format - matplotlib.rcParams["font.sans-serif"] = "Arial" - sns.set( - style="whitegrid", - font_scale=1, - color_codes=True, - palette="colorblind", - rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6}, - ) - - # Import signaling data - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, - ) - d = X.select_dtypes(include=[float]).T - i = X.select_dtypes(include=[object]) - # Fit DDMC - model = DDMC( - i, n_components=30, SeqWeight=100, distance_method="Binomial", random_state=5 - ).fit(d) +def makeFigure(): + axes, f = getSetup((11, 10), (3, 3), multz={0: 1, 4: 1}) + cptac = CPTAC() + p_signal = cptac.get_p_signal() + model = DDMC(n_components=30, seq_weight=100).fit(p_signal) - # Normalize - centers = pd.DataFrame(model.transform()).T - centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform( - centers.iloc[:, :] - ) - centers = centers.T - centers.columns = np.arange(model.n_components) + 1 - centers["Patient_ID"] = X.columns[4:] - centers = TumorType(centers).set_index("Patient_ID") - centers = centers.drop([14, 24], axis=1) # Drop cluster 19, contains only 1 peptide + centers = model.transform(as_df=True) + centers.iloc[:, :] = normalize_cluster_centers(centers.values) + is_tumor = cptac.get_tumor_or_nat(centers.index) # first plot heatmap of clusters # lim = 1.5 # sns.clustermap(centers.set_index("Type").T, method="complete", cmap="bwr", vmax=lim, vmin=-lim, figsize=(15, 9)) Run in notebook and save as svg - ax[0].axis("off") - - # PCA and Hypothesis Testing - pvals = calculate_mannW_pvals(centers, "Type", "NAT", "Tumor") - pvals = build_pval_matrix(model.n_components, pvals) - plotPCA( - ax[1:3], - centers.reset_index(), - 2, - ["Patient_ID", "Type"], - "Cluster", - hue_scores="Type", - style_scores="Type", - pvals=pvals.iloc[:, -1].values, + axes[0].axis("off") + + plot_pca_on_cluster_centers( + centers, + axes[1:3], + hue_scores=is_tumor, + hue_scores_title="Tumor?", + hue_loadings=get_pvals_across_clusters(is_tumor, centers) < 0.01, + hue_loadings_title="p < 0.01", ) - plot_clusters_binaryfeatures(centers, "Type", ax[3], pvals=pvals, loc="lower left") - # Transform to Binary - c = centers.select_dtypes(include=["float64"]) - tt = centers.iloc[:, -1] - tt = tt.replace("NAT", 0) - tt = tt.replace("Tumor", 1) + print(is_tumor) + plot_p_signal_across_clusters_and_binary_feature( + is_tumor, centers, "is_tumor", axes[3] + ) # Logistic Regression lr = LogisticRegressionCV( @@ -99,185 +59,16 @@ def makeFigure(): l1_ratios=[0.85], class_weight="balanced", ) - plotROC(ax[4], lr, c.values, tt, cv_folds=4, return_mAUC=False) - plotClusterCoefficients(ax[5], lr) - ax[5].set_xticklabels(centers.columns[:-1]) - - # Upstream Kinases - plotDistanceToUpstreamKinase(model, [6, 15, 20], ax[6], num_hits=2) - - return f + plot_roc(lr, centers.values, is_tumor, cv_folds=4, return_mAUC=False, ax=axes[4]) + plot_cluster_regression_coefficients(axes[5], lr) -def make_BPtoGenes_table(X, cluster): - d = X[["Clusters", "Description", "geneID"]] - d = d[d["Clusters"] == cluster] - gAr = d[["geneID"]].values - bpAr = d[["Description"]].values - mg = mygene.MyGeneInfo() - BPtoGenesDict = {} - for ii, arr in enumerate(gAr): - gg = mg.querymany( - list(arr[0].split("/")), - scopes="entrezgene", - fields="symbol", - species="human", - returnall=False, - as_dataframe=True, - ) - BPtoGenesDict[bpAr[ii][0]] = list(gg["symbol"]) - return pd.DataFrame(dict([(k, pd.Series(v)) for k, v in BPtoGenesDict.items()])) + top_clusters = get_highest_weighted_clusters(model, lr.coef_) + # plot predicted kinases for most weighted clusters + distances = model.predict_upstream_kinases()[top_clusters] -def plot_enriched_processes(ax, X, y, f, cluster, gene_set="WP"): - """"Plot BPs enriched per cluster""" "" - if gene_set == "WP": - gsea = pd.read_csv("ddmc/data/cluster_analysis/CPTAC_GSEA_WP_results.csv").iloc[ - :, 1: - ] - elif gene_set == "onco": - gsea = pd.read_csv( - "ddmc/data/cluster_analysis/CPTAC_GSEA_ONCO_results.csv" - ).iloc[:, 1:] - elif gene_set == "Immuno": - gsea = pd.read_csv("ddmc/data/cluster_analysis/CPTAC_GSEA_WP_results.csv").iloc[ - :, 1: - ] - cc = make_BPtoGenes_table(gsea, cluster) - cl = X[X["Cluster"] == cluster].set_index("Gene") - dfs = [] - for ii in range(cc.shape[1]): - ss = cl.loc[cc.iloc[:, ii].dropna()].reset_index() - ss["Process"] = cc.columns[ii] - dfs.append(ss) - - out = pd.concat(dfs).set_index("Process").select_dtypes(include=[float]).T - out[f[0]] = y - out[f[0]] = out[f[0]].replace(0, f[1]) - out[f[0]] = out[f[0]].replace(1, f[2]) - dm = pd.melt( - out, - id_vars=f[0], - value_vars=out.columns, - var_name="Process", - value_name="mean log(p-signal)", - ) - dm.iloc[:, -1] = dm.iloc[:, -1].astype(float) - sns.boxplot( - data=dm, - x="Process", - y="mean log(p-signal)", - hue=f[0], - showfliers=False, - linewidth=0.5, - ax=ax, + plot_cluster_kinase_distances( + distances, model.get_pssms(clusters=top_clusters), axes[6], num_hits=2 ) - ax.set_xticklabels([textwrap.fill(t, 10) for t in list(cc.columns)], rotation=0) - ax.set_title("Processes Cluster " + str(cluster)) - - -def plot_NetPhoresScoreByKinGroup(PathToFile, ax, n=5, title=False, color="royalblue"): - """Plot top scoring kinase groups""" - NPtoCumScore = {} - X = pd.read_csv(PathToFile) - for ii in range(X.shape[0]): - curr_NPgroup = X["netphorest_group"][ii] - if curr_NPgroup == "any_group": - continue - elif curr_NPgroup not in NPtoCumScore.keys(): - NPtoCumScore[curr_NPgroup] = X["netphorest_score"][ii] - else: - NPtoCumScore[curr_NPgroup] += X["netphorest_score"][ii] - X = pd.DataFrame.from_dict(NPtoCumScore, orient="index").reset_index() - X.columns = ["KIN Group", "NetPhorest Score"] - X["KIN Group"] = [s.split("_")[0] for s in X["KIN Group"]] - X = X.sort_values(by="NetPhorest Score", ascending=False).iloc[:n, :] - sns.stripplot( - data=X, - y="KIN Group", - x="NetPhorest Score", - ax=ax, - orient="h", - color=color, - size=5, - **{"linewidth": 1}, - **{"edgecolor": "black"} - ) - if title: - ax.set_title(title) - else: - ax.set_title("Kinase Predictions") - - -def TumorType(X): - """Add NAT vs Tumor column.""" - tumortype = [] - for i in range(X.shape[0]): - if X["Patient_ID"][i].endswith(".N"): - tumortype.append("NAT") - else: - tumortype.append("Tumor") - X["Type"] = tumortype - return X - - -def plot_clusters_binaryfeatures(centers, id_var, ax, pvals=False, loc="best"): - """Plot p-signal of binary features (tumor vs NAT or mutational status) per cluster""" - data = pd.melt( - id_vars=id_var, - value_vars=centers.columns[:-1], - value_name="p-signal", - var_name="Cluster", - frame=centers, - ) - sns.violinplot( - x="Cluster", - y="p-signal", - hue=id_var, - data=data, - dodge=True, - ax=ax, - linewidth=0.25, - ) - ax.legend(prop={"size": 8}, loc=loc) - - if not isinstance(pvals, bool): - for ii, s in enumerate(pvals["Significant"]): - y, h, col = data["p-signal"].max(), 0.05, "k" - if s == "NS": - continue - elif s == "<0.05": - mark = "*" - else: - mark = "**" - ax.text(ii, y + h, mark, ha="center", va="bottom", color=col, fontsize=20) - - -def calculate_mannW_pvals(centers, col, feature1, feature2): - """Compute Mann Whitney rank test p-values corrected for multiple tests.""" - pvals, clus = [], [] - for ii in centers.columns[:-1]: - x = centers.loc[:, [ii, col]] - x1 = x[x[col] == feature1].iloc[:, 0] - x2 = x[x[col] == feature2].iloc[:, 0] - pval = mannwhitneyu(x1, x2)[1] - pvals.append(pval) - clus.append(ii) - return dict(zip(clus, multipletests(pvals)[1])) - - -def build_pval_matrix(ncl, pvals): - """Build data frame with pvalues per cluster""" - data = pd.DataFrame() - data["Clusters"] = pvals.keys() - data["p-value"] = pvals.values() - signif = [] - for val in pvals.values(): - if 0.01 < val < 0.05: - signif.append("<0.05") - elif val < 0.01: - signif.append("<0.01") - else: - signif.append("NS") - data["Significant"] = signif - return data + return f diff --git a/ddmc/figures/figureM6.py b/ddmc/figures/figureM6.py index 46c14fa97..cb3091ff1 100644 --- a/ddmc/figures/figureM6.py +++ b/ddmc/figures/figureM6.py @@ -1,102 +1,45 @@ -""" -This creates Figure 6: STK11 analysis -""" - import numpy as np import pandas as pd -import seaborn as sns -import matplotlib -from sklearn.linear_model import LogisticRegressionCV -from sklearn.preprocessing import StandardScaler +from bioinfokit import visuz from scipy.stats import mannwhitneyu +from sklearn.linear_model import LogisticRegressionCV from statsmodels.stats.multitest import multipletests -from bioinfokit import visuz -from ..clustering import DDMC -from ..pre_processing import filter_NaNpeptides -from .common import plotDistanceToUpstreamKinase -from .figureM4 import find_patients_with_NATandTumor -from .figureM5 import ( - plot_clusters_binaryfeatures, - build_pval_matrix, - calculate_mannW_pvals, - plot_enriched_processes, + +from ddmc.clustering import DDMC +from ddmc.datasets import CPTAC +from ddmc.figures.common import ( + getSetup, + plot_cluster_kinase_distances, + plot_p_signal_across_clusters_and_binary_feature, +) +from ddmc.logistic_regression import ( + plot_roc, + plot_cluster_regression_coefficients, + normalize_cluster_centers, + get_highest_weighted_clusters, ) -from ..logistic_regression import plotROC, plotClusterCoefficients -from .common import subplotLabel, getSetup def makeFigure(): - """Get a list of the axis objects and create a figure""" - # Get list of axis objects - ax, f = getSetup((11, 7), (2, 3), multz={0: 1}) - - # Add subplot labels - subplotLabel(ax) - - # Set plotting format - matplotlib.rcParams["font.sans-serif"] = "Arial" - sns.set( - style="whitegrid", - font_scale=1, - color_codes=True, - palette="colorblind", - rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6}, - ) - - # Import signaling data - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, - ) - d = X.select_dtypes(include=[float]).T - i = X.select_dtypes(include=[object]) - - # Fit DDMC - model = DDMC( - i, n_components=30, SeqWeight=100, distance_method="Binomial", random_state=5 - ).fit(d) + axes, f = getSetup((11, 7), (2, 3), multz={0: 1}) + cptac = CPTAC() + p_signal = cptac.get_p_signal() + model = DDMC(n_components=30, seq_weight=100).fit(p_signal) # Import Genotype data - mutations = pd.read_csv("ddmc/data/MS/CPTAC/Patient_Mutations.csv") - mOI = mutations[ - ["Sample.ID"] + list(mutations.columns)[45:54] + list(mutations.columns)[61:64] - ] - y = mOI[~mOI["Sample.ID"].str.contains("IR")] + egfrm = cptac.get_mutations(["EGFR.mutation.status"])["EGFR.mutation.status"] # Find centers - centers = pd.DataFrame(model.transform()) - centers.columns = np.arange(model.n_components) + 1 - centers["Patient_ID"] = X.columns[4:] - centers = centers.set_index("Patient_ID") - centers = centers.drop( - [14, 24], axis=1 - ) # Drop clusters 14&24, contain only 1 peptide - - # Hypothesis Testing - assert np.all(y["Sample.ID"] == centers.index) - centers["EGFRm"] = y["EGFR.mutation.status"].values - pvals = calculate_mannW_pvals(centers, "EGFRm", 1, 0) - pvals = build_pval_matrix(model.n_components, pvals) - centers["EGFRm"] = centers["EGFRm"].replace(0, "EGFR WT") - centers["EGFRm"] = centers["EGFRm"].replace(1, "EGFRm") - plot_clusters_binaryfeatures(centers, "EGFRm", ax[0], pvals=pvals) - ax[0].legend(loc="lower left", prop={"size": 10}) - - # Reshape data (Patients vs NAT and tumor sample per cluster) - centers = centers.reset_index().set_index("EGFRm") - centers = find_patients_with_NATandTumor(centers, "Patient_ID", conc=False) - y_ = find_patients_with_NATandTumor(y.copy(), "Sample.ID", conc=False) - assert all(centers.index.values == y_.index.values), "Samples don't match" + centers = model.transform(as_df=True).loc[egfrm.index] - # Normalize - centers = centers.T - centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform( - centers.iloc[:, :] + plot_p_signal_across_clusters_and_binary_feature( + egfrm, centers, "egfr mutation", axes[0] ) - centers = centers.T + + # Normalize + centers.iloc[:, :] = normalize_cluster_centers(centers.values) # Logistic Regression - centers["EGFRm"] = y_["EGFR.mutation.status"].values lr = LogisticRegressionCV( cv=20, solver="saga", @@ -105,31 +48,25 @@ def makeFigure(): penalty="l1", class_weight="balanced", ) - plotROC( - ax[1], - lr, - centers.iloc[:, :-1].values, - centers["EGFRm"], - cv_folds=4, - title="ROC EGFRm", - ) - ax[1].legend(loc="lower right", prop={"size": 8}) - plotClusterCoefficients( - ax[2], lr.fit(centers.iloc[:, :-1].values, centers["EGFRm"]), title="" + plot_roc( + lr, centers.values, egfrm.values, cv_folds=3, title="ROC EGFRm", ax=axes[1] ) - ax[2].legend(loc="lower left", prop={"size": 10}) + axes[1].legend(loc="lower right", prop={"size": 8}) - # plot Upstream Kinases - plotDistanceToUpstreamKinase( - model, [5, 16, 27], ax[3], num_hits=3, PsP_background=False - ) + plot_cluster_regression_coefficients(axes[2], lr, title="") + + top_clusters = get_highest_weighted_clusters(model, lr.coef_) - # volcano protein expression - ax[-1].axis("off") + distances = model.predict_upstream_kinases()[top_clusters] + # plot Upstream Kinases + plot_cluster_kinase_distances( + distances, model.get_pssms(clusters=top_clusters), axes[3], num_hits=2 + ) return f +# THIS FUNCTION IS NOT MAINTAINED def make_EGFRvolcano_plot(centers, y): """Make volcano plot with differential protein expression between EGFRm and WT.""" y = y.drop("C3N.00545") diff --git a/ddmc/figures/figureM7.py b/ddmc/figures/figureM7.py index 109e0127d..cb593a13a 100644 --- a/ddmc/figures/figureM7.py +++ b/ddmc/figures/figureM7.py @@ -1,126 +1,57 @@ -""" -This creates Figure 7: Tumor infiltrating immune cells -""" - import numpy as np import pandas as pd import seaborn as sns -import matplotlib import textwrap from sklearn.linear_model import LogisticRegressionCV -from sklearn.preprocessing import StandardScaler -from .common import subplotLabel, getSetup, plotDistanceToUpstreamKinase -from .figureM5 import ( - build_pval_matrix, - calculate_mannW_pvals, - plot_clusters_binaryfeatures, + +from ddmc.clustering import DDMC +from ddmc.datasets import CPTAC, select_peptide_subset +from ddmc.figures.common import ( + plot_cluster_kinase_distances, + getSetup, + plot_p_signal_across_clusters_and_binary_feature, +) +from ddmc.logistic_regression import ( + plot_roc, + plot_cluster_regression_coefficients, + normalize_cluster_centers, + get_highest_weighted_clusters, ) -from ..clustering import DDMC -from ..logistic_regression import plotROC, plotClusterCoefficients -from ..pre_processing import filter_NaNpeptides def makeFigure(): - """Get a list of the axis objects and create a figure""" - # Get list of axis objects - ax, f = getSetup((11, 7), (2, 3), multz={0: 1}) + axes, f = getSetup((11, 7), (2, 3), multz={0: 1, 4: 1}) + cptac = CPTAC() + is_hot = cptac.get_hot_cold_labels() + p_signal = cptac.get_p_signal() + model = DDMC(n_components=30, seq_weight=100, random_state=5).fit(p_signal) - # Set plotting format - matplotlib.rcParams["font.sans-serif"] = "Arial" - sns.set( - style="whitegrid", - font_scale=1, - color_codes=True, - palette="colorblind", - rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6}, - ) + centers = model.transform(as_df=True).loc[is_hot.index] + centers.iloc[:, :] = normalize_cluster_centers(centers.values) - # Add subplot labels - subplotLabel(ax) + plot_p_signal_across_clusters_and_binary_feature(is_hot, centers, "is_hot", axes[0]) - # Import signaling data - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, + n_cv = 15 + lr = LogisticRegressionCV( + cv=n_cv, solver="saga", n_jobs=1, penalty="l1", max_iter=10000 ) - d = X.select_dtypes(include=[float]).T - i = X.select_dtypes(include=[object]) - - # Fit DDMC - model = DDMC( - i, n_components=30, SeqWeight=100, distance_method="Binomial", random_state=5 - ).fit(d) - - X = pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:] - X = filter_NaNpeptides(X, tmt=2) - centers = pd.DataFrame(model.transform()) - centers.columns = np.arange(model.n_components) + 1 - centers["Patient_ID"] = X.columns[4:] - centers = ( - centers.loc[~centers["Patient_ID"].str.endswith(".N"), :] - .sort_values(by="Patient_ID") - .set_index("Patient_ID") + plot_roc( + lr, centers.values, is_hot.values, cv_folds=n_cv, title="ROC TI", return_mAUC=True ) - centers = centers.drop( - [14, 24], axis=1 - ) # Drop clusters 14&24, contain only 1 peptide - - # Import Cold-Hot Tumor data - cent1, y = FormatXYmatrices(centers.copy()) + plot_cluster_regression_coefficients(axes[1], lr, title="") - # Normalize - cent1 = cent1.T - cent1.iloc[:, :] = StandardScaler(with_std=False).fit_transform(cent1.iloc[:, :]) - cent1 = cent1.T + top_clusters = get_highest_weighted_clusters(model, lr.coef_) - # Hypothesis Testing - cent1["TI"] = y.values - pvals = calculate_mannW_pvals(cent1, "TI", 1, 0) - pvals = build_pval_matrix(model.n_components, pvals) - cent1["TI"] = cent1["TI"].replace(0, "CTE") - cent1["TI"] = cent1["TI"].replace(1, "HTE") - plot_clusters_binaryfeatures(cent1, "TI", ax[0], pvals=pvals, loc="lower left") - ax[0].legend(loc="lower left", prop={"size": 10}) + distances = model.predict_upstream_kinases()[top_clusters] - # Logistic Regression - lr = LogisticRegressionCV(cv=15, solver="saga", n_jobs=-1, penalty="l1") - plotROC(ax[1], lr, cent1.iloc[:, :-1].values, y, cv_folds=4, title="ROC TI") - plotClusterCoefficients( - ax[2], lr.fit(cent1.iloc[:, :-1], y.values), title="TI weights" + plot_cluster_kinase_distances( + distances, model.get_pssms(clusters=top_clusters), axes[2], num_hits=2 ) - - # plot Upstream Kinases - plotDistanceToUpstreamKinase(model, [17, 20, 21], ax[3], num_hits=3) - return f -def FormatXYmatrices(centers): - """Make sure Y matrix has the same matching samples has the signaling centers""" - y = ( - pd.read_csv("ddmc/data/MS/CPTAC/Hot_Cold.csv") - .dropna(axis=1) - .sort_values(by="Sample ID") - ) - y = y.loc[~y["Sample ID"].str.endswith(".N"), :].set_index("Sample ID") - l1 = list(centers.index) - l2 = list(y.index) - dif = [i for i in l1 + l2 if i not in l1 or i not in l2] - centers = centers.drop(dif) - - # Transform to binary - y = y.replace("Cold-tumor enriched", 0) - y = y.replace("Hot-tumor enriched", 1) - y = np.squeeze(y) - - # Remove NAT-enriched samples - centers = centers.drop(y[y == "NAT enriched"].index) - y = y.drop(y[y == "NAT enriched"].index).astype(int) - assert all(centers.index.values == y.index.values), "Samples don't match" - return centers, y - - def plot_ImmuneGOs(cluster, ax, title=False, max_width=25, n=False, loc="best"): + # THIS FUNCION IS NOT MAINTAINED """Plot immune-related GO""" go = pd.read_csv("ddmc/data/cluster_analysis/CPTAC_GO_C" + str(cluster) + ".csv") im = go[go["GO biological process complete"].str.contains("immune")] diff --git a/ddmc/figures/figureMS2.py b/ddmc/figures/figureMS2.py index 7c7dbb01a..40654f1f9 100644 --- a/ddmc/figures/figureMS2.py +++ b/ddmc/figures/figureMS2.py @@ -2,57 +2,30 @@ This creates Supplemental Figure 2: Cluster motifs """ -import matplotlib -import pandas as pd import numpy as np -import seaborn as sns -from .common import subplotLabel, getSetup -from .common import plotMotifs -from ..pre_processing import filter_NaNpeptides -from ..clustering import DDMC +from ddmc.clustering import DDMC +from ddmc.datasets import CPTAC, select_peptide_subset +from ddmc.figures.common import getSetup, plot_motifs -def makeFigure(): - """Get a list of the axis objects and create a figure""" - # Get list of axis objects - ax, f = getSetup((9, 9), (5, 5)) - - matplotlib.rcParams["font.sans-serif"] = "Arial" - sns.set( - style="white", - font_scale=1.2, - color_codes=True, - palette="colorblind", - rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6}, - ) - - # Import signaling data - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, - ) - d = X.select_dtypes(include=[float]).T - i = X.select_dtypes(include=[object]) - # Fit DDMC - model = DDMC( - i, n_components=30, SeqWeight=100, distance_method="Binomial", random_state=5 - ).fit(d) +def makeFigure(): + # Increase number of peptides and components for actual figure + p_signal = CPTAC().get_p_signal() + model = DDMC(n_components=16, seq_weight=100, random_state=5).fit(p_signal) - pssms, cl_num = model.pssms(PsP_background=False) + ax, f = getSetup((9, 9), (4, 4)) + clusters, pssms = model.get_pssms(PsP_background=False) ylabels = np.arange(0, 21, 5) xlabels = [20, 21, 22, 23, 24, 25] - for ii, cc in enumerate(cl_num): - cluster = "Cluster " + str(cc) - plotMotifs([pssms[ii]], axes=[ax[ii]], titles=[cluster], yaxis=[0, 10]) - if ii not in ylabels: - ax[ii].set_ylabel("") - ax[ii].get_yaxis().set_visible(False) - if ii not in xlabels: - ax[ii].set_xlabel("") - ax[ii].get_xaxis().set_visible(False) - - # Add subplot labels - # subplotLabel(ax) Too many plots to label A-Z + for cluster in clusters: + cluster_label = "Cluster " + str(cluster) + plot_motifs(pssms[cluster], ax=ax[cluster], titles=cluster_label, yaxis=[0, 10]) + if cluster not in ylabels: + ax[cluster].set_ylabel("") + ax[cluster].get_yaxis().set_visible(False) + if cluster not in xlabels: + ax[cluster].set_xlabel("") + ax[cluster].get_xaxis().set_visible(False) return f diff --git a/ddmc/figures/figureMS3.py b/ddmc/figures/figureMS3.py index 73b749f9a..9d4d82c66 100644 --- a/ddmc/figures/figureMS3.py +++ b/ddmc/figures/figureMS3.py @@ -1,51 +1,23 @@ -""" -This creates Supplemental Figure 3: Predictive performance of DDMC clusters using different weights -""" - -import matplotlib -import pandas as pd -import seaborn as sns from sklearn.linear_model import LogisticRegressionCV -from .common import subplotLabel, getSetup -from .figureM4 import TransformCenters, HotColdBehavior, find_patients_with_NATandTumor -from ..pre_processing import filter_NaNpeptides -from ..clustering import DDMC -from ..logistic_regression import plotROC - +from sklearn.preprocessing import StandardScaler -def makeFigure(): - """Get a list of the axis objects and create a figure""" - # Get list of axis objects - ax, f = getSetup((15, 10), (3, 5)) +from ddmc.clustering import DDMC +from ddmc.datasets import CPTAC, select_peptide_subset +from ddmc.figures.common import getSetup +from ddmc.logistic_regression import plot_roc, normalize_cluster_centers - # Add subplot labels - subplotLabel(ax) - # Set plotting format - matplotlib.rcParams["font.sans-serif"] = "Arial" - sns.set( - style="whitegrid", - font_scale=1.2, - color_codes=True, - palette="colorblind", - rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6}, - ) +def makeFigure(): + cptac = CPTAC() - # Signaling - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, + mutations = cptac.get_mutations( + ["STK11.mutation.status", "EGFR.mutation.status", "ALK.fusion"] ) - d = X.select_dtypes(include=[float]).T - i = X.select_dtypes(include=[object]) + stk11 = mutations["STK11.mutation.status"] + egfr = mutations["EGFR.mutation.status"] + hot_cold = cptac.get_hot_cold_labels() - # Genotype data - mutations = pd.read_csv("ddmc/data/MS/CPTAC/Patient_Mutations.csv") - mOI = mutations[ - ["Sample.ID"] + list(mutations.columns)[45:54] + list(mutations.columns)[61:64] - ] - y = mOI[~mOI["Sample.ID"].str.contains("IR")] - y = find_patients_with_NATandTumor(y.copy(), "Sample.ID", conc=False) + p_signal = CPTAC().get_p_signal() # LASSO lr = LogisticRegressionCV( @@ -58,50 +30,48 @@ def makeFigure(): class_weight="balanced", ) - folds = 5 - weights = [0, 100, 500, 1000, 1000000] - for ii, w in enumerate(weights): - model = DDMC(i, n_components=30, SeqWeight=w, distance_method="Binomial").fit(d) + folds = 3 + weights = [0, 500, 1000000] + ax, f = getSetup((15, 10), (3, len(weights))) + for ii, weight in enumerate(weights): + model = DDMC( + n_components=30, + seq_weight=weight, + distance_method="Binomial", + random_state=5, + ).fit(p_signal) # Find and scale centers - centers_gen, centers_hcb = TransformCenters(model, X) - - if w == 0: - prio = " (data only)" - elif w == 50: - prio = " (motif mainly)" - else: - prio = " (mix)" + centers = model.transform(as_df=True) + centers.iloc[:, :] = normalize_cluster_centers(centers.values) # STK11 - plotROC( - ax[ii], + plot_roc( lr, - centers_gen.values, - y["STK11.mutation.status"], + centers.loc[stk11.index].values, + stk11, cv_folds=folds, - title="STK11m " + "w=" + str(model.SeqWeight) + prio, + return_mAUC=False, + ax=ax[ii], + title="STK11m " + "w=" + str(model.seq_weight), ) - - # EGFRm - plotROC( - ax[ii + 5], + plot_roc( lr, - centers_gen.values, - y["EGFR.mutation.status"], + centers.loc[egfr.index].values, + egfr, cv_folds=folds, - title="EGFRm " + "w=" + str(model.SeqWeight) + prio, + return_mAUC=False, + ax=ax[ii + len(weights)], + title="EGFRm " + "w=" + str(model.seq_weight), ) - - # Hot-Cold behavior - y_hcb, centers_hcb = HotColdBehavior(centers_hcb) - plotROC( - ax[ii + 10], + plot_roc( lr, - centers_hcb.values, - y_hcb, + centers.loc[hot_cold.index].values, + hot_cold, cv_folds=folds, - title="Infiltration " + "w=" + str(model.SeqWeight) + prio, + return_mAUC=False, + ax=ax[ii + len(weights) * 2], + title="Infiltration " + "w=" + str(model.seq_weight), ) return f diff --git a/ddmc/figures/figureMS4.py b/ddmc/figures/figureMS4.py index b445b361c..f6240f947 100644 --- a/ddmc/figures/figureMS4.py +++ b/ddmc/figures/figureMS4.py @@ -1,119 +1,79 @@ -""" -This creates Supplemental Figure 4: Predicting sample type with different modeling strategies -""" - -import matplotlib import numpy as np import pandas as pd import seaborn as sns -from sklearn.linear_model import LogisticRegressionCV -from sklearn.cluster import KMeans -from ..clustering import DDMC -from .common import subplotLabel, getSetup -from ..pre_processing import filter_NaNpeptides -from ..logistic_regression import plotROC -from .figureM5 import TumorType +from sklearn.cluster import ( + KMeans, + AffinityPropagation, + Birch, + SpectralClustering, + MeanShift, + AgglomerativeClustering, +) +from ddmc.clustering import DDMC +from ddmc.datasets import CPTAC, filter_incomplete_peptides +from ddmc.figures.common import getSetup +from sklearn.metrics import adjusted_mutual_info_score def makeFigure(): - """Get a list of the axis objects and create a figure""" # Get list of axis objects - ax, f = getSetup((9, 6), (2, 3), multz={1: 1}) + ax, f = getSetup((8, 8), (1, 1), labels=False) - # Set plotting format - matplotlib.rcParams["font.sans-serif"] = "Arial" - sns.set( - style="whitegrid", - font_scale=1, - color_codes=True, - palette="colorblind", - rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6}, + p_signal = filter_incomplete_peptides( + CPTAC().get_p_signal(), sample_presence_ratio=1 ) - # Add subplot labels - subplotLabel(ax) + methods = [ + Birch(n_clusters=15), + MeanShift(), + KMeans(n_clusters=15), + AffinityPropagation(), + SpectralClustering(n_clusters=15, affinity="nearest_neighbors"), + AgglomerativeClustering(n_clusters=15), + AgglomerativeClustering(n_clusters=15, linkage="average"), + AgglomerativeClustering(n_clusters=15, linkage="complete"), + ] - # Import data - X = pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:] - X = filter_NaNpeptides(X, cut=1) - i = X.select_dtypes(include=["object"]) - X["Gene/Pos"] = X["Gene"] + ": " + X["Position"] - d = X.set_index("Gene/Pos").select_dtypes(include=["float64"]).T.reset_index() - d.rename(columns={"index": "Patient_ID"}, inplace=True) - z = TumorType(d) - z.iloc[:, -1] = z.iloc[:, -1].replace("Normal", "NAT") - d = z.iloc[:, 1:-1] - y = z.iloc[:, -1] - y = y.replace("NAT", 0) - y = y.replace("Tumor", 1) + labels_by_method = np.empty((p_signal.shape[0], len(methods) + 1), dtype=int) - # DDMC ROC - ncl = 30 - model = DDMC( - i, n_components=ncl, SeqWeight=100, distance_method="Binomial", random_state=5 - ).fit(d) - lr = LogisticRegressionCV( - cv=3, - solver="saga", - max_iter=10000, - n_jobs=-1, - penalty="elasticnet", - l1_ratios=[0.85], - class_weight="balanced", - ) - plotROC(ax[2], lr, model.transform(), y, cv_folds=4, return_mAUC=False) - ax[2].set_title("DDMC ROC") + for ii, m in enumerate(methods): + m.fit(p_signal.values) + labels_by_method[:, ii] = m.labels_ - # Tumor vs NAT unclustered - plotROC(ax[0], lr, d.values, y, cv_folds=4, title="ROC unclustered") - ax[0].set_title("Unclustered ROC") - plot_unclustered_LRcoef(ax[1], lr, d, y, z) + labels_by_method[:, -1] = ( + DDMC( + n_components=30, + seq_weight=100, + distance_method="Binomial", + random_state=5, + ) + .fit(p_signal) + .predict() + ) - # k-means - labels = KMeans(n_clusters=ncl).fit(d.T).labels_ - x_ = X.copy() - x_["Cluster"] = labels - c_kmeans = x_.groupby("Cluster").mean().T - c_kmeans.columns = list(np.arange(ncl) + 1) - km_lr = lr.fit(c_kmeans, y) - plotROC(ax[3], km_lr, c_kmeans.values, y, cv_folds=4, title="ROC k-means") - ax[3].set_title("k-means ROC") + mutInfo = np.zeros( + (labels_by_method.shape[1], labels_by_method.shape[1]), dtype=float + ) + for ii in range(mutInfo.shape[0]): + for jj in range(ii + 1): + mutInfo[ii, jj] = adjusted_mutual_info_score( + labels_by_method[:, ii], labels_by_method[:, jj] + ) + mutInfo[jj, ii] = mutInfo[ii, jj] - # GMM - gmm = DDMC(i, n_components=ncl, SeqWeight=0, distance_method="Binomial").fit(d) - x_ = X.copy() - x_["Cluster"] = gmm.labels() - c_gmm = x_.groupby("Cluster").mean().T - gmm_lr = lr.fit(c_gmm, y) - plotROC(ax[4], gmm_lr, c_gmm.values, y, cv_folds=4, title="ROC GMM") - ax[4].set_title("GMM ROC") + sns.heatmap(mutInfo, ax=ax[0]) + labels = [ + "Birch", + "MeanShift", + "k-means", + "Affinity Propagation", + "SpectralClustering", + "Agglomerative Clustering—Ward", + "Agglomerative Clustering—Average", + "Agglomerative Clustering—Commplete", + "DDMC", + ] + ax[0].set_xticklabels(labels, rotation=90) + ax[0].set_yticklabels(labels, rotation=0) return f - - -def plot_unclustered_LRcoef(ax, lr, d, y, z, title=False): - """Plot logistic regression coefficients of unclustered data""" - weights = [] - w = pd.DataFrame() - for _ in range(3): - lr = LogisticRegressionCV( - cv=3, - solver="saga", - max_iter=10000, - n_jobs=-1, - penalty="elasticnet", - l1_ratios=[0.85], - class_weight="balanced", - ) - w["Coefficients"] = lr.fit(d, y).coef_[0] - w["p-sites"] = z.columns[2:] - weights.append(w) - - coefs = pd.concat(weights) - coefs.sort_values(by="Coefficients", ascending=False, inplace=True) - coefs = coefs[(coefs["Coefficients"] > 0.075) | (coefs["Coefficients"] < -0.075)] - sns.barplot(data=coefs, x="p-sites", y="Coefficients", ax=ax, color="darkblue") - ax.set_title("p-sites explaining tumor vs NATs ") - ax.set_xticklabels(list(set(coefs["p-sites"])), rotation=90) - ax.set_xlabel("p-sites") - return coefs diff --git a/ddmc/figures/figureMS5.py b/ddmc/figures/figureMS5.py new file mode 100644 index 000000000..07476cef1 --- /dev/null +++ b/ddmc/figures/figureMS5.py @@ -0,0 +1,60 @@ +import pandas as pd +import seaborn as sns +from sklearn.linear_model import LogisticRegressionCV +from sklearn.cluster import KMeans +from ddmc.clustering import DDMC +from ddmc.figures.common import getSetup +from ddmc.logistic_regression import plot_roc, normalize_cluster_centers +from ddmc.datasets import CPTAC, filter_incomplete_peptides + + +def makeFigure(): + axes, f = getSetup((9, 6), (2, 3), multz={3: 2}) + + # Import data + cptac = CPTAC() + p_signal = filter_incomplete_peptides(cptac.get_p_signal(), sample_presence_ratio=1) + is_tumor = cptac.get_tumor_or_nat(p_signal.columns) + + n_components = 30 + model_gmm = DDMC( + n_components=n_components, + seq_weight=0, + distance_method="Binomial", + random_state=5, + ).fit(p_signal) + + model_kmeans = KMeans(n_clusters=n_components).fit(p_signal.values) + + lr = LogisticRegressionCV( + cv=3, + solver="saga", + max_iter=10000, + n_jobs=-1, + penalty="elasticnet", + l1_ratios=[0.85], + class_weight="balanced", + ) + + plot_roc(lr, p_signal.T.values, is_tumor, ax=axes[0]) + axes[0].set_title("unclustered ROC") + + # plot regression coefficients + coefs = pd.Series(lr.coef_.squeeze(), index=p_signal.index) + coefs.sort_values(ascending=False, inplace=True) + coefs.dropna(inplace=True) + coefs = pd.concat([coefs.iloc[:10], coefs.iloc[-10:]]) + coefs = coefs.to_frame("Coefficient") + coefs.reset_index(names=["p-site"], inplace=True) + sns.barplot(data=coefs, x="p-site", y="Coefficient", color="darkblue", ax=axes[3]) + axes[3].set_xticklabels(axes[3].get_xticklabels(), rotation=45) + + centers = model_gmm.transform() + centers = normalize_cluster_centers(centers) + plot_roc(lr, centers, is_tumor, ax=axes[1]) + axes[1].set_title("GMM ROC") + + plot_roc(lr, model_kmeans.cluster_centers_.T, is_tumor, ax=axes[2]) + axes[2].set_title("kmeans roc") + + return f diff --git a/ddmc/figures/figureMS6.py b/ddmc/figures/figureMS6.py deleted file mode 100644 index ef7ebbe16..000000000 --- a/ddmc/figures/figureMS6.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -This creates Figure 6: STK11 analysis -""" -import matplotlib -import numpy as np -import pandas as pd -import seaborn as sns -from sklearn.linear_model import LogisticRegressionCV -from sklearn.preprocessing import StandardScaler -from ..clustering import DDMC -from ..pre_processing import filter_NaNpeptides -from .common import plotDistanceToUpstreamKinase -from .figureM4 import find_patients_with_NATandTumor -from .figureM5 import ( - plot_clusters_binaryfeatures, - build_pval_matrix, - calculate_mannW_pvals, -) -from ..logistic_regression import plotROC, plotClusterCoefficients -from .common import subplotLabel, getSetup - - -def makeFigure(): - """Get a list of the axis objects and create a figure""" - # Get list of axis objects - ax, f = getSetup((11, 7), (2, 3), multz={0: 1}) - - # Add subplot labels - subplotLabel(ax) - - # Phosphoproteomic aberrations associated with molecular signatures - matplotlib.rcParams["font.sans-serif"] = "Arial" - sns.set( - style="whitegrid", - font_scale=1, - color_codes=True, - palette="colorblind", - rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6}, - ) - - # Import signaling data - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, - ) - d = X.select_dtypes(include=[float]).T - i = X.select_dtypes(include=[object]) - - # Fit DDMC - model = DDMC( - i, n_components=30, SeqWeight=100, distance_method="Binomial", random_state=5 - ).fit(d) - - # Import Genotype data - mutations = pd.read_csv("ddmc/data/MS/CPTAC/Patient_Mutations.csv") - mOI = mutations[ - ["Sample.ID"] + list(mutations.columns)[45:54] + list(mutations.columns)[61:64] - ] - y = mOI[~mOI["Sample.ID"].str.contains("IR")] - - # Find centers - X = pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:] - centers = pd.DataFrame(model.transform()) - centers.columns = np.arange(model.n_components) + 1 - centers["Patient_ID"] = X.columns[4:] - centers = centers.set_index("Patient_ID") - centers = centers.drop( - [14, 24], axis=1 - ) # Drop clusters 14&24, contain only 1 peptide - - # Hypothesis Testing - assert np.all(y["Sample.ID"] == centers.index) - centers["STK11"] = y["STK11.mutation.status"].values - pvals = calculate_mannW_pvals(centers, "STK11", 1, 0) - pvals = build_pval_matrix(model.n_components, pvals) - centers["STK11"] = centers["STK11"].replace(0, "STK11 WT") - centers["STK11"] = centers["STK11"].replace(1, "STK11m") - plot_clusters_binaryfeatures(centers, "STK11", ax[0], pvals=pvals) - ax[0].legend(loc="lower left", prop={"size": 10}) - - # Reshape data (Patients vs NAT and tumor sample per cluster) - centers = centers.reset_index().set_index("STK11") - centers = find_patients_with_NATandTumor(centers, "Patient_ID", conc=False) - y_ = find_patients_with_NATandTumor(y.copy(), "Sample.ID", conc=False) - assert all(centers.index.values == y_.index.values), "Samples don't match" - - # Normalize - centers = centers.T - centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform( - centers.iloc[:, :] - ) - centers = centers.T - - # Logistic Regression - centers["STK11"] = y_["STK11.mutation.status"].values - lr = LogisticRegressionCV( - cv=5, - solver="saga", - max_iter=10000, - n_jobs=-1, - penalty="l1", - class_weight="balanced", - random_state=10, - ) - plotROC( - ax[1], - lr, - centers.iloc[:, :-1].values, - centers["STK11"], - cv_folds=4, - title="ROC STK11", - ) - ax[1].legend(loc="lower right", prop={"size": 8}) - plotClusterCoefficients( - ax[2], - lr.fit(centers.iloc[:, :-1], centers["STK11"].values), - xlabels=list(centers.columns[:-1]), - title="", - ) - ax[2].legend(loc="lower left", prop={"size": 10}) - - # plot Upstream Kinases - plotDistanceToUpstreamKinase(model, [9, 11, 16, 18], ax[3], num_hits=1) - - ax[-1].axis("off") - - return f diff --git a/ddmc/figures/figureMS7.py b/ddmc/figures/figureMS7.py index c57ff8ec4..29d5e2bae 100644 --- a/ddmc/figures/figureMS7.py +++ b/ddmc/figures/figureMS7.py @@ -1,79 +1,38 @@ -""" -This creates Supplemental Figure 7: Predicting STK11 genotype using different clustering strategies. -""" - -import matplotlib import numpy as np -import pandas as pd -import seaborn as sns from sklearn.linear_model import LogisticRegressionCV -from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler + from ddmc.clustering import DDMC -from .common import subplotLabel, getSetup -from ..pre_processing import filter_NaNpeptides -from ..logistic_regression import plotROC -from .figureM4 import find_patients_with_NATandTumor +from ddmc.datasets import CPTAC, select_peptide_subset +from ddmc.figures.common import ( + plot_cluster_kinase_distances, + plot_p_signal_across_clusters_and_binary_feature, + getSetup, +) +from ddmc.logistic_regression import ( + plot_roc, + plot_cluster_regression_coefficients, + normalize_cluster_centers, + get_highest_weighted_clusters, +) def makeFigure(): - """Get a list of the axis objects and create a figure""" - # Get list of axis objects - ax, f = getSetup((20, 10), (2, 5)) - - # Add subplot labels - subplotLabel(ax) - - # Set plotting format - matplotlib.rcParams["font.sans-serif"] = "Arial" - sns.set( - style="whitegrid", - font_scale=1.2, - color_codes=True, - palette="colorblind", - rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6}, - ) + axes, f = getSetup((11, 7), (2, 3), multz={0: 1}) + cptac = CPTAC() + p_signal = cptac.get_p_signal() + stk11m = cptac.get_mutations(["STK11.mutation.status"])["STK11.mutation.status"] - # Signaling - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - cut=1, + model = DDMC(n_components=30, seq_weight=100, random_state=5).fit(p_signal) + centers = model.transform(as_df=True) + centers = centers.loc[stk11m.index] + plot_p_signal_across_clusters_and_binary_feature( + stk11m.values, centers, "STK11M", axes[0] ) - # Fit DDMC to complete data - d = np.array(X.select_dtypes(include=["float64"]).T) - i = X.select_dtypes(include=["object"]) + centers.iloc[:, :] = normalize_cluster_centers(centers.values) - assert np.all(np.isfinite(d)) - model_min = DDMC(i, n_components=30, SeqWeight=100, distance_method="Binomial").fit( - d - ) - - centers_min = pd.DataFrame(model_min.transform()).T - centers_min.iloc[:, :] = StandardScaler(with_std=False).fit_transform( - centers_min.iloc[:, :] - ) - centers_min = centers_min.T - centers_min.columns = np.arange(model_min.n_components) + 1 - centers_min["Patient_ID"] = X.columns[4:] - centers_min = find_patients_with_NATandTumor( - centers_min.copy(), "Patient_ID", conc=True - ) - - # Fit DDMC - model = DDMC(i, n_components=30, SeqWeight=100, distance_method="Binomial").fit(d) - - # Find and scale centers - centers = pd.DataFrame(model.transform()).T - centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform( - centers.iloc[:, :] - ) - centers = centers.T - centers.columns = np.arange(model.n_components) + 1 - centers["Patient_ID"] = X.columns[4:] - centers = find_patients_with_NATandTumor(centers.copy(), "Patient_ID", conc=True) - - # Predicting STK11 + # Logistic Regression lr = LogisticRegressionCV( cv=5, solver="saga", @@ -83,95 +42,21 @@ def makeFigure(): class_weight="balanced", random_state=10, ) - mutations = pd.read_csv("ddmc/data/MS/CPTAC/Patient_Mutations.csv") - mOI = mutations[ - ["Sample.ID"] + list(mutations.columns)[45:54] + list(mutations.columns)[61:64] - ] - y = mOI[~mOI["Sample.ID"].str.contains("IR")] - y = find_patients_with_NATandTumor(y.copy(), "Sample.ID", conc=False) - assert all(centers.index.values == y.index.values), "Samples don't match" - y_STK = y["STK11.mutation.status"] - plot_ROCs(ax[:5], centers, centers_min, X, i, y_STK, lr, "STK11") - - # Predicting EGFRm - lr = LogisticRegressionCV( - cv=20, - solver="saga", - max_iter=10000, - n_jobs=-1, - penalty="l1", - class_weight="balanced", - ) - y_EA = y["EGFR.mutation.status"] - plot_ROCs(ax[5:], centers, centers_min, X, i, y_EA, lr, "EGFRm") - return f - - -def plot_ROCs(ax, centers, centers_min, X, i, y, lr, gene_label): - """Generate ROC plots using DDMC, unclustered, k-means, and GMM for a particular feature.""" - folds = 7 - - # DDMC full - plotROC( - ax[0], - lr, - centers.values, - y, - cv_folds=folds, - title="DDMC—Full data set" + gene_label, + plot_roc( + lr, centers.values, stk11m.values, cv_folds=4, title="ROC STK11", ax=axes[1] ) - # DDMC minimal - plotROC( - ax[1], + plot_cluster_regression_coefficients( + axes[2], lr, - centers_min.values, - y, - cv_folds=folds, - title="DDMC—Complete portion" + gene_label, ) - # Unclustered - X_f = X.loc[:, centers.index].T - X_f.index = np.arange(X_f.shape[0]) - plotROC(ax[2], lr, X_f.values, y, cv_folds=folds, title="Unclustered " + gene_label) + top_clusters = get_highest_weighted_clusters(model, lr.coef_) - # Run k-means - ncl = 30 - d = X.select_dtypes(include=["float64"]).T.reset_index() - d.rename(columns={"index": "Patient_ID"}, inplace=True) - d = d.iloc[:, 1:] - x_ = X.copy() - x_["Cluster"] = KMeans(n_clusters=ncl).fit(d.T).labels_ - c_kmeans = x_.groupby("Cluster").mean().T - c_kmeans["Patient_ID"] = X.columns[4:] - c_kmeans.columns = list(np.arange(ncl) + 1) + ["Patient_ID"] - c_kmeans.iloc[:, :-1] = StandardScaler(with_std=False).fit_transform( - c_kmeans.iloc[:, :-1] + plot_cluster_kinase_distances( + model.predict_upstream_kinases()[top_clusters], + model.get_pssms(PsP_background=True, clusters=top_clusters)[0], + ax=axes[3], ) - - # Reshape data (Patients vs NAT and tumor sample per cluster) - c_kmeansT = find_patients_with_NATandTumor(c_kmeans.copy(), "Patient_ID", conc=True) - - # Regress k-means clusters against STK11 status - plotROC( - ax[3], lr, c_kmeansT.values, y, cv_folds=folds, title="k-means " + gene_label - ) - - # Run GMM - gmm = DDMC( - i, n_components=30, SeqWeight=0, distance_method="Binomial", random_state=15 - ).fit(d, "NA") - x_["Cluster"] = gmm.labels() - c_gmm = x_.groupby("Cluster").mean().T - c_gmm["Patient_ID"] = X.columns[4:] - c_gmm.iloc[:, :-1] = StandardScaler(with_std=False).fit_transform( - c_gmm.iloc[:, :-1] - ) - - # Reshape data (Patients vs NAT and tumor sample per cluster) - c_gmmT = find_patients_with_NATandTumor(c_gmm.copy(), "Patient_ID", conc=True) - - # Regress GMM clusters against STK11 status - plotROC(ax[4], lr, c_gmmT.values, y, cv_folds=folds, title="GMM " + gene_label) + return f diff --git a/ddmc/figures/figureMS8.py b/ddmc/figures/figureMS8.py deleted file mode 100644 index b0ef7aa1d..000000000 --- a/ddmc/figures/figureMS8.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -This creates Supplemental Figure 7: Adjusted Mutual Information across clustering methods -""" - -import numpy as np -import pandas as pd -import seaborn as sns -import matplotlib -from sklearn.cluster import ( - KMeans, - AffinityPropagation, - Birch, - SpectralClustering, - MeanShift, - AgglomerativeClustering, -) -from ddmc.clustering import DDMC -from .common import subplotLabel, getSetup -from ..pre_processing import filter_NaNpeptides -from sklearn.metrics import adjusted_mutual_info_score - - -def makeFigure(): - """Get a list of the axis objects and create a figure""" - # Get list of axis objects - ax, f = getSetup((8, 8), (1, 1)) - - # Set plotting format - matplotlib.rcParams["font.sans-serif"] = "Arial" - sns.set( - style="whitegrid", - font_scale=1, - color_codes=True, - palette="colorblind", - rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6}, - ) - - # Signaling - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - cut=1, - ) - - # Fit DDMC to complete data - d = np.array(X.select_dtypes(include=["float64"]).T) - i = X.select_dtypes(include=["object"]) - - assert np.all(np.isfinite(d)) - - methods = [ - Birch(n_clusters=15), - MeanShift(), - KMeans(n_clusters=15), - AffinityPropagation(), - SpectralClustering(n_clusters=15, affinity="nearest_neighbors"), - AgglomerativeClustering(n_clusters=15), - AgglomerativeClustering(n_clusters=15, linkage="average"), - AgglomerativeClustering(n_clusters=15, linkage="complete"), - ] - - labelsOut = np.empty((d.shape[1], len(methods) + 1), dtype=int) - - for ii, m in enumerate(methods): - m.fit(d.T) - labelsOut[:, ii] = m.labels_ - - labelsOut[:, -1] = ( - DDMC( - i, - n_components=30, - SeqWeight=100, - distance_method="Binomial", - random_state=5, - ) - .fit(d) - .predict() - ) - - # How many clusters do we get in each instance - print(np.amax(labelsOut, axis=0)) - - mutInfo = np.zeros((labelsOut.shape[1], labelsOut.shape[1]), dtype=float) - for ii in range(mutInfo.shape[0]): - for jj in range(ii + 1): - mutInfo[ii, jj] = adjusted_mutual_info_score( - labelsOut[:, ii], labelsOut[:, jj] - ) - mutInfo[jj, ii] = mutInfo[ii, jj] - - sns.heatmap(mutInfo, ax=ax[0]) - labels = [ - "Birch", - "MeanShift", - "k-means", - "Affinity Propagation", - "SpectralClustering", - "Agglomerative Clustering—Ward", - "Agglomerative Clustering—Average", - "Agglomerative Clustering—Commplete", - "DDMC", - ] - ax[0].set_xticklabels(labels, rotation=90) - ax[0].set_yticklabels(labels, rotation=0) - - return f diff --git a/ddmc/logistic_regression.py b/ddmc/logistic_regression.py index 91e0c5c9a..c70842366 100644 --- a/ddmc/logistic_regression.py +++ b/ddmc/logistic_regression.py @@ -4,14 +4,29 @@ import pandas as pd import seaborn as sns import matplotlib.pyplot as plt +from matplotlib.axes import Axes from scipy.stats import sem -from sklearn.metrics import confusion_matrix from sklearn.metrics import auc from sklearn.metrics import RocCurveDisplay from sklearn.model_selection import StratifiedKFold, RepeatedKFold +from sklearn.preprocessing import StandardScaler +from ddmc.clustering import DDMC -def plotClusterCoefficients(ax, lr, hue=None, xlabels=False, title=False): +def normalize_cluster_centers(centers: np.ndarray): + # normalize centers along along patient dimension + return StandardScaler(with_std=False).fit_transform(centers.T).T + + +def get_highest_weighted_clusters(model: DDMC, coefficients: np.ndarray, n_clusters=3): + top_clusters = np.flip(np.argsort(np.abs(coefficients.squeeze()))) + top_clusters = [ + cluster for cluster in top_clusters if cluster in model.get_nonempty_clusters() + ] + return top_clusters[:n_clusters] + + +def plot_cluster_regression_coefficients(ax: Axes, lr, hue=None, title=False): """Plot LR coeficients of clusters.""" coefs_ = pd.DataFrame(lr.coef_.T, columns=["LR Coefficient"]) if hue: @@ -19,9 +34,7 @@ def plotClusterCoefficients(ax, lr, hue=None, xlabels=False, title=False): coefs_["Sample"] = [l.split("_")[1] for l in hue] hue = "Sample" else: - coefs_["Cluster"] = np.arange(coefs_.shape[0]) + 1 - if xlabels: - coefs_["Cluster"] = xlabels + coefs_["Cluster"] = np.arange(coefs_.shape[0]) p = sns.barplot( ax=ax, x="Cluster", @@ -38,41 +51,17 @@ def plotClusterCoefficients(ax, lr, hue=None, xlabels=False, title=False): ax.set_title(title) -def plotPredictionProbabilities(ax, lr, dd, yy): - """Plot LR predictions and prediction probabilities.""" - res_ = pd.DataFrame() - res_["y, p(x)"] = lr.predict_proba(dd)[:, 1] - z = lr.predict(dd) == yy - res_["Correct_Prediction"] = z.values - res_["Prediction"] = lr.predict(dd).astype("int") - res_["Patients"] = np.arange(res_.shape[0]) + 1 - sns.scatterplot( - ax=ax, x="Patients", y="Prediction", data=res_, hue="Correct_Prediction" - ) - sns.lineplot(ax=ax, x="Patients", y="y, p(x)", data=res_, marker="s", color="gray") - ax.axhline(0.5, ls="--", color="r") - - -def plotConfusionMatrix(ax, lr, dd, yy): - """Actual vs predicted outputs""" - cm = confusion_matrix(yy, lr.predict(dd)) - n = lr.classes_.shape[0] - ax.imshow(cm) - ax.grid(False) - ax.set_xlabel("Predicted outputs", color="black") - ax.set_ylabel("Actual outputs", color="black") - ax.xaxis.set(ticks=range(n)) - ax.yaxis.set(ticks=range(n)) - for i in range(n): - for j in range(n): - ax.text(j, i, cm[i, j], ha="center", va="center", color="white") - - -def plotROC( - ax, classifier, d, y, cv_folds=4, title=False, return_mAUC=False, kfold="Stratified" +def plot_roc( + classifier, + X: np.ndarray, + y: np.ndarray, + cv_folds: int = 4, + title=False, + return_mAUC: bool = False, + kfold="Stratified", + ax: Axes = None, ): """Plot Receiver Operating Characteristc with cross-validation folds of a given classifier model.""" - y = y.values if kfold == "Stratified": cv = StratifiedKFold(n_splits=cv_folds) elif kfold == "Repeated": @@ -81,9 +70,9 @@ def plotROC( aucs = [] mean_fpr = np.linspace(0, 1, 100) - for _, (train, test) in enumerate(cv.split(d, y)): - classifier.fit(d[train], y[train]) - viz = RocCurveDisplay.from_estimator(classifier, d[test], y[test]) + for _, (train, test) in enumerate(cv.split(X, y)): + classifier.fit(X[train], y[train]) + viz = RocCurveDisplay.from_estimator(classifier, X[test], y[test]) plt.close() interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr) interp_tpr[0] = 0.0 diff --git a/ddmc/motifs.py b/ddmc/motifs.py index c83180979..aafc92ee3 100644 --- a/ddmc/motifs.py +++ b/ddmc/motifs.py @@ -9,29 +9,7 @@ from .binomial import AAlist -def MapMotifs(X, names): - """Generate pY motifs for pre-processing.""" - names, seqs, pXpos, Xidx = GeneratingKinaseMotifs(names, FormatSeq(X)) - X = X.iloc[Xidx, :] - X["Gene"] = names - X["Sequence"] = seqs - X.insert(3, "Position", pXpos) - return X[~X["Sequence"].str.contains("-")] - - -def FormatName(X): - """Keep only the general protein name, without any other accession information""" - full = [v.split("OS")[0].strip() for v in X.iloc[:, 0]] - gene = [v.split("GN=")[1].split(" PE")[0].strip() for v in X.iloc[:, 0]] - return full, gene - - -def FormatSeq(X): - """Deleting -1/-2 for mapping to uniprot's proteome""" - return [v.split("-")[0] for v in X["Sequence"]] - - -def DictProteomeNameToSeq(X, n): +def get_proteome_name_to_seq(X, n): """To generate proteom's dictionary""" DictProtToSeq_UP = {} for rec2 in SeqIO.parse(X, "fasta"): @@ -48,17 +26,17 @@ def DictProteomeNameToSeq(X, n): return DictProtToSeq_UP -def getKeysByValue(dictOfElements, valueToFind): +def get_keys_by_value(dictionary, value): """Find the key of a given value within a dictionary.""" listOfKeys = list() - listOfItems = dictOfElements.items() + listOfItems = dictionary.items() for item in listOfItems: - if valueToFind in item[1]: + if value in item[1]: listOfKeys.append(item[0]) return listOfKeys -def MatchProtNames(ProteomeDict, MS_names, MS_seqs): +def match_protein_names(ProteomeDict, MS_names, MS_seqs): """Match protein names of MS and Uniprot's proteome.""" matchedNames, seqs, Xidx = [], [], [] counter = 0 @@ -71,7 +49,7 @@ def MatchProtNames(ProteomeDict, MS_names, MS_seqs): matchedNames.append(MS_name) else: try: - newname = getKeysByValue(ProteomeDict, MS_seqU)[0] + newname = get_keys_by_value(ProteomeDict, MS_seqU)[0] assert MS_seqU in ProteomeDict[newname] Xidx.append(i) seqs.append(MS_seq) @@ -86,7 +64,7 @@ def MatchProtNames(ProteomeDict, MS_names, MS_seqs): return matchedNames, seqs, Xidx -def findmotif(MS_seq, MS_name, ProteomeDict, motif_size): +def find_motif(MS_seq, MS_name, ProteomeDict, motif_size): """For a given MS peptide, finds it in the ProteomeDict, and maps the +/-5 AA from the p-site, accounting for peptides phosphorylated multiple times concurrently.""" MS_seqU = MS_seq.upper() @@ -114,7 +92,7 @@ def findmotif(MS_seq, MS_name, ProteomeDict, motif_size): elif "t" in MS_seq or "s" in MS_seq: DoS_idx = list(re.compile("y|t|s").finditer(MS_seq)) assert len(DoS_idx) != 0 - mappedMotif, pidx = makeMotif( + mappedMotif, pidx = make_motif( UP_seq, MS_seq, motif_size, y_idx, center_idx, DoS_idx ) if len(pidx) == 1: @@ -130,7 +108,7 @@ def findmotif(MS_seq, MS_name, ProteomeDict, motif_size): DoS_idx = None if len(pTS_idx) > 1: DoS_idx = pTS_idx[1:] - mappedMotif, pidx = makeMotif( + mappedMotif, pidx = make_motif( UP_seq, MS_seq, motif_size, ts_idx, center_idx, DoS_idx ) if len(pidx) == 1: @@ -145,14 +123,12 @@ def findmotif(MS_seq, MS_name, ProteomeDict, motif_size): return pos, mappedMotif -def GeneratingKinaseMotifs(names, seqs): +def generate_kinase_motifs(names, seqs): """Main function to generate motifs using 'findmotif'.""" motif_size = 5 - proteome = open( - "./data/Sequence_analysis/proteome_uniprot2019.fa", "r" - ) - ProteomeDict = DictProteomeNameToSeq(proteome, n="gene") - protnames, seqs, Xidx = MatchProtNames(ProteomeDict, names, seqs) + proteome = open("./data/Sequence_analysis/proteome_uniprot2019.fa", "r") + ProteomeDict = get_proteome_name_to_seq(proteome, n="gene") + protnames, seqs, Xidx = match_protein_names(ProteomeDict, names, seqs) ( MS_names, mapped_motifs, @@ -164,7 +140,7 @@ def GeneratingKinaseMotifs(names, seqs): ) for i, MS_seq in enumerate(seqs): - pos, mappedMotif = findmotif(MS_seq, protnames[i], ProteomeDict, motif_size) + pos, mappedMotif = find_motif(MS_seq, protnames[i], ProteomeDict, motif_size) MS_names.append(protnames[i]) mapped_motifs.append(mappedMotif) uni_pos.append(pos) @@ -173,7 +149,7 @@ def GeneratingKinaseMotifs(names, seqs): return MS_names, mapped_motifs, uni_pos, Xidx -def makeMotif(UP_seq, MS_seq, motif_size, ps_protein_idx, center_motif_idx, DoS_idx): +def make_motif(UP_seq, MS_seq, motif_size, ps_protein_idx, center_motif_idx, DoS_idx): """Make a motif out of the matched sequences.""" UP_seq_copy = list( UP_seq[max(0, ps_protein_idx - motif_size) : ps_protein_idx + motif_size + 1] @@ -214,52 +190,23 @@ def makeMotif(UP_seq, MS_seq, motif_size, ps_protein_idx, center_motif_idx, DoS_ return "".join(UP_seq_copy), pidx -def preprocess_seqs(X, pYTS): - """Filter out any sequences with different than the specified central p-residue - and/or any containing gaps.""" - X = X[~X["Sequence"].str.contains("-")] - - Xidx = [] - for seq in X["Sequence"]: - Xidx.append(seq[5] == pYTS.lower()) - return X.iloc[Xidx, :] - - -def ForegroundSeqs(sequences): - """Build Background data set for either "Y", "S", or "T".""" - seqs = [] - yts = ["Y", "T", "S"] - for motif in sequences: - motif = motif.upper() - assert "-" not in motif, "gap in motif" - assert motif[5] in yts, "WRONG CENTRAL AMINO ACID" - seqs.append(Seq(motif, alphabet=AAlist)) - return seqs - - -def PSPLdict(): +def get_pspls() -> tuple[np.ndarray, np.ndarray]: """Generate dictionary with kinase name-specificity profile pairs""" - pspl_dict = {} + pspls_arr = [] + kinases = [] # individual files PSPLs = glob.glob("./ddmc/data/PSPL/*.csv") for sp in PSPLs: if sp == "./ddmc/data/PSPL/pssm_data.csv": continue - sp_mat = pd.read_csv(sp).sort_values(by="Unnamed: 0") - - if ( - sp_mat.shape[0] > 20 - ): # Remove profiling of fixed pY and pT, include only natural AA - assert np.all(sp_mat.iloc[:-2, 0] == AAlist), "aa don't match" - sp_mat = sp_mat.iloc[:-2, 1:].values - else: - assert np.all(sp_mat.iloc[:, 0] == AAlist), "aa don't match" - sp_mat = sp_mat.iloc[:, 1:].values + sp_mat = pd.read_csv(sp, index_col=0) + sp_mat = sp_mat.loc[AAlist] if np.all(sp_mat >= 0): sp_mat = np.log2(sp_mat) - pspl_dict[sp.split("PSPL/")[1].split(".csv")[0]] = sp_mat + kinases.append(sp.split("PSPL/")[1].split(".csv")[0]) + pspls_arr.append(sp_mat.values) # NetPhores PSPL results f = pd.read_csv("ddmc/data/PSPL/pssm_data.csv", header=None) @@ -272,12 +219,13 @@ def PSPLdict(): mat = np.ma.log2(mat) mat = mat.filled(0) mat = np.clip(mat, a_min=0, a_max=3) - pspl_dict[kin] = mat + kinases.append(kin) + pspls_arr.append(mat) - return pspl_dict + return np.array(kinases), np.array(pspls_arr) -def compute_control_pssm(bg_sequences): +def compute_control_pssm(bg_sequences) -> np.ndarray: """Generate PSSM.""" back_pssm = np.zeros((len(AAlist), 11), dtype=float) for _, seq in enumerate(bg_sequences): @@ -285,8 +233,8 @@ def compute_control_pssm(bg_sequences): back_pssm[AAlist.index(aa), kk] += 1.0 for pos in range(back_pssm.shape[1]): back_pssm[:, pos] /= np.mean(back_pssm[:, pos]) - back_pssm = np.ma.log2(back_pssm) - return back_pssm.filled(0) + back_pssm = np.log2(back_pssm) + return np.nan_to_num(back_pssm) KinToPhosphotypeDict = { diff --git a/ddmc/pam250.py b/ddmc/pam250.py index b608beeb9..607172ec6 100644 --- a/ddmc/pam250.py +++ b/ddmc/pam250.py @@ -5,46 +5,32 @@ class PAM250: - def __init__(self, seqs): + def __init__(self, seqs: list[str]): # Compute all pairwise distances - self.background = MotifPam250Scores(seqs) + self.background = get_pam250_scores(seqs) self.logWeights = 0.0 - def from_summaries(self, weightsIn): + def from_summaries(self, weightsIn: np.ndarray): """Update the underlying distribution.""" sums = np.sum(weightsIn, axis=0) sums = np.clip(sums, 0.00001, np.inf) # Avoid empty cluster divide by 0 self.logWeights = (self.background @ weightsIn) / sums -def MotifPam250Scores(seqs): +def get_pam250_scores(seqs: list[str]) -> np.ndarray: """Calculate and store all pairwise pam250 distances before starting.""" pam250 = substitution_matrices.load("PAM250") seqs = np.array( - [[pam250.alphabet.find(aa) for aa in seq] for seq in seqs], dtype=np.int8 + [[pam250.alphabet.find(aa) for aa in seq] for seq in seqs], dtype=np.int8 # type: ignore ) - # Move to a standard Numpy array - pam250m = np.ndarray(pam250.shape, dtype=np.int8) + # convert to np array + pam250m = np.array(pam250.values(), dtype=np.int8).reshape(pam250.shape) - for ii in range(pam250m.shape[0]): - for jj in range(pam250m.shape[1]): - pam250m[ii, jj] = pam250[ii, jj] - - out = distanceCalc(seqs, pam250m) + out = np.zeros((seqs.shape[0], seqs.shape[0]), dtype=np.int8) + i_idx, j_idx = np.tril_indices(seqs.shape[0]) + out[i_idx, j_idx] = np.sum(pam250m[seqs[i_idx], seqs[j_idx]], axis=1) i_upper = np.triu_indices_from(out, k=1) out[i_upper] = out.T[i_upper] # pylint: disable=unsubscriptable-object return out - - -def distanceCalc(seqs, pam250m): - """Perform all the pairwise distances.""" - # WARNING this type can only hold -128 to 127 - out = np.zeros((seqs.shape[0], seqs.shape[0]), dtype=np.int8) - - for ii in range(seqs.shape[0]): # pylint: disable=not-an-iterable - for jj in range(ii + 1): - out[ii, jj] = np.sum(pam250m[seqs[ii, :], seqs[jj, :]]) - - return out diff --git a/ddmc/pca.py b/ddmc/pca.py deleted file mode 100644 index 007b5ea83..000000000 --- a/ddmc/pca.py +++ /dev/null @@ -1,511 +0,0 @@ -""" PCA functions """ - -import pandas as pd -import numpy as np -import seaborn as sns -from ddmc.pre_processing import Linear -from sklearn.decomposition import PCA, NMF -from sklearn.preprocessing import StandardScaler -from sklearn.utils import resample -from sklearn.metrics import r2_score - - -def pca_dfs(scores, loadings, df, n_components, sIDX, lIDX): - """build PCA scores and loadings data frames.""" - dScor = pd.DataFrame() - dLoad = pd.DataFrame() - for i in range(n_components): - cpca = "PC" + str(i + 1) - dScor[cpca] = scores[:, i] - dLoad[cpca] = loadings[i, :] - - for j in sIDX: - dScor[j] = list(df[j]) - dLoad[lIDX] = df.select_dtypes(include=["float64"]).columns - return dScor, dLoad - - -def plotPCA( - ax, - d, - n_components, - scores_ind, - loadings_ind, - hue_scores=None, - style_scores=None, - pvals=None, - style_load=None, - legendOut=False, - quadrants=True, -): - """Plot PCA scores and loadings.""" - pp = PCA(n_components=n_components) - dScor_ = pp.fit_transform(d.select_dtypes(include=["float64"])) - dLoad_ = pp.components_ - dScor_, dLoad_ = pca_dfs(dScor_, dLoad_, d, n_components, scores_ind, loadings_ind) - varExp = np.round(pp.explained_variance_ratio_, 2) - - # Scores - sns.scatterplot( - x="PC1", - y="PC2", - data=dScor_, - hue=hue_scores, - style=style_scores, - ax=ax[0], - **{"linewidth": 0.5, "edgecolor": "k"} - ) - ax[0].set_title("PCA Scores") - ax[0].set_xlabel("PC1 (" + str(int(varExp[0] * 100)) + "%)", fontsize=10) - ax[0].set_ylabel("PC2 (" + str(int(varExp[1] * 100)) + "%)", fontsize=10) - ax[0].legend(prop={"size": 8}) - if legendOut: - ax[0].legend( - bbox_to_anchor=(1.05, 1), - loc=2, - borderaxespad=0, - labelspacing=0.2, - prop={"size": 8}, - ) - - # Loadings - if isinstance(pvals, np.ndarray): - dLoad_["p-value"] = pvals - sns.scatterplot( - x="PC1", - y="PC2", - data=dLoad_, - hue="p-value", - style=style_load, - ax=ax[1], - **{"linewidth": 0.5, "edgecolor": "k"} - ) - else: - sns.scatterplot( - x="PC1", - y="PC2", - data=dLoad_, - style=style_load, - ax=ax[1], - **{"linewidth": 0.5, "edgecolor": "k"} - ) - - ax[1].set_title("PCA Loadings") - ax[1].set_xlabel("PC1 (" + str(int(varExp[0] * 100)) + "%)", fontsize=10) - ax[1].set_ylabel("PC2 (" + str(int(varExp[1] * 100)) + "%)", fontsize=10) - ax[1].legend(prop={"size": 8}) - for j, txt in enumerate(dLoad_[loadings_ind]): - ax[1].annotate( - txt, (dLoad_["PC1"][j] + 0.001, dLoad_["PC2"][j] + 0.001), fontsize=10 - ) - - if quadrants: - ax[0].axhline(0, ls="--", color="lightgrey") - ax[0].axvline(0, ls="--", color="lightgrey") - ax[1].axhline(0, ls="--", color="lightgrey") - ax[1].axvline(0, ls="--", color="lightgrey") - - -def plotPCA_scoresORloadings( - ax, - d, - n_components, - scores_ind, - loadings_ind, - hue_scores=None, - style_scores=None, - pvals=None, - style_load=None, - legendOut=False, - plot="scores", - annotateScores=False, -): - """Plot PCA scores only""" - pp = PCA(n_components=n_components) - dScor_ = pp.fit_transform(d.select_dtypes(include=["float64"]).values) - dLoad_ = pp.components_ - dScor_, dLoad_ = pca_dfs(dScor_, dLoad_, d, n_components, scores_ind, loadings_ind) - varExp = np.round(pp.explained_variance_ratio_, 2) - - # Scores - if plot == "scores": - sns.scatterplot( - x="PC1", - y="PC2", - data=dScor_, - hue=hue_scores, - style=style_scores, - ax=ax, - **{"linewidth": 0.5, "edgecolor": "k"} - ) - ax.set_title("PCA Scores") - ax.set_xlabel("PC1 (" + str(int(varExp[0] * 100)) + "%)", fontsize=10) - ax.set_ylabel("PC2 (" + str(int(varExp[1] * 100)) + "%)", fontsize=10) - if legendOut: - ax.legend( - bbox_to_anchor=(1.05, 1), - loc=2, - borderaxespad=0, - labelspacing=0.2, - prop={"size": 8}, - ) - if annotateScores: - for j, txt in enumerate(d[scores_ind[0]]): - ax.annotate( - txt, - (dScor_["PC1"][j] + 0.001, dScor_["PC2"][j] + 0.001), - fontsize=10, - ) - - # Loadings - elif plot == "loadings": - if isinstance(pvals, np.ndarray): - dLoad_["p-value"] = pvals - sns.scatterplot( - x="PC1", - y="PC2", - data=dLoad_, - hue="p-value", - style=style_load, - ax=ax, - **{"linewidth": 0.5, "edgecolor": "k"} - ) - else: - sns.scatterplot( - x="PC1", - y="PC2", - data=dLoad_, - style=style_load, - ax=ax, - **{"linewidth": 0.5, "edgecolor": "k"} - ) - - ax.set_title("PCA Loadings") - ax.set_xlabel("PC1 (" + str(int(varExp[0] * 100)) + "%)", fontsize=10) - ax.set_ylabel("PC2 (" + str(int(varExp[1] * 100)) + "%)", fontsize=10) - for j, txt in enumerate(dLoad_[loadings_ind]): - ax.annotate( - txt, (dLoad_["PC1"][j] + 0.001, dLoad_["PC2"][j] + 0.001), fontsize=10 - ) - - -def plotpca_explained(ax, data, ncomp): - """Cumulative variance explained for each principal component.""" - explained = PCA(n_components=ncomp).fit(data).explained_variance_ratio_ - acc_expl = [] - - for i, exp in enumerate(explained): - if i > 0: - exp += acc_expl[i - 1] - acc_expl.append(exp) - else: - acc_expl.append(exp) - - ax.bar(range(ncomp), acc_expl, edgecolor="black", linewidth=1) - ax.set_ylabel("% Variance Explained") - ax.set_xlabel("Components") - ax.set_xticks(range(ncomp)) - ax.set_xticklabels([i + 1 for i in range(ncomp)]) - - -def plotpca_ScoresLoadings(ax, data, pn, ps): - fit = PCA(n_components=2).fit(data) - PC1_scores, PC2_scores = fit.transform(data)[:, 0], fit.transform(data)[:, 1] - PC1_loadings, PC2_loadings = fit.components_[0], fit.components_[1] - - # Scores - ax[0].scatter(PC1_scores, PC2_scores, linewidths=0.2) - for j, txt in enumerate(list(data.index)): - ax[0].annotate(txt, (PC1_scores[j], PC2_scores[j])) - ax[0].set_title("PCA Scores") - ax[0].set_xlabel("Principal Component 1") - ax[0].set_ylabel("Principal Component 2") - ax[0].axhline(y=0, color="0.25", linestyle="--") - ax[0].axvline(x=0, color="0.25", linestyle="--") - - spacer = 0.5 - ax[0].set_xlim( - [(-1 * max(np.abs(PC1_scores))) - spacer, max(np.abs(PC1_scores)) + spacer] - ) - ax[0].set_ylim( - [(-1 * max(np.abs(PC2_scores))) - spacer, max(np.abs(PC2_scores)) + spacer] - ) - - # Loadings - poi = [ - "AXL Y702-p", - "AXL Y866-p", - "AXL Y481-p", - "RAB13 Y5-p", - "RAB2B Y3-p", - "CBLB Y889-p", - "SOS1 Y1196-p", - "GAB2 T265-p", - "GAB1 Y406", - "CRKL Y251-p", - "PACSIN2 Y388-p", - "SHC1 S426-p", - "GSK3A Y279-p", - "NME2 Y52-p", - "CDK1 Y15-p", - "MAPK3 T207-p", - "TNS3 Y802-p", - "GIT1 Y383-p", - "KIRREL1 Y622-p", - "BCAR1 Y234-p", - "NEDD9 S182-p", - "RIN1 Y681-p", - "ATP8B1 Y1217", - "MAPK3 Y204-p", - "ATP1A1 Y55-p", - "YES1 Y446", - "EPHB3 Y792-p", - "SLC35E1 Y403-p", - ] - for i, name in enumerate(pn): - p = name + " " + ps[i] - if p in poi: - ax[1].annotate(name, (PC1_loadings[i], PC2_loadings[i])) - ax[1].scatter(PC1_loadings, PC2_loadings, c="darkred", linewidths=0.2, alpha=0.7) - ax[1].set_title("PCA Loadings") - ax[1].set_xlabel("Principal Component 1") - ax[1].set_ylabel("Principal Component 2") - ax[1].axhline(y=0, color="0.25", linestyle="--") - ax[1].axvline(x=0, color="0.25", linestyle="--") - - spacer = 0.05 - ax[1].set_xlim( - [ - (-1 * max(np.abs(PC1_loadings)) - spacer), - (max(np.abs(PC1_loadings)) + spacer), - ] - ) - ax[1].set_ylim( - [ - (-1 * max(np.abs(PC2_loadings)) - spacer), - (max(np.abs(PC2_loadings)) + spacer), - ] - ) - - -def plotpca_ScoresLoadings_plotly(data, title, loc=False): - """Interactive PCA plot. Note that this works best by pre-defining the dataframe's - indices which will serve as labels for each dot in the plot.""" - fit = PCA(n_components=2).fit(data) - - scores = pd.concat( - [ - pd.DataFrame(fit.transform(data)[:, 0]), - pd.DataFrame(fit.transform(data)[:, 1]), - ], - axis=1, - ) - scores.index = data.index - scores.columns = ["PC1", "PC2"] - - loadings = pd.concat( - [pd.DataFrame(fit.components_[0]), pd.DataFrame(fit.components_[1])], axis=1 - ) - loadings.index = data.columns - loadings.columns = ["PC1", "PC2"] - - if loc: - print(loadings.loc[loc]) - - fig = make_subplots(rows=1, cols=2, subplot_titles=("PCA Scores", "PCA Loadings")) - fig.add_trace( - go.Scatter( - mode="markers+text", - x=scores["PC1"], - y=scores["PC2"], - text=scores.index, - textposition="top center", - textfont=dict(size=10, color="black"), - marker=dict(color="blue", size=8, line=dict(color="black", width=1)), - ), - row=1, - col=1, - ) - - fig.add_trace( - go.Scatter( - mode="markers", - x=loadings["PC1"], - y=loadings["PC2"], - opacity=0.7, - text=[ - "Protein: " + loadings.index[i][0] + " Pos: " + loadings.index[i][1] - for i in range(len(loadings.index)) - ], - marker=dict(color="crimson", size=8, line=dict(color="black", width=1)), - ), - row=1, - col=2, - ) - - fig.update_layout( - height=500, - width=1000, - showlegend=False, - xaxis=dict(showgrid=False), - yaxis=dict(showgrid=False), - xaxis2=dict(showgrid=False), - yaxis2=dict(showgrid=False), - title_text=title, - ), - fig.update_xaxes(title_text="Principal Component 1", row=1, col=1) - fig.update_xaxes(title_text="Principal Component 1", row=1, col=2) - fig.update_yaxes(title_text="Principal Component 2", row=1, col=1) - fig.update_yaxes(title_text="Principal Component 2", row=1, col=2) - - fig.show() - - -def preprocess_ID(linear=False, npepts=7, FCcut=10): - bid = pd.read_csv("ddmc/data/BioID/BioID.csv") - genes = [s.split("_")[0] for i, s in enumerate(bid["Gene"])] - bid = bid.drop(["Protein", "Gene"], axis=1) - bid.insert(0, "Gene", genes) - XIDX = np.any(bid.iloc[:, 1:].values > npepts, axis=1) - bid = bid.iloc[XIDX, :] - bid = bid.replace(0, 0.00001) - bid.iloc[:, 1:] = np.log(bid.iloc[:, 1:]) - bid = bid.set_index("Gene").T.reset_index() - bid["index"] = [s.split(".")[0] for s in bid["index"]] - nc = bid.groupby("index").mean().loc["Bio"] - fc = bid.copy() - fc.iloc[:, 1:] += nc.abs() - XIDX = np.any(fc.iloc[:, 1:].values >= FCcut, axis=0) - XIDX = np.insert(XIDX, 0, True) - bid = bid.iloc[:, XIDX] - if linear: - bid = Linear(bid, bid.columns[1:]) - return bid - - -def bootPCA(d, n_components, lIDX, method="PCA", n_boots=100): - """Compute PCA scores and loadings including bootstrap variance of the estimates.""" - bootScor, bootLoad = [], [] - data_headers = list(d.select_dtypes(include=["float64"]).columns) - sIDX = list(d.select_dtypes(include=["object"]).columns) - for _ in range(n_boots): - xIDX = range(d.shape[0]) - resamp = resample(xIDX, replace=True) - bootdf = d.iloc[resamp, :].groupby(sIDX).mean().reset_index() - data = bootdf[data_headers] - - if method == "PCA": - bootdf[data_headers] = StandardScaler().fit_transform(data) - red = PCA(n_components=n_components) - dScor = red.fit_transform(data.values) - varExp = np.round(red.explained_variance_ratio_, 2) - - elif method == "NMF": - varExp = [] - for i in range(1, n_components + 1): - red = NMF( - n_components=i, - max_iter=10000, - solver="mu", - beta_loss="frobenius", - init="nndsvdar", - ).fit(data.values) - dScor = red.transform(data) - varExp.append(r2_score(data, red.inverse_transform(dScor))) - - dScor, dLoad = pca_dfs(dScor, red.components_, bootdf, n_components, sIDX, lIDX) - bootScor.append(dScor) - bootLoad.append(dLoad) - - bootScor = pd.concat(bootScor) - bootScor_m = bootScor.groupby(sIDX).mean().reset_index() - bootScor_sd = bootScor.groupby(sIDX).sem().reset_index() - - bootLoad = pd.concat(bootLoad) - bootLoad_m = bootLoad.groupby(lIDX).mean().reset_index() - bootLoad_sd = bootLoad.groupby(lIDX).sem().reset_index() - - return bootScor_m, bootScor_sd, bootLoad_m, bootLoad_sd, bootScor, varExp - - -def plotBootPCA( - ax, - means, - stds, - varExp, - title=False, - X="PC1", - Y="PC2", - LegOut=False, - annotate=False, - colors=False, -): - """Plot Scores and Loadings.""" - sIDX = list(means.select_dtypes(include=["object"]).columns) - hue = sIDX[0] - style = None - if len(sIDX) == 2: - style = sIDX[1] - - ax.errorbar( - means[X], - means[Y], - xerr=stds[X], - yerr=stds[Y], - linestyle="", - elinewidth=0.2, - capsize=2, - capthick=0.2, - ecolor="k", - ) - - if colors: - pal = sns.xkcd_palette(colors) - p1 = sns.scatterplot( - x=X, - y=Y, - data=means, - hue=hue, - style=style, - ax=ax, - palette=pal, - markers=["o", "X", "d", "*"], - **{"linewidth": 0.5, "edgecolor": "k"}, - s=55 - ) - - if not colors: - p1 = sns.scatterplot( - x=X, - y=Y, - data=means, - hue=hue, - style=style, - ax=ax, - markers=["o", "X", "d", "*"], - **{"linewidth": 0.5, "edgecolor": "k"}, - s=55 - ) - - if LegOut: - ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0) - ax.set_title(title) - - if annotate: - for idx, txt in enumerate(means[sIDX[0]]): - p1.text( - means[X][idx], - means[Y][idx], - txt, - horizontalalignment="left", - color="black", - size="xx-small", - fontweight="light", - ) - - ax.set_xlabel( - str(X) + "(" + str(int(varExp[int(X[-1]) - 1] * 100)) + "%)", fontsize=10 - ) - ax.set_ylabel( - str(Y) + "(" + str(int(varExp[int(Y[-1]) - 1] * 100)) + "%)", fontsize=10 - ) diff --git a/ddmc/pre_processing.py b/ddmc/pre_processing.py deleted file mode 100644 index 5517cc0f1..000000000 --- a/ddmc/pre_processing.py +++ /dev/null @@ -1,346 +0,0 @@ -""" This scripts handles all the pre-processing required to merge and transform the raw mass spec biological replicates. """ - -import os -import numpy as np -import pandas as pd -from scipy import stats -from collections import defaultdict -from .motifs import FormatName, MapMotifs - - -path = os.path.dirname(os.path.abspath(__file__)) - - -###-------------------------- Pre-processing MS data --------------------------### -def preprocessCPTAC(): - """Replace patient identifiers, fill NaNs, and make it compatible with current code.""" - X = pd.read_csv( - os.path.join( - path, - "./data/MS/CPTAC/CPTAC3_Lung_Adeno_Carcinoma_Phosphoproteome.phosphopeptide.tmt10.csv", - ) - ) - d = X.iloc[:, 1:-3] - X = pd.concat( - [X.iloc[:, 0], X.iloc[:, -3:], d.loc[:, d.columns.str.contains("CPT")]], axis=1 - ) - X = filter_NaNpeptides(X, cut=0.2) - - n = pd.read_csv( - os.path.join( - path, - "./data/MS/CPTAC/S046_BI_CPTAC3_LUAD_Discovery_Cohort_Samples_r1_May2019.csv", - ) - ) - bi_id = list(n[~n["Broad Sample.ID"].str.contains("IR")].iloc[:, 1]) - X.columns = ["Sequence"] + list(X.columns[1:4]) + bi_id - - return X.drop("Organism", axis=1) - - -def filter_NaNpeptides(X, cut=False, tmt=False): - """Filter peptides that have a given minimum percentage of completeness or number of TMT experiments.""" - d = X.select_dtypes(include=["float64"]) - if cut: - Xidx = np.count_nonzero(~np.isnan(d), axis=1) / d.shape[1] >= cut - else: - idx_values = FindIdxValues(X) - dict_ = defaultdict(list) - for i in range(idx_values.shape[0]): - dict_[idx_values[i, 0]].append(idx_values[i, -1]) - Xidx = [len(set(dict_[i])) >= tmt for i in range(X.shape[0])] - return X.iloc[Xidx, :] - - -def FindIdxValues(X): - """Find the patient indices corresponding to all non-missing values grouped in TMT experiments.""" - data = X.select_dtypes(include=["float64"]) - idx = np.argwhere(~np.isnan(data.values)) - idx[:, 1] += 4 # add ID variable columns - StoE = pd.read_csv("ddmc/data/MS/CPTAC/IDtoExperiment.csv") - assert all(StoE.iloc[:, 0] == data.columns), "Sample labels don't match." - StoE = StoE.iloc[:, 1].values - tmt = [[StoE[idx[ii][1] - 4]] for ii in range(idx.shape[0])] - return np.append(idx, tmt, axis=1) - - -def MergeDfbyMean(X, values, indices): - """Compute mean across duplicates.""" - return pd.pivot_table(X, values=values, index=indices, aggfunc="mean") - - -def LinearFoldChange(X, data_headers, FCto): - """Convert to linear fold-change from log2 mean-centered.""" - X[data_headers] = pd.DataFrame(np.power(2, X[data_headers])).div( - np.power(2, X[FCto]), axis=0 - ) - return X - - -def Linear(X, data_headers): - """Convert to linear from log2 mean-centered.""" - X[data_headers] = pd.DataFrame(np.power(2, X[data_headers])) - return X - - -def FoldChangeToControl(X, data_headers): - """Convert to fold-change to control.""" - X[data_headers] = X[data_headers].div(X.iloc[:, 3], axis=0) - return X - - -def Log2T(X): - """Convert to log2 scale keeping original sign.""" - data_headers = X.select_dtypes(include=["float64"]).columns - X[data_headers] = np.log2(X[data_headers]) - return X - - -def VarianceFilter(X, data_headers, varCut=0.1): - """Filter rows for those containing more than cutoff variance. Variance across conditions per peptide. - Note this should only be used with log-scaled, mean-centered data.""" - Xidx = np.var(X[data_headers].values, axis=1) > varCut - return X.iloc[Xidx, :] - - -def FoldChangeFilterToControl(X, data_headers, FCto, cutoff=0.4): - """Filter rows for those containing more than a two-fold change. - Note this should only be used with linear-scale data normalized to the control.""" - XX = LinearFoldChange(X.copy(), data_headers, FCto) - Xidx = np.any(XX[data_headers].values <= 1 - cutoff, axis=1) | np.any( - XX[data_headers].values >= 1 + cutoff, axis=1 - ) - return X.iloc[Xidx, :] - - -def FoldChangeFilterBasedOnMaxFC(X, data_headers, cutoff=0.5): - """Filter rows for those containing an cutoff% change of the maximum vs minimum fold-change - across every condition.""" - XX = Linear(X.copy(), data_headers) - X_ToMin = XX[data_headers] / XX[data_headers].min(axis=0) - Xidx = np.any(X_ToMin.values >= X_ToMin.max().values * cutoff, axis=1) - return X.iloc[Xidx, :] - - -###------------ Filter by variance (stdev or range/pearson's) ------------------### - - -def VFilter(ABC, merging_indices, data_headers, corrCut=0.55, stdCut=0.5): - """Filter based on variability across recurrent peptides in MS biological replicates""" - NonRecPeptides, CorrCoefPeptides, StdPeptides = MapOverlappingPeptides(ABC) - - NonRecTable = BuildMatrix(NonRecPeptides, ABC, data_headers) - NonRecTable = NonRecTable.assign(BioReps=list("1" * NonRecTable.shape[0])) - NonRecTable = NonRecTable.assign(r2_Std=list(["N/A"] * NonRecTable.shape[0])) - - CorrCoefPeptides = BuildMatrix(CorrCoefPeptides, ABC, data_headers) - DupsTable = CorrCoefFilter(CorrCoefPeptides, corrCut=corrCut) - DupsTable = MergeDfbyMean( - DupsTable, DupsTable[data_headers], merging_indices + ["r2_Std"] - ) - DupsTable = DupsTable.assign(BioReps=list("2" * DupsTable.shape[0])).reset_index() - - StdPeptides = BuildMatrix(StdPeptides, ABC, data_headers) - TripsTable = TripsMeanAndStd( - StdPeptides, merging_indices + ["BioReps"], data_headers - ) - TripsTable = FilterByStdev(TripsTable, merging_indices + ["BioReps"], stdCut=stdCut) - - merging_indices.insert(4, "BioReps") - merging_indices.insert(5, "r2_Std") - - ABC = pd.concat([NonRecTable, DupsTable, TripsTable]).reset_index()[ - merging_indices[:2] + list(ABC[data_headers].columns) + merging_indices[2:] - ] - - # Including non-overlapping peptides - return ABC - - -def MapOverlappingPeptides(ABC): - """Find recurrent peptides across biological replicates. Grouping those showing up 2 to later calculate - correlation, those showing up >= 3 to take the std. Those showing up 1 can be included or not in the final data set. - Final dfs are formed by 'Name', 'Peptide', '#Recurrences'.""" - dups = pd.pivot_table( - ABC, index=["Protein", "Sequence"], aggfunc="size" - ).sort_values() - dups = pd.DataFrame(dups).reset_index() - dups.columns = [ABC.columns[0], ABC.columns[1], "Recs"] - NonRecPeptides = dups[dups["Recs"] == 1] - RangePeptides = dups[dups["Recs"] == 2] - StdPeptides = dups[dups["Recs"] >= 3] - return NonRecPeptides, RangePeptides, StdPeptides - - -def BuildMatrix(peptides, ABC, data_headers): - """Map identified recurrent peptides to generate complete matrices with values. - If recurrent peptides = 2, the correlation coefficient is included in a new column. - """ - ABC = ABC.reset_index().set_index(["Sequence", "Protein"], drop=False) - ABC = ABC.sort_index() - - corrcoefs, peptideslist, bioReps = [], [], [] - for idx, seq in enumerate(peptides.iloc[:, 1]): - name = peptides.iloc[idx, 0] - - # Skip blank - if name == "(blank)": - continue - - pepts = ABC.loc[seq, name] - pepts = pepts.iloc[:, 1:] - names = pepts.iloc[:, 0] - - if len(pepts) == 1: - peptideslist.append(pepts.iloc[0, :]) - elif len(pepts) == 2 and len(set(names)) == 1: - fc = Linear(pepts[data_headers].copy(), data_headers) - corrcoef, _ = stats.pearsonr(fc.iloc[0, :], fc.iloc[1, :]) - for i in range(len(pepts)): - corrcoefs.append(np.round(corrcoef, decimals=2)) - peptideslist.append(pepts.iloc[i, :]) - elif len(pepts) >= 3 and len(set(names)) == 1: - for i in range(len(pepts)): - peptideslist.append(pepts.iloc[i, :]) - bioReps.append(len(pepts)) - else: - print("check this", pepts) - - if corrcoefs: - matrix = ( - pd.DataFrame(peptideslist).reset_index(drop=True).assign(r2_Std=corrcoefs) - ) - - elif bioReps: - matrix = ( - pd.DataFrame(peptideslist).reset_index(drop=True).assign(BioReps=bioReps) - ) - - else: - matrix = pd.DataFrame(peptideslist).reset_index(drop=True) - - return matrix - - -def CorrCoefFilter(X, corrCut=0.6): - """Filter rows for those containing more than a correlation threshold.""" - Xidx = X.iloc[:, -1].values >= corrCut - return X.iloc[Xidx, :] - - -def TripsMeanAndStd(triplicates, merging_indices, data_headers): - """Merge all triplicates by mean and standard deviation across conditions. Note this builds a multilevel header - meaning we have 2 values for each condition (eg within Erlotinib -> Mean | Std).""" - func_tri = {} - for i in triplicates[data_headers].columns: - func_tri[i] = "mean", "std" - X = pd.pivot_table( - triplicates, - values=triplicates[data_headers].columns, - index=merging_indices, - aggfunc=func_tri, - ) - return X.reset_index() - - -def MeanCenter(X, mc_row, mc_col): - """Mean centers each row of values. logT also optionally log2-transforms.""" - data_headers = X.select_dtypes(include=["float64"]).columns - if mc_row: - X[data_headers] = X[data_headers].sub(X[data_headers].mean(axis=1), axis=0) - if mc_col: - X[data_headers] = X[data_headers].sub(X[data_headers].mean(axis=0), axis=1) - return X - - -def FilterByRange(X, rangeCut=0.4): - """Filter rows for those containing more than a range threshold.""" - Rg = X.iloc[:, X.columns.get_level_values(1) == "ptp"] - Xidx = np.all(Rg.values <= rangeCut, axis=1) - return X.iloc[Xidx, :] - - -def FilterByStdev(X, merging_indices, stdCut=0.4): - """Filter rows for those containing more than a standard deviation threshold.""" - Stds = X.iloc[:, X.columns.get_level_values(1) == "std"] - StdMeans = list(np.round(Stds.mean(axis=1), decimals=2)) - Xidx = np.mean(Stds.values, axis=1) <= stdCut - Means = pd.concat( - [X[merging_indices], X.iloc[:, X.columns.get_level_values(1) == "mean"]], axis=1 - ) - Means = Means.iloc[Xidx, :] - Means.columns = Means.columns.droplevel(1) - StdMeans = pd.DataFrame(StdMeans).iloc[Xidx, :] - return Means.assign(r2_Std=StdMeans) - - -def peptidefinder(X, loc, Protein=False, Gene=False, Sequence=False): - """Search for a specific peptide either by name or sequence.""" - if Protein: - found = X[X["Protein"].str.contains(loc)] - if Gene: - found = X[X["Gene"].str.contains(loc)] - if Sequence: - found = X[X["Sequence"].str.contains(loc)] - return found - - -######----------pre-processing of phenotype data----------###### - - -def MergeTR(data): - """Convenient to merge by mean all TRs of IncuCyte""" - for i in range(1, data.shape[1], 2): - data.iloc[:, i] = data.iloc[:, i : i + 2].mean(axis=1) - - return data.drop( - data.columns[[i + 1 for i in range(1, data.shape[1], 2)]], axis="columns" - ) - - -def y_pre(ds, tr, ftp, phenotype, all_lines, itp=False): - """Preprocesses cell phenotype data sets for analysis.""" - z = [] - for sl in ds: - c = sl.loc[:, sl.columns.str.contains(tr)] - c.insert(0, "Elapsed", ds[0].iloc[:, 0]) - c = c[ - list(c.columns[:3]) + [c.columns[4]] + [c.columns[3]] + list(c.columns[5:]) - ] - if not isinstance(itp, bool): - c = ( - c[c["Elapsed"] == ftp] - .iloc[0, 1:] - .div(c[c["Elapsed"] == itp].iloc[0, 1:]) - ) - else: - c = c[c["Elapsed"] == ftp].iloc[0, 1:] - z.append(c) - - y = pd.DataFrame(pd.concat(z, axis=0)).reset_index() - y.columns = ["Lines", phenotype] - y = y.groupby("Lines").mean().T[c.index].T.reset_index() - - y["Lines"] = [s.split(tr)[0] for s in y.iloc[:, 0]] - y["Treatment"] = tr - - if "-" in y["Lines"][1]: - y["Lines"] = [s.split("-")[0] for s in y.iloc[:, 0]] - - y["Lines"] = all_lines - return y[["Lines", "Treatment", phenotype]] - - -def FixColumnLabels(cv): - """Fix column labels to use pandas locators.""" - l = [] - for label in cv[0].columns: - if "-" not in label and label != "Elapsed": - l.append(label + "-UT") - if "-" in label or label == "Elapsed": - l.append(label) - - for d in cv: - d.columns = l - - return cv diff --git a/ddmc/tests/test_CoClustering.py b/ddmc/tests/test_CoClustering.py deleted file mode 100644 index 2de0ba9af..000000000 --- a/ddmc/tests/test_CoClustering.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Testing file for the clustering methods by data and sequence. -""" - -import pytest -import numpy as np -import pandas as pd -from ..clustering import DDMC -from ..pre_processing import filter_NaNpeptides - - -X = pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:] -X = filter_NaNpeptides(X, tmt=25) -data = X.select_dtypes(include=["float64"]).T -info = X.select_dtypes(include=["object"]) - - -@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"]) -def test_wins(distance_method): - """Test that EMclustering is working by comparing with GMM clusters.""" - MSC = DDMC(info, 2, SeqWeight=0, distance_method=distance_method).fit(X=data) - distances = MSC.wins(data) - - # assert that the distance to the same sequence weight is less - assert distances[0] < 1.0 - assert distances[0] < distances[1] - - -@pytest.mark.parametrize("w", [0, 0.1, 1.0, 10.0]) -@pytest.mark.parametrize("ncl", [2, 5, 10, 25]) -@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"]) -def test_clusters(w, ncl, distance_method): - """Test that EMclustering is working by comparing with GMM clusters.""" - MSC = DDMC(info, ncl, SeqWeight=w, distance_method=distance_method).fit(X=data) - - # Assert that we got a reasonable result - assert np.all(np.isfinite(MSC.scores_)) - assert np.all(np.isfinite(MSC.seq_scores_)) diff --git a/ddmc/tests/test_DifClusters.py b/ddmc/tests/test_DifClusters.py deleted file mode 100644 index 00db4710b..000000000 --- a/ddmc/tests/test_DifClusters.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -Testing file for minimum variance across clusters using CPTAC data. -""" - -import pytest -import numpy as np -import pandas as pd -from scipy.spatial.distance import cdist -from collections import Counter -from ..clustering import DDMC -from ..pre_processing import filter_NaNpeptides - -X = pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:] -X = filter_NaNpeptides(X, tmt=25) -d = X.select_dtypes(include=["float64"]).T -i = X.select_dtypes(include=["object"]) - - -@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"]) -def test_ClusterVar(distance_method): - """Test minimum variance of output cluster centers""" - model = DDMC(i, 6, SeqWeight=3, distance_method=distance_method).fit(X=d) - centers = model.transform() - - # Get pairwise cluster distances - dists = cdist(centers.T, centers.T) - np.fill_diagonal(dists, 10.0) - - # Assert that all the clusters are at least euclidean distance 1 away - assert np.all(dists > 1.0) diff --git a/ddmc/tests/test_cluster.py b/ddmc/tests/test_cluster.py new file mode 100644 index 000000000..2f3e9aabe --- /dev/null +++ b/ddmc/tests/test_cluster.py @@ -0,0 +1,64 @@ +""" +Testing file for the clustering methods by data and sequence. +""" + +import pytest +import numpy as np +from scipy.spatial.distance import cdist +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.mixture import GaussianMixture + +from ddmc.clustering import DDMC +from ddmc.datasets import CPTAC, filter_incomplete_peptides + + +def test_wins(): + """Test that EMclustering is working by comparing with GMM clusters.""" + p_signal = filter_incomplete_peptides( + CPTAC().get_p_signal(), sample_presence_ratio=1 + ) + model_ddmc = DDMC(n_components=2, seq_weight=0).fit(p_signal) + model_gmm = GaussianMixture(n_components=2).fit(p_signal.values) + + similarity = cosine_similarity(model_gmm.means_, model_ddmc.transform().T) + + # only works for 2 clusters, check that the two clusters are matched up + # either index-matching or otherwise + diag = np.eye(2, dtype=bool) + offdiag = ~diag + + assert np.all(similarity[diag] > 0.95) or np.all(similarity[offdiag] > 0.95) + + +@pytest.mark.parametrize("w", [0, 0.1, 10.0]) +@pytest.mark.parametrize("ncl", [2, 5, 25]) +@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"]) +def test_clusters(w, ncl, distance_method): + p_signal = filter_incomplete_peptides( + CPTAC().get_p_signal(), sample_presence_ratio=1 + ) + model = DDMC(ncl, seq_weight=w, distance_method=distance_method).fit(p_signal) + + # Assert that we got a reasonable result + assert np.all(np.isfinite(model.scores_)) + assert np.all(np.isfinite(model.seq_scores_)) + + +@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"]) +def test_ClusterVar(distance_method): + """Test minimum variance of output cluster centers""" + p_signal = filter_incomplete_peptides( + CPTAC().get_p_signal(), sample_presence_ratio=1 + ) + + model = DDMC(n_components=6, seq_weight=3, distance_method=distance_method).fit( + p_signal + ) + centers = model.transform() + + # Get pairwise cluster distances + dists = cdist(centers.T, centers.T) + np.fill_diagonal(dists, 10.0) + + # Assert that all the clusters are at least euclidean distance 1 away + assert np.all(dists > 1.0) diff --git a/makefile b/makefile index 010096a2b..e579258f8 100644 --- a/makefile +++ b/makefile @@ -9,6 +9,10 @@ output/figure%.svg: ddmc/figures/figure%.py test: poetry run pytest -s -x -v +testprofile: + poetry run python3 -m cProfile -o profile -m pytest -s -v -x + gprof2dot -f pstats --node-thres=5.0 profile | dot -Tsvg -o profile.svg + coverage.xml: poetry run pytest --cov=ddmc --cov-report=xml diff --git a/poetry.lock b/poetry.lock index 76e9dacfc..63f6b55e9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,9 +1,10 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "adjusttext" version = "1.0.4" description = "Iteratively adjust text position in matplotlib plots to minimize overlaps" +category = "main" optional = false python-versions = "*" files = [ @@ -20,6 +21,7 @@ scipy = "*" name = "bioinfokit" version = "2.1.3" description = "Bioinformatics data analysis and visualization toolkit" +category = "main" optional = false python-versions = "*" files = [ @@ -43,6 +45,7 @@ textwrap3 = "*" name = "biopython" version = "1.83" description = "Freely available tools for computational molecular biology." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -77,6 +80,7 @@ numpy = "*" name = "biothings-client" version = "0.3.1" description = "Python Client for BioThings API services." +category = "main" optional = false python-versions = ">=2.7" files = [ @@ -96,6 +100,7 @@ jsonld = ["PyLD (>=0.7.2)"] name = "certifi" version = "2023.11.17" description = "Python package for providing Mozilla's CA Bundle." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -107,6 +112,7 @@ files = [ name = "charset-normalizer" version = "3.3.2" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -206,6 +212,7 @@ files = [ name = "clarabel" version = "0.6.0" description = "Clarabel Conic Interior Point Solver for Rust / Python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -227,6 +234,7 @@ scipy = "*" name = "click" version = "8.1.7" description = "Composable command line interface toolkit" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -241,6 +249,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -252,6 +261,7 @@ files = [ name = "contourpy" version = "1.2.0" description = "Python library for calculating contours of 2D quadrilateral grids" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -315,6 +325,7 @@ test-no-images = ["pytest", "pytest-cov", "pytest-xdist", "wurlitzer"] name = "coverage" version = "7.4.0" description = "Code coverage measurement for Python" +category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -379,6 +390,7 @@ toml = ["tomli"] name = "cvxopt" version = "1.3.2" description = "Convex optimization package" +category = "main" optional = false python-versions = ">=3, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -411,6 +423,7 @@ files = [ name = "cvxpy" version = "1.4.2" description = "A domain-specific language for modeling convex optimization problems in Python." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -469,6 +482,7 @@ xpress = ["xpress"] name = "cycler" version = "0.12.1" description = "Composable style cycles" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -484,6 +498,7 @@ tests = ["pytest", "pytest-cov", "pytest-xdist"] name = "ecos" version = "2.0.12" description = "This is the Python package for ECOS: Embedded Cone Solver. See Github page for more information." +category = "main" optional = false python-versions = "*" files = [ @@ -513,6 +528,7 @@ scipy = ">=0.9" name = "fancyimpute" version = "0.7.0" description = "Matrix completion and feature imputation algorithms" +category = "main" optional = false python-versions = "*" files = [ @@ -531,6 +547,7 @@ scikit-learn = ">=0.24.2" name = "fonttools" version = "4.47.2" description = "Tools to manipulate font files" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -596,6 +613,7 @@ woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] name = "idna" version = "3.6" description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -607,6 +625,7 @@ files = [ name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -618,6 +637,7 @@ files = [ name = "joblib" version = "1.3.2" description = "Lightweight pipelining with Python functions" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -629,6 +649,7 @@ files = [ name = "kiwisolver" version = "1.4.5" description = "A fast implementation of the Cassowary constraint solver" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -742,6 +763,7 @@ files = [ name = "knnimpute" version = "0.1.0" description = "k-Nearest Neighbor imputation" +category = "main" optional = false python-versions = "*" files = [ @@ -756,6 +778,7 @@ six = "*" name = "logomaker" version = "0.8" description = "Package for making Sequence Logos" +category = "main" optional = false python-versions = "*" files = [ @@ -772,6 +795,7 @@ pandas = "*" name = "lxml" version = "5.1.0" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -865,6 +889,7 @@ source = ["Cython (>=3.0.7)"] name = "matplotlib" version = "3.8.2" description = "Python plotting package" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -913,6 +938,7 @@ python-dateutil = ">=2.7" name = "matplotlib-venn" version = "0.11.9" description = "Functions for plotting area-proportional two- and three-way Venn diagrams in matplotlib." +category = "main" optional = false python-versions = "*" files = [ @@ -928,6 +954,7 @@ scipy = "*" name = "mygene" version = "3.2.2" description = "Python Client for MyGene.Info services." +category = "main" optional = false python-versions = "*" files = [ @@ -942,6 +969,7 @@ biothings-client = ">=0.2.6" name = "mypy" version = "1.8.0" description = "Optional static typing for Python" +category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -988,6 +1016,7 @@ reports = ["lxml"] name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." +category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -999,6 +1028,7 @@ files = [ name = "nose" version = "1.3.7" description = "nose extends unittest to make testing easier" +category = "main" optional = false python-versions = "*" files = [ @@ -1011,6 +1041,7 @@ files = [ name = "numpy" version = "1.26.3" description = "Fundamental package for array computing in Python" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -1056,6 +1087,7 @@ files = [ name = "osqp" version = "0.6.3" description = "OSQP: The Operator Splitting QP Solver" +category = "main" optional = false python-versions = "*" files = [ @@ -1095,6 +1127,7 @@ scipy = ">=0.13.2" name = "packaging" version = "23.2" description = "Core utilities for Python packages" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1104,36 +1137,41 @@ files = [ [[package]] name = "pandas" -version = "2.1.4" +version = "2.2.0" description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" optional = false python-versions = ">=3.9" files = [ - {file = "pandas-2.1.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bdec823dc6ec53f7a6339a0e34c68b144a7a1fd28d80c260534c39c62c5bf8c9"}, - {file = "pandas-2.1.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:294d96cfaf28d688f30c918a765ea2ae2e0e71d3536754f4b6de0ea4a496d034"}, - {file = "pandas-2.1.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b728fb8deba8905b319f96447a27033969f3ea1fea09d07d296c9030ab2ed1d"}, - {file = "pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00028e6737c594feac3c2df15636d73ace46b8314d236100b57ed7e4b9ebe8d9"}, - {file = "pandas-2.1.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:426dc0f1b187523c4db06f96fb5c8d1a845e259c99bda74f7de97bd8a3bb3139"}, - {file = "pandas-2.1.4-cp310-cp310-win_amd64.whl", hash = "sha256:f237e6ca6421265643608813ce9793610ad09b40154a3344a088159590469e46"}, - {file = "pandas-2.1.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b7d852d16c270e4331f6f59b3e9aa23f935f5c4b0ed2d0bc77637a8890a5d092"}, - {file = "pandas-2.1.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7d5f2f54f78164b3d7a40f33bf79a74cdee72c31affec86bfcabe7e0789821"}, - {file = "pandas-2.1.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0aa6e92e639da0d6e2017d9ccff563222f4eb31e4b2c3cf32a2a392fc3103c0d"}, - {file = "pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d797591b6846b9db79e65dc2d0d48e61f7db8d10b2a9480b4e3faaddc421a171"}, - {file = "pandas-2.1.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d2d3e7b00f703aea3945995ee63375c61b2e6aa5aa7871c5d622870e5e137623"}, - {file = "pandas-2.1.4-cp311-cp311-win_amd64.whl", hash = "sha256:dc9bf7ade01143cddc0074aa6995edd05323974e6e40d9dbde081021ded8510e"}, - {file = "pandas-2.1.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:482d5076e1791777e1571f2e2d789e940dedd927325cc3cb6d0800c6304082f6"}, - {file = "pandas-2.1.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8a706cfe7955c4ca59af8c7a0517370eafbd98593155b48f10f9811da440248b"}, - {file = "pandas-2.1.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0513a132a15977b4a5b89aabd304647919bc2169eac4c8536afb29c07c23540"}, - {file = "pandas-2.1.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9f17f2b6fc076b2a0078862547595d66244db0f41bf79fc5f64a5c4d635bead"}, - {file = "pandas-2.1.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:45d63d2a9b1b37fa6c84a68ba2422dc9ed018bdaa668c7f47566a01188ceeec1"}, - {file = "pandas-2.1.4-cp312-cp312-win_amd64.whl", hash = "sha256:f69b0c9bb174a2342818d3e2778584e18c740d56857fc5cdb944ec8bbe4082cf"}, - {file = "pandas-2.1.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3f06bda01a143020bad20f7a85dd5f4a1600112145f126bc9e3e42077c24ef34"}, - {file = "pandas-2.1.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab5796839eb1fd62a39eec2916d3e979ec3130509930fea17fe6f81e18108f6a"}, - {file = "pandas-2.1.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edbaf9e8d3a63a9276d707b4d25930a262341bca9874fcb22eff5e3da5394732"}, - {file = "pandas-2.1.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ebfd771110b50055712b3b711b51bee5d50135429364d0498e1213a7adc2be8"}, - {file = "pandas-2.1.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8ea107e0be2aba1da619cc6ba3f999b2bfc9669a83554b1904ce3dd9507f0860"}, - {file = "pandas-2.1.4-cp39-cp39-win_amd64.whl", hash = "sha256:d65148b14788b3758daf57bf42725caa536575da2b64df9964c563b015230984"}, - {file = "pandas-2.1.4.tar.gz", hash = "sha256:fcb68203c833cc735321512e13861358079a96c174a61f5116a1de89c58c0ef7"}, + {file = "pandas-2.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8108ee1712bb4fa2c16981fba7e68b3f6ea330277f5ca34fa8d557e986a11670"}, + {file = "pandas-2.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:736da9ad4033aeab51d067fc3bd69a0ba36f5a60f66a527b3d72e2030e63280a"}, + {file = "pandas-2.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38e0b4fc3ddceb56ec8a287313bc22abe17ab0eb184069f08fc6a9352a769b18"}, + {file = "pandas-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20404d2adefe92aed3b38da41d0847a143a09be982a31b85bc7dd565bdba0f4e"}, + {file = "pandas-2.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7ea3ee3f125032bfcade3a4cf85131ed064b4f8dd23e5ce6fa16473e48ebcaf5"}, + {file = "pandas-2.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f9670b3ac00a387620489dfc1bca66db47a787f4e55911f1293063a78b108df1"}, + {file = "pandas-2.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:5a946f210383c7e6d16312d30b238fd508d80d927014f3b33fb5b15c2f895430"}, + {file = "pandas-2.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a1b438fa26b208005c997e78672f1aa8138f67002e833312e6230f3e57fa87d5"}, + {file = "pandas-2.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8ce2fbc8d9bf303ce54a476116165220a1fedf15985b09656b4b4275300e920b"}, + {file = "pandas-2.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2707514a7bec41a4ab81f2ccce8b382961a29fbe9492eab1305bb075b2b1ff4f"}, + {file = "pandas-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85793cbdc2d5bc32620dc8ffa715423f0c680dacacf55056ba13454a5be5de88"}, + {file = "pandas-2.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:cfd6c2491dc821b10c716ad6776e7ab311f7df5d16038d0b7458bc0b67dc10f3"}, + {file = "pandas-2.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a146b9dcacc3123aa2b399df1a284de5f46287a4ab4fbfc237eac98a92ebcb71"}, + {file = "pandas-2.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbc1b53c0e1fdf16388c33c3cca160f798d38aea2978004dd3f4d3dec56454c9"}, + {file = "pandas-2.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a41d06f308a024981dcaa6c41f2f2be46a6b186b902c94c2674e8cb5c42985bc"}, + {file = "pandas-2.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:159205c99d7a5ce89ecfc37cb08ed179de7783737cea403b295b5eda8e9c56d1"}, + {file = "pandas-2.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb1e1f3861ea9132b32f2133788f3b14911b68102d562715d71bd0013bc45440"}, + {file = "pandas-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:761cb99b42a69005dec2b08854fb1d4888fdf7b05db23a8c5a099e4b886a2106"}, + {file = "pandas-2.2.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a20628faaf444da122b2a64b1e5360cde100ee6283ae8effa0d8745153809a2e"}, + {file = "pandas-2.2.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f5be5d03ea2073627e7111f61b9f1f0d9625dc3c4d8dda72cc827b0c58a1d042"}, + {file = "pandas-2.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:a626795722d893ed6aacb64d2401d017ddc8a2341b49e0384ab9bf7112bdec30"}, + {file = "pandas-2.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9f66419d4a41132eb7e9a73dcec9486cf5019f52d90dd35547af11bc58f8637d"}, + {file = "pandas-2.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:57abcaeda83fb80d447f28ab0cc7b32b13978f6f733875ebd1ed14f8fbc0f4ab"}, + {file = "pandas-2.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e60f1f7dba3c2d5ca159e18c46a34e7ca7247a73b5dd1a22b6d59707ed6b899a"}, + {file = "pandas-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb61dc8567b798b969bcc1fc964788f5a68214d333cade8319c7ab33e2b5d88a"}, + {file = "pandas-2.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:52826b5f4ed658fa2b729264d63f6732b8b29949c7fd234510d57c61dbeadfcd"}, + {file = "pandas-2.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bde2bc699dbd80d7bc7f9cab1e23a95c4375de615860ca089f34e7c64f4a8de7"}, + {file = "pandas-2.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:3de918a754bbf2da2381e8a3dcc45eede8cd7775b047b923f9006d5f876802ae"}, + {file = "pandas-2.2.0.tar.gz", hash = "sha256:30b83f7c3eb217fb4d1b494a57a2fda5444f17834f5df2de6b2ffff68dc3c8e2"}, ] [package.dependencies] @@ -1143,36 +1181,37 @@ numpy = [ ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" -tzdata = ">=2022.1" +tzdata = ">=2022.7" [package.extras] -all = ["PyQt5 (>=5.15.6)", "SQLAlchemy (>=1.4.36)", "beautifulsoup4 (>=4.11.1)", "bottleneck (>=1.3.4)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=0.8.1)", "fsspec (>=2022.05.0)", "gcsfs (>=2022.05.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.8.0)", "matplotlib (>=3.6.1)", "numba (>=0.55.2)", "numexpr (>=2.8.0)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pandas-gbq (>=0.17.5)", "psycopg2 (>=2.9.3)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.5)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "pyxlsb (>=1.0.9)", "qtpy (>=2.2.0)", "s3fs (>=2022.05.0)", "scipy (>=1.8.1)", "tables (>=3.7.0)", "tabulate (>=0.8.10)", "xarray (>=2022.03.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)", "zstandard (>=0.17.0)"] -aws = ["s3fs (>=2022.05.0)"] -clipboard = ["PyQt5 (>=5.15.6)", "qtpy (>=2.2.0)"] -compression = ["zstandard (>=0.17.0)"] -computation = ["scipy (>=1.8.1)", "xarray (>=2022.03.0)"] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] consortium-standard = ["dataframe-api-compat (>=0.1.7)"] -excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pyxlsb (>=1.0.9)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)"] -feather = ["pyarrow (>=7.0.0)"] -fss = ["fsspec (>=2022.05.0)"] -gcp = ["gcsfs (>=2022.05.0)", "pandas-gbq (>=0.17.5)"] -hdf5 = ["tables (>=3.7.0)"] -html = ["beautifulsoup4 (>=4.11.1)", "html5lib (>=1.1)", "lxml (>=4.8.0)"] -mysql = ["SQLAlchemy (>=1.4.36)", "pymysql (>=1.0.2)"] -output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.8.10)"] -parquet = ["pyarrow (>=7.0.0)"] -performance = ["bottleneck (>=1.3.4)", "numba (>=0.55.2)", "numexpr (>=2.8.0)"] -plot = ["matplotlib (>=3.6.1)"] -postgresql = ["SQLAlchemy (>=1.4.36)", "psycopg2 (>=2.9.3)"] -spss = ["pyreadstat (>=1.1.5)"] -sql-other = ["SQLAlchemy (>=1.4.36)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] -xml = ["lxml (>=4.8.0)"] +xml = ["lxml (>=4.9.2)"] [[package]] name = "panflute" version = "2.3.0" description = "Pythonic Pandoc filters" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1193,6 +1232,7 @@ pypi = ["Pygments", "docutils", "twine", "wheel"] name = "patsy" version = "0.5.6" description = "A Python package for describing statistical models and for building design matrices." +category = "main" optional = false python-versions = "*" files = [ @@ -1211,6 +1251,7 @@ test = ["pytest", "pytest-cov", "scipy"] name = "pillow" version = "10.2.0" description = "Python Imaging Library (Fork)" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1296,6 +1337,7 @@ xmp = ["defusedxml"] name = "pluggy" version = "1.3.0" description = "plugin and hook calling mechanisms for python" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1311,6 +1353,7 @@ testing = ["pytest", "pytest-benchmark"] name = "pybind11" version = "2.11.1" description = "Seamless operability between C++11 and Python" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1325,6 +1368,7 @@ global = ["pybind11-global (==2.11.1)"] name = "pyparsing" version = "3.1.1" description = "pyparsing module - Classes and methods to define and execute parsing grammars" +category = "main" optional = false python-versions = ">=3.6.8" files = [ @@ -1339,6 +1383,7 @@ diagrams = ["jinja2", "railroad-diagrams"] name = "pytest" version = "7.4.4" description = "pytest: simple powerful testing with Python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1359,6 +1404,7 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no name = "pytest-cov" version = "4.1.0" description = "Pytest plugin for measuring coverage." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1377,6 +1423,7 @@ testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtuale name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -1391,6 +1438,7 @@ six = ">=1.5" name = "pytz" version = "2023.3.post1" description = "World timezone definitions, modern and historical" +category = "main" optional = false python-versions = "*" files = [ @@ -1402,6 +1450,7 @@ files = [ name = "pyyaml" version = "6.0.1" description = "YAML parser and emitter for Python" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1461,6 +1510,7 @@ files = [ name = "qdldl" version = "0.1.7.post0" description = "QDLDL, a free LDL factorization routine." +category = "main" optional = false python-versions = "*" files = [ @@ -1493,6 +1543,7 @@ scipy = ">=0.13.2" name = "requests" version = "2.31.0" description = "Python HTTP for Humans." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1514,6 +1565,7 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "scikit-learn" version = "1.4.0" description = "A set of python modules for machine learning and data mining" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -1574,50 +1626,52 @@ tests = ["black (>=23.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.3)", "numpydoc ( [[package]] name = "scipy" -version = "1.11.4" +version = "1.12.0" description = "Fundamental algorithms for scientific computing in Python" +category = "main" optional = false python-versions = ">=3.9" files = [ - {file = "scipy-1.11.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bc9a714581f561af0848e6b69947fda0614915f072dfd14142ed1bfe1b806710"}, - {file = "scipy-1.11.4-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:cf00bd2b1b0211888d4dc75656c0412213a8b25e80d73898083f402b50f47e41"}, - {file = "scipy-1.11.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9999c008ccf00e8fbcce1236f85ade5c569d13144f77a1946bef8863e8f6eb4"}, - {file = "scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:933baf588daa8dc9a92c20a0be32f56d43faf3d1a60ab11b3f08c356430f6e56"}, - {file = "scipy-1.11.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8fce70f39076a5aa62e92e69a7f62349f9574d8405c0a5de6ed3ef72de07f446"}, - {file = "scipy-1.11.4-cp310-cp310-win_amd64.whl", hash = "sha256:6550466fbeec7453d7465e74d4f4b19f905642c89a7525571ee91dd7adabb5a3"}, - {file = "scipy-1.11.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be"}, - {file = "scipy-1.11.4-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1b7c3dca977f30a739e0409fb001056484661cb2541a01aba0bb0029f7b68db8"}, - {file = "scipy-1.11.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c"}, - {file = "scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:530f9ad26440e85766509dbf78edcfe13ffd0ab7fec2560ee5c36ff74d6269ff"}, - {file = "scipy-1.11.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5e347b14fe01003d3b78e196e84bd3f48ffe4c8a7b8a1afbcb8f5505cb710993"}, - {file = "scipy-1.11.4-cp311-cp311-win_amd64.whl", hash = "sha256:acf8ed278cc03f5aff035e69cb511741e0418681d25fbbb86ca65429c4f4d9cd"}, - {file = "scipy-1.11.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6"}, - {file = "scipy-1.11.4-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2c6ff6ef9cc27f9b3db93a6f8b38f97387e6e0591600369a297a50a8e96e835d"}, - {file = "scipy-1.11.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b030c6674b9230d37c5c60ab456e2cf12f6784596d15ce8da9365e70896effc4"}, - {file = "scipy-1.11.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad669df80528aeca5f557712102538f4f37e503f0c5b9541655016dd0932ca79"}, - {file = "scipy-1.11.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ce7fff2e23ab2cc81ff452a9444c215c28e6305f396b2ba88343a567feec9660"}, - {file = "scipy-1.11.4-cp312-cp312-win_amd64.whl", hash = "sha256:36750b7733d960d7994888f0d148d31ea3017ac15eef664194b4ef68d36a4a97"}, - {file = "scipy-1.11.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6e619aba2df228a9b34718efb023966da781e89dd3d21637b27f2e54db0410d7"}, - {file = "scipy-1.11.4-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec"}, - {file = "scipy-1.11.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d10e45a6c50211fe256da61a11c34927c68f277e03138777bdebedd933712fea"}, - {file = "scipy-1.11.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91af76a68eeae0064887a48e25c4e616fa519fa0d38602eda7e0f97d65d57937"}, - {file = "scipy-1.11.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6df1468153a31cf55ed5ed39647279beb9cfb5d3f84369453b49e4b8502394fd"}, - {file = "scipy-1.11.4-cp39-cp39-win_amd64.whl", hash = "sha256:ee410e6de8f88fd5cf6eadd73c135020bfbbbdfcd0f6162c36a7638a1ea8cc65"}, - {file = "scipy-1.11.4.tar.gz", hash = "sha256:90a2b78e7f5733b9de748f589f09225013685f9b218275257f8a8168ededaeaa"}, + {file = "scipy-1.12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:78e4402e140879387187f7f25d91cc592b3501a2e51dfb320f48dfb73565f10b"}, + {file = "scipy-1.12.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:f5f00ebaf8de24d14b8449981a2842d404152774c1a1d880c901bf454cb8e2a1"}, + {file = "scipy-1.12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e53958531a7c695ff66c2e7bb7b79560ffdc562e2051644c5576c39ff8efb563"}, + {file = "scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e32847e08da8d895ce09d108a494d9eb78974cf6de23063f93306a3e419960c"}, + {file = "scipy-1.12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4c1020cad92772bf44b8e4cdabc1df5d87376cb219742549ef69fc9fd86282dd"}, + {file = "scipy-1.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:75ea2a144096b5e39402e2ff53a36fecfd3b960d786b7efd3c180e29c39e53f2"}, + {file = "scipy-1.12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:408c68423f9de16cb9e602528be4ce0d6312b05001f3de61fe9ec8b1263cad08"}, + {file = "scipy-1.12.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5adfad5dbf0163397beb4aca679187d24aec085343755fcdbdeb32b3679f254c"}, + {file = "scipy-1.12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3003652496f6e7c387b1cf63f4bb720951cfa18907e998ea551e6de51a04467"}, + {file = "scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b8066bce124ee5531d12a74b617d9ac0ea59245246410e19bca549656d9a40a"}, + {file = "scipy-1.12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8bee4993817e204d761dba10dbab0774ba5a8612e57e81319ea04d84945375ba"}, + {file = "scipy-1.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:a24024d45ce9a675c1fb8494e8e5244efea1c7a09c60beb1eeb80373d0fecc70"}, + {file = "scipy-1.12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e7e76cc48638228212c747ada851ef355c2bb5e7f939e10952bc504c11f4e372"}, + {file = "scipy-1.12.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f7ce148dffcd64ade37b2df9315541f9adad6efcaa86866ee7dd5db0c8f041c3"}, + {file = "scipy-1.12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c39f92041f490422924dfdb782527a4abddf4707616e07b021de33467f917bc"}, + {file = "scipy-1.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7ebda398f86e56178c2fa94cad15bf457a218a54a35c2a7b4490b9f9cb2676c"}, + {file = "scipy-1.12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:95e5c750d55cf518c398a8240571b0e0782c2d5a703250872f36eaf737751338"}, + {file = "scipy-1.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:e646d8571804a304e1da01040d21577685ce8e2db08ac58e543eaca063453e1c"}, + {file = "scipy-1.12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:913d6e7956c3a671de3b05ccb66b11bc293f56bfdef040583a7221d9e22a2e35"}, + {file = "scipy-1.12.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:bba1b0c7256ad75401c73e4b3cf09d1f176e9bd4248f0d3112170fb2ec4db067"}, + {file = "scipy-1.12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:730badef9b827b368f351eacae2e82da414e13cf8bd5051b4bdfd720271a5371"}, + {file = "scipy-1.12.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6546dc2c11a9df6926afcbdd8a3edec28566e4e785b915e849348c6dd9f3f490"}, + {file = "scipy-1.12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:196ebad3a4882081f62a5bf4aeb7326aa34b110e533aab23e4374fcccb0890dc"}, + {file = "scipy-1.12.0-cp39-cp39-win_amd64.whl", hash = "sha256:b360f1b6b2f742781299514e99ff560d1fe9bd1bff2712894b52abe528d1fd1e"}, + {file = "scipy-1.12.0.tar.gz", hash = "sha256:4bf5abab8a36d20193c698b0f1fc282c1d083c94723902c447e5d2f1780936a3"}, ] [package.dependencies] -numpy = ">=1.21.6,<1.28.0" +numpy = ">=1.22.4,<1.29.0" [package.extras] dev = ["click", "cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"] doc = ["jupytext", "matplotlib (>2)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-design (>=0.2.0)"] -test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] +test = ["asv", "gmpy2", "hypothesis", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] [[package]] name = "scs" version = "3.2.4.post1" description = "Splitting conic solver" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1651,6 +1705,7 @@ scipy = "*" name = "seaborn" version = "0.13.1" description = "Statistical data visualization" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1672,6 +1727,7 @@ stats = ["scipy (>=1.7)", "statsmodels (>=0.12)"] name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -1683,6 +1739,7 @@ files = [ name = "statsmodels" version = "0.14.1" description = "Statistical computations and models for Python" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1733,6 +1790,7 @@ docs = ["ipykernel", "jupyter-client", "matplotlib", "nbconvert", "nbformat", "n name = "svgutils" version = "0.3.4" description = "Python SVG editor" +category = "main" optional = false python-versions = "*" files = [ @@ -1747,6 +1805,7 @@ lxml = "*" name = "tabulate" version = "0.9.0" description = "Pretty-print tabular data" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1761,6 +1820,7 @@ widechars = ["wcwidth"] name = "textwrap3" version = "0.9.2" description = "textwrap from Python 3.6 backport (plus a few tweaks)" +category = "main" optional = false python-versions = "*" files = [ @@ -1772,6 +1832,7 @@ files = [ name = "threadpoolctl" version = "3.2.0" description = "threadpoolctl" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1783,6 +1844,7 @@ files = [ name = "typing-extensions" version = "4.9.0" description = "Backported and Experimental Type Hints for Python 3.8+" +category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -1794,6 +1856,7 @@ files = [ name = "tzdata" version = "2023.4" description = "Provider of IANA time zone data" +category = "main" optional = false python-versions = ">=2" files = [ @@ -1805,6 +1868,7 @@ files = [ name = "urllib3" version = "2.1.0" description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1820,4 +1884,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "7f8d71307f1bdb26a92257989fc06f67812c594bfd4f474d93972a04cdccfe67" +content-hash = "2cd3d2a9a31b35229da37f6b4b4fd5eafda1a0a666a7b26da0d3fc17cc8dd6b3"