diff --git a/.github/workflows/autopep8.yml b/.github/workflows/autopep8.yml deleted file mode 100644 index 31b6db25..00000000 --- a/.github/workflows/autopep8.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: autopep8 - -on: - schedule: - - cron: '0 18 * * *' -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: autopep8 - id: autopep8 - uses: peter-evans/autopep8@v1 - with: - args: --exit-code -raai --max-line-length 200 . - - name: Create Pull Request - if: steps.autopep8.outputs.exit-code == 2 - uses: peter-evans/create-pull-request@v3 - with: - token: ${{ secrets.GITHUB_TOKEN }} - commit-message: autopep8 action fixes - title: Fixes by autopep8 action - body: This is an auto-generated PR with fixes by autopep8. - labels: autopep8, automated pr - branch: create-pull-request/autopep8 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1fad3c15..c105f0dd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,7 +10,7 @@ jobs: - name: Install dependencies run: poetry install - name: Build figures - run: make -j 2 all + run: make -i all - name: Upload files uses: actions/upload-artifact@v4 with: diff --git a/ddmc/clustering.py b/ddmc/clustering.py index fff8a5e4..3a4d6eab 100644 --- a/ddmc/clustering.py +++ b/ddmc/clustering.py @@ -70,13 +70,11 @@ def _m_step(self, X: np.ndarray, log_resp: np.ndarray): """ if self._missing: labels = np.argmax(log_resp, axis=1) - centers = self.means_.T # samples x clusters + centers = np.array(self.means_) # samples x clusters + centers_fill = centers[labels, :] - assert len(labels) == X.shape[0] - for ii in range(X.shape[0]): # X is peptides x samples - X[ii, self.missing_d[ii, :]] = centers[ - self.missing_d[ii, :], labels[ii] - ] + assert centers_fill.shape == X.shape + X[self.missing_d] = centers_fill[self.missing_d] super()._m_step(X, log_resp) # Do the regular m step diff --git a/ddmc/figures/common.py b/ddmc/figures/common.py index 70756654..0ecd8933 100644 --- a/ddmc/figures/common.py +++ b/ddmc/figures/common.py @@ -126,7 +126,7 @@ def genFigure(): print(f"Figure {sys.argv[1]} is done after {time.time() - start} seconds.\n") -def getDDMC_CPTAC(n_components: int, SeqWeight: float) -> DDMC: +def getDDMC_CPTAC(n_components: int, SeqWeight: float): # Import signaling data X = filter_NaNpeptides( pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], @@ -143,7 +143,7 @@ def getDDMC_CPTAC(n_components: int, SeqWeight: float) -> DDMC: distance_method="Binomial", random_state=5, ).fit(d) - return model + return model, X def plotMotifs(pssm, ax: axes.Axes, titles=False, yaxis=False): diff --git a/ddmc/figures/figureM4.py b/ddmc/figures/figureM4.py index 493aa5fd..f16de360 100644 --- a/ddmc/figures/figureM4.py +++ b/ddmc/figures/figureM4.py @@ -46,9 +46,9 @@ def makeFigure(): ax[0].legend(prop={"size": 5}, loc=0) # Fit Data, Mix, and Seq Models - dataM = getDDMC_CPTAC(n_components=30, SeqWeight=0.0) - mixM = getDDMC_CPTAC(n_components=30, SeqWeight=250.0) - seqM = getDDMC_CPTAC(n_components=30, SeqWeight=1.0e6) + dataM, _ = getDDMC_CPTAC(n_components=30, SeqWeight=0.0) + mixM, _ = getDDMC_CPTAC(n_components=30, SeqWeight=250.0) + seqM, _ = getDDMC_CPTAC(n_components=30, SeqWeight=1.0e6) models = [dataM, mixM, seqM] # Center to peptide distance diff --git a/ddmc/figures/figureM5.py b/ddmc/figures/figureM5.py index 8f6fb32e..2b8c637f 100644 --- a/ddmc/figures/figureM5.py +++ b/ddmc/figures/figureM5.py @@ -24,14 +24,8 @@ def makeFigure(): # Get list of axis objects ax, f = getSetup((11, 10), (3, 3), multz={0: 1, 4: 1}) - # Import signaling data - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, - ) - # Fit DDMC - model = getDDMC_CPTAC(n_components=30, SeqWeight=100.0) + model, X = getDDMC_CPTAC(n_components=30, SeqWeight=100.0) # Normalize centers = pd.DataFrame(model.transform()).T diff --git a/ddmc/figures/figureM6.py b/ddmc/figures/figureM6.py index 8a7f4a6c..0862c6d2 100644 --- a/ddmc/figures/figureM6.py +++ b/ddmc/figures/figureM6.py @@ -26,14 +26,8 @@ def makeFigure(): # Get list of axis objects ax, f = getSetup((11, 7), (2, 3), multz={0: 1}) - # Import signaling data - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, - ) - # Fit DDMC - model = getDDMC_CPTAC(n_components=30, SeqWeight=100.0) + model, X = getDDMC_CPTAC(n_components=30, SeqWeight=100.0) # Import Genotype data mutations = pd.read_csv("ddmc/data/MS/CPTAC/Patient_Mutations.csv") diff --git a/ddmc/figures/figureMS2.py b/ddmc/figures/figureMS2.py index 55658823..42e4c261 100644 --- a/ddmc/figures/figureMS2.py +++ b/ddmc/figures/figureMS2.py @@ -2,12 +2,9 @@ This creates Supplemental Figure 2: Cluster motifs """ -import pandas as pd import numpy as np -from .common import getSetup +from .common import getSetup, getDDMC_CPTAC from .common import plotMotifs -from ..pre_processing import filter_NaNpeptides -from ..clustering import DDMC def makeFigure(): @@ -15,25 +12,15 @@ def makeFigure(): # Get list of axis objects ax, f = getSetup((9, 9), (5, 5)) - # Import signaling data - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, - ) - d = X.select_dtypes(include=[float]).T - i = X["Sequence"] - # Fit DDMC - model = DDMC( - i, n_components=30, SeqWeight=100, distance_method="Binomial", random_state=5 - ).fit(d) + model, _ = getDDMC_CPTAC(n_components=30, SeqWeight=100.0) pssms, cl_num = model.pssms(PsP_background=False) ylabels = np.arange(0, 21, 5) xlabels = [20, 21, 22, 23, 24, 25] for ii, cc in enumerate(cl_num): cluster = "Cluster " + str(cc) - plotMotifs(pssms[ii], axes=ax[ii], titles=cluster, yaxis=[0, 10]) + plotMotifs(pssms[ii], ax=ax[ii], titles=cluster, yaxis=[0, 10]) if ii not in ylabels: ax[ii].set_ylabel("") ax[ii].get_yaxis().set_visible(False) diff --git a/ddmc/figures/figureMS3.py b/ddmc/figures/figureMS3.py index f6cfd00a..2d54f982 100644 --- a/ddmc/figures/figureMS3.py +++ b/ddmc/figures/figureMS3.py @@ -4,10 +4,8 @@ import pandas as pd from sklearn.linear_model import LogisticRegressionCV -from .common import getSetup +from .common import getSetup, getDDMC_CPTAC from .figureM4 import TransformCenters, HotColdBehavior, find_patients_with_NATandTumor -from ..pre_processing import filter_NaNpeptides -from ..clustering import DDMC from ..logistic_regression import plotROC @@ -16,14 +14,6 @@ def makeFigure(): # Get list of axis objects ax, f = getSetup((15, 10), (3, 5)) - # Signaling - X = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:], - tmt=2, - ) - d = X.select_dtypes(include=[float]).T - i = X["Sequence"] - # Genotype data mutations = pd.read_csv("ddmc/data/MS/CPTAC/Patient_Mutations.csv") mOI = mutations[ @@ -46,7 +36,7 @@ def makeFigure(): folds = 5 weights = [0, 100, 500, 1000, 1000000] for ii, w in enumerate(weights): - model = DDMC(i, n_components=30, SeqWeight=w, distance_method="Binomial").fit(d) + model, X = getDDMC_CPTAC(n_components=30, SeqWeight=w) # Find and scale centers centers_gen, centers_hcb = TransformCenters(model, X) @@ -63,7 +53,7 @@ def makeFigure(): ax[ii], lr, centers_gen.values, - y["STK11.mutation.status"], + y["STK11.mutation.status"].values, # type: ignore cv_folds=folds, title="STK11m " + "w=" + str(model.SeqWeight) + prio, ) @@ -73,7 +63,7 @@ def makeFigure(): ax[ii + 5], lr, centers_gen.values, - y["EGFR.mutation.status"], + y["EGFR.mutation.status"].values, # type: ignore cv_folds=folds, title="EGFRm " + "w=" + str(model.SeqWeight) + prio, ) @@ -84,7 +74,7 @@ def makeFigure(): ax[ii + 10], lr, centers_hcb.values, - y_hcb, + y_hcb.values, cv_folds=folds, title="Infiltration " + "w=" + str(model.SeqWeight) + prio, ) diff --git a/ddmc/figures/figureMS6.py b/ddmc/figures/figureMS6.py index 749e39d0..23deb716 100644 --- a/ddmc/figures/figureMS6.py +++ b/ddmc/figures/figureMS6.py @@ -22,7 +22,7 @@ def makeFigure(): ax, f = getSetup((11, 7), (2, 3), multz={0: 1}) # Fit DDMC - model = getDDMC_CPTAC(n_components=30, SeqWeight=100.0) + model, X = getDDMC_CPTAC(n_components=30, SeqWeight=100.0) # Import Genotype data mutations = pd.read_csv("ddmc/data/MS/CPTAC/Patient_Mutations.csv") diff --git a/ddmc/pre_processing.py b/ddmc/pre_processing.py index 5517cc0f..8d54e9bb 100644 --- a/ddmc/pre_processing.py +++ b/ddmc/pre_processing.py @@ -3,9 +3,7 @@ import os import numpy as np import pandas as pd -from scipy import stats from collections import defaultdict -from .motifs import FormatName, MapMotifs path = os.path.dirname(os.path.abspath(__file__)) @@ -124,125 +122,6 @@ def FoldChangeFilterBasedOnMaxFC(X, data_headers, cutoff=0.5): ###------------ Filter by variance (stdev or range/pearson's) ------------------### - -def VFilter(ABC, merging_indices, data_headers, corrCut=0.55, stdCut=0.5): - """Filter based on variability across recurrent peptides in MS biological replicates""" - NonRecPeptides, CorrCoefPeptides, StdPeptides = MapOverlappingPeptides(ABC) - - NonRecTable = BuildMatrix(NonRecPeptides, ABC, data_headers) - NonRecTable = NonRecTable.assign(BioReps=list("1" * NonRecTable.shape[0])) - NonRecTable = NonRecTable.assign(r2_Std=list(["N/A"] * NonRecTable.shape[0])) - - CorrCoefPeptides = BuildMatrix(CorrCoefPeptides, ABC, data_headers) - DupsTable = CorrCoefFilter(CorrCoefPeptides, corrCut=corrCut) - DupsTable = MergeDfbyMean( - DupsTable, DupsTable[data_headers], merging_indices + ["r2_Std"] - ) - DupsTable = DupsTable.assign(BioReps=list("2" * DupsTable.shape[0])).reset_index() - - StdPeptides = BuildMatrix(StdPeptides, ABC, data_headers) - TripsTable = TripsMeanAndStd( - StdPeptides, merging_indices + ["BioReps"], data_headers - ) - TripsTable = FilterByStdev(TripsTable, merging_indices + ["BioReps"], stdCut=stdCut) - - merging_indices.insert(4, "BioReps") - merging_indices.insert(5, "r2_Std") - - ABC = pd.concat([NonRecTable, DupsTable, TripsTable]).reset_index()[ - merging_indices[:2] + list(ABC[data_headers].columns) + merging_indices[2:] - ] - - # Including non-overlapping peptides - return ABC - - -def MapOverlappingPeptides(ABC): - """Find recurrent peptides across biological replicates. Grouping those showing up 2 to later calculate - correlation, those showing up >= 3 to take the std. Those showing up 1 can be included or not in the final data set. - Final dfs are formed by 'Name', 'Peptide', '#Recurrences'.""" - dups = pd.pivot_table( - ABC, index=["Protein", "Sequence"], aggfunc="size" - ).sort_values() - dups = pd.DataFrame(dups).reset_index() - dups.columns = [ABC.columns[0], ABC.columns[1], "Recs"] - NonRecPeptides = dups[dups["Recs"] == 1] - RangePeptides = dups[dups["Recs"] == 2] - StdPeptides = dups[dups["Recs"] >= 3] - return NonRecPeptides, RangePeptides, StdPeptides - - -def BuildMatrix(peptides, ABC, data_headers): - """Map identified recurrent peptides to generate complete matrices with values. - If recurrent peptides = 2, the correlation coefficient is included in a new column. - """ - ABC = ABC.reset_index().set_index(["Sequence", "Protein"], drop=False) - ABC = ABC.sort_index() - - corrcoefs, peptideslist, bioReps = [], [], [] - for idx, seq in enumerate(peptides.iloc[:, 1]): - name = peptides.iloc[idx, 0] - - # Skip blank - if name == "(blank)": - continue - - pepts = ABC.loc[seq, name] - pepts = pepts.iloc[:, 1:] - names = pepts.iloc[:, 0] - - if len(pepts) == 1: - peptideslist.append(pepts.iloc[0, :]) - elif len(pepts) == 2 and len(set(names)) == 1: - fc = Linear(pepts[data_headers].copy(), data_headers) - corrcoef, _ = stats.pearsonr(fc.iloc[0, :], fc.iloc[1, :]) - for i in range(len(pepts)): - corrcoefs.append(np.round(corrcoef, decimals=2)) - peptideslist.append(pepts.iloc[i, :]) - elif len(pepts) >= 3 and len(set(names)) == 1: - for i in range(len(pepts)): - peptideslist.append(pepts.iloc[i, :]) - bioReps.append(len(pepts)) - else: - print("check this", pepts) - - if corrcoefs: - matrix = ( - pd.DataFrame(peptideslist).reset_index(drop=True).assign(r2_Std=corrcoefs) - ) - - elif bioReps: - matrix = ( - pd.DataFrame(peptideslist).reset_index(drop=True).assign(BioReps=bioReps) - ) - - else: - matrix = pd.DataFrame(peptideslist).reset_index(drop=True) - - return matrix - - -def CorrCoefFilter(X, corrCut=0.6): - """Filter rows for those containing more than a correlation threshold.""" - Xidx = X.iloc[:, -1].values >= corrCut - return X.iloc[Xidx, :] - - -def TripsMeanAndStd(triplicates, merging_indices, data_headers): - """Merge all triplicates by mean and standard deviation across conditions. Note this builds a multilevel header - meaning we have 2 values for each condition (eg within Erlotinib -> Mean | Std).""" - func_tri = {} - for i in triplicates[data_headers].columns: - func_tri[i] = "mean", "std" - X = pd.pivot_table( - triplicates, - values=triplicates[data_headers].columns, - index=merging_indices, - aggfunc=func_tri, - ) - return X.reset_index() - - def MeanCenter(X, mc_row, mc_col): """Mean centers each row of values. logT also optionally log2-transforms.""" data_headers = X.select_dtypes(include=["float64"]).columns @@ -251,96 +130,3 @@ def MeanCenter(X, mc_row, mc_col): if mc_col: X[data_headers] = X[data_headers].sub(X[data_headers].mean(axis=0), axis=1) return X - - -def FilterByRange(X, rangeCut=0.4): - """Filter rows for those containing more than a range threshold.""" - Rg = X.iloc[:, X.columns.get_level_values(1) == "ptp"] - Xidx = np.all(Rg.values <= rangeCut, axis=1) - return X.iloc[Xidx, :] - - -def FilterByStdev(X, merging_indices, stdCut=0.4): - """Filter rows for those containing more than a standard deviation threshold.""" - Stds = X.iloc[:, X.columns.get_level_values(1) == "std"] - StdMeans = list(np.round(Stds.mean(axis=1), decimals=2)) - Xidx = np.mean(Stds.values, axis=1) <= stdCut - Means = pd.concat( - [X[merging_indices], X.iloc[:, X.columns.get_level_values(1) == "mean"]], axis=1 - ) - Means = Means.iloc[Xidx, :] - Means.columns = Means.columns.droplevel(1) - StdMeans = pd.DataFrame(StdMeans).iloc[Xidx, :] - return Means.assign(r2_Std=StdMeans) - - -def peptidefinder(X, loc, Protein=False, Gene=False, Sequence=False): - """Search for a specific peptide either by name or sequence.""" - if Protein: - found = X[X["Protein"].str.contains(loc)] - if Gene: - found = X[X["Gene"].str.contains(loc)] - if Sequence: - found = X[X["Sequence"].str.contains(loc)] - return found - - -######----------pre-processing of phenotype data----------###### - - -def MergeTR(data): - """Convenient to merge by mean all TRs of IncuCyte""" - for i in range(1, data.shape[1], 2): - data.iloc[:, i] = data.iloc[:, i : i + 2].mean(axis=1) - - return data.drop( - data.columns[[i + 1 for i in range(1, data.shape[1], 2)]], axis="columns" - ) - - -def y_pre(ds, tr, ftp, phenotype, all_lines, itp=False): - """Preprocesses cell phenotype data sets for analysis.""" - z = [] - for sl in ds: - c = sl.loc[:, sl.columns.str.contains(tr)] - c.insert(0, "Elapsed", ds[0].iloc[:, 0]) - c = c[ - list(c.columns[:3]) + [c.columns[4]] + [c.columns[3]] + list(c.columns[5:]) - ] - if not isinstance(itp, bool): - c = ( - c[c["Elapsed"] == ftp] - .iloc[0, 1:] - .div(c[c["Elapsed"] == itp].iloc[0, 1:]) - ) - else: - c = c[c["Elapsed"] == ftp].iloc[0, 1:] - z.append(c) - - y = pd.DataFrame(pd.concat(z, axis=0)).reset_index() - y.columns = ["Lines", phenotype] - y = y.groupby("Lines").mean().T[c.index].T.reset_index() - - y["Lines"] = [s.split(tr)[0] for s in y.iloc[:, 0]] - y["Treatment"] = tr - - if "-" in y["Lines"][1]: - y["Lines"] = [s.split("-")[0] for s in y.iloc[:, 0]] - - y["Lines"] = all_lines - return y[["Lines", "Treatment", phenotype]] - - -def FixColumnLabels(cv): - """Fix column labels to use pandas locators.""" - l = [] - for label in cv[0].columns: - if "-" not in label and label != "Elapsed": - l.append(label + "-UT") - if "-" in label or label == "Elapsed": - l.append(label) - - for d in cv: - d.columns = l - - return cv