From cfb91ad84a0fc9d6ac7924ae9a761ec218e7daa0 Mon Sep 17 00:00:00 2001 From: Aaron Meyer Date: Mon, 22 Jan 2024 07:17:59 -0800 Subject: [PATCH] Fix MS7 --- ddmc/figures/figureMS7.py | 69 ++++++++++++++------------------------- 1 file changed, 24 insertions(+), 45 deletions(-) diff --git a/ddmc/figures/figureMS7.py b/ddmc/figures/figureMS7.py index ad67d547..4e59a312 100644 --- a/ddmc/figures/figureMS7.py +++ b/ddmc/figures/figureMS7.py @@ -34,29 +34,12 @@ def makeFigure(): d ) - centers_min = pd.DataFrame(model_min.transform()).T - centers_min.iloc[:, :] = StandardScaler(with_std=False).fit_transform( - centers_min.iloc[:, :] - ) - centers_min = centers_min.T - centers_min.columns = np.arange(model_min.n_components) + 1 - centers_min["Patient_ID"] = X.columns[4:] - centers_min = find_patients_with_NATandTumor( - centers_min.copy(), "Patient_ID", conc=True - ) + centers_min = reshapePatients(model_min.transform(), X.columns[4:]) # Fit DDMC model = DDMC(i, n_components=30, SeqWeight=100, distance_method="Binomial").fit(d) - # Find and scale centers - centers = pd.DataFrame(model.transform()).T - centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform( - centers.iloc[:, :] - ) - centers = centers.T - centers.columns = np.arange(model.n_components) + 1 - centers["Patient_ID"] = X.columns[4:] - centers = find_patients_with_NATandTumor(centers.copy(), "Patient_ID", conc=True) + centers = reshapePatients(model.transform(), X.columns[4:]) # Predicting STK11 lr = LogisticRegressionCV( @@ -93,7 +76,7 @@ def makeFigure(): return f -def plot_ROCs(ax, centers, centers_min, X, i, y, lr, gene_label): +def plot_ROCs(ax, centers, centers_min, X, i, y: pd.Series, lr, gene_label): """Generate ROC plots using DDMC, unclustered, k-means, and GMM for a particular feature.""" folds = 7 @@ -102,7 +85,7 @@ def plot_ROCs(ax, centers, centers_min, X, i, y, lr, gene_label): ax[0], lr, centers.values, - y, + y.values, cv_folds=folds, title="DDMC—Full data set" + gene_label, ) @@ -112,7 +95,7 @@ def plot_ROCs(ax, centers, centers_min, X, i, y, lr, gene_label): ax[1], lr, centers_min.values, - y, + y.values, cv_folds=folds, title="DDMC—Complete portion" + gene_label, ) @@ -120,44 +103,40 @@ def plot_ROCs(ax, centers, centers_min, X, i, y, lr, gene_label): # Unclustered X_f = X.loc[:, centers.index].T X_f.index = np.arange(X_f.shape[0]) - plotROC(ax[2], lr, X_f.values, y, cv_folds=folds, title="Unclustered " + gene_label) + plotROC(ax[2], lr, X_f.values, y.values, cv_folds=folds, title="Unclustered " + gene_label) # Run k-means - ncl = 30 d = X.select_dtypes(include=["float64"]).T.reset_index() d.rename(columns={"index": "Patient_ID"}, inplace=True) d = d.iloc[:, 1:] - x_ = X.copy() - x_["Cluster"] = KMeans(n_clusters=ncl).fit(d.T).labels_ - print(x_) - c_kmeans = x_.groupby("Cluster").mean().T - c_kmeans["Patient_ID"] = X.columns[4:] - c_kmeans.columns = list(np.arange(ncl) + 1) + ["Patient_ID"] - c_kmeans.iloc[:, :-1] = StandardScaler(with_std=False).fit_transform( - c_kmeans.iloc[:, :-1] - ) + + kmeans = KMeans(n_clusters=30).fit(d.T) # Reshape data (Patients vs NAT and tumor sample per cluster) - c_kmeansT = find_patients_with_NATandTumor(c_kmeans.copy(), "Patient_ID", conc=True) + c_kmeansT = reshapePatients(kmeans.cluster_centers_.T, X.columns[4:]) # Regress k-means clusters against STK11 status plotROC( - ax[3], lr, c_kmeansT.values, y, cv_folds=folds, title="k-means " + gene_label + ax[3], lr, c_kmeansT.values, y.values, cv_folds=folds, title="k-means " + gene_label ) # Run GMM gmm = DDMC( i, n_components=30, SeqWeight=0, distance_method="Binomial", random_state=15 - ).fit(d, "NA") - x_["Cluster"] = gmm.labels() - c_gmm = x_.groupby("Cluster").mean().T - c_gmm["Patient_ID"] = X.columns[4:] - c_gmm.iloc[:, :-1] = StandardScaler(with_std=False).fit_transform( - c_gmm.iloc[:, :-1] - ) + ).fit(d) - # Reshape data (Patients vs NAT and tumor sample per cluster) - c_gmmT = find_patients_with_NATandTumor(c_gmm.copy(), "Patient_ID", conc=True) + c_gmmT = reshapePatients(gmm.transform(), X.columns[4:]) # Regress GMM clusters against STK11 status - plotROC(ax[4], lr, c_gmmT.values, y, cv_folds=folds, title="GMM " + gene_label) + plotROC(ax[4], lr, c_gmmT.values, y.values, cv_folds=folds, title="GMM " + gene_label) + + +def reshapePatients(centers, patients): + df = pd.DataFrame(centers) + df["Patient_ID"] = patients + df.iloc[:, :-1] = StandardScaler(with_std=False).fit_transform( + df.iloc[:, :-1] + ) + + # Reshape data (Patients vs NAT and tumor sample per cluster) + return find_patients_with_NATandTumor(df.copy(), "Patient_ID", conc=True) \ No newline at end of file