Skip to content

Commit

Permalink
Fix MS7
Browse files Browse the repository at this point in the history
  • Loading branch information
aarmey committed Jan 22, 2024
1 parent 42958bb commit cfb91ad
Showing 1 changed file with 24 additions and 45 deletions.
69 changes: 24 additions & 45 deletions ddmc/figures/figureMS7.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,29 +34,12 @@ def makeFigure():
d
)

centers_min = pd.DataFrame(model_min.transform()).T
centers_min.iloc[:, :] = StandardScaler(with_std=False).fit_transform(
centers_min.iloc[:, :]
)
centers_min = centers_min.T
centers_min.columns = np.arange(model_min.n_components) + 1
centers_min["Patient_ID"] = X.columns[4:]
centers_min = find_patients_with_NATandTumor(
centers_min.copy(), "Patient_ID", conc=True
)
centers_min = reshapePatients(model_min.transform(), X.columns[4:])

# Fit DDMC
model = DDMC(i, n_components=30, SeqWeight=100, distance_method="Binomial").fit(d)

# Find and scale centers
centers = pd.DataFrame(model.transform()).T
centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform(
centers.iloc[:, :]
)
centers = centers.T
centers.columns = np.arange(model.n_components) + 1
centers["Patient_ID"] = X.columns[4:]
centers = find_patients_with_NATandTumor(centers.copy(), "Patient_ID", conc=True)
centers = reshapePatients(model.transform(), X.columns[4:])

# Predicting STK11
lr = LogisticRegressionCV(
Expand Down Expand Up @@ -93,7 +76,7 @@ def makeFigure():
return f


def plot_ROCs(ax, centers, centers_min, X, i, y, lr, gene_label):
def plot_ROCs(ax, centers, centers_min, X, i, y: pd.Series, lr, gene_label):
"""Generate ROC plots using DDMC, unclustered, k-means, and GMM for a particular feature."""
folds = 7

Expand All @@ -102,7 +85,7 @@ def plot_ROCs(ax, centers, centers_min, X, i, y, lr, gene_label):
ax[0],
lr,
centers.values,
y,
y.values,
cv_folds=folds,
title="DDMC—Full data set" + gene_label,
)
Expand All @@ -112,52 +95,48 @@ def plot_ROCs(ax, centers, centers_min, X, i, y, lr, gene_label):
ax[1],
lr,
centers_min.values,
y,
y.values,
cv_folds=folds,
title="DDMC—Complete portion" + gene_label,
)

# Unclustered
X_f = X.loc[:, centers.index].T
X_f.index = np.arange(X_f.shape[0])
plotROC(ax[2], lr, X_f.values, y, cv_folds=folds, title="Unclustered " + gene_label)
plotROC(ax[2], lr, X_f.values, y.values, cv_folds=folds, title="Unclustered " + gene_label)

# Run k-means
ncl = 30
d = X.select_dtypes(include=["float64"]).T.reset_index()
d.rename(columns={"index": "Patient_ID"}, inplace=True)
d = d.iloc[:, 1:]
x_ = X.copy()
x_["Cluster"] = KMeans(n_clusters=ncl).fit(d.T).labels_
print(x_)
c_kmeans = x_.groupby("Cluster").mean().T
c_kmeans["Patient_ID"] = X.columns[4:]
c_kmeans.columns = list(np.arange(ncl) + 1) + ["Patient_ID"]
c_kmeans.iloc[:, :-1] = StandardScaler(with_std=False).fit_transform(
c_kmeans.iloc[:, :-1]
)

kmeans = KMeans(n_clusters=30).fit(d.T)

# Reshape data (Patients vs NAT and tumor sample per cluster)
c_kmeansT = find_patients_with_NATandTumor(c_kmeans.copy(), "Patient_ID", conc=True)
c_kmeansT = reshapePatients(kmeans.cluster_centers_.T, X.columns[4:])

# Regress k-means clusters against STK11 status
plotROC(
ax[3], lr, c_kmeansT.values, y, cv_folds=folds, title="k-means " + gene_label
ax[3], lr, c_kmeansT.values, y.values, cv_folds=folds, title="k-means " + gene_label
)

# Run GMM
gmm = DDMC(
i, n_components=30, SeqWeight=0, distance_method="Binomial", random_state=15
).fit(d, "NA")
x_["Cluster"] = gmm.labels()
c_gmm = x_.groupby("Cluster").mean().T
c_gmm["Patient_ID"] = X.columns[4:]
c_gmm.iloc[:, :-1] = StandardScaler(with_std=False).fit_transform(
c_gmm.iloc[:, :-1]
)
).fit(d)

# Reshape data (Patients vs NAT and tumor sample per cluster)
c_gmmT = find_patients_with_NATandTumor(c_gmm.copy(), "Patient_ID", conc=True)
c_gmmT = reshapePatients(gmm.transform(), X.columns[4:])

# Regress GMM clusters against STK11 status
plotROC(ax[4], lr, c_gmmT.values, y, cv_folds=folds, title="GMM " + gene_label)
plotROC(ax[4], lr, c_gmmT.values, y.values, cv_folds=folds, title="GMM " + gene_label)


def reshapePatients(centers, patients):
df = pd.DataFrame(centers)
df["Patient_ID"] = patients
df.iloc[:, :-1] = StandardScaler(with_std=False).fit_transform(
df.iloc[:, :-1]
)

# Reshape data (Patients vs NAT and tumor sample per cluster)
return find_patients_with_NATandTumor(df.copy(), "Patient_ID", conc=True)

0 comments on commit cfb91ad

Please sign in to comment.