Skip to content

Commit

Permalink
remake figure 7
Browse files Browse the repository at this point in the history
  • Loading branch information
armaan-abraham committed Feb 2, 2024
1 parent 83ee890 commit d0fb555
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 105 deletions.
33 changes: 23 additions & 10 deletions ddmc/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ def fit(self, p_signal: pd.DataFrame):
# TODO: probably just pass in sequences here
assert isinstance(p_signal, pd.DataFrame)
sequences = p_signal.index.values
assert isinstance(sequences[0], str) and len(sequences[0]) == 11, "The index of p_signal must be the peptide sequences of length 11"
assert (
isinstance(sequences[0], str) and len(sequences[0]) == 11
), "The index of p_signal must be the peptide sequences of length 11"
self.p_signal = p_signal
self.gen_peptide_distances(sequences, self.distance_method)

Expand Down Expand Up @@ -156,7 +158,11 @@ def transform(self, as_df=False) -> np.ndarray | pd.DataFrame:
check_is_fitted(self, ["means_"])
centers = self.means_.T
if as_df:
centers = pd.DataFrame(centers, index=self.p_signal.columns, columns=np.arange(self.n_components))
centers = pd.DataFrame(
centers,
index=self.p_signal.columns,
columns=np.arange(self.n_components),
)
return centers

def impute(self, X: np.ndarray) -> np.ndarray:
Expand All @@ -172,7 +178,7 @@ def impute(self, X: np.ndarray) -> np.ndarray:
assert np.all(np.isfinite(X))
return X

def get_pssms(self, PsP_background=False, clusters: List=None):
def get_pssms(self, PsP_background=False, clusters: List = None):
"""Compute position-specific scoring matrix of each cluster.
Note, to normalize by amino acid frequency this uses either
all the sequences in the data set or a collection of random MS phosphosites in PhosphoSitePlus.
Expand All @@ -184,12 +190,13 @@ def get_pssms(self, PsP_background=False, clusters: List=None):
else:
back_pssm = np.zeros((len(AAlist), 11), dtype=float)

l1 = list(np.arange(self.n_components) + 1)
l1 = list(np.arange(self.n_components))
l2 = list(set(self.labels()))
ec = [i for i in l1 + l2 if i not in l1 or i not in l2]
for ii in range(1, self.n_components + 1):
for ii in range(self.n_components):
# Check for empty clusters and ignore them, if there are
if ii in ec: continue
if ii in ec:
continue

# Compute PSSM
pssm = np.zeros((len(AAlist), 11), dtype=float)
Expand Down Expand Up @@ -231,11 +238,13 @@ def get_pssms(self, PsP_background=False, clusters: List=None):

pssms.append(np.clip(pssm, a_min=0, a_max=3))
pssm_names.append(ii)

pssm_names, pssms = np.array(pssm_names), np.array(pssms)

if clusters is not None:
return pssms[[np.where(pssm_names == cluster)[0][0] for cluster in clusters]]
return pssms[
[np.where(pssm_names == cluster)[0][0] for cluster in clusters]
]

return pssm_names, pssms

Expand All @@ -258,6 +267,10 @@ def predict_upstream_kinases(

return distances

def has_empty_clusters(self):
check_is_fitted(self, ["scores_"])
return np.unique(self.labels()).size != self.n_components

def predict(self) -> np.ndarray:
"""Provided the current model parameters, predict the cluster each peptide belongs to."""
check_is_fitted(self, ["scores_"])
Expand All @@ -279,7 +292,7 @@ def get_pspl_pssm_distances(
"""
Args:
pspls: kinase specificity profiles of shape (n_kinase, 20, 9)
pssms: position-specific scoring matrices of shape (n_pssms, 20, 11)
pssms: position-specific scoring matrices of shape (n_pssms, 20, 11)
"""
assert pssms.shape[1:3] == (20, 11)
assert pspls.shape[1:3] == (20, 9)
Expand Down
11 changes: 6 additions & 5 deletions ddmc/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ def filter_incomplete_peptides(
assert min_experiments is not None
assert sample_to_experiment is not None
unique_experiments = np.unique(sample_to_experiment)
experiments_grid, StoE_grid = np.meshgrid(
experiments_grid, s_to_e_grid = np.meshgrid(
unique_experiments, sample_to_experiment, indexing="ij"
)
bool_matrix = experiments_grid == StoE_grid
bool_matrix = experiments_grid == s_to_e_grid
present = ~np.isnan(p_signal.values)
peptide_idx = (present[None, :, :] & bool_matrix[:, None, :]).any(axis=2).sum(
axis=0
) > min_experiments
) >= min_experiments
return p_signal.iloc[peptide_idx]


Expand Down Expand Up @@ -79,7 +79,7 @@ def get_mutations(self, mutation_names: Sequence[str]=None):
mutations = mutations.loc[patients]
if mutation_names is not None:
mutations = mutations[mutation_names]
return mutations
return mutations.astype(bool)

def get_hot_cold_labels(self):
hot_cold = (
Expand All @@ -88,11 +88,12 @@ def get_hot_cold_labels(self):
.sort_values(by="Sample ID")
.set_index("Sample ID")
)["Group"]
hot_cold = hot_cold[~hot_cold.index.str.endswith(".N")]
hot_cold = hot_cold[hot_cold != "NAT enriched"]
hot_cold = hot_cold.replace("Cold-tumor enriched", 0)
hot_cold = hot_cold.replace("Hot-tumor enriched", 1)
hot_cold = hot_cold.dropna()
return np.squeeze(hot_cold)
return np.squeeze(hot_cold).astype(bool)

def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]:
return ~np.array([sample.endswith(".N") for sample in samples])
5 changes: 2 additions & 3 deletions ddmc/figures/figureM6.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
from statsmodels.stats.multitest import multipletests

from ddmc.clustering import DDMC
from ddmc.datasets import CPTAC, select_peptide_subset
from ddmc.datasets import CPTAC
from ddmc.figures.common import getSetup, plot_cluster_kinase_distances
from ddmc.logistic_regression import plotROC, plotClusterCoefficients


def makeFigure():
axes, f = getSetup((11, 7), (2, 3), multz={0: 1})
cptac = CPTAC()
p_signal = select_peptide_subset(cptac.get_p_signal(), keep_ratio=0.01)
p_signal = cptac.get_p_signal()
model = DDMC(n_components=30, seq_weight=100, max_iter=10).fit(p_signal)

# Import Genotype data
Expand Down Expand Up @@ -52,7 +52,6 @@ def makeFigure():
ax=axes[0],
linewidth=0.25,
)
axes[0].legend(prop={"size": 8})

annotation_height = df_violin["p-signal"].max() + 0.02
for i, pval in enumerate(pvals):
Expand Down
144 changes: 57 additions & 87 deletions ddmc/figures/figureM7.py
Original file line number Diff line number Diff line change
@@ -1,114 +1,84 @@
"""
This creates Figure 7: Tumor infiltrating immune cells
"""

import numpy as np
import pandas as pd
import seaborn as sns
import textwrap
from scipy.stats import mannwhitneyu
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from .common import getSetup, plot_distance_to_upstream_kinase
from .figureM5 import (
build_pval_matrix,
calculate_mannW_pvals,
plot_clusters_binaryfeatures,
)
from ..clustering import DDMC
from ..logistic_regression import plotROC, plotClusterCoefficients
from ..pre_processing import filter_NaNpeptides
from statsmodels.stats.multitest import multipletests

from ddmc.clustering import DDMC
from ddmc.datasets import CPTAC
from ddmc.figures.common import plot_cluster_kinase_distances, getSetup
from ddmc.logistic_regression import plotROC, plotClusterCoefficients

def makeFigure():
"""Get a list of the axis objects and create a figure"""
# Get list of axis objects
ax, f = getSetup((11, 7), (2, 3), multz={0: 1})

# Import signaling data
X = filter_NaNpeptides(
pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:],
tmt=2,
axes, f = getSetup((11, 7), (2, 3), multz={0: 1, 4: 1})
cptac = CPTAC()
is_hot = cptac.get_hot_cold_labels()
p_signal = cptac.get_p_signal()
model = DDMC(n_components=30, seq_weight=100, max_iter=10, random_state=5).fit(
p_signal
)
d = X.select_dtypes(include=[float]).T
i = X["Sequence"]

# Fit DDMC
model = DDMC(
i, n_components=30, seq_weight=100, distance_method="Binomial", random_state=5
).fit(d)

X = pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:]
X = filter_NaNpeptides(X, tmt=2)
centers = pd.DataFrame(model.transform())
centers.columns = np.arange(model.n_components) + 1
centers["Patient_ID"] = X.columns[4:]
centers = (
centers.loc[~centers["Patient_ID"].str.endswith(".N"), :]
.sort_values(by="Patient_ID")
.set_index("Patient_ID")
assert (
not model.has_empty_clusters()
), "This plot assumes that every cluster will have at least one peptide. Please rerun with fewer components are more peptides."

centers = model.transform(as_df=True).loc[is_hot.index]

pvals = []
centers_m = centers[is_hot]
centers_wt = centers[~is_hot]
for col in centers.columns:
pvals.append(mannwhitneyu(centers_m[col], centers_wt[col])[1])
pvals = multipletests(pvals)[1]

# plot tumor vs nat by cluster
df_violin = (
centers.assign(is_hot=is_hot)
.reset_index()
.melt(
id_vars="is_hot",
value_vars=centers.columns,
value_name="p-signal",
var_name="Cluster",
)
)
centers = centers.drop(
[14, 24], axis=1
) # Drop clusters 14&24, contain only 1 peptide

# Import Cold-Hot Tumor data
cent1, y = FormatXYmatrices(centers.copy())

# Normalize
cent1 = cent1.T
cent1.iloc[:, :] = StandardScaler(with_std=False).fit_transform(cent1.iloc[:, :])
cent1 = cent1.T

# Hypothesis Testing
cent1["TI"] = y.values
pvals = calculate_mannW_pvals(cent1, "TI", 1, 0)
pvals = build_pval_matrix(model.n_components, pvals)
cent1["TI"] = cent1["TI"].replace(0, "CTE")
cent1["TI"] = cent1["TI"].replace(1, "HTE")
plot_clusters_binaryfeatures(cent1, "TI", ax[0], pvals=pvals, loc="lower left")
ax[0].legend(loc="lower left", prop={"size": 10})
sns.violinplot(
data=df_violin,
x="Cluster",
y="p-signal",
hue="is_hot",
dodge=True,
ax=axes[0],
linewidth=0.25,
)

# Logistic Regression
centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform(centers)
lr = LogisticRegressionCV(
cv=15, solver="saga", n_jobs=-1, penalty="l1", max_iter=10000
cv=3, solver="saga", n_jobs=1, penalty="l1", max_iter=10000
)
plotROC(ax[1], lr, cent1.iloc[:, :-1].values, y, cv_folds=4, title="ROC TI")
plotClusterCoefficients(
ax[2], lr.fit(cent1.iloc[:, :-1], y.values), title="TI weights"
plotROC(
lr, centers.values, is_hot.values, cv_folds=3, title="ROC TI", return_mAUC=True
)
plotClusterCoefficients(axes[1], lr, title="")

# plot Upstream Kinases
plot_distance_to_upstream_kinase(model, [17, 20, 21], ax[3], num_hits=3)

return f
top_clusters = np.argsort(np.abs(lr.coef_.squeeze()))[-3:]

# plot predicted kinases for most weighted clusters
distances = model.predict_upstream_kinases()[top_clusters]

def FormatXYmatrices(centers):
"""Make sure Y matrix has the same matching samples has the signaling centers"""
y = (
pd.read_csv("ddmc/data/MS/CPTAC/Hot_Cold.csv")
.dropna(axis=1)
.sort_values(by="Sample ID")
# plot upstream Kinases
plot_cluster_kinase_distances(
distances, model.get_pssms(clusters=top_clusters), axes[2], num_hits=2
)
y = y.loc[~y["Sample ID"].str.endswith(".N"), :].set_index("Sample ID")
l1 = list(centers.index)
l2 = list(y.index)
dif = [i for i in l1 + l2 if i not in l1 or i not in l2]
centers = centers.drop(dif)

# Transform to binary
y = y.replace("Cold-tumor enriched", 0)
y = y.replace("Hot-tumor enriched", 1)
y = np.squeeze(y)

# Remove NAT-enriched samples
centers = centers.drop(y[y == "NAT enriched"].index)
y = y.drop(y[y == "NAT enriched"].index).astype(int)
assert all(centers.index.values == y.index.values), "Samples don't match"
return centers, y

return f

def plot_ImmuneGOs(cluster, ax, title=False, max_width=25, n=False, loc="best"):
# THIS FUNCION IS NOT MAINTAINED
"""Plot immune-related GO"""
go = pd.read_csv("ddmc/data/cluster_analysis/CPTAC_GO_C" + str(cluster) + ".csv")
im = go[go["GO biological process complete"].str.contains("immune")]
Expand Down

0 comments on commit d0fb555

Please sign in to comment.