Skip to content

Commit

Permalink
Initial pass
Browse files Browse the repository at this point in the history
  • Loading branch information
aarmey committed Jan 20, 2024
1 parent d3fcbd6 commit 7fb1c7a
Show file tree
Hide file tree
Showing 21 changed files with 155 additions and 739 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- name: Install dependencies
run: poetry install
- name: Build figures
run: make -j 3 all
run: make -i all
- name: Upload files
uses: actions/upload-artifact@v4
with:
Expand Down
144 changes: 61 additions & 83 deletions ddmc/binomial.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,76 +5,47 @@
import pandas as pd
import scipy.special as sc
from Bio import motifs
from Bio.Seq import Seq
from collections import OrderedDict

# Binomial method inspired by Schwartz & Gygi's Nature Biotech 2005: doi:10.1038/nbt1146

# Amino acids frequencies (http://www.tiem.utk.edu/~gross/bioed/webmodules/aminoacid.htm) used for pseudocounts,
AAfreq = {
"A": 0.074,
"R": 0.042,
"N": 0.044,
"D": 0.059,
"C": 0.033,
"Q": 0.058,
"E": 0.037,
"G": 0.074,
"H": 0.029,
"I": 0.038,
"L": 0.076,
"K": 0.072,
"M": 0.018,
"F": 0.04,
"P": 0.05,
"S": 0.081,
"T": 0.062,
"W": 0.013,
"Y": 0.033,
"V": 0.068,
}
AAlist = [
"A",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"K",
"L",
"M",
"N",
"P",
"Q",
"R",
"S",
"T",
"V",
"W",
"Y",
]
AAfreq: OrderedDict[str, float] = OrderedDict()
AAfreq["A"] = 0.074
AAfreq["R"] = 0.042
AAfreq["N"] = 0.044
AAfreq["D"] = 0.059
AAfreq["C"] = 0.033
AAfreq["Q"] = 0.058
AAfreq["E"] = 0.037
AAfreq["G"] = 0.074
AAfreq["H"] = 0.029
AAfreq["I"] = 0.038
AAfreq["L"] = 0.076
AAfreq["K"] = 0.072
AAfreq["M"] = 0.018
AAfreq["F"] = 0.040
AAfreq["P"] = 0.050
AAfreq["S"] = 0.081
AAfreq["T"] = 0.062
AAfreq["W"] = 0.013
AAfreq["Y"] = 0.033
AAfreq["V"] = 0.068

AAlist = list(AAfreq.keys())


def position_weight_matrix(seqs, pseudoC=AAfreq):
"""Build PWM of a given set of sequences."""
return frequencies(seqs).normalize(pseudocounts=pseudoC)


def frequencies(seqs):
def frequencies(seqs: list[str]):
"""Build counts matrix of a given set of sequences."""
return motifs.create(seqs, alphabet=AAlist).counts
return motifs.create(seqs, alphabet="".join(AAlist)).counts


def InformationContent(seqs):
"""The mean of the PSSM is particularly important becuase its value is equal to the
Kullback-Leibler divergence or relative entropy, and is a measure for the information content
of the motif compared to the background."""
pssm = position_weight_matrix(seqs).log_odds(AAfreq)
return pssm.mean(AAfreq)


def GenerateBinarySeqID(seqs):
def GenerateBinarySeqID(seqs: list[str]) -> np.ndarray:
"""Build matrix with 0s and 1s to identify residue/position pairs for every sequence"""
res = np.zeros((len(seqs), len(AAlist), 11), dtype=bool)
for ii, seq in enumerate(seqs):
Expand All @@ -83,15 +54,15 @@ def GenerateBinarySeqID(seqs):
return res


def BackgroundSeqs(forseqs):
def BackgroundSeqs(forseqs: pd.Series) -> list[str]:
"""Build Background data set with the same proportion of pY, pT, and pS motifs as in the foreground set of sequences.
Note this PsP data set contains 51976 pY, 226131 pS, 81321 pT
Source: https://www.phosphosite.org/staticDownloads.action -
Phosphorylation_site_dataset.gz - Last mod: Wed Dec 04 14:56:35 EST 2019
Cite: Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations,
PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926"""
# Get porportion of psite types in foreground set
forw_pYn, forw_pSn, forw_pTn, _ = CountPsiteTypes(forseqs, 5)
forw_pYn, forw_pSn, forw_pTn = CountPsiteTypes(forseqs.astype(str).tolist())
forw_tot = forw_pYn + forw_pSn + forw_pTn

pYf = forw_pYn / forw_tot
Expand All @@ -106,7 +77,7 @@ def BackgroundSeqs(forseqs):
PsP = PsP[~PsP["SITE_+/-7_AA"].str.contains("X")]
refseqs = list(PsP["SITE_+/-7_AA"])
len_bg = int(len(refseqs))
backg_pYn, _, _, _ = CountPsiteTypes(refseqs, 7)
backg_pYn, _, _ = CountPsiteTypes(refseqs)

# Make sure there are enough pY peptides to meet proportions
if backg_pYn >= len_bg * pYf:
Expand All @@ -126,9 +97,12 @@ def BackgroundSeqs(forseqs):
return bg_seqs


def BackgProportions(refseqs, pYn, pSn, pTn):
def BackgProportions(refseqs: list[str], pYn: int, pSn: int, pTn: int) -> list[str]:
"""Provided the proportions, add peptides to background set."""
y_seqs, s_seqs, t_seqs = [], [], []
y_seqs: list[str] = []
s_seqs: list[str] = []
t_seqs: list[str] = []

pR = ["y", "t", "s"]
for seq in refseqs:
if seq[7] not in pR:
Expand All @@ -144,50 +118,54 @@ def BackgProportions(refseqs, pYn, pSn, pTn):
), "Wrong central AA in background set. Sliced: %s, Full: %s" % (motif, seq)

if motif[5] == "Y" and len(y_seqs) < pYn:
y_seqs.append(Seq(motif))
y_seqs.append(motif)

if motif[5] == "S" and len(s_seqs) < pSn:
s_seqs.append(Seq(motif))
s_seqs.append(motif)

if motif[5] == "T" and len(t_seqs) < pTn:
t_seqs.append(Seq(motif))
t_seqs.append(motif)

return y_seqs + s_seqs + t_seqs


class Binomial:
"""Create a binomial distance distribution."""
"""Definition of the binomial sequence distance distribution."""

def __init__(self, seq, seqs):
def __init__(self, seq: pd.Series, seqs: list[str]):
# Background sequences
background = position_weight_matrix(BackgroundSeqs(seq))
self.background = (
np.array([background[AA] for AA in AAlist]),
GenerateBinarySeqID(seqs),
)
self.background = np.array([background[AA] for AA in AAlist])
self.foreground: np.ndarray = GenerateBinarySeqID(seqs)

self.logWeights = 0.0
assert np.all(np.isfinite(self.background[0]))
assert np.all(np.isfinite(self.background[1]))
assert np.all(np.isfinite(self.background))
assert np.all(np.isfinite(self.foreground))

def from_summaries(self, weightsIn):
def from_summaries(self, weightsIn: np.ndarray):
"""Update the underlying distribution."""
k = np.einsum("kji,kl->lji", self.background[1], weightsIn)
k = np.einsum("kji,kl->lji", self.foreground, weightsIn)
betaA = np.sum(weightsIn, axis=0)[:, None, None] - k
betaA = np.clip(betaA, 0.001, np.inf)
probmat = sc.betainc(betaA, k + 1, 1 - self.background[0])
tempp = np.einsum("ijk,ljk->il", self.background[1], probmat)
probmat = sc.betainc(betaA, k + 1, 1 - self.background)
tempp = np.einsum("ijk,ljk->il", self.foreground, probmat)
self.logWeights = np.log(tempp)


def CountPsiteTypes(X, cA):
"""Count number of different phosphorylation types in a MS data set."""
def CountPsiteTypes(X: list[str]) -> tuple[int, int, int]:
"""Count the number of different phosphorylation types in an MS data set.
Args:
X (list[str]): The list of peptide sequences.
Returns:
tuple[int, int, int]: The number of pY, pS, and pT sites.
"""
# Find the center amino acid
cA = int((len(X[0]) - 1) / 2)

positionSeq = [seq[cA] for seq in X]
pS = positionSeq.count("s")
pT = positionSeq.count("t")
pY = positionSeq.count("y")

countt = [sum(map(str.islower, seq)) for seq in X]
primed = sum(map(lambda i: i > 1, countt))

return pY, pS, pT, primed
return pY, pS, pT
42 changes: 20 additions & 22 deletions ddmc/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,14 @@
from fancyimpute import SoftImpute


# pylint: disable=W0201


class DDMC(GaussianMixture):
"""Cluster peptides by both sequence similarity and data behavior following an
expectation-maximization algorithm. SeqWeight specifies which method's expectation step
should have a larger effect on the peptide assignment."""

def __init__(
self,
info: pd.DataFrame,
sequences: pd.Series,
n_components: int,
SeqWeight: float,
distance_method: Literal["PAM250", "Binomial"],
Expand All @@ -39,20 +36,20 @@ def __init__(
random_state=random_state,
)

self.info = info
self.sequences = sequences
self.SeqWeight = SeqWeight
self.distance_method = distance_method

seqs = [s.upper() for s in info["Sequence"]]
seqs = sequences.str.upper().to_list()

if distance_method == "PAM250":
self.seqDist: PAM250 | Binomial = PAM250(seqs)
elif distance_method == "Binomial":
self.seqDist = Binomial(info["Sequence"], seqs)
self.seqDist = Binomial(sequences, seqs)
else:
raise ValueError("Wrong distance type.")

def _estimate_log_prob(self, X):
def _estimate_log_prob(self, X: np.ndarray):
"""Estimate the log-probability of each point in each cluster."""
logp = super()._estimate_log_prob(X) # Do the regular work

Expand All @@ -62,7 +59,7 @@ def _estimate_log_prob(self, X):

return logp

def _m_step(self, X, log_resp):
def _m_step(self, X: np.ndarray, log_resp: np.ndarray):
"""M step.
Parameters
----------
Expand All @@ -86,7 +83,7 @@ def _m_step(self, X, log_resp):
# Do sequence m step
self.seqDist.from_summaries(np.exp(log_resp))

def fit(self, X, y=None):
def fit(self, X: np.ndarray | pd.DataFrame, y=None):
"""Compute EM clustering"""
d = np.array(X.T)

Expand Down Expand Up @@ -132,12 +129,12 @@ def wins(self, X):

return (dataDist, seqDist)

def transform(self):
def transform(self) -> np.ndarray:
"""Calculate cluster averages."""
check_is_fitted(self, ["means_"])
return self.means_.T

def impute(self, X):
def impute(self, X: np.ndarray) -> np.ndarray:
"""Impute a matching dataset."""
X = X.copy()
labels = self.labels() # cluster assignments
Expand All @@ -157,7 +154,7 @@ def pssms(self, PsP_background=False):
"""
pssms, cl_num = [], []
if PsP_background:
bg_seqs = BackgroundSeqs(self.info["Sequence"])
bg_seqs = BackgroundSeqs(self.sequences)

Check warning on line 157 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L157

Added line #L157 was not covered by tests
back_pssm = compute_control_pssm(bg_seqs)
else:
back_pssm = np.zeros((len(AAlist), 11), dtype=float)
Expand All @@ -171,7 +168,7 @@ def pssms(self, PsP_background=False):

# Compute PSSM
pssm = np.zeros((len(AAlist), 11), dtype=float)
for jj, seq in enumerate(self.info["Sequence"]):
for jj, seq in enumerate(self.sequences):

Check warning on line 171 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L171

Added line #L171 was not covered by tests
seq = seq.upper()
for kk, aa in enumerate(seq):
pssm[AAlist.index(aa), kk] += self.scores_[jj, ii - 1]
Expand All @@ -187,8 +184,9 @@ def pssms(self, PsP_background=False):
back_pssm[:, pos] /= np.mean(back_pssm[:, pos])

# Normalize to background PSSM to account for AA frequencies per position
with np.seterr(divide='ignore', invalid='ignore'):
pssm /= back_pssm.copy()
old_settings = np.seterr(divide="ignore", invalid="ignore")
pssm /= back_pssm.copy()
np.seterr(**old_settings)

Check warning on line 189 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L187-L189

Added lines #L187 - L189 were not covered by tests

# Log2 transform
pssm = np.ma.log2(pssm)
Expand All @@ -198,7 +196,7 @@ def pssms(self, PsP_background=False):
pssm.index = AAlist

# Normalize phosphoacceptor position to frequency
df = pd.DataFrame(self.info["Sequence"].str.upper())
df = pd.DataFrame(self.sequences.str.upper())

Check warning on line 199 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L199

Added line #L199 was not covered by tests
df["Cluster"] = self.labels()
clSeq = df[df["Cluster"] == ii]["Sequence"]
clSeq = pd.DataFrame(frequencies(clSeq)).T
Expand Down Expand Up @@ -236,24 +234,24 @@ def predict_UpstreamKinases(
table.insert(0, "Kinase", list(PSPLdict().keys()))
return table

def predict(self):
def predict(self) -> np.ndarray:
"""Provided the current model parameters, predict the cluster each peptide belongs to."""
check_is_fitted(self, ["scores_"])
return np.argmax(self.scores_, axis=1)

def labels(self):
def labels(self) -> np.ndarray:
"""Find cluster assignment with highest likelihood for each peptide."""
return self.predict() + 1

def score(self):
def score(self) -> float:
"""Generate score of the fitting."""
check_is_fitted(self, ["lower_bound_"])
return self.lower_bound_

def get_params(self, deep=True):
def get_params(self, deep: bool = True):
"""Returns a dict of the estimator parameters with their values."""
dictt = super().get_params(deep=deep)
dictt["info"] = self.info
dictt["sequences"] = self.sequences
dictt["SeqWeight"] = self.SeqWeight
dictt["distance_method"] = self.distance_method
return dictt
Loading

0 comments on commit 7fb1c7a

Please sign in to comment.