Skip to content

Commit

Permalink
Cleanup to get figures building again (#533)
Browse files Browse the repository at this point in the history
* Initial pass

* Some fixes

* Lots more cleanup

* More cleanup

* Fix MS7

* clean figureM2

* rename for consistency

* stage

* class changes

* figure 2 and 3

* revert figure 4

* remake figure 4

* remove figure 4 copy

* remake figure 5

* remake figure 6

* remake figure 7

* refactor

* figure S2

* figure S3

* refactor pca functions

* move ebdt to datasets

* figure S4, renaming

* figure MS5

* figure s7

* delete many things

* fix tests

* delete unused function, can always restore

* add fit() documentation

* update README and fix validation

* Update build.yml

* make figures more like before

* small fix

* small cv fold fixes

* update random states

---------

Co-authored-by: Armaan Abraham <[email protected]>
  • Loading branch information
aarmey and armaan-abraham authored Feb 10, 2024
1 parent d3fcbd6 commit 123b392
Show file tree
Hide file tree
Showing 32 changed files with 1,687 additions and 3,496 deletions.
25 changes: 0 additions & 25 deletions .github/workflows/autopep8.yml

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- name: Install dependencies
run: poetry install
- name: Build figures
run: make -j 3 all
run: make all
- name: Upload files
uses: actions/upload-artifact@v4
with:
Expand Down
46 changes: 45 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,48 @@
![Build](https://github.com/meyer-lab/resistance-MS/workflows/Build/badge.svg)
![Test](https://github.com/meyer-lab/resistance-MS/workflows/Test/badge.svg)

Predicting cell phenotype with mass spec data.
Clusters peptides based on both sequence similarity and phosphorylation signal across samples.


## Usage

```
>>> from ddmc.clustering import DDMC
>>> # load dataset as p_signal...
>>> p_signal
Sample 1 Sample 2 Sample 3 Sample 4 Sample 5
Sequence
AAAAAsQQGSA -3.583614 NaN -0.662659 -1.320029 -0.730832
AAAAGsASPRS -0.174779 -1.796899 0.891798 -3.092941 2.394315
AAAAGsGPSPP -1.951552 -2.937095 2.692876 -2.344894 0.556615
AAAAGsGPsPP 3.666782 NaN -2.081231 0.989394 NaN
AAAAPsPGSAR 1.753855 -2.135835 0.896778 3.369230 2.020967
... ... ... ... ... ...
YYSPYsVSGSG -3.502871 2.831169 3.383486 2.589559 3.624968
YYSSRsQSGGY -0.870365 0.887317 2.600291 -0.374107 3.285459
YYTAGyNSPVK 0.249539 2.047050 -0.286033 0.042650 2.863317
YYTSAsGDEMV 0.662787 0.135326 -1.004350 0.879398 -1.609894
YYYSSsEDEDS NaN -1.101679 -3.273987 -0.872370 -1.735891
>>> p_signal.index # p_signal.index contains the peptide sequences
Index(['AAAAAsQQGSA', 'AAAAGsASPRS', 'AAAAGsGPSPP', 'AAAAGsGPsPP',
'AAAAPsPGSAR', 'AAAAPsPGsAR', 'AAAARsLLNHT', 'AAAARsPDRNL',
'AAAARtQAPPT', 'AAADFsDEDED',
...
'YYDRMySYPAR', 'YYEDDsEGEDI', 'YYGGGsEGGRA', 'YYRNNsFTAPS',
'YYSPDyGLPSP', 'YYSPYsVSGSG', 'YYSSRsQSGGY', 'YYTAGyNSPVK',
'YYTSAsGDEMV', 'YYYSSsEDEDS'],
dtype='object', name='Sequence', length=30561)
>>> model = DDMC(n_components=2, seq_weight=100).fit(p_signal) # fit model
>>> model.transform(as_df=True) # get cluster centers
0 1
Sample 1 0.017644 0.370375
Sample 2 -0.003625 -0.914869
Sample 3 -0.087624 -0.682140
Sample 4 0.014644 -0.658907
Sample 5 0.023885 0.196063
```
3 changes: 3 additions & 0 deletions WeightSearch.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
,Weight,DDMC,Average,Zero,PCA
0,0.0,,,,
1,100.0,,,,
152 changes: 66 additions & 86 deletions ddmc/binomial.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,76 +5,47 @@
import pandas as pd
import scipy.special as sc
from Bio import motifs
from Bio.Seq import Seq
from collections import OrderedDict

# Binomial method inspired by Schwartz & Gygi's Nature Biotech 2005: doi:10.1038/nbt1146

# Amino acids frequencies (http://www.tiem.utk.edu/~gross/bioed/webmodules/aminoacid.htm) used for pseudocounts,
AAfreq = {
"A": 0.074,
"R": 0.042,
"N": 0.044,
"D": 0.059,
"C": 0.033,
"Q": 0.058,
"E": 0.037,
"G": 0.074,
"H": 0.029,
"I": 0.038,
"L": 0.076,
"K": 0.072,
"M": 0.018,
"F": 0.04,
"P": 0.05,
"S": 0.081,
"T": 0.062,
"W": 0.013,
"Y": 0.033,
"V": 0.068,
}
AAlist = [
"A",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"K",
"L",
"M",
"N",
"P",
"Q",
"R",
"S",
"T",
"V",
"W",
"Y",
]
AAfreq: OrderedDict[str, float] = OrderedDict()
AAfreq["A"] = 0.074
AAfreq["R"] = 0.042
AAfreq["N"] = 0.044
AAfreq["D"] = 0.059
AAfreq["C"] = 0.033
AAfreq["Q"] = 0.058
AAfreq["E"] = 0.037
AAfreq["G"] = 0.074
AAfreq["H"] = 0.029
AAfreq["I"] = 0.038
AAfreq["L"] = 0.076
AAfreq["K"] = 0.072
AAfreq["M"] = 0.018
AAfreq["F"] = 0.040
AAfreq["P"] = 0.050
AAfreq["S"] = 0.081
AAfreq["T"] = 0.062
AAfreq["W"] = 0.013
AAfreq["Y"] = 0.033
AAfreq["V"] = 0.068

AAlist = list(AAfreq.keys())


def position_weight_matrix(seqs, pseudoC=AAfreq):
"""Build PWM of a given set of sequences."""
return frequencies(seqs).normalize(pseudocounts=pseudoC)


def frequencies(seqs):
def frequencies(seqs: list[str]):
"""Build counts matrix of a given set of sequences."""
return motifs.create(seqs, alphabet=AAlist).counts
return motifs.create(seqs, alphabet="".join(AAlist)).counts


def InformationContent(seqs):
"""The mean of the PSSM is particularly important becuase its value is equal to the
Kullback-Leibler divergence or relative entropy, and is a measure for the information content
of the motif compared to the background."""
pssm = position_weight_matrix(seqs).log_odds(AAfreq)
return pssm.mean(AAfreq)


def GenerateBinarySeqID(seqs):
def GenerateBinarySeqID(seqs: list[str]) -> np.ndarray:
"""Build matrix with 0s and 1s to identify residue/position pairs for every sequence"""
res = np.zeros((len(seqs), len(AAlist), 11), dtype=bool)
for ii, seq in enumerate(seqs):
Expand All @@ -83,15 +54,15 @@ def GenerateBinarySeqID(seqs):
return res


def BackgroundSeqs(forseqs):
def BackgroundSeqs(forseqs: np.ndarray[str]) -> list[str]:
"""Build Background data set with the same proportion of pY, pT, and pS motifs as in the foreground set of sequences.
Note this PsP data set contains 51976 pY, 226131 pS, 81321 pT
Source: https://www.phosphosite.org/staticDownloads.action -
Phosphorylation_site_dataset.gz - Last mod: Wed Dec 04 14:56:35 EST 2019
Cite: Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations,
PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926"""
# Get porportion of psite types in foreground set
forw_pYn, forw_pSn, forw_pTn, _ = CountPsiteTypes(forseqs, 5)
forw_pYn, forw_pSn, forw_pTn = CountPsiteTypes(forseqs)
forw_tot = forw_pYn + forw_pSn + forw_pTn

pYf = forw_pYn / forw_tot
Expand All @@ -106,7 +77,7 @@ def BackgroundSeqs(forseqs):
PsP = PsP[~PsP["SITE_+/-7_AA"].str.contains("X")]
refseqs = list(PsP["SITE_+/-7_AA"])
len_bg = int(len(refseqs))
backg_pYn, _, _, _ = CountPsiteTypes(refseqs, 7)
backg_pYn, _, _ = CountPsiteTypes(refseqs)

# Make sure there are enough pY peptides to meet proportions
if backg_pYn >= len_bg * pYf:
Expand All @@ -126,9 +97,12 @@ def BackgroundSeqs(forseqs):
return bg_seqs


def BackgProportions(refseqs, pYn, pSn, pTn):
def BackgProportions(refseqs: list[str], pYn: int, pSn: int, pTn: int) -> list[str]:
"""Provided the proportions, add peptides to background set."""
y_seqs, s_seqs, t_seqs = [], [], []
y_seqs: list[str] = []
s_seqs: list[str] = []
t_seqs: list[str] = []

pR = ["y", "t", "s"]
for seq in refseqs:
if seq[7] not in pR:
Expand All @@ -144,50 +118,56 @@ def BackgProportions(refseqs, pYn, pSn, pTn):
), "Wrong central AA in background set. Sliced: %s, Full: %s" % (motif, seq)

if motif[5] == "Y" and len(y_seqs) < pYn:
y_seqs.append(Seq(motif))
y_seqs.append(motif)

if motif[5] == "S" and len(s_seqs) < pSn:
s_seqs.append(Seq(motif))
s_seqs.append(motif)

if motif[5] == "T" and len(t_seqs) < pTn:
t_seqs.append(Seq(motif))
t_seqs.append(motif)

return y_seqs + s_seqs + t_seqs


class Binomial:
"""Create a binomial distance distribution."""
"""Definition of the binomial sequence distance distribution."""

def __init__(self, seq, seqs):
def __init__(self, seqs: np.ndarray[str]):
# Background sequences
background = position_weight_matrix(BackgroundSeqs(seq))
self.background = (
np.array([background[AA] for AA in AAlist]),
GenerateBinarySeqID(seqs),
)
background = position_weight_matrix(BackgroundSeqs(seqs))
self.background = np.array([background[AA] for AA in AAlist])
self.foreground: np.ndarray = GenerateBinarySeqID(seqs)

self.logWeights = 0.0
assert np.all(np.isfinite(self.background[0]))
assert np.all(np.isfinite(self.background[1]))
assert np.all(np.isfinite(self.background))
assert np.all(np.isfinite(self.foreground))

def from_summaries(self, weightsIn):
def from_summaries(self, weightsIn: np.ndarray):
"""Update the underlying distribution."""
k = np.einsum("kji,kl->lji", self.background[1], weightsIn)
k = np.einsum("kji,kl->lji", self.foreground, weightsIn)
betaA = np.sum(weightsIn, axis=0)[:, None, None] - k
betaA = np.clip(betaA, 0.001, np.inf)
probmat = sc.betainc(betaA, k + 1, 1 - self.background[0])
tempp = np.einsum("ijk,ljk->il", self.background[1], probmat)
probmat = sc.betainc(betaA, k + 1, 1 - self.background)
tempp = np.einsum("ijk,ljk->il", self.foreground, probmat)
self.logWeights = np.log(tempp)


def CountPsiteTypes(X, cA):
"""Count number of different phosphorylation types in a MS data set."""
positionSeq = [seq[cA] for seq in X]
pS = positionSeq.count("s")
pT = positionSeq.count("t")
pY = positionSeq.count("y")
def CountPsiteTypes(X: np.ndarray[str]) -> tuple[int, int, int]:
"""Count the number of different phosphorylation types in an MS data set.
Args:
X (list[str]): The list of peptide sequences.
Returns:
tuple[int, int, int]: The number of pY, pS, and pT sites.
"""
X = np.char.upper(X)

countt = [sum(map(str.islower, seq)) for seq in X]
primed = sum(map(lambda i: i > 1, countt))
# Find the center amino acid
cA = int((len(X[0]) - 1) / 2)

return pY, pS, pT, primed
phospho_aminos = [seq[cA] for seq in X]
pS = phospho_aminos.count("S")
pT = phospho_aminos.count("T")
pY = phospho_aminos.count("Y")
return pY, pS, pT
Loading

0 comments on commit 123b392

Please sign in to comment.