Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor and tidy #534

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 0 additions & 25 deletions .github/workflows/autopep8.yml

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- name: Install dependencies
run: poetry install
- name: Build figures
run: make -j 3 all
run: make -i all
- name: Upload files
uses: actions/upload-artifact@v4
with:
Expand Down
3 changes: 3 additions & 0 deletions WeightSearch.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
,Weight,DDMC,Average,Zero,PCA
0,0.0,,,,
1,100.0,,,,
152 changes: 66 additions & 86 deletions ddmc/binomial.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,76 +5,47 @@
import pandas as pd
import scipy.special as sc
from Bio import motifs
from Bio.Seq import Seq
from collections import OrderedDict

# Binomial method inspired by Schwartz & Gygi's Nature Biotech 2005: doi:10.1038/nbt1146

# Amino acids frequencies (http://www.tiem.utk.edu/~gross/bioed/webmodules/aminoacid.htm) used for pseudocounts,
AAfreq = {
"A": 0.074,
"R": 0.042,
"N": 0.044,
"D": 0.059,
"C": 0.033,
"Q": 0.058,
"E": 0.037,
"G": 0.074,
"H": 0.029,
"I": 0.038,
"L": 0.076,
"K": 0.072,
"M": 0.018,
"F": 0.04,
"P": 0.05,
"S": 0.081,
"T": 0.062,
"W": 0.013,
"Y": 0.033,
"V": 0.068,
}
AAlist = [
"A",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"K",
"L",
"M",
"N",
"P",
"Q",
"R",
"S",
"T",
"V",
"W",
"Y",
]
AAfreq: OrderedDict[str, float] = OrderedDict()
AAfreq["A"] = 0.074
AAfreq["R"] = 0.042
AAfreq["N"] = 0.044
AAfreq["D"] = 0.059
AAfreq["C"] = 0.033
AAfreq["Q"] = 0.058
AAfreq["E"] = 0.037
AAfreq["G"] = 0.074
AAfreq["H"] = 0.029
AAfreq["I"] = 0.038
AAfreq["L"] = 0.076
AAfreq["K"] = 0.072
AAfreq["M"] = 0.018
AAfreq["F"] = 0.040
AAfreq["P"] = 0.050
AAfreq["S"] = 0.081
AAfreq["T"] = 0.062
AAfreq["W"] = 0.013
AAfreq["Y"] = 0.033
AAfreq["V"] = 0.068

AAlist = list(AAfreq.keys())


def position_weight_matrix(seqs, pseudoC=AAfreq):
"""Build PWM of a given set of sequences."""
return frequencies(seqs).normalize(pseudocounts=pseudoC)


def frequencies(seqs):
def frequencies(seqs: list[str]):
"""Build counts matrix of a given set of sequences."""
return motifs.create(seqs, alphabet=AAlist).counts
return motifs.create(seqs, alphabet="".join(AAlist)).counts


def InformationContent(seqs):
"""The mean of the PSSM is particularly important becuase its value is equal to the
Kullback-Leibler divergence or relative entropy, and is a measure for the information content
of the motif compared to the background."""
pssm = position_weight_matrix(seqs).log_odds(AAfreq)
return pssm.mean(AAfreq)


def GenerateBinarySeqID(seqs):
def GenerateBinarySeqID(seqs: list[str]) -> np.ndarray:
"""Build matrix with 0s and 1s to identify residue/position pairs for every sequence"""
res = np.zeros((len(seqs), len(AAlist), 11), dtype=bool)
for ii, seq in enumerate(seqs):
Expand All @@ -83,15 +54,15 @@ def GenerateBinarySeqID(seqs):
return res


def BackgroundSeqs(forseqs):
def BackgroundSeqs(forseqs: np.ndarray[str]) -> list[str]:
"""Build Background data set with the same proportion of pY, pT, and pS motifs as in the foreground set of sequences.
Note this PsP data set contains 51976 pY, 226131 pS, 81321 pT
Source: https://www.phosphosite.org/staticDownloads.action -
Phosphorylation_site_dataset.gz - Last mod: Wed Dec 04 14:56:35 EST 2019
Cite: Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations,
PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926"""
# Get porportion of psite types in foreground set
forw_pYn, forw_pSn, forw_pTn, _ = CountPsiteTypes(forseqs, 5)
forw_pYn, forw_pSn, forw_pTn = CountPsiteTypes(forseqs)
forw_tot = forw_pYn + forw_pSn + forw_pTn

pYf = forw_pYn / forw_tot
Expand All @@ -106,7 +77,7 @@ def BackgroundSeqs(forseqs):
PsP = PsP[~PsP["SITE_+/-7_AA"].str.contains("X")]
refseqs = list(PsP["SITE_+/-7_AA"])
len_bg = int(len(refseqs))
backg_pYn, _, _, _ = CountPsiteTypes(refseqs, 7)
backg_pYn, _, _ = CountPsiteTypes(refseqs)

# Make sure there are enough pY peptides to meet proportions
if backg_pYn >= len_bg * pYf:
Expand All @@ -126,9 +97,12 @@ def BackgroundSeqs(forseqs):
return bg_seqs


def BackgProportions(refseqs, pYn, pSn, pTn):
def BackgProportions(refseqs: list[str], pYn: int, pSn: int, pTn: int) -> list[str]:
"""Provided the proportions, add peptides to background set."""
y_seqs, s_seqs, t_seqs = [], [], []
y_seqs: list[str] = []
s_seqs: list[str] = []
t_seqs: list[str] = []

pR = ["y", "t", "s"]
for seq in refseqs:
if seq[7] not in pR:
Expand All @@ -144,50 +118,56 @@ def BackgProportions(refseqs, pYn, pSn, pTn):
), "Wrong central AA in background set. Sliced: %s, Full: %s" % (motif, seq)

if motif[5] == "Y" and len(y_seqs) < pYn:
y_seqs.append(Seq(motif))
y_seqs.append(motif)

if motif[5] == "S" and len(s_seqs) < pSn:
s_seqs.append(Seq(motif))
s_seqs.append(motif)

if motif[5] == "T" and len(t_seqs) < pTn:
t_seqs.append(Seq(motif))
t_seqs.append(motif)

return y_seqs + s_seqs + t_seqs


class Binomial:
"""Create a binomial distance distribution."""
"""Definition of the binomial sequence distance distribution."""

def __init__(self, seq, seqs):
def __init__(self, seqs: np.ndarray[str]):
# Background sequences
background = position_weight_matrix(BackgroundSeqs(seq))
self.background = (
np.array([background[AA] for AA in AAlist]),
GenerateBinarySeqID(seqs),
)
background = position_weight_matrix(BackgroundSeqs(seqs))
self.background = np.array([background[AA] for AA in AAlist])
self.foreground: np.ndarray = GenerateBinarySeqID(seqs)

self.logWeights = 0.0
assert np.all(np.isfinite(self.background[0]))
assert np.all(np.isfinite(self.background[1]))
assert np.all(np.isfinite(self.background))
assert np.all(np.isfinite(self.foreground))

def from_summaries(self, weightsIn):
def from_summaries(self, weightsIn: np.ndarray):
"""Update the underlying distribution."""
k = np.einsum("kji,kl->lji", self.background[1], weightsIn)
k = np.einsum("kji,kl->lji", self.foreground, weightsIn)
betaA = np.sum(weightsIn, axis=0)[:, None, None] - k
betaA = np.clip(betaA, 0.001, np.inf)
probmat = sc.betainc(betaA, k + 1, 1 - self.background[0])
tempp = np.einsum("ijk,ljk->il", self.background[1], probmat)
probmat = sc.betainc(betaA, k + 1, 1 - self.background)
tempp = np.einsum("ijk,ljk->il", self.foreground, probmat)
self.logWeights = np.log(tempp)


def CountPsiteTypes(X, cA):
"""Count number of different phosphorylation types in a MS data set."""
positionSeq = [seq[cA] for seq in X]
pS = positionSeq.count("s")
pT = positionSeq.count("t")
pY = positionSeq.count("y")
def CountPsiteTypes(X: np.ndarray[str]) -> tuple[int, int, int]:
"""Count the number of different phosphorylation types in an MS data set.

Args:
X (list[str]): The list of peptide sequences.

Returns:
tuple[int, int, int]: The number of pY, pS, and pT sites.
"""
X = np.char.upper(X)

countt = [sum(map(str.islower, seq)) for seq in X]
primed = sum(map(lambda i: i > 1, countt))
# Find the center amino acid
cA = int((len(X[0]) - 1) / 2)

return pY, pS, pT, primed
phospho_aminos = [seq[cA] for seq in X]
pS = phospho_aminos.count("S")
pT = phospho_aminos.count("T")
pY = phospho_aminos.count("Y")
return pY, pS, pT
Loading
Loading