Skip to content

Commit

Permalink
delete many things
Browse files Browse the repository at this point in the history
  • Loading branch information
armaan-abraham committed Feb 2, 2024
1 parent 54d3779 commit f6a0b34
Show file tree
Hide file tree
Showing 6 changed files with 18 additions and 229 deletions.
3 changes: 1 addition & 2 deletions ddmc/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@

class DDMC(GaussianMixture):
"""Cluster peptides by both sequence similarity and condition-wise phosphorylation following an
expectation-maximization algorithm. SeqWeight specifies which method's expectation step
should have a larger effect on the peptide assignment."""
expectation-maximization algorithm."""

def __init__(
self,
Expand Down
4 changes: 2 additions & 2 deletions ddmc/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
from typing import Sequence

from ddmc.motifs import DictProteomeNameToSeq
from ddmc.motifs import get_proteome_name_to_seq

DATA_DIR = Path(__file__).parent / "data"

Expand Down Expand Up @@ -136,7 +136,7 @@ def pos_to_motif(self, genes, pos):
"""Map p-site sequence position to uniprot's proteome and extract motifs."""
proteome = open(DATA_DIR / "Sequence_analysis" / "proteome_uniprot2019.fa", "r")
motif_size = 5
ProteomeDict = DictProteomeNameToSeq(proteome, n="gene")
ProteomeDict = get_proteome_name_to_seq(proteome, n="gene")
motifs = []
del_GeneToPos = []
for gene, pos in list(zip(genes, pos)):
Expand Down
33 changes: 1 addition & 32 deletions ddmc/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import StratifiedKFold, RepeatedKFold


def plot_cluster_regression_coefficients(ax: Axes, lr, hue=None, title=False):
"""Plot LR coeficients of clusters."""
coefs_ = pd.DataFrame(lr.coef_.T, columns=["LR Coefficient"])
Expand All @@ -20,8 +21,6 @@ def plot_cluster_regression_coefficients(ax: Axes, lr, hue=None, title=False):
hue = "Sample"
else:
coefs_["Cluster"] = np.arange(coefs_.shape[0])
if xlabels is not None:
coefs_["Cluster"] = xlabels
p = sns.barplot(
ax=ax,
x="Cluster",
Expand All @@ -38,36 +37,6 @@ def plot_cluster_regression_coefficients(ax: Axes, lr, hue=None, title=False):
ax.set_title(title)


def plotPredictionProbabilities(ax, lr, dd, yy):
"""Plot LR predictions and prediction probabilities."""
res_ = pd.DataFrame()
res_["y, p(x)"] = lr.predict_proba(dd)[:, 1]
z = lr.predict(dd) == yy
res_["Correct_Prediction"] = z.values
res_["Prediction"] = lr.predict(dd).astype("int")
res_["Patients"] = np.arange(res_.shape[0]) + 1
sns.scatterplot(
ax=ax, x="Patients", y="Prediction", data=res_, hue="Correct_Prediction"
)
sns.lineplot(ax=ax, x="Patients", y="y, p(x)", data=res_, marker="s", color="gray")
ax.axhline(0.5, ls="--", color="r")


def plotConfusionMatrix(ax, lr, dd, yy):
"""Actual vs predicted outputs"""
cm = confusion_matrix(yy, lr.predict(dd))
n = lr.classes_.shape[0]
ax.imshow(cm)
ax.grid(False)
ax.set_xlabel("Predicted outputs", color="black")
ax.set_ylabel("Actual outputs", color="black")
ax.xaxis.set(ticks=range(n))
ax.yaxis.set(ticks=range(n))
for i in range(n):
for j in range(n):
ax.text(j, i, cm[i, j], ha="center", va="center", color="white")


def plot_roc(
classifier,
X: np.ndarray,
Expand Down
73 changes: 14 additions & 59 deletions ddmc/motifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,7 @@
from .binomial import AAlist


def MapMotifs(X, names):
"""Generate pY motifs for pre-processing."""
names, seqs, pXpos, Xidx = GeneratingKinaseMotifs(names, FormatSeq(X))
X = X.iloc[Xidx, :]
X["Gene"] = names
X["Sequence"] = seqs
X.insert(3, "Position", pXpos)
return X[~X["Sequence"].str.contains("-")]


def FormatName(X):
"""Keep only the general protein name, without any other accession information"""
full = [v.split("OS")[0].strip() for v in X.iloc[:, 0]]
gene = [v.split("GN=")[1].split(" PE")[0].strip() for v in X.iloc[:, 0]]
return full, gene


def FormatSeq(X):
"""Deleting -1/-2 for mapping to uniprot's proteome"""
return [v.split("-")[0] for v in X["Sequence"]]


def DictProteomeNameToSeq(X, n):
def get_proteome_name_to_seq(X, n):
"""To generate proteom's dictionary"""
DictProtToSeq_UP = {}
for rec2 in SeqIO.parse(X, "fasta"):
Expand All @@ -48,17 +26,17 @@ def DictProteomeNameToSeq(X, n):
return DictProtToSeq_UP


def getKeysByValue(dictOfElements, valueToFind):
def get_keys_by_value(dictionary, value):
"""Find the key of a given value within a dictionary."""
listOfKeys = list()
listOfItems = dictOfElements.items()
listOfItems = dictionary.items()
for item in listOfItems:
if valueToFind in item[1]:
if value in item[1]:
listOfKeys.append(item[0])
return listOfKeys


def MatchProtNames(ProteomeDict, MS_names, MS_seqs):
def match_protein_names(ProteomeDict, MS_names, MS_seqs):
"""Match protein names of MS and Uniprot's proteome."""
matchedNames, seqs, Xidx = [], [], []
counter = 0
Expand All @@ -71,7 +49,7 @@ def MatchProtNames(ProteomeDict, MS_names, MS_seqs):
matchedNames.append(MS_name)
else:
try:
newname = getKeysByValue(ProteomeDict, MS_seqU)[0]
newname = get_keys_by_value(ProteomeDict, MS_seqU)[0]
assert MS_seqU in ProteomeDict[newname]
Xidx.append(i)
seqs.append(MS_seq)
Expand All @@ -86,7 +64,7 @@ def MatchProtNames(ProteomeDict, MS_names, MS_seqs):
return matchedNames, seqs, Xidx


def findmotif(MS_seq, MS_name, ProteomeDict, motif_size):
def find_motif(MS_seq, MS_name, ProteomeDict, motif_size):
"""For a given MS peptide, finds it in the ProteomeDict, and maps the +/-5 AA from the p-site, accounting
for peptides phosphorylated multiple times concurrently."""
MS_seqU = MS_seq.upper()
Expand Down Expand Up @@ -114,7 +92,7 @@ def findmotif(MS_seq, MS_name, ProteomeDict, motif_size):
elif "t" in MS_seq or "s" in MS_seq:
DoS_idx = list(re.compile("y|t|s").finditer(MS_seq))
assert len(DoS_idx) != 0
mappedMotif, pidx = makeMotif(
mappedMotif, pidx = make_motif(
UP_seq, MS_seq, motif_size, y_idx, center_idx, DoS_idx
)
if len(pidx) == 1:
Expand All @@ -130,7 +108,7 @@ def findmotif(MS_seq, MS_name, ProteomeDict, motif_size):
DoS_idx = None
if len(pTS_idx) > 1:
DoS_idx = pTS_idx[1:]
mappedMotif, pidx = makeMotif(
mappedMotif, pidx = make_motif(
UP_seq, MS_seq, motif_size, ts_idx, center_idx, DoS_idx
)
if len(pidx) == 1:
Expand All @@ -145,12 +123,12 @@ def findmotif(MS_seq, MS_name, ProteomeDict, motif_size):
return pos, mappedMotif


def GeneratingKinaseMotifs(names, seqs):
def generate_kinase_motifs(names, seqs):
"""Main function to generate motifs using 'findmotif'."""
motif_size = 5
proteome = open("./data/Sequence_analysis/proteome_uniprot2019.fa", "r")
ProteomeDict = DictProteomeNameToSeq(proteome, n="gene")
protnames, seqs, Xidx = MatchProtNames(ProteomeDict, names, seqs)
ProteomeDict = get_proteome_name_to_seq(proteome, n="gene")
protnames, seqs, Xidx = match_protein_names(ProteomeDict, names, seqs)
(
MS_names,
mapped_motifs,
Expand All @@ -162,7 +140,7 @@ def GeneratingKinaseMotifs(names, seqs):
)

for i, MS_seq in enumerate(seqs):
pos, mappedMotif = findmotif(MS_seq, protnames[i], ProteomeDict, motif_size)
pos, mappedMotif = find_motif(MS_seq, protnames[i], ProteomeDict, motif_size)
MS_names.append(protnames[i])
mapped_motifs.append(mappedMotif)
uni_pos.append(pos)
Expand All @@ -171,7 +149,7 @@ def GeneratingKinaseMotifs(names, seqs):
return MS_names, mapped_motifs, uni_pos, Xidx


def makeMotif(UP_seq, MS_seq, motif_size, ps_protein_idx, center_motif_idx, DoS_idx):
def make_motif(UP_seq, MS_seq, motif_size, ps_protein_idx, center_motif_idx, DoS_idx):
"""Make a motif out of the matched sequences."""
UP_seq_copy = list(
UP_seq[max(0, ps_protein_idx - motif_size) : ps_protein_idx + motif_size + 1]
Expand Down Expand Up @@ -212,29 +190,6 @@ def makeMotif(UP_seq, MS_seq, motif_size, ps_protein_idx, center_motif_idx, DoS_
return "".join(UP_seq_copy), pidx


def preprocess_seqs(X, pYTS):
"""Filter out any sequences with different than the specified central p-residue
and/or any containing gaps."""
X = X[~X["Sequence"].str.contains("-")]

Xidx = []
for seq in X["Sequence"]:
Xidx.append(seq[5] == pYTS.lower())
return X.iloc[Xidx, :]


def ForegroundSeqs(sequences):
"""Build Background data set for either "Y", "S", or "T"."""
seqs = []
yts = ["Y", "T", "S"]
for motif in sequences:
motif = motif.upper()
assert "-" not in motif, "gap in motif"
assert motif[5] in yts, "WRONG CENTRAL AMINO ACID"
seqs.append(Seq(motif, alphabet=AAlist))
return seqs


def get_pspls() -> tuple[np.ndarray, np.ndarray]:
"""Generate dictionary with kinase name-specificity profile pairs"""
pspls_arr = []
Expand Down
2 changes: 0 additions & 2 deletions ddmc/pam250.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,7 @@ def MotifPam250Scores(seqs: list[str]) -> np.ndarray:
def distanceCalc(seqs, pam250m: np.ndarray):
"""Calculate all the pairwise distances."""
# WARNING this type can only hold -128 to 127

out = np.zeros((seqs.shape[0], seqs.shape[0]), dtype=np.int8)
i_idx, j_idx = np.tril_indices(seqs.shape[0])
out[i_idx, j_idx] = np.sum(pam250m[seqs[i_idx], seqs[j_idx]], axis=1)

return out
132 changes: 0 additions & 132 deletions ddmc/pre_processing.py

This file was deleted.

0 comments on commit f6a0b34

Please sign in to comment.