Skip to content

Commit

Permalink
Merge branch 'spectrumAI' of https://github.com/DongdongdongW/py-pgatk
Browse files Browse the repository at this point in the history
…into spectrumAI

# Conflicts:
#	requirements.txt
#	setup.py
  • Loading branch information
ypriverol committed Apr 19, 2024
2 parents e46bffe + 9a0a9b4 commit 8fe67f1
Show file tree
Hide file tree
Showing 9 changed files with 295 additions and 123 deletions.
5 changes: 1 addition & 4 deletions pypgatk/commands/blast_get_position.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@
@click.option('-i', '--input_psm_to_blast', help='The file name of the input PSM table to blast.')
@click.option('-o', '--output_psm', help='The file name of the output PSM table.')
@click.option('-r', '--input_reference_database', help='The file name of the refence protein database to blast. The reference database includes Uniprot Proteomes with isoforms, ENSEMBL, RefSeq, etc.')
@click.option('-p', '--canonical_peptide_prefix', help='The prefix of reference database. Uniprot is prefixed with "sp",ENSEMBL is prefixed with "ENSP", RefSeq is prefixed with "NP". Default is "sp,NP,ENSP".')
@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')

@click.pass_context
def blast_get_position(ctx, config_file, input_psm_to_blast, output_psm, input_reference_database, canonical_peptide_prefix, number_of_processes):
def blast_get_position(ctx, config_file, input_psm_to_blast, output_psm, input_reference_database, number_of_processes):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)
Expand All @@ -29,8 +28,6 @@ def blast_get_position(ctx, config_file, input_psm_to_blast, output_psm, input_r

if input_reference_database is not None:
pipeline_arguments[BlastGetPositionService.CONFIG_INPUT_REFERENCE_DATABASE] = input_reference_database
if canonical_peptide_prefix is not None:
pipeline_arguments[BlastGetPositionService.CONFIG_CANONICAL_PEPTIDE_PREFIX] = canonical_peptide_prefix
if number_of_processes is not None:
pipeline_arguments[BlastGetPositionService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes

Expand Down
41 changes: 41 additions & 0 deletions pypgatk/commands/deeplc_predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import logging

import click

from pypgatk.toolbox.general import read_yaml_from_file
from pypgatk.commands.utils import print_help
from pypgatk.proteogenomics.deeplc_predict import DeepLCPredictService

log = logging.getLogger(__name__)

@click.command('deeplc_predict', short_help='Use deeplc prediction to filter.')
@click.option('-c', '--config_file', help='Configuration file for the fdr peptides pipeline.')
@click.option('-i', '--input_to_deeplc', help='The file name of the input PSM table to deeplc.')
@click.option('-o', '--output', help='DeepLC filtered results file.')
@click.option('-tl', '--transfer_learning', help='Use transfer learning as calibration method. 1 indicates enabled, 0 indicates disabled. Default is 0.')
@click.option('-r', '--filtration_ratio', help='Default is 0.01, filtering out 1% of the data.')
@click.option('-d', '--redundancy_removal_strategy', help='The optional parameters include the "median", "mean", "bestRT". Default is median.')
@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')

@click.pass_context
def deeplc_predict(ctx, config_file, input_to_deeplc, output, transfer_learning,filtration_ratio ,redundancy_removal_strategy, number_of_processes):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)

if input_to_deeplc is None or output is None:
print_help()

pipeline_arguments = {}

if transfer_learning is not None:
pipeline_arguments[DeepLCPredictService.CONFIG_TRANSFER_LEARNING] = transfer_learning
if redundancy_removal_strategy is not None:
pipeline_arguments[DeepLCPredictService.CONFIG_REDUNDANCY_REMOVAL_STRATEGY] = redundancy_removal_strategy
if number_of_processes is not None:
pipeline_arguments[DeepLCPredictService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
if filtration_ratio is not None:
pipeline_arguments[DeepLCPredictService.CONFIG_FILTRATION_RATIO] = filtration_ratio

deeplc_predict_service = DeepLCPredictService(config_data, pipeline_arguments)
deeplc_predict_service.predict(input_to_deeplc, output)
7 changes: 7 additions & 0 deletions pypgatk/commands/predictive tool filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import logging

import click

from pypgatk.toolbox.general import read_yaml_from_file
from pypgatk.commands.utils import print_help
from pypgatk.proteogenomics.blast_get_position import BlastGetPositionService
23 changes: 12 additions & 11 deletions pypgatk/commands/validate_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,38 @@
@click.option('-i', '--infile_name', help='Variant peptide PSMs table')
@click.option('-o', '--outfile_name', help='Output file for the results')
@click.option('-ion', '--ions_tolerance', help='MS2 fragment ions mass accuracy')
@click.option('-r', '--relative', help='relative', is_flag=True)
@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
@click.option('-r', '--relative', help='When using ppm as ions_tolerance (not Da), it needs to be turned on', is_flag=True)
@click.option('-msgf', '--msgf', help='If it is the standard format of MSGF output, please turn on this switch, otherwise it defaults to mzTab format', is_flag=True)
# @click.option('-in_psms', '--input_psm_table', help='Input variant peptide PSMs table')
# @click.option('-fa', '--input_fasta', help='Protein sequence used')
# @click.option('-out_psms', '--output_psm_table', help='Output variant peptide PSMs table')

@click.pass_context
# def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance, relative, input_psm_table, input_fasta, output_psm_table ,msgf):
def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance, relative, msgf):

def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance, number_of_processes, relative, msgf):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)

validate_flag = bool(infile_name and (mzml_path or mzml_files) and outfile_name)
# position_flag = bool(input_psm_table and input_fasta and output_psm_table)
# if not validate_flag and not position_flag:
if not validate_flag :
print_help()

pipeline_arguments = {}

if mzml_path is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_PATH] = mzml_path
if mzml_files is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_FILES] = mzml_files
if ions_tolerance is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_IONS_TOLERANCE] = ions_tolerance
if number_of_processes is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
if relative is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_RELATIVE] = relative
if msgf is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MSGF] = msgf

validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments)
if validate_flag:
validate_peptides_service.validate(infile_name, outfile_name, mzml_path , mzml_files)
# elif position_flag:
# validate_peptides_service.get_position(input_psm_table, input_fasta, output_psm_table)
validate_peptides_service.validate(infile_name, outfile_name)


52 changes: 29 additions & 23 deletions pypgatk/proteogenomics/blast_get_position.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@
from Bio import pairwise2,SeqIO
import datetime
from pathos.multiprocessing import ProcessingPool as Pool
from multiprocessing import Manager
from tqdm import tqdm
import ahocorasick

from pypgatk.toolbox.general import ParameterConfiguration

class BlastGetPositionService(ParameterConfiguration):
CONFIG_KEY_BlastGetPosition = 'blast_get_position'
CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
# CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
CONFIG_INPUT_REFERENCE_DATABASE = 'input_reference_database'
CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes'

Expand All @@ -19,13 +22,13 @@ def __init__(self, config_data, pipeline_arguments):
"""

super(BlastGetPositionService, self).__init__(self.CONFIG_KEY_BlastGetPosition, config_data, pipeline_arguments)
self._canonical_peptide_prefix = self.get_blast_parameters(variable=self.CONFIG_CANONICAL_PEPTIDE_PREFIX, default_value='sp,NP,ENSP')
self._input_reference_database = self.get_blast_parameters(variable=self.CONFIG_INPUT_REFERENCE_DATABASE, default_value='')
self._number_of_processes = self.get_blast_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES, default_value='40')

self.fa_set =set()
for j in SeqIO.parse(self._input_reference_database, "fasta"):
self.fa_set.add(str(j.seq))
self.blast_dict = Manager().dict()

def get_blast_parameters(self, variable: str, default_value):
value_return = default_value
Expand All @@ -35,15 +38,24 @@ def get_blast_parameters(self, variable: str, default_value):
variable in self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition]:
value_return = self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition][variable]
return value_return

def _blast_canonical(self, df):
seq_set = set(df["sequence"].to_list())

def _is_canonical(self, accession):
prefix_list = self._canonical_peptide_prefix.split(",")
accession_list = accession.split(",")
for i in accession_list:
for j in prefix_list:
if j in i:
return "canonical"
return "waiting for blast"
auto = ahocorasick.Automaton()
seq_dict = dict()
for seq_peptide in seq_set:
auto.add_word(seq_peptide, seq_peptide)
seq_dict[seq_peptide] = "waiting for blast"

auto.make_automaton()

for protein_seq in self.fa_set:
for end_ind, found in auto.iter(protein_seq):
seq_dict[found]= "canonical"

df["position"] = df["sequence"].map(seq_dict)
return df

def _blast_set(self, fasta_set, peptide):
length = len(peptide)
Expand Down Expand Up @@ -96,37 +108,31 @@ def _blast_set(self, fasta_set, peptide):
return "non-canonical"

def _result(self, sequence):
return self._blast_set(self.fa_set ,sequence)
self.blast_dict[sequence] = self._blast_set(self.fa_set ,sequence)

def blast(self, input_psm_to_blast, output_psm):
start_time = datetime.datetime.now()
print("Start time :", start_time)

PSM = pd.read_table(input_psm_to_blast, header=0,sep="\t")
PSM.loc[:,"position"] = PSM.apply(lambda x: self._is_canonical(x["accession"]),axis = 1)
PSM = self._blast_canonical(PSM)

first_filter = PSM[PSM.position=="canonical"]
psm_to_blast = PSM[PSM.position=="waiting for blast"]
psm_to_blast = psm_to_blast.copy()

# seq = psm_to_blast["sequence"].to_list()
# pool = Pool()
# res = pool.map(self._result, seq)
# psm_to_blast["position"] = res
# pool.close()

#Remove duplicate sequences
seq_set = set(psm_to_blast["sequence"].to_list())
seq_list = list(seq_set)

pool = Pool(int(self._number_of_processes))
res = pool.map(self._result, seq_list)
seq_dict = dict()
for i in range(len(seq_list)):
seq_dict[seq_list[i]] = res[i]
psm_to_blast["position"] = PSM.apply(lambda x: seq_dict.get(x["sequence"]),axis=1)
list(tqdm(pool.imap(self._result, seq_list) , total=len(seq_list), desc="Blast", unit="peptide"))

pool.close()
pool.join()

psm_to_blast["position"] = PSM.apply(lambda x: self.blast_dict.get(x["sequence"]),axis=1)

second_filter = psm_to_blast[psm_to_blast.position=="canonical"]
non_filter = psm_to_blast[psm_to_blast.position=="non-canonical"]

Expand Down
144 changes: 144 additions & 0 deletions pypgatk/proteogenomics/deeplc_predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from deeplc import DeepLC
import datetime
# from deeplcretrainer import deeplcretrainer


from pypgatk.toolbox.general import ParameterConfiguration

class DeepLCPredictService(ParameterConfiguration):
CONFIG_KEY_DeepLCPredict = 'deeplc_predict'
CONFIG_TRANSFER_LEARNING = 'transfer_learning'
CONFIG_REDUNDANCY_REMOVAL_STRATEGY = 'redundancy_removal_strategy'
CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes'
CONFIG_FILTRATION_RATIO = 'filtration_ratio'

def __init__(self, config_data, pipeline_arguments):
"""
Init the class with the specific parameters.
:param config_data configuration file
:param pipeline_arguments pipelines arguments
"""

super(DeepLCPredictService, self).__init__(self.CONFIG_KEY_DeepLCPredict, config_data, pipeline_arguments)
self._transfer_learning = self.get_deeplc_parameters(variable=self.CONFIG_TRANSFER_LEARNING,default_value=0)
self._redundancy_removal_strategy = self.get_deeplc_parameters(variable=self.CONFIG_REDUNDANCY_REMOVAL_STRATEGY, default_value='median')
self._number_of_processes = int(self.get_deeplc_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES, default_value=40))
self._filtration_ratio = self.get_deeplc_parameters(variable=self.CONFIG_FILTRATION_RATIO, default_value=0.01)

self.mod_rep = {"UNIMOD:4": "Carbamidomethyl", "UNIMOD:737": "TMT6plex", "UNIMOD:35": "Oxidation", "UNIMOD:1": "Acetyl"}

def get_deeplc_parameters(self, variable: str, default_value):
value_return = default_value
if variable in self.get_pipeline_parameters():
value_return = self.get_pipeline_parameters()[variable]
elif self.CONFIG_KEY_DeepLCPredict in self.get_default_parameters() and \
variable in self.get_default_parameters()[self.CONFIG_KEY_DeepLCPredict]:
value_return = self.get_default_parameters()[self.CONFIG_KEY_DeepLCPredict][variable]
return value_return

def _replace_mod(self, x, mod_dict):
s = ""
for mod in x.split(','):
nums = mod.split("-")[0]
m = mod.split("-")[1]
if s:
s += "|"
s += nums + "|" + mod_dict.get(m)
return s

def predict(self, input_to_deeplc, output):
start_time = datetime.datetime.now()
print("Start time :", start_time)

validate_out = pd.read_table(input_to_deeplc, header=0, sep="\t", index_col=0)

validate_out.loc[:, "seq"] = validate_out.apply(lambda x: x["sequence"], axis=1)
validate_out["modifications"] = validate_out["modifications"].fillna("")
validate_out.loc[:, "modifications"] = validate_out.apply(lambda x: self._replace_mod(x["modifications"], self.mod_rep) if x["modifications"] != "" else "", axis=1)
validate_out.loc[:, "tr"] = validate_out.apply(lambda x: x["retention_time"], axis=1)

filter_out = validate_out[(validate_out["position"] == "non-canonical") | (validate_out["position"] == "canonical") | (validate_out["flanking_ions_support"] == "YES")]
print("all(no_drop_by_PSMid):" + str(len(filter_out)))
filter_out = filter_out.drop_duplicates(['PSM_ID'])

can = filter_out[filter_out["position"]=="canonical"]
non = filter_out[filter_out["position"]!="canonical"]
print("can(after_drop_by_PSMid):" + str(len(can)))
print("non(after_drop_by_PSMid):" + str(len(non)))

can_after_drop = can
non_after_drop = non
if self._redundancy_removal_strategy == 'median':
can_after_drop = can.groupby(["seq","modifications"]).median().reset_index()
non_after_drop = non.groupby(["seq","modifications"]).median().reset_index()
elif self._redundancy_removal_strategy == 'mean':
can_after_drop = can.groupby(["seq","modifications"]).mean().reset_index()
non_after_drop = non.groupby(["seq","modifications"]).mean().reset_index()
elif self._redundancy_removal_strategy == 'first':
can_after_drop = can.sort_values("tr").drop_duplicates(["seq","modifications"])
non_after_drop = non.sort_values("tr").drop_duplicates(["seq","modifications"])
else:
raise ValueError(
"You need to specify a strategy for removing redundancy.")
print("can(after_drop_by_same_sequence+modification):" + str(len(can_after_drop)))
print("non(after_drop_by_same_sequence+modification):" + str(len(non_after_drop)))

if self._transfer_learning == 1:
dlc = DeepLC(
deeplc_retrain=True,
n_epochs=30,
n_jobs=self._number_of_processes,
)
else:
dlc = DeepLC(
pygam_calibration=True,
n_jobs=self._number_of_processes,
)

dlc.calibrate_preds(seq_df=can_after_drop)
preds_calib = dlc.make_preds(seq_df=non_after_drop)

non_after_drop["preds_tr"] = preds_calib
non_after_drop.index = non_after_drop["seq"]+"+"+non_after_drop["modifications"]
pred_dict = non_after_drop["preds_tr"].to_dict()

non.index = non["seq"]+"+"+non["modifications"]
non["preds_tr"] = [pred_dict[v] for v in non.index]

non["error"] = non["tr"]-non["preds_tr"]
non["abserror"] = abs(non["error"])

plt.hist(non["abserror"], bins=1000)
y_max = plt.gca().get_ylim()[1]
plt.vlines(np.percentile(non["abserror"], 95), 0, y_max, color="grey", label="95th percentile")
plt.vlines(np.percentile(non["abserror"], 99), 0, y_max, color="black", label="99th percentile")
plt.vlines(np.percentile(non["abserror"], (1-self._filtration_ratio)*100), 0, y_max, color="red", label= str((1-self._filtration_ratio)*100) + "th percentile")
plt.legend()
plt.show()
plt.savefig('percentile.png')

non_after_filter = non[non["abserror"] < np.percentile(non["abserror"], (1-self._filtration_ratio)*100)]
print("non(after_filter):" + str(len(non_after_filter)))

all = pd.concat([can, non_after_filter], axis=0)
all.to_csv(output,sep="\t",index = None)

end_time = datetime.datetime.now()
print("End time :", end_time)
time_taken = end_time - start_time
print("Time consumption :", time_taken)












6 changes: 3 additions & 3 deletions pypgatk/proteogenomics/mztab_class_fdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def _compute_class_fdr(self, df_psms,order):

return df_psms

def form_mzTab_class_fdr(self, input_mztab ,outfile_name):
def form_mzTab_class_fdr(self, input_mztab , outfile_name):
start_time = datetime.datetime.now()
print("Start time :", start_time)

Expand Down Expand Up @@ -135,8 +135,8 @@ def form_mzTab_class_fdr(self, input_mztab ,outfile_name):

PSM = self._compute_global_fdr(PSM, order)
PSM = self._compute_class_fdr(PSM, order)
PSM = PSM[((PSM['q-value'] < self._global_fdr_cutoff) & (
PSM['class-specific-q-value'] < self._class_fdr_cutoff))]
PSM = PSM[((PSM['q-value'] < float(self._global_fdr_cutoff))
& (PSM['class-specific-q-value'] < float(self._class_fdr_cutoff)))]
PSM.reset_index(drop=True, inplace=True)

PSM.to_csv(outfile_name, header=1, sep="\t", index=None)
Expand Down
Loading

0 comments on commit 8fe67f1

Please sign in to comment.