Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update validate #76

Merged
merged 5 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions pypgatk/commands/blast_get_position.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@
@click.option('-i', '--input_psm_to_blast', help='The file name of the input PSM table to blast.')
@click.option('-o', '--output_psm', help='The file name of the output PSM table.')
@click.option('-r', '--input_reference_database', help='The file name of the refence protein database to blast. The reference database includes Uniprot Proteomes with isoforms, ENSEMBL, RefSeq, etc.')
@click.option('-p', '--canonical_peptide_prefix', help='The prefix of reference database. Uniprot is prefixed with "sp",ENSEMBL is prefixed with "ENSP", RefSeq is prefixed with "NP". Default is "sp,NP,ENSP".')
@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')

@click.pass_context
def blast_get_position(ctx, config_file, input_psm_to_blast, output_psm, input_reference_database, canonical_peptide_prefix, number_of_processes):
def blast_get_position(ctx, config_file, input_psm_to_blast, output_psm, input_reference_database, number_of_processes):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)
Expand All @@ -29,8 +28,6 @@ def blast_get_position(ctx, config_file, input_psm_to_blast, output_psm, input_r

if input_reference_database is not None:
pipeline_arguments[BlastGetPositionService.CONFIG_INPUT_REFERENCE_DATABASE] = input_reference_database
if canonical_peptide_prefix is not None:
pipeline_arguments[BlastGetPositionService.CONFIG_CANONICAL_PEPTIDE_PREFIX] = canonical_peptide_prefix
if number_of_processes is not None:
pipeline_arguments[BlastGetPositionService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes

Expand Down
41 changes: 41 additions & 0 deletions pypgatk/commands/deeplc_predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import logging

import click

from pypgatk.toolbox.general import read_yaml_from_file
from pypgatk.commands.utils import print_help
from pypgatk.proteogenomics.deeplc_predict import DeepLCPredictService

log = logging.getLogger(__name__)

@click.command('deeplc_predict', short_help='Use deeplc prediction to filter.')
@click.option('-c', '--config_file', help='Configuration file for the fdr peptides pipeline.')
@click.option('-i', '--input_to_deeplc', help='The file name of the input PSM table to deeplc.')
@click.option('-o', '--output', help='DeepLC filtered results file.')
@click.option('-tl', '--transfer_learning', help='Use transfer learning as calibration method. 1 indicates enabled, 0 indicates disabled. Default is 0.')
@click.option('-r', '--filtration_ratio', help='Default is 0.01, filtering out 1% of the data.')
@click.option('-d', '--redundancy_removal_strategy', help='The optional parameters include the "median", "mean", "bestRT". Default is median.')
@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')

@click.pass_context
def deeplc_predict(ctx, config_file, input_to_deeplc, output, transfer_learning,filtration_ratio ,redundancy_removal_strategy, number_of_processes):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)

if input_to_deeplc is None or output is None:
print_help()

pipeline_arguments = {}

if transfer_learning is not None:
pipeline_arguments[DeepLCPredictService.CONFIG_TRANSFER_LEARNING] = transfer_learning
if redundancy_removal_strategy is not None:
pipeline_arguments[DeepLCPredictService.CONFIG_REDUNDANCY_REMOVAL_STRATEGY] = redundancy_removal_strategy
if number_of_processes is not None:
pipeline_arguments[DeepLCPredictService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
if filtration_ratio is not None:
pipeline_arguments[DeepLCPredictService.CONFIG_FILTRATION_RATIO] = filtration_ratio

deeplc_predict_service = DeepLCPredictService(config_data, pipeline_arguments)
deeplc_predict_service.predict(input_to_deeplc, output)
7 changes: 7 additions & 0 deletions pypgatk/commands/predictive tool filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import logging

import click

from pypgatk.toolbox.general import read_yaml_from_file
from pypgatk.commands.utils import print_help
from pypgatk.proteogenomics.blast_get_position import BlastGetPositionService
23 changes: 12 additions & 11 deletions pypgatk/commands/validate_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,38 @@
@click.option('-i', '--infile_name', help='Variant peptide PSMs table')
@click.option('-o', '--outfile_name', help='Output file for the results')
@click.option('-ion', '--ions_tolerance', help='MS2 fragment ions mass accuracy')
@click.option('-r', '--relative', help='relative', is_flag=True)
@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
@click.option('-r', '--relative', help='When using ppm as ions_tolerance (not Da), it needs to be turned on', is_flag=True)
@click.option('-msgf', '--msgf', help='If it is the standard format of MSGF output, please turn on this switch, otherwise it defaults to mzTab format', is_flag=True)
# @click.option('-in_psms', '--input_psm_table', help='Input variant peptide PSMs table')
# @click.option('-fa', '--input_fasta', help='Protein sequence used')
# @click.option('-out_psms', '--output_psm_table', help='Output variant peptide PSMs table')

@click.pass_context
# def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance, relative, input_psm_table, input_fasta, output_psm_table ,msgf):
def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance, relative, msgf):

def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance, number_of_processes, relative, msgf):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)

validate_flag = bool(infile_name and (mzml_path or mzml_files) and outfile_name)
# position_flag = bool(input_psm_table and input_fasta and output_psm_table)
# if not validate_flag and not position_flag:
if not validate_flag :
print_help()

pipeline_arguments = {}

if mzml_path is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_PATH] = mzml_path
if mzml_files is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_FILES] = mzml_files
if ions_tolerance is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_IONS_TOLERANCE] = ions_tolerance
if number_of_processes is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
if relative is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_RELATIVE] = relative
if msgf is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MSGF] = msgf

validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments)
if validate_flag:
validate_peptides_service.validate(infile_name, outfile_name, mzml_path , mzml_files)
# elif position_flag:
# validate_peptides_service.get_position(input_psm_table, input_fasta, output_psm_table)
validate_peptides_service.validate(infile_name, outfile_name)


52 changes: 29 additions & 23 deletions pypgatk/proteogenomics/blast_get_position.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@
from Bio import pairwise2,SeqIO
import datetime
from pathos.multiprocessing import ProcessingPool as Pool
from multiprocessing import Manager
from tqdm import tqdm
import ahocorasick

from pypgatk.toolbox.general import ParameterConfiguration

class BlastGetPositionService(ParameterConfiguration):
CONFIG_KEY_BlastGetPosition = 'blast_get_position'
CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
# CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
CONFIG_INPUT_REFERENCE_DATABASE = 'input_reference_database'
CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes'

Expand All @@ -19,13 +22,13 @@ def __init__(self, config_data, pipeline_arguments):
"""

super(BlastGetPositionService, self).__init__(self.CONFIG_KEY_BlastGetPosition, config_data, pipeline_arguments)
self._canonical_peptide_prefix = self.get_blast_parameters(variable=self.CONFIG_CANONICAL_PEPTIDE_PREFIX, default_value='sp,NP,ENSP')
self._input_reference_database = self.get_blast_parameters(variable=self.CONFIG_INPUT_REFERENCE_DATABASE, default_value='')
self._number_of_processes = self.get_blast_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES, default_value='40')

self.fa_set =set()
for j in SeqIO.parse(self._input_reference_database, "fasta"):
self.fa_set.add(str(j.seq))
self.blast_dict = Manager().dict()

def get_blast_parameters(self, variable: str, default_value):
value_return = default_value
Expand All @@ -35,15 +38,24 @@ def get_blast_parameters(self, variable: str, default_value):
variable in self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition]:
value_return = self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition][variable]
return value_return

def _blast_canonical(self, df):
seq_set = set(df["sequence"].to_list())

def _is_canonical(self, accession):
prefix_list = self._canonical_peptide_prefix.split(",")
accession_list = accession.split(",")
for i in accession_list:
for j in prefix_list:
if j in i:
return "canonical"
return "waiting for blast"
auto = ahocorasick.Automaton()
seq_dict = dict()
for seq_peptide in seq_set:
auto.add_word(seq_peptide, seq_peptide)
seq_dict[seq_peptide] = "waiting for blast"

auto.make_automaton()

for protein_seq in self.fa_set:
for end_ind, found in auto.iter(protein_seq):
seq_dict[found]= "canonical"

df["position"] = df["sequence"].map(seq_dict)
return df

def _blast_set(self, fasta_set, peptide):
length = len(peptide)
Expand Down Expand Up @@ -96,37 +108,31 @@ def _blast_set(self, fasta_set, peptide):
return "non-canonical"

def _result(self, sequence):
return self._blast_set(self.fa_set ,sequence)
self.blast_dict[sequence] = self._blast_set(self.fa_set ,sequence)

def blast(self, input_psm_to_blast, output_psm):
start_time = datetime.datetime.now()
print("Start time :", start_time)

PSM = pd.read_table(input_psm_to_blast, header=0,sep="\t")
PSM.loc[:,"position"] = PSM.apply(lambda x: self._is_canonical(x["accession"]),axis = 1)
PSM = self._blast_canonical(PSM)

first_filter = PSM[PSM.position=="canonical"]
psm_to_blast = PSM[PSM.position=="waiting for blast"]
psm_to_blast = psm_to_blast.copy()

# seq = psm_to_blast["sequence"].to_list()
# pool = Pool()
# res = pool.map(self._result, seq)
# psm_to_blast["position"] = res
# pool.close()

#Remove duplicate sequences
seq_set = set(psm_to_blast["sequence"].to_list())
seq_list = list(seq_set)

pool = Pool(int(self._number_of_processes))
res = pool.map(self._result, seq_list)
seq_dict = dict()
for i in range(len(seq_list)):
seq_dict[seq_list[i]] = res[i]
psm_to_blast["position"] = PSM.apply(lambda x: seq_dict.get(x["sequence"]),axis=1)
list(tqdm(pool.imap(self._result, seq_list) , total=len(seq_list), desc="Blast", unit="peptide"))

pool.close()
pool.join()

psm_to_blast["position"] = PSM.apply(lambda x: self.blast_dict.get(x["sequence"]),axis=1)

second_filter = psm_to_blast[psm_to_blast.position=="canonical"]
non_filter = psm_to_blast[psm_to_blast.position=="non-canonical"]

Expand Down
144 changes: 144 additions & 0 deletions pypgatk/proteogenomics/deeplc_predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from deeplc import DeepLC
import datetime
# from deeplcretrainer import deeplcretrainer


from pypgatk.toolbox.general import ParameterConfiguration

class DeepLCPredictService(ParameterConfiguration):
CONFIG_KEY_DeepLCPredict = 'deeplc_predict'
CONFIG_TRANSFER_LEARNING = 'transfer_learning'
CONFIG_REDUNDANCY_REMOVAL_STRATEGY = 'redundancy_removal_strategy'
CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes'
CONFIG_FILTRATION_RATIO = 'filtration_ratio'

def __init__(self, config_data, pipeline_arguments):
"""
Init the class with the specific parameters.
:param config_data configuration file
:param pipeline_arguments pipelines arguments
"""

super(DeepLCPredictService, self).__init__(self.CONFIG_KEY_DeepLCPredict, config_data, pipeline_arguments)
self._transfer_learning = self.get_deeplc_parameters(variable=self.CONFIG_TRANSFER_LEARNING,default_value=0)
self._redundancy_removal_strategy = self.get_deeplc_parameters(variable=self.CONFIG_REDUNDANCY_REMOVAL_STRATEGY, default_value='median')
self._number_of_processes = int(self.get_deeplc_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES, default_value=40))
self._filtration_ratio = self.get_deeplc_parameters(variable=self.CONFIG_FILTRATION_RATIO, default_value=0.01)

self.mod_rep = {"UNIMOD:4": "Carbamidomethyl", "UNIMOD:737": "TMT6plex", "UNIMOD:35": "Oxidation", "UNIMOD:1": "Acetyl"}

def get_deeplc_parameters(self, variable: str, default_value):
value_return = default_value
if variable in self.get_pipeline_parameters():
value_return = self.get_pipeline_parameters()[variable]
elif self.CONFIG_KEY_DeepLCPredict in self.get_default_parameters() and \
variable in self.get_default_parameters()[self.CONFIG_KEY_DeepLCPredict]:
value_return = self.get_default_parameters()[self.CONFIG_KEY_DeepLCPredict][variable]
return value_return

def _replace_mod(self, x, mod_dict):
s = ""
for mod in x.split(','):
nums = mod.split("-")[0]
m = mod.split("-")[1]
if s:
s += "|"
s += nums + "|" + mod_dict.get(m)
return s

def predict(self, input_to_deeplc, output):
start_time = datetime.datetime.now()
print("Start time :", start_time)

validate_out = pd.read_table(input_to_deeplc, header=0, sep="\t", index_col=0)

validate_out.loc[:, "seq"] = validate_out.apply(lambda x: x["sequence"], axis=1)
validate_out["modifications"] = validate_out["modifications"].fillna("")
validate_out.loc[:, "modifications"] = validate_out.apply(lambda x: self._replace_mod(x["modifications"], self.mod_rep) if x["modifications"] != "" else "", axis=1)
validate_out.loc[:, "tr"] = validate_out.apply(lambda x: x["retention_time"], axis=1)

filter_out = validate_out[(validate_out["position"] == "non-canonical") | (validate_out["position"] == "canonical") | (validate_out["flanking_ions_support"] == "YES")]
print("all(no_drop_by_PSMid):" + str(len(filter_out)))
filter_out = filter_out.drop_duplicates(['PSM_ID'])

can = filter_out[filter_out["position"]=="canonical"]
non = filter_out[filter_out["position"]!="canonical"]
print("can(after_drop_by_PSMid):" + str(len(can)))
print("non(after_drop_by_PSMid):" + str(len(non)))

can_after_drop = can
non_after_drop = non
if self._redundancy_removal_strategy == 'median':
can_after_drop = can.groupby(["seq","modifications"]).median().reset_index()
non_after_drop = non.groupby(["seq","modifications"]).median().reset_index()
elif self._redundancy_removal_strategy == 'mean':
can_after_drop = can.groupby(["seq","modifications"]).mean().reset_index()
non_after_drop = non.groupby(["seq","modifications"]).mean().reset_index()
elif self._redundancy_removal_strategy == 'first':
can_after_drop = can.sort_values("tr").drop_duplicates(["seq","modifications"])
non_after_drop = non.sort_values("tr").drop_duplicates(["seq","modifications"])
else:
raise ValueError(
"You need to specify a strategy for removing redundancy.")
print("can(after_drop_by_same_sequence+modification):" + str(len(can_after_drop)))
print("non(after_drop_by_same_sequence+modification):" + str(len(non_after_drop)))

if self._transfer_learning == 1:
dlc = DeepLC(
deeplc_retrain=True,
n_epochs=30,
n_jobs=self._number_of_processes,
)
else:
dlc = DeepLC(
pygam_calibration=True,
n_jobs=self._number_of_processes,
)

dlc.calibrate_preds(seq_df=can_after_drop)
preds_calib = dlc.make_preds(seq_df=non_after_drop)

non_after_drop["preds_tr"] = preds_calib
non_after_drop.index = non_after_drop["seq"]+"+"+non_after_drop["modifications"]
pred_dict = non_after_drop["preds_tr"].to_dict()

non.index = non["seq"]+"+"+non["modifications"]
non["preds_tr"] = [pred_dict[v] for v in non.index]

non["error"] = non["tr"]-non["preds_tr"]
non["abserror"] = abs(non["error"])

plt.hist(non["abserror"], bins=1000)
y_max = plt.gca().get_ylim()[1]
plt.vlines(np.percentile(non["abserror"], 95), 0, y_max, color="grey", label="95th percentile")
plt.vlines(np.percentile(non["abserror"], 99), 0, y_max, color="black", label="99th percentile")
plt.vlines(np.percentile(non["abserror"], (1-self._filtration_ratio)*100), 0, y_max, color="red", label= str((1-self._filtration_ratio)*100) + "th percentile")
plt.legend()
plt.show()
plt.savefig('percentile.png')

non_after_filter = non[non["abserror"] < np.percentile(non["abserror"], (1-self._filtration_ratio)*100)]
print("non(after_filter):" + str(len(non_after_filter)))

all = pd.concat([can, non_after_filter], axis=0)
all.to_csv(output,sep="\t",index = None)

end_time = datetime.datetime.now()
print("End time :", end_time)
time_taken = end_time - start_time
print("Time consumption :", time_taken)












6 changes: 3 additions & 3 deletions pypgatk/proteogenomics/mztab_class_fdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def _compute_class_fdr(self, df_psms,order):

return df_psms

def form_mzTab_class_fdr(self, input_mztab ,outfile_name):
def form_mzTab_class_fdr(self, input_mztab , outfile_name):
start_time = datetime.datetime.now()
print("Start time :", start_time)

Expand Down Expand Up @@ -135,8 +135,8 @@ def form_mzTab_class_fdr(self, input_mztab ,outfile_name):

PSM = self._compute_global_fdr(PSM, order)
PSM = self._compute_class_fdr(PSM, order)
PSM = PSM[((PSM['q-value'] < self._global_fdr_cutoff) & (
PSM['class-specific-q-value'] < self._class_fdr_cutoff))]
PSM = PSM[((PSM['q-value'] < float(self._global_fdr_cutoff))
& (PSM['class-specific-q-value'] < float(self._class_fdr_cutoff)))]
PSM.reset_index(drop=True, inplace=True)

PSM.to_csv(outfile_name, header=1, sep="\t", index=None)
Expand Down
Loading
Loading