Merge branch 'spectrumAI' of https://github.com/DongdongdongW/py-pgatk …

…into spectrumAI # Conflicts: # requirements.txt # setup.py
bigbio · Apr 19, 2024 · 8fe67f1 · 8fe67f1
2 parents e46bffe + 9a0a9b4
commit 8fe67f1
Show file tree

Hide file tree

Showing 9 changed files with 295 additions and 123 deletions.
diff --git a/pypgatk/commands/blast_get_position.py b/pypgatk/commands/blast_get_position.py
@@ -13,11 +13,10 @@
 @click.option('-i', '--input_psm_to_blast', help='The file name of the input PSM table to blast.')
 @click.option('-o', '--output_psm', help='The file name of the output PSM table.')
 @click.option('-r', '--input_reference_database', help='The file name of the refence protein database to blast. The reference database includes Uniprot Proteomes with isoforms, ENSEMBL, RefSeq, etc.')
-@click.option('-p', '--canonical_peptide_prefix', help='The prefix of reference database. Uniprot is prefixed with "sp",ENSEMBL is prefixed with "ENSP", RefSeq is prefixed with "NP". Default is "sp,NP,ENSP".')
 @click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
 
 @click.pass_context
-def blast_get_position(ctx, config_file, input_psm_to_blast, output_psm, input_reference_database, canonical_peptide_prefix, number_of_processes):  
+def blast_get_position(ctx, config_file, input_psm_to_blast, output_psm, input_reference_database, number_of_processes):  
     config_data = None
     if config_file is not None:
         config_data = read_yaml_from_file(config_file)
@@ -29,8 +28,6 @@ def blast_get_position(ctx, config_file, input_psm_to_blast, output_psm, input_r
 
     if input_reference_database is not None:
         pipeline_arguments[BlastGetPositionService.CONFIG_INPUT_REFERENCE_DATABASE] = input_reference_database
-    if canonical_peptide_prefix is not None:
-        pipeline_arguments[BlastGetPositionService.CONFIG_CANONICAL_PEPTIDE_PREFIX] = canonical_peptide_prefix
     if number_of_processes is not None:
         pipeline_arguments[BlastGetPositionService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
 

diff --git a/pypgatk/commands/deeplc_predict.py b/pypgatk/commands/deeplc_predict.py
@@ -0,0 +1,41 @@
+import logging
+
+import click
+
+from pypgatk.toolbox.general import read_yaml_from_file
+from pypgatk.commands.utils import print_help
+from pypgatk.proteogenomics.deeplc_predict import DeepLCPredictService
+
+log = logging.getLogger(__name__)
+
+@click.command('deeplc_predict', short_help='Use deeplc prediction to filter.')
+@click.option('-c', '--config_file', help='Configuration file for the fdr peptides pipeline.')
+@click.option('-i', '--input_to_deeplc', help='The file name of the input PSM table to deeplc.')
+@click.option('-o', '--output', help='DeepLC filtered results file.')
+@click.option('-tl', '--transfer_learning', help='Use transfer learning as calibration method. 1 indicates enabled, 0 indicates disabled. Default is 0.')
+@click.option('-r', '--filtration_ratio', help='Default is 0.01, filtering out 1% of the data.')
+@click.option('-d', '--redundancy_removal_strategy', help='The optional parameters include the "median", "mean", "bestRT". Default is median.')
+@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
+
+@click.pass_context
+def deeplc_predict(ctx, config_file, input_to_deeplc, output, transfer_learning,filtration_ratio ,redundancy_removal_strategy, number_of_processes):  
+    config_data = None
+    if config_file is not None:
+        config_data = read_yaml_from_file(config_file)
+
+    if input_to_deeplc is None or output is None:
+        print_help()
+
+    pipeline_arguments = {}
+
+    if transfer_learning is not None:
+        pipeline_arguments[DeepLCPredictService.CONFIG_TRANSFER_LEARNING] = transfer_learning
+    if redundancy_removal_strategy is not None:
+        pipeline_arguments[DeepLCPredictService.CONFIG_REDUNDANCY_REMOVAL_STRATEGY] = redundancy_removal_strategy
+    if number_of_processes is not None:
+        pipeline_arguments[DeepLCPredictService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
+    if filtration_ratio is not None:
+        pipeline_arguments[DeepLCPredictService.CONFIG_FILTRATION_RATIO] = filtration_ratio
+
+    deeplc_predict_service = DeepLCPredictService(config_data, pipeline_arguments)
+    deeplc_predict_service.predict(input_to_deeplc, output)
diff --git a/pypgatk/commands/predictive tool filter.py b/pypgatk/commands/predictive tool filter.py
@@ -0,0 +1,7 @@
+import logging
+
+import click
+
+from pypgatk.toolbox.general import read_yaml_from_file
+from pypgatk.commands.utils import print_help
+from pypgatk.proteogenomics.blast_get_position import BlastGetPositionService
diff --git a/pypgatk/commands/validate_peptides.py b/pypgatk/commands/validate_peptides.py
@@ -15,37 +15,38 @@
 @click.option('-i', '--infile_name', help='Variant peptide PSMs table')
 @click.option('-o', '--outfile_name', help='Output file for the results')
 @click.option('-ion', '--ions_tolerance', help='MS2 fragment ions mass accuracy')
-@click.option('-r', '--relative', help='relative', is_flag=True)
+@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
+@click.option('-r', '--relative', help='When using ppm as ions_tolerance (not Da), it needs to be turned on', is_flag=True)
 @click.option('-msgf', '--msgf', help='If it is the standard format of MSGF output, please turn on this switch, otherwise it defaults to mzTab format', is_flag=True)
-# @click.option('-in_psms', '--input_psm_table', help='Input variant peptide PSMs table')
-# @click.option('-fa', '--input_fasta', help='Protein sequence used')
-# @click.option('-out_psms', '--output_psm_table', help='Output variant peptide PSMs table')
+
 @click.pass_context
-# def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance, relative, input_psm_table, input_fasta, output_psm_table ,msgf):
-def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance, relative, msgf):
+
+def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance, number_of_processes, relative, msgf):
     config_data = None
     if config_file is not None:
         config_data = read_yaml_from_file(config_file)
 
     validate_flag = bool(infile_name and (mzml_path or mzml_files) and outfile_name)
-    # position_flag = bool(input_psm_table and input_fasta and output_psm_table)
-    # if not validate_flag and not position_flag:
     if not validate_flag :
         print_help()
 
     pipeline_arguments = {}
 
+    if mzml_path is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_PATH] = mzml_path
+    if mzml_files is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_FILES] = mzml_files
     if ions_tolerance is not None:
         pipeline_arguments[ValidatePeptidesService.CONFIG_IONS_TOLERANCE] = ions_tolerance
+    if number_of_processes is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
     if relative is not None:
         pipeline_arguments[ValidatePeptidesService.CONFIG_RELATIVE] = relative
     if msgf is not None:
         pipeline_arguments[ValidatePeptidesService.CONFIG_MSGF] = msgf
 
     validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments)
     if validate_flag:
-        validate_peptides_service.validate(infile_name, outfile_name, mzml_path , mzml_files)
-    # elif position_flag:
-    #     validate_peptides_service.get_position(input_psm_table, input_fasta, output_psm_table)
+        validate_peptides_service.validate(infile_name, outfile_name)
 
 
diff --git a/pypgatk/proteogenomics/blast_get_position.py b/pypgatk/proteogenomics/blast_get_position.py
@@ -2,12 +2,15 @@
 from Bio import pairwise2,SeqIO
 import datetime
 from pathos.multiprocessing import ProcessingPool as Pool
+from multiprocessing import Manager
+from tqdm import tqdm
+import ahocorasick
 
 from pypgatk.toolbox.general import ParameterConfiguration
 
 class BlastGetPositionService(ParameterConfiguration):
     CONFIG_KEY_BlastGetPosition = 'blast_get_position'
-    CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
+    # CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
     CONFIG_INPUT_REFERENCE_DATABASE = 'input_reference_database'
     CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes'
 
@@ -19,13 +22,13 @@ def __init__(self, config_data, pipeline_arguments):
       """
 
         super(BlastGetPositionService, self).__init__(self.CONFIG_KEY_BlastGetPosition, config_data, pipeline_arguments)
-        self._canonical_peptide_prefix = self.get_blast_parameters(variable=self.CONFIG_CANONICAL_PEPTIDE_PREFIX, default_value='sp,NP,ENSP')
         self._input_reference_database = self.get_blast_parameters(variable=self.CONFIG_INPUT_REFERENCE_DATABASE, default_value='')
         self._number_of_processes = self.get_blast_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES, default_value='40')
 
         self.fa_set =set()
         for j in SeqIO.parse(self._input_reference_database, "fasta"):
             self.fa_set.add(str(j.seq))
+        self.blast_dict = Manager().dict()
 
     def get_blast_parameters(self, variable: str, default_value):
         value_return = default_value
@@ -35,15 +38,24 @@ def get_blast_parameters(self, variable: str, default_value):
                 variable in self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition]:
             value_return = self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition][variable]
         return value_return
+
+    def _blast_canonical(self, df):
+        seq_set = set(df["sequence"].to_list())
 
-    def _is_canonical(self, accession):
-        prefix_list = self._canonical_peptide_prefix.split(",")
-        accession_list = accession.split(",")
-        for i in accession_list:
-            for j in prefix_list:
-                if j in i:
-                    return "canonical"
-        return "waiting for blast"
+        auto = ahocorasick.Automaton()
+        seq_dict = dict()
+        for seq_peptide in seq_set:
+            auto.add_word(seq_peptide, seq_peptide)
+            seq_dict[seq_peptide] = "waiting for blast"
+
+        auto.make_automaton()
+
+        for protein_seq in self.fa_set:
+            for end_ind, found in auto.iter(protein_seq):
+                seq_dict[found]= "canonical"
+
+        df["position"] = df["sequence"].map(seq_dict)
+        return df 
 
     def _blast_set(self, fasta_set, peptide):
         length = len(peptide)
@@ -96,37 +108,31 @@ def _blast_set(self, fasta_set, peptide):
             return "non-canonical"
 
     def _result(self, sequence):
-        return self._blast_set(self.fa_set ,sequence)
+        self.blast_dict[sequence] = self._blast_set(self.fa_set ,sequence)
 
     def blast(self, input_psm_to_blast, output_psm):
         start_time = datetime.datetime.now()
         print("Start time :", start_time)
 
         PSM = pd.read_table(input_psm_to_blast, header=0,sep="\t")
-        PSM.loc[:,"position"] = PSM.apply(lambda x: self._is_canonical(x["accession"]),axis = 1)
+        PSM = self._blast_canonical(PSM)
 
         first_filter = PSM[PSM.position=="canonical"]
         psm_to_blast = PSM[PSM.position=="waiting for blast"]
         psm_to_blast = psm_to_blast.copy()
 
-        # seq = psm_to_blast["sequence"].to_list()
-        # pool = Pool()
-        # res = pool.map(self._result, seq)
-        # psm_to_blast["position"] = res
-        # pool.close()
-
         #Remove duplicate sequences
         seq_set = set(psm_to_blast["sequence"].to_list())
         seq_list = list(seq_set)
 
         pool = Pool(int(self._number_of_processes))
-        res = pool.map(self._result, seq_list)
-        seq_dict = dict()
-        for i in range(len(seq_list)):
-            seq_dict[seq_list[i]] = res[i]
-        psm_to_blast["position"] = PSM.apply(lambda x: seq_dict.get(x["sequence"]),axis=1)
+        list(tqdm(pool.imap(self._result, seq_list) , total=len(seq_list), desc="Blast", unit="peptide"))
+
         pool.close()
+        pool.join()
 
+        psm_to_blast["position"] = PSM.apply(lambda x: self.blast_dict.get(x["sequence"]),axis=1)
+
         second_filter = psm_to_blast[psm_to_blast.position=="canonical"]
         non_filter = psm_to_blast[psm_to_blast.position=="non-canonical"]
 

diff --git a/pypgatk/proteogenomics/deeplc_predict.py b/pypgatk/proteogenomics/deeplc_predict.py
@@ -0,0 +1,144 @@
+import pandas as pd
+import numpy as np
+from matplotlib import pyplot as plt
+from deeplc import DeepLC
+import datetime
+# from deeplcretrainer import deeplcretrainer
+
+
+from pypgatk.toolbox.general import ParameterConfiguration
+
+class DeepLCPredictService(ParameterConfiguration):
+    CONFIG_KEY_DeepLCPredict = 'deeplc_predict'
+    CONFIG_TRANSFER_LEARNING = 'transfer_learning'
+    CONFIG_REDUNDANCY_REMOVAL_STRATEGY = 'redundancy_removal_strategy'
+    CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes'
+    CONFIG_FILTRATION_RATIO = 'filtration_ratio'
+
+    def __init__(self, config_data, pipeline_arguments):
+        """
+      Init the class with the specific parameters.
+      :param config_data configuration file
+      :param pipeline_arguments pipelines arguments
+      """
+
+        super(DeepLCPredictService, self).__init__(self.CONFIG_KEY_DeepLCPredict, config_data, pipeline_arguments)
+        self._transfer_learning = self.get_deeplc_parameters(variable=self.CONFIG_TRANSFER_LEARNING,default_value=0)
+        self._redundancy_removal_strategy = self.get_deeplc_parameters(variable=self.CONFIG_REDUNDANCY_REMOVAL_STRATEGY, default_value='median')
+        self._number_of_processes = int(self.get_deeplc_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES, default_value=40))
+        self._filtration_ratio = self.get_deeplc_parameters(variable=self.CONFIG_FILTRATION_RATIO, default_value=0.01)
+
+        self.mod_rep = {"UNIMOD:4": "Carbamidomethyl", "UNIMOD:737": "TMT6plex", "UNIMOD:35": "Oxidation", "UNIMOD:1": "Acetyl"}
+
+    def get_deeplc_parameters(self, variable: str, default_value):
+        value_return = default_value
+        if variable in self.get_pipeline_parameters():
+            value_return = self.get_pipeline_parameters()[variable]
+        elif self.CONFIG_KEY_DeepLCPredict in self.get_default_parameters() and \
+                variable in self.get_default_parameters()[self.CONFIG_KEY_DeepLCPredict]:
+            value_return = self.get_default_parameters()[self.CONFIG_KEY_DeepLCPredict][variable]
+        return value_return
+
+    def _replace_mod(self, x, mod_dict):
+        s = ""
+        for mod in x.split(','):
+            nums = mod.split("-")[0]
+            m = mod.split("-")[1]
+            if s:
+                    s += "|"
+            s += nums + "|" + mod_dict.get(m)
+        return s
+
+    def predict(self, input_to_deeplc, output):
+        start_time = datetime.datetime.now()
+        print("Start time :", start_time)
+
+        validate_out = pd.read_table(input_to_deeplc, header=0, sep="\t", index_col=0)
+
+        validate_out.loc[:, "seq"] = validate_out.apply(lambda x: x["sequence"], axis=1)
+        validate_out["modifications"] = validate_out["modifications"].fillna("")
+        validate_out.loc[:, "modifications"] = validate_out.apply(lambda x: self._replace_mod(x["modifications"], self.mod_rep) if x["modifications"] != "" else "", axis=1)
+        validate_out.loc[:, "tr"] = validate_out.apply(lambda x: x["retention_time"], axis=1)
+
+        filter_out = validate_out[(validate_out["position"] == "non-canonical") | (validate_out["position"] == "canonical") | (validate_out["flanking_ions_support"] == "YES")]
+        print("all(no_drop_by_PSMid):" + str(len(filter_out)))
+        filter_out = filter_out.drop_duplicates(['PSM_ID'])
+
+        can = filter_out[filter_out["position"]=="canonical"]
+        non = filter_out[filter_out["position"]!="canonical"]
+        print("can(after_drop_by_PSMid):" + str(len(can)))
+        print("non(after_drop_by_PSMid):" + str(len(non)))
+
+        can_after_drop = can
+        non_after_drop = non
+        if self._redundancy_removal_strategy == 'median':
+            can_after_drop = can.groupby(["seq","modifications"]).median().reset_index()
+            non_after_drop = non.groupby(["seq","modifications"]).median().reset_index()
+        elif self._redundancy_removal_strategy == 'mean':
+            can_after_drop = can.groupby(["seq","modifications"]).mean().reset_index()
+            non_after_drop = non.groupby(["seq","modifications"]).mean().reset_index()
+        elif self._redundancy_removal_strategy == 'first':
+            can_after_drop = can.sort_values("tr").drop_duplicates(["seq","modifications"])
+            non_after_drop = non.sort_values("tr").drop_duplicates(["seq","modifications"])
+        else:
+            raise ValueError(
+                "You need to specify a strategy for removing redundancy.")    
+        print("can(after_drop_by_same_sequence+modification):" + str(len(can_after_drop)))
+        print("non(after_drop_by_same_sequence+modification):" + str(len(non_after_drop)))
+
+        if self._transfer_learning == 1:
+            dlc = DeepLC(
+                deeplc_retrain=True,
+                n_epochs=30,
+                n_jobs=self._number_of_processes,
+            )
+        else:
+            dlc = DeepLC(
+                pygam_calibration=True,
+                n_jobs=self._number_of_processes,
+            )
+
+        dlc.calibrate_preds(seq_df=can_after_drop)
+        preds_calib = dlc.make_preds(seq_df=non_after_drop)
+
+        non_after_drop["preds_tr"] = preds_calib
+        non_after_drop.index = non_after_drop["seq"]+"+"+non_after_drop["modifications"]
+        pred_dict = non_after_drop["preds_tr"].to_dict()
+
+        non.index = non["seq"]+"+"+non["modifications"]
+        non["preds_tr"] = [pred_dict[v] for v in non.index]
+
+        non["error"] = non["tr"]-non["preds_tr"]
+        non["abserror"] = abs(non["error"])
+
+        plt.hist(non["abserror"], bins=1000)
+        y_max = plt.gca().get_ylim()[1]
+        plt.vlines(np.percentile(non["abserror"], 95), 0, y_max, color="grey", label="95th percentile")
+        plt.vlines(np.percentile(non["abserror"], 99), 0, y_max, color="black", label="99th percentile")
+        plt.vlines(np.percentile(non["abserror"], (1-self._filtration_ratio)*100), 0, y_max, color="red", label= str((1-self._filtration_ratio)*100) + "th percentile")
+        plt.legend()
+        plt.show()
+        plt.savefig('percentile.png')
+
+        non_after_filter = non[non["abserror"] < np.percentile(non["abserror"], (1-self._filtration_ratio)*100)]
+        print("non(after_filter):" + str(len(non_after_filter)))
+
+        all = pd.concat([can, non_after_filter], axis=0)
+        all.to_csv(output,sep="\t",index = None)
+
+        end_time = datetime.datetime.now()
+        print("End time :", end_time)
+        time_taken = end_time - start_time
+        print("Time consumption :", time_taken)
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/pypgatk/proteogenomics/mztab_class_fdr.py b/pypgatk/proteogenomics/mztab_class_fdr.py
@@ -100,7 +100,7 @@ def _compute_class_fdr(self, df_psms,order):
 
         return df_psms
 
-    def form_mzTab_class_fdr(self, input_mztab ,outfile_name):
+    def form_mzTab_class_fdr(self, input_mztab , outfile_name):
         start_time = datetime.datetime.now()
         print("Start time :", start_time)
 
@@ -135,8 +135,8 @@ def form_mzTab_class_fdr(self, input_mztab ,outfile_name):
 
         PSM = self._compute_global_fdr(PSM, order)
         PSM = self._compute_class_fdr(PSM, order)
-        PSM = PSM[((PSM['q-value'] < self._global_fdr_cutoff) & (
-                PSM['class-specific-q-value'] < self._class_fdr_cutoff))]
+        PSM = PSM[((PSM['q-value'] < float(self._global_fdr_cutoff)) 
+                            &  (PSM['class-specific-q-value'] < float(self._class_fdr_cutoff)))]
         PSM.reset_index(drop=True, inplace=True)
 
         PSM.to_csv(outfile_name, header=1, sep="\t", index=None)