Merge pull request #80 from bigbio/spectrumAI

Spectrum ai
bigbio · Apr 24, 2024 · 3939fc2 · 3939fc2
2 parents 6faf53c + a9607af
commit 3939fc2
Show file tree

Hide file tree

Showing 10 changed files with 166 additions and 600 deletions.
diff --git a/pypgatk/commands/validate_peptides.py b/pypgatk/commands/validate_peptides.py
@@ -3,30 +3,29 @@
 import click
 
 from pypgatk.toolbox.general import read_yaml_from_file
-from pypgatk.proteogenomics.validate_peptides import ValidatePeptidesService
+from pypgatk.proteogenomics.spectrumai import SpectrumAIService
 from pypgatk.commands.utils import print_help
 
 log = logging.getLogger(__name__)
 
 
-@click.command('validate_peptides',
+@click.command('spectrumai',
                short_help='Command to inspect MS2 spectra of single-subsititution peptide identifications')
 @click.option('-c', '--config_file', help='Configuration file for the validate peptides pipeline')
 @click.option('-p', '--mzml_path', help='The mzml file path.You only need to use either mzml_path or mzml_files')
-@click.option('-f', '--mzml_files',
-              help='The mzml files.Different files are separated by ",".You only need to use either mzml_path or mzml_files')
+@click.option('-f', '--mzml_files', help='The mzml files. Different files are separated by ","')
 @click.option('-i', '--infile_name', help='Variant peptide PSMs table')
 @click.option('-o', '--outfile_name', help='Output file for the results')
 @click.option('-ion', '--ions_tolerance', help='MS2 fragment ions mass accuracy')
 @click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
 @click.option('-r', '--relative', help='When using ppm as ions_tolerance (not Da), it needs to be turned on',
               is_flag=True)
-@click.option('-msgf', '--msgf',
-              help='If it is the standard format of MSGF output, please turn on this switch, otherwise it defaults to mzTab format',
+@click.option('-m', '--mztab',
+              help='If the tsv was obtained from mzTab, please enable this option. Default to tsv obtained from parquet',
               is_flag=True)
 @click.pass_context
-def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance,
-                      number_of_processes, relative, msgf):
+def spectrumai(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance,
+               number_of_processes, relative, mztab):
     config_data = None
     if config_file is not None:
         config_data = read_yaml_from_file(config_file)
@@ -38,18 +37,18 @@ def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outf
     pipeline_arguments = {}
 
     if mzml_path is not None:
-        pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_PATH] = mzml_path
+        pipeline_arguments[SpectrumAIService.CONFIG_MZML_PATH] = mzml_path
     if mzml_files is not None:
-        pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_FILES] = mzml_files
+        pipeline_arguments[SpectrumAIService.CONFIG_MZML_FILES] = mzml_files
     if ions_tolerance is not None:
-        pipeline_arguments[ValidatePeptidesService.CONFIG_IONS_TOLERANCE] = ions_tolerance
+        pipeline_arguments[SpectrumAIService.CONFIG_IONS_TOLERANCE] = ions_tolerance
     if number_of_processes is not None:
-        pipeline_arguments[ValidatePeptidesService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
+        pipeline_arguments[SpectrumAIService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
     if relative is not None:
-        pipeline_arguments[ValidatePeptidesService.CONFIG_RELATIVE] = relative
-    if msgf is not None:
-        pipeline_arguments[ValidatePeptidesService.CONFIG_MSGF] = msgf
+        pipeline_arguments[SpectrumAIService.CONFIG_RELATIVE] = relative
+    if mztab is not None:
+        pipeline_arguments[SpectrumAIService.CONFIG_MZTAB] = mztab
 
-    validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments)
+    validate_peptides_service = SpectrumAIService(config_data, pipeline_arguments)
     if validate_flag:
         validate_peptides_service.validate(infile_name, outfile_name)
diff --git a/pypgatk/proteogenomics/blast_get_position.py b/pypgatk/proteogenomics/blast_get_position.py
@@ -8,61 +8,69 @@
 
 from pypgatk.toolbox.general import ParameterConfiguration
 
-
-def _blast_set(fasta_set, peptide):
+def get_details(fasta, peptide):
+    res = []
+    i = 0
+    j = 0
+    for AA1, AA2 in zip(fasta, peptide):
+        i += 1
+        j += 1
+        if AA1 == AA2:
+            continue
+        else:
+            res.append(str(i) + "|" + AA1 + ">" + AA2)
+    return res
+
+def peptide_blast_protein(fasta, peptide):
     length = len(peptide)
-    position_set = set()
-    for fasta in fasta_set:
-        if len(fasta) >= length:
-            alignments_score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0, score_only=True)
-            if alignments_score == length:
-                return "canonical"
-            elif alignments_score == length - 1:
-                alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0)
-                for alignment in alignments_local:
-                    # insertion e.g., ABCDMEFGH<----ABCDEFGH
-                    if alignment.end - alignment.start == length + 1:
-                        s = fasta[alignment.start:alignment.end]
-                        for i in range(length):
-                            if peptide[i] != s[i]:
-                                position_set.add(i + 1)
-                                break
-                    # substitution e.g., ABCDMFGH<----ABCDEFGH
-                    elif alignment.end - alignment.start == length:
-                        s = fasta[alignment.start:alignment.end]
-                        for i in range(length):
-                            if peptide[i] != s[i]:
-                                position_set.add(i + 1)
-                                break
-                    # substitution e.g., ABCDEFGM<----ABCDEFGH
-                    elif alignment.end - alignment.start == length - 1:
-                        s = fasta[alignment.start:alignment.end]
-                        if peptide[0] != s[0]:
-                            position_set.add(1)
-                        elif peptide[-1] != s[-1]:
-                            position_set.add(length)
-            elif alignments_score == length - 2:
-                alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=-1,
-                                                           open=-1, extend=0)
-                for alignment in alignments_local:
-                    # deletion e.g., ABCEFGH<----ABCDEFGH
-                    if alignment.end - alignment.start == length and alignment.score == length - 2:
-                        s = fasta[alignment.start:alignment.end - 1]
-                        if pairwise2.align.localms(sequenceA=s, sequenceB=peptide, match=1, mismatch=0, open=0,
-                                                   extend=0, score_only=True) == length - 1:
-                            for i in range(length - 1):
-                                if peptide[i] != s[i]:
-                                    position_set.add(i + 1)
-                                    break
-    if position_set:
-        return position_set
+    mismatch = []
+    if len(fasta) >= length:
+        score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide,
+                                                       match=1, mismatch=0, open=-2, extend=-2, score_only=True)
+        if score == length-1:
+            alignment = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide,
+                                                       match=1, mismatch=0, open=-2, extend=-2)[0]
+            if alignment.end - alignment.start == length:
+                mismatch = get_details(alignment.seqA[alignment.start:alignment.end], alignment.seqB[alignment.start:alignment.end])
+            elif alignment.end - alignment.start == length-1:
+                res = get_details(alignment.seqA[alignment.start:alignment.end+1], alignment.seqB[alignment.start:alignment.end+1])
+                if len(res) == 1:
+                    if res[0].split(">")[1]!="-":
+                        mismatch = res
+                    else:
+                        mismatch = get_details(alignment.seqA[alignment.start-1:alignment.end], alignment.seqB[alignment.start-1:alignment.end])
+                elif len(res) == 0:
+                    mismatch = get_details(alignment.seqA[alignment.start-1:alignment.end], alignment.seqB[alignment.start-1:alignment.end])
+                else:
+                    print("Number of mismatch Error")
+    return mismatch
+
+def _blast_set(fasta_dict, peptide):
+    positions = dict()
+    for fasta in fasta_dict.keys():
+        mismatch = peptide_blast_protein(fasta, peptide)
+        if len(mismatch) == 1:
+            if positions.get(mismatch[0]):
+                positions[mismatch[0]].update(fasta_dict[fasta])
+            else:
+                positions[mismatch[0]] = fasta_dict[fasta]
+        elif len(mismatch) > 1:
+            print("Number of mismatch > 1")
+            print(peptide)
+            print(fasta)
+            print(mismatch)
+    if positions:
+        res = []
+        for key,value in positions.items():
+            splits = key.split("|")
+            splits.append(",".join(value))
+            res.append(splits)
+        return res
     else:
         return "non-canonical"
 
-
 class BlastGetPositionService(ParameterConfiguration):
     CONFIG_KEY_BlastGetPosition = 'blast_get_position'
-    # CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
     CONFIG_INPUT_REFERENCE_DATABASE = 'input_reference_database'
     CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes'
 
@@ -79,9 +87,13 @@ def __init__(self, config_data, pipeline_arguments):
         self._number_of_processes = self.get_blast_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES,
                                                               default_value='40')
 
-        self.fa_set = set()
+
+        self.fasta_dict = dict()
         for j in SeqIO.parse(self._input_reference_database, "fasta"):
-            self.fa_set.add(str(j.seq))
+            if self.fasta_dict.get(str(j.seq)):
+                self.fasta_dict[str(j.seq)].add(j.id)
+            else:
+                self.fasta_dict[str(j.seq)] = {j.id}
         self.blast_dict = Manager().dict()
 
     def get_blast_parameters(self, variable: str, default_value):
@@ -104,7 +116,7 @@ def _blast_canonical(self, df):
 
         auto.make_automaton()
 
-        for protein_seq in self.fa_set:
+        for protein_seq in self.fasta_dict.keys():
             for end_ind, found in auto.iter(protein_seq):
                 seq_dict[found] = "canonical"
                 print("Found", found, "at position", end_ind, "in protein sequence")
@@ -113,13 +125,31 @@ def _blast_canonical(self, df):
         return df
 
     def _result(self, sequence):
-        self.blast_dict[sequence] = _blast_set(self.fa_set, sequence)
+        self.blast_dict[sequence] = _blast_set(self.fasta_dict, sequence)
 
     def blast(self, input_psm_to_blast, output_psm):
+        """
+        Blast peptide and reference protein database to find variation sites.
+        :param input_psm_to_blast: input PSM table to blast
+        :param output_psm: output PSM table
+        :return:
+        """
+
         start_time = datetime.datetime.now()
         print("Start time :", start_time)
 
-        psm = pd.read_table(input_psm_to_blast, header=0, sep="\t")
+        if input_psm_to_blast.endswith(".csv.gz"):
+            psm = pd.read_csv(input_psm_to_blast, header=0, sep=",", compression="gzip")
+        elif input_psm_to_blast.endswith(".csv"):
+            psm = pd.read_csv(input_psm_to_blast, header=0, sep=",")
+        elif input_psm_to_blast.endswith(".tsv.gz"):
+            psm = pd.read_table(input_psm_to_blast, header=0, sep="\t", compression="gzip")
+        elif input_psm_to_blast.endswith(".tsv"):
+            psm = pd.read_table(input_psm_to_blast, header=0, sep="\t")
+        else:
+            raise ValueError("The input file format is not supported.")
+
+
         psm = self._blast_canonical(psm)
 
         first_filter = psm[psm.position == "canonical"]
@@ -145,25 +175,24 @@ def blast(self, input_psm_to_blast, output_psm):
         psm_to_findpos = psm_to_findpos[psm_to_findpos.position != "non-canonical"]
 
         if len(psm_to_findpos) > 0:
-            psm_to_findpos["var_num"] = psm_to_findpos.apply(lambda x: len(x["position"]), axis=1)
-            psm_to_findpos = psm_to_findpos.loc[psm_to_findpos.index.repeat(psm_to_findpos["var_num"])]
-            psm_to_findpos["var_num"].iloc[0] = 0
-            psm_id = psm_to_findpos["PSM_ID"].iloc[0]
-            for i in range(1, psm_to_findpos.shape[0]):
-                if psm_to_findpos["PSM_ID"].iloc[i] == psm_id:
-                    psm_to_findpos["var_num"].iloc[i] = psm_to_findpos["var_num"].iloc[i - 1] + 1
-                else:
-                    psm_id = psm_to_findpos["PSM_ID"].iloc[i]
-                    psm_to_findpos["var_num"].iloc[i] = 0
-            psm_to_findpos["position"] = psm_to_findpos.apply(
-                lambda x: str(x["position"])[1:-1].split(",")[x["var_num"]],
-                axis=1)
-            psm_to_findpos.drop(columns="var_num", axis=1, inplace=True)
-            psm_to_findpos["position"] = psm_to_findpos.apply(lambda x: x["position"].replace(' ', ''), axis=1)
+            psm_to_findpos = psm_to_findpos.explode("position", ignore_index=True)
+            psm_to_findpos["variant"] = psm_to_findpos["position"].apply(lambda x: x[1])
+            psm_to_findpos["protein"] = psm_to_findpos["position"].apply(lambda x: x[2])
+            psm_to_findpos["position"] = psm_to_findpos["position"].apply(lambda x: x[0])
 
         all_psm_out = pd.concat([first_filter, second_filter, non_filter, psm_to_findpos], axis=0, join='outer')
-        all_psm_out = all_psm_out.sort_values("PSM_ID")
-        all_psm_out.to_csv(output_psm, header=1, sep="\t", index=None)
+        all_psm_out = all_psm_out.sort_values("usi")
+
+        if output_psm.endswith(".csv.gz"):
+            all_psm_out.to_csv(output_psm, header=True, sep=",", index=None, compression="gzip")
+        elif output_psm.endswith(".csv"):
+            all_psm_out.to_csv(output_psm, header=True, sep=",", index=None)
+        elif output_psm.endswith(".tsv.gz"):
+            all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None, compression="gzip")
+        elif output_psm.endswith(".tsv"):
+            all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None)
+        else:
+            all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None)
 
         end_time = datetime.datetime.now()
         print("End time :", end_time)