Skip to content

Commit

Permalink
Merge pull request #80 from bigbio/spectrumAI
Browse files Browse the repository at this point in the history
Spectrum ai
  • Loading branch information
ypriverol authored Apr 24, 2024
2 parents 6faf53c + a9607af commit 3939fc2
Show file tree
Hide file tree
Showing 10 changed files with 166 additions and 600 deletions.
31 changes: 15 additions & 16 deletions pypgatk/commands/validate_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,29 @@
import click

from pypgatk.toolbox.general import read_yaml_from_file
from pypgatk.proteogenomics.validate_peptides import ValidatePeptidesService
from pypgatk.proteogenomics.spectrumai import SpectrumAIService
from pypgatk.commands.utils import print_help

log = logging.getLogger(__name__)


@click.command('validate_peptides',
@click.command('spectrumai',
short_help='Command to inspect MS2 spectra of single-subsititution peptide identifications')
@click.option('-c', '--config_file', help='Configuration file for the validate peptides pipeline')
@click.option('-p', '--mzml_path', help='The mzml file path.You only need to use either mzml_path or mzml_files')
@click.option('-f', '--mzml_files',
help='The mzml files.Different files are separated by ",".You only need to use either mzml_path or mzml_files')
@click.option('-f', '--mzml_files', help='The mzml files. Different files are separated by ","')
@click.option('-i', '--infile_name', help='Variant peptide PSMs table')
@click.option('-o', '--outfile_name', help='Output file for the results')
@click.option('-ion', '--ions_tolerance', help='MS2 fragment ions mass accuracy')
@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
@click.option('-r', '--relative', help='When using ppm as ions_tolerance (not Da), it needs to be turned on',
is_flag=True)
@click.option('-msgf', '--msgf',
help='If it is the standard format of MSGF output, please turn on this switch, otherwise it defaults to mzTab format',
@click.option('-m', '--mztab',
help='If the tsv was obtained from mzTab, please enable this option. Default to tsv obtained from parquet',
is_flag=True)
@click.pass_context
def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance,
number_of_processes, relative, msgf):
def spectrumai(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance,
number_of_processes, relative, mztab):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)
Expand All @@ -38,18 +37,18 @@ def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outf
pipeline_arguments = {}

if mzml_path is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_PATH] = mzml_path
pipeline_arguments[SpectrumAIService.CONFIG_MZML_PATH] = mzml_path
if mzml_files is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_FILES] = mzml_files
pipeline_arguments[SpectrumAIService.CONFIG_MZML_FILES] = mzml_files
if ions_tolerance is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_IONS_TOLERANCE] = ions_tolerance
pipeline_arguments[SpectrumAIService.CONFIG_IONS_TOLERANCE] = ions_tolerance
if number_of_processes is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
pipeline_arguments[SpectrumAIService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
if relative is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_RELATIVE] = relative
if msgf is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MSGF] = msgf
pipeline_arguments[SpectrumAIService.CONFIG_RELATIVE] = relative
if mztab is not None:
pipeline_arguments[SpectrumAIService.CONFIG_MZTAB] = mztab

validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments)
validate_peptides_service = SpectrumAIService(config_data, pipeline_arguments)
if validate_flag:
validate_peptides_service.validate(infile_name, outfile_name)
171 changes: 100 additions & 71 deletions pypgatk/proteogenomics/blast_get_position.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,61 +8,69 @@

from pypgatk.toolbox.general import ParameterConfiguration


def _blast_set(fasta_set, peptide):
def get_details(fasta, peptide):
res = []
i = 0
j = 0
for AA1, AA2 in zip(fasta, peptide):
i += 1
j += 1
if AA1 == AA2:
continue
else:
res.append(str(i) + "|" + AA1 + ">" + AA2)
return res

def peptide_blast_protein(fasta, peptide):
length = len(peptide)
position_set = set()
for fasta in fasta_set:
if len(fasta) >= length:
alignments_score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0, score_only=True)
if alignments_score == length:
return "canonical"
elif alignments_score == length - 1:
alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0)
for alignment in alignments_local:
# insertion e.g., ABCDMEFGH<----ABCDEFGH
if alignment.end - alignment.start == length + 1:
s = fasta[alignment.start:alignment.end]
for i in range(length):
if peptide[i] != s[i]:
position_set.add(i + 1)
break
# substitution e.g., ABCDMFGH<----ABCDEFGH
elif alignment.end - alignment.start == length:
s = fasta[alignment.start:alignment.end]
for i in range(length):
if peptide[i] != s[i]:
position_set.add(i + 1)
break
# substitution e.g., ABCDEFGM<----ABCDEFGH
elif alignment.end - alignment.start == length - 1:
s = fasta[alignment.start:alignment.end]
if peptide[0] != s[0]:
position_set.add(1)
elif peptide[-1] != s[-1]:
position_set.add(length)
elif alignments_score == length - 2:
alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=-1,
open=-1, extend=0)
for alignment in alignments_local:
# deletion e.g., ABCEFGH<----ABCDEFGH
if alignment.end - alignment.start == length and alignment.score == length - 2:
s = fasta[alignment.start:alignment.end - 1]
if pairwise2.align.localms(sequenceA=s, sequenceB=peptide, match=1, mismatch=0, open=0,
extend=0, score_only=True) == length - 1:
for i in range(length - 1):
if peptide[i] != s[i]:
position_set.add(i + 1)
break
if position_set:
return position_set
mismatch = []
if len(fasta) >= length:
score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide,
match=1, mismatch=0, open=-2, extend=-2, score_only=True)
if score == length-1:
alignment = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide,
match=1, mismatch=0, open=-2, extend=-2)[0]
if alignment.end - alignment.start == length:
mismatch = get_details(alignment.seqA[alignment.start:alignment.end], alignment.seqB[alignment.start:alignment.end])
elif alignment.end - alignment.start == length-1:
res = get_details(alignment.seqA[alignment.start:alignment.end+1], alignment.seqB[alignment.start:alignment.end+1])
if len(res) == 1:
if res[0].split(">")[1]!="-":
mismatch = res
else:
mismatch = get_details(alignment.seqA[alignment.start-1:alignment.end], alignment.seqB[alignment.start-1:alignment.end])
elif len(res) == 0:
mismatch = get_details(alignment.seqA[alignment.start-1:alignment.end], alignment.seqB[alignment.start-1:alignment.end])
else:
print("Number of mismatch Error")
return mismatch

def _blast_set(fasta_dict, peptide):
positions = dict()
for fasta in fasta_dict.keys():
mismatch = peptide_blast_protein(fasta, peptide)
if len(mismatch) == 1:
if positions.get(mismatch[0]):
positions[mismatch[0]].update(fasta_dict[fasta])
else:
positions[mismatch[0]] = fasta_dict[fasta]
elif len(mismatch) > 1:
print("Number of mismatch > 1")
print(peptide)
print(fasta)
print(mismatch)
if positions:
res = []
for key,value in positions.items():
splits = key.split("|")
splits.append(",".join(value))
res.append(splits)
return res
else:
return "non-canonical"


class BlastGetPositionService(ParameterConfiguration):
CONFIG_KEY_BlastGetPosition = 'blast_get_position'
# CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
CONFIG_INPUT_REFERENCE_DATABASE = 'input_reference_database'
CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes'

Expand All @@ -79,9 +87,13 @@ def __init__(self, config_data, pipeline_arguments):
self._number_of_processes = self.get_blast_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES,
default_value='40')

self.fa_set = set()

self.fasta_dict = dict()
for j in SeqIO.parse(self._input_reference_database, "fasta"):
self.fa_set.add(str(j.seq))
if self.fasta_dict.get(str(j.seq)):
self.fasta_dict[str(j.seq)].add(j.id)
else:
self.fasta_dict[str(j.seq)] = {j.id}
self.blast_dict = Manager().dict()

def get_blast_parameters(self, variable: str, default_value):
Expand All @@ -104,7 +116,7 @@ def _blast_canonical(self, df):

auto.make_automaton()

for protein_seq in self.fa_set:
for protein_seq in self.fasta_dict.keys():
for end_ind, found in auto.iter(protein_seq):
seq_dict[found] = "canonical"
print("Found", found, "at position", end_ind, "in protein sequence")
Expand All @@ -113,13 +125,31 @@ def _blast_canonical(self, df):
return df

def _result(self, sequence):
self.blast_dict[sequence] = _blast_set(self.fa_set, sequence)
self.blast_dict[sequence] = _blast_set(self.fasta_dict, sequence)

def blast(self, input_psm_to_blast, output_psm):
"""
Blast peptide and reference protein database to find variation sites.
:param input_psm_to_blast: input PSM table to blast
:param output_psm: output PSM table
:return:
"""

start_time = datetime.datetime.now()
print("Start time :", start_time)

psm = pd.read_table(input_psm_to_blast, header=0, sep="\t")
if input_psm_to_blast.endswith(".csv.gz"):
psm = pd.read_csv(input_psm_to_blast, header=0, sep=",", compression="gzip")
elif input_psm_to_blast.endswith(".csv"):
psm = pd.read_csv(input_psm_to_blast, header=0, sep=",")
elif input_psm_to_blast.endswith(".tsv.gz"):
psm = pd.read_table(input_psm_to_blast, header=0, sep="\t", compression="gzip")
elif input_psm_to_blast.endswith(".tsv"):
psm = pd.read_table(input_psm_to_blast, header=0, sep="\t")
else:
raise ValueError("The input file format is not supported.")


psm = self._blast_canonical(psm)

first_filter = psm[psm.position == "canonical"]
Expand All @@ -145,25 +175,24 @@ def blast(self, input_psm_to_blast, output_psm):
psm_to_findpos = psm_to_findpos[psm_to_findpos.position != "non-canonical"]

if len(psm_to_findpos) > 0:
psm_to_findpos["var_num"] = psm_to_findpos.apply(lambda x: len(x["position"]), axis=1)
psm_to_findpos = psm_to_findpos.loc[psm_to_findpos.index.repeat(psm_to_findpos["var_num"])]
psm_to_findpos["var_num"].iloc[0] = 0
psm_id = psm_to_findpos["PSM_ID"].iloc[0]
for i in range(1, psm_to_findpos.shape[0]):
if psm_to_findpos["PSM_ID"].iloc[i] == psm_id:
psm_to_findpos["var_num"].iloc[i] = psm_to_findpos["var_num"].iloc[i - 1] + 1
else:
psm_id = psm_to_findpos["PSM_ID"].iloc[i]
psm_to_findpos["var_num"].iloc[i] = 0
psm_to_findpos["position"] = psm_to_findpos.apply(
lambda x: str(x["position"])[1:-1].split(",")[x["var_num"]],
axis=1)
psm_to_findpos.drop(columns="var_num", axis=1, inplace=True)
psm_to_findpos["position"] = psm_to_findpos.apply(lambda x: x["position"].replace(' ', ''), axis=1)
psm_to_findpos = psm_to_findpos.explode("position", ignore_index=True)
psm_to_findpos["variant"] = psm_to_findpos["position"].apply(lambda x: x[1])
psm_to_findpos["protein"] = psm_to_findpos["position"].apply(lambda x: x[2])
psm_to_findpos["position"] = psm_to_findpos["position"].apply(lambda x: x[0])

all_psm_out = pd.concat([first_filter, second_filter, non_filter, psm_to_findpos], axis=0, join='outer')
all_psm_out = all_psm_out.sort_values("PSM_ID")
all_psm_out.to_csv(output_psm, header=1, sep="\t", index=None)
all_psm_out = all_psm_out.sort_values("usi")

if output_psm.endswith(".csv.gz"):
all_psm_out.to_csv(output_psm, header=True, sep=",", index=None, compression="gzip")
elif output_psm.endswith(".csv"):
all_psm_out.to_csv(output_psm, header=True, sep=",", index=None)
elif output_psm.endswith(".tsv.gz"):
all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None, compression="gzip")
elif output_psm.endswith(".tsv"):
all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None)
else:
all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None)

end_time = datetime.datetime.now()
print("End time :", end_time)
Expand Down
Loading

0 comments on commit 3939fc2

Please sign in to comment.