Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spectrum ai #80

Merged
merged 9 commits into from
Apr 24, 2024
31 changes: 15 additions & 16 deletions pypgatk/commands/validate_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,29 @@
import click

from pypgatk.toolbox.general import read_yaml_from_file
from pypgatk.proteogenomics.validate_peptides import ValidatePeptidesService
from pypgatk.proteogenomics.spectrumai import SpectrumAIService
from pypgatk.commands.utils import print_help

log = logging.getLogger(__name__)


@click.command('validate_peptides',
@click.command('spectrumai',
short_help='Command to inspect MS2 spectra of single-subsititution peptide identifications')
@click.option('-c', '--config_file', help='Configuration file for the validate peptides pipeline')
@click.option('-p', '--mzml_path', help='The mzml file path.You only need to use either mzml_path or mzml_files')
@click.option('-f', '--mzml_files',
help='The mzml files.Different files are separated by ",".You only need to use either mzml_path or mzml_files')
@click.option('-f', '--mzml_files', help='The mzml files. Different files are separated by ","')
@click.option('-i', '--infile_name', help='Variant peptide PSMs table')
@click.option('-o', '--outfile_name', help='Output file for the results')
@click.option('-ion', '--ions_tolerance', help='MS2 fragment ions mass accuracy')
@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
@click.option('-r', '--relative', help='When using ppm as ions_tolerance (not Da), it needs to be turned on',
is_flag=True)
@click.option('-msgf', '--msgf',
help='If it is the standard format of MSGF output, please turn on this switch, otherwise it defaults to mzTab format',
@click.option('-m', '--mztab',
help='If the tsv was obtained from mzTab, please enable this option. Default to tsv obtained from parquet',
is_flag=True)
@click.pass_context
def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance,
number_of_processes, relative, msgf):
def spectrumai(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance,
number_of_processes, relative, mztab):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)
Expand All @@ -38,18 +37,18 @@ def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outf
pipeline_arguments = {}

if mzml_path is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_PATH] = mzml_path
pipeline_arguments[SpectrumAIService.CONFIG_MZML_PATH] = mzml_path
if mzml_files is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_FILES] = mzml_files
pipeline_arguments[SpectrumAIService.CONFIG_MZML_FILES] = mzml_files
if ions_tolerance is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_IONS_TOLERANCE] = ions_tolerance
pipeline_arguments[SpectrumAIService.CONFIG_IONS_TOLERANCE] = ions_tolerance
if number_of_processes is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
pipeline_arguments[SpectrumAIService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
if relative is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_RELATIVE] = relative
if msgf is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MSGF] = msgf
pipeline_arguments[SpectrumAIService.CONFIG_RELATIVE] = relative
if mztab is not None:
pipeline_arguments[SpectrumAIService.CONFIG_MZTAB] = mztab

validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments)
validate_peptides_service = SpectrumAIService(config_data, pipeline_arguments)
if validate_flag:
validate_peptides_service.validate(infile_name, outfile_name)
171 changes: 100 additions & 71 deletions pypgatk/proteogenomics/blast_get_position.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,61 +8,69 @@

from pypgatk.toolbox.general import ParameterConfiguration


def _blast_set(fasta_set, peptide):
def get_details(fasta, peptide):
res = []
i = 0
j = 0
for AA1, AA2 in zip(fasta, peptide):
i += 1
j += 1
if AA1 == AA2:
continue
else:
res.append(str(i) + "|" + AA1 + ">" + AA2)
return res

def peptide_blast_protein(fasta, peptide):
length = len(peptide)
position_set = set()
for fasta in fasta_set:
if len(fasta) >= length:
alignments_score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0, score_only=True)
if alignments_score == length:
return "canonical"
elif alignments_score == length - 1:
alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0)
for alignment in alignments_local:
# insertion e.g., ABCDMEFGH<----ABCDEFGH
if alignment.end - alignment.start == length + 1:
s = fasta[alignment.start:alignment.end]
for i in range(length):
if peptide[i] != s[i]:
position_set.add(i + 1)
break
# substitution e.g., ABCDMFGH<----ABCDEFGH
elif alignment.end - alignment.start == length:
s = fasta[alignment.start:alignment.end]
for i in range(length):
if peptide[i] != s[i]:
position_set.add(i + 1)
break
# substitution e.g., ABCDEFGM<----ABCDEFGH
elif alignment.end - alignment.start == length - 1:
s = fasta[alignment.start:alignment.end]
if peptide[0] != s[0]:
position_set.add(1)
elif peptide[-1] != s[-1]:
position_set.add(length)
elif alignments_score == length - 2:
alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=-1,
open=-1, extend=0)
for alignment in alignments_local:
# deletion e.g., ABCEFGH<----ABCDEFGH
if alignment.end - alignment.start == length and alignment.score == length - 2:
s = fasta[alignment.start:alignment.end - 1]
if pairwise2.align.localms(sequenceA=s, sequenceB=peptide, match=1, mismatch=0, open=0,
extend=0, score_only=True) == length - 1:
for i in range(length - 1):
if peptide[i] != s[i]:
position_set.add(i + 1)
break
if position_set:
return position_set
mismatch = []
if len(fasta) >= length:
score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide,
match=1, mismatch=0, open=-2, extend=-2, score_only=True)
if score == length-1:
alignment = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide,
match=1, mismatch=0, open=-2, extend=-2)[0]
if alignment.end - alignment.start == length:
mismatch = get_details(alignment.seqA[alignment.start:alignment.end], alignment.seqB[alignment.start:alignment.end])
elif alignment.end - alignment.start == length-1:
res = get_details(alignment.seqA[alignment.start:alignment.end+1], alignment.seqB[alignment.start:alignment.end+1])
if len(res) == 1:
if res[0].split(">")[1]!="-":
mismatch = res
else:
mismatch = get_details(alignment.seqA[alignment.start-1:alignment.end], alignment.seqB[alignment.start-1:alignment.end])
elif len(res) == 0:
mismatch = get_details(alignment.seqA[alignment.start-1:alignment.end], alignment.seqB[alignment.start-1:alignment.end])
else:
print("Number of mismatch Error")
return mismatch

def _blast_set(fasta_dict, peptide):
positions = dict()
for fasta in fasta_dict.keys():
mismatch = peptide_blast_protein(fasta, peptide)
if len(mismatch) == 1:
if positions.get(mismatch[0]):
positions[mismatch[0]].update(fasta_dict[fasta])
else:
positions[mismatch[0]] = fasta_dict[fasta]
elif len(mismatch) > 1:
print("Number of mismatch > 1")
print(peptide)
print(fasta)
print(mismatch)
if positions:
res = []
for key,value in positions.items():
splits = key.split("|")
splits.append(",".join(value))
res.append(splits)
return res
else:
return "non-canonical"


class BlastGetPositionService(ParameterConfiguration):
CONFIG_KEY_BlastGetPosition = 'blast_get_position'
# CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
CONFIG_INPUT_REFERENCE_DATABASE = 'input_reference_database'
CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes'

Expand All @@ -79,9 +87,13 @@ def __init__(self, config_data, pipeline_arguments):
self._number_of_processes = self.get_blast_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES,
default_value='40')

self.fa_set = set()

self.fasta_dict = dict()
for j in SeqIO.parse(self._input_reference_database, "fasta"):
self.fa_set.add(str(j.seq))
if self.fasta_dict.get(str(j.seq)):
self.fasta_dict[str(j.seq)].add(j.id)
else:
self.fasta_dict[str(j.seq)] = {j.id}
self.blast_dict = Manager().dict()

def get_blast_parameters(self, variable: str, default_value):
Expand All @@ -104,7 +116,7 @@ def _blast_canonical(self, df):

auto.make_automaton()

for protein_seq in self.fa_set:
for protein_seq in self.fasta_dict.keys():
for end_ind, found in auto.iter(protein_seq):
seq_dict[found] = "canonical"
print("Found", found, "at position", end_ind, "in protein sequence")
Expand All @@ -113,13 +125,31 @@ def _blast_canonical(self, df):
return df

def _result(self, sequence):
self.blast_dict[sequence] = _blast_set(self.fa_set, sequence)
self.blast_dict[sequence] = _blast_set(self.fasta_dict, sequence)

def blast(self, input_psm_to_blast, output_psm):
"""
Blast peptide and reference protein database to find variation sites.
:param input_psm_to_blast: input PSM table to blast
:param output_psm: output PSM table
:return:
"""

start_time = datetime.datetime.now()
print("Start time :", start_time)

psm = pd.read_table(input_psm_to_blast, header=0, sep="\t")
if input_psm_to_blast.endswith(".csv.gz"):
psm = pd.read_csv(input_psm_to_blast, header=0, sep=",", compression="gzip")
elif input_psm_to_blast.endswith(".csv"):
psm = pd.read_csv(input_psm_to_blast, header=0, sep=",")
elif input_psm_to_blast.endswith(".tsv.gz"):
psm = pd.read_table(input_psm_to_blast, header=0, sep="\t", compression="gzip")
elif input_psm_to_blast.endswith(".tsv"):
psm = pd.read_table(input_psm_to_blast, header=0, sep="\t")
else:
raise ValueError("The input file format is not supported.")


psm = self._blast_canonical(psm)

first_filter = psm[psm.position == "canonical"]
Expand All @@ -145,25 +175,24 @@ def blast(self, input_psm_to_blast, output_psm):
psm_to_findpos = psm_to_findpos[psm_to_findpos.position != "non-canonical"]

if len(psm_to_findpos) > 0:
psm_to_findpos["var_num"] = psm_to_findpos.apply(lambda x: len(x["position"]), axis=1)
psm_to_findpos = psm_to_findpos.loc[psm_to_findpos.index.repeat(psm_to_findpos["var_num"])]
psm_to_findpos["var_num"].iloc[0] = 0
psm_id = psm_to_findpos["PSM_ID"].iloc[0]
for i in range(1, psm_to_findpos.shape[0]):
if psm_to_findpos["PSM_ID"].iloc[i] == psm_id:
psm_to_findpos["var_num"].iloc[i] = psm_to_findpos["var_num"].iloc[i - 1] + 1
else:
psm_id = psm_to_findpos["PSM_ID"].iloc[i]
psm_to_findpos["var_num"].iloc[i] = 0
psm_to_findpos["position"] = psm_to_findpos.apply(
lambda x: str(x["position"])[1:-1].split(",")[x["var_num"]],
axis=1)
psm_to_findpos.drop(columns="var_num", axis=1, inplace=True)
psm_to_findpos["position"] = psm_to_findpos.apply(lambda x: x["position"].replace(' ', ''), axis=1)
psm_to_findpos = psm_to_findpos.explode("position", ignore_index=True)
psm_to_findpos["variant"] = psm_to_findpos["position"].apply(lambda x: x[1])
psm_to_findpos["protein"] = psm_to_findpos["position"].apply(lambda x: x[2])
psm_to_findpos["position"] = psm_to_findpos["position"].apply(lambda x: x[0])

all_psm_out = pd.concat([first_filter, second_filter, non_filter, psm_to_findpos], axis=0, join='outer')
all_psm_out = all_psm_out.sort_values("PSM_ID")
all_psm_out.to_csv(output_psm, header=1, sep="\t", index=None)
all_psm_out = all_psm_out.sort_values("usi")

if output_psm.endswith(".csv.gz"):
all_psm_out.to_csv(output_psm, header=True, sep=",", index=None, compression="gzip")
elif output_psm.endswith(".csv"):
all_psm_out.to_csv(output_psm, header=True, sep=",", index=None)
elif output_psm.endswith(".tsv.gz"):
all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None, compression="gzip")
elif output_psm.endswith(".tsv"):
all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None)
else:
all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None)

end_time = datetime.datetime.now()
print("End time :", end_time)
Expand Down
Loading
Loading