Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update new blast and spectrumAI #79

Merged
merged 1 commit into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions pypgatk/commands/validate_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@
@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
@click.option('-r', '--relative', help='When using ppm as ions_tolerance (not Da), it needs to be turned on',
is_flag=True)
@click.option('-msgf', '--msgf',
help='If it is the standard format of MSGF output, please turn on this switch, otherwise it defaults to mzTab format',
@click.option('-m', '--mztab',
help='If the tsv was obtained from mzTab, please enable this option. Default to tsv obtained from parquet',
is_flag=True)
@click.pass_context
def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance,
number_of_processes, relative, msgf):
number_of_processes, relative, mztab):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)
Expand All @@ -47,8 +47,8 @@ def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outf
pipeline_arguments[ValidatePeptidesService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
if relative is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_RELATIVE] = relative
if msgf is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MSGF] = msgf
if mztab is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MZTAB] = mztab

validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments)
if validate_flag:
Expand Down
152 changes: 82 additions & 70 deletions pypgatk/proteogenomics/blast_get_position.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,61 +8,69 @@

from pypgatk.toolbox.general import ParameterConfiguration

def get_details(fasta, peptide):
res = []
i = 0
j = 0
for AA1, AA2 in zip(fasta, peptide):
i += 1
j += 1
if AA1 == AA2:
continue
else:
res.append(str(i) + "|" + AA1 + ">" + AA2)
return res

def _blast_set(fasta_set, peptide):
def peptide_blast_protein(fasta, peptide):
length = len(peptide)
position_set = set()
for fasta in fasta_set:
if len(fasta) >= length:
alignments_score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0, score_only=True)
if alignments_score == length:
return "canonical"
elif alignments_score == length - 1:
alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0)
for alignment in alignments_local:
# insertion e.g., ABCDMEFGH<----ABCDEFGH
if alignment.end - alignment.start == length + 1:
s = fasta[alignment.start:alignment.end]
for i in range(length):
if peptide[i] != s[i]:
position_set.add(i + 1)
break
# substitution e.g., ABCDMFGH<----ABCDEFGH
elif alignment.end - alignment.start == length:
s = fasta[alignment.start:alignment.end]
for i in range(length):
if peptide[i] != s[i]:
position_set.add(i + 1)
break
# substitution e.g., ABCDEFGM<----ABCDEFGH
elif alignment.end - alignment.start == length - 1:
s = fasta[alignment.start:alignment.end]
if peptide[0] != s[0]:
position_set.add(1)
elif peptide[-1] != s[-1]:
position_set.add(length)
elif alignments_score == length - 2:
alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=-1,
open=-1, extend=0)
for alignment in alignments_local:
# deletion e.g., ABCEFGH<----ABCDEFGH
if alignment.end - alignment.start == length and alignment.score == length - 2:
s = fasta[alignment.start:alignment.end - 1]
if pairwise2.align.localms(sequenceA=s, sequenceB=peptide, match=1, mismatch=0, open=0,
extend=0, score_only=True) == length - 1:
for i in range(length - 1):
if peptide[i] != s[i]:
position_set.add(i + 1)
break
if position_set:
return position_set
mismatch = []
if len(fasta) >= length:

Check notice on line 27 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L27

Trailing whitespace
score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide,

Check notice on line 28 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L28

Trailing whitespace
match=1, mismatch=0, open=-2, extend=-2, score_only=True)
if score == length-1:
alignment = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide,

Check notice on line 31 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L31

Trailing whitespace
match=1, mismatch=0, open=-2, extend=-2)[0]
if alignment.end - alignment.start == length:
mismatch = get_details(alignment.seqA[alignment.start:alignment.end], alignment.seqB[alignment.start:alignment.end])
elif alignment.end - alignment.start == length-1:
res = get_details(alignment.seqA[alignment.start:alignment.end+1], alignment.seqB[alignment.start:alignment.end+1])
if len(res) == 1:
if res[0].split(">")[1]!="-":
mismatch = res
else:
mismatch = get_details(alignment.seqA[alignment.start-1:alignment.end], alignment.seqB[alignment.start-1:alignment.end])
elif len(res) == 0:
mismatch = get_details(alignment.seqA[alignment.start-1:alignment.end], alignment.seqB[alignment.start-1:alignment.end])
else:
print("Number of mismatch Error")
return mismatch

def _blast_set(fasta_dict, peptide):
positions = dict()
for fasta in fasta_dict.keys():
mismatch = peptide_blast_protein(fasta, peptide)
if len(mismatch) == 1:
if positions.get(mismatch[0]):
positions[mismatch[0]].update(fasta_dict[fasta])
else:
positions[mismatch[0]] = fasta_dict[fasta]
elif len(mismatch) > 1:
print("Number of mismatch > 1")
print(peptide)
print(fasta)
print(mismatch)
if positions:
res = []
for key,value in positions.items():
splits = key.split("|")
splits.append(",".join(value))
res.append(splits)

Check notice on line 67 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L67

Trailing whitespace
return res
else:
return "non-canonical"


class BlastGetPositionService(ParameterConfiguration):
CONFIG_KEY_BlastGetPosition = 'blast_get_position'
# CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
CONFIG_INPUT_REFERENCE_DATABASE = 'input_reference_database'
CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes'

Expand All @@ -79,9 +87,14 @@
self._number_of_processes = self.get_blast_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES,
default_value='40')

self.fa_set = set()

self.fasta_dict = dict()
for j in SeqIO.parse(self._input_reference_database, "fasta"):
self.fa_set.add(str(j.seq))
if self.fasta_dict.get(str(j.seq)):
self.fasta_dict[str(j.seq)].add(j.id)
else:
self.fasta_dict[str(j.seq)] = {j.id}

Check notice on line 97 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L97

Trailing whitespace
self.blast_dict = Manager().dict()

def get_blast_parameters(self, variable: str, default_value):
Expand All @@ -92,7 +105,7 @@
variable in self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition]:
value_return = self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition][variable]
return value_return

def _blast_canonical(self, df):
seq_set = set(df["sequence"].to_list())

Expand All @@ -104,7 +117,7 @@

auto.make_automaton()

for protein_seq in self.fa_set:
for protein_seq in self.fasta_dict.keys():
for end_ind, found in auto.iter(protein_seq):
seq_dict[found] = "canonical"
print("Found", found, "at position", end_ind, "in protein sequence")
Expand All @@ -113,7 +126,7 @@
return df

def _result(self, sequence):
self.blast_dict[sequence] = _blast_set(self.fa_set, sequence)
self.blast_dict[sequence] = _blast_set(self.fasta_dict, sequence)

def blast(self, input_psm_to_blast, output_psm):
"""
Expand All @@ -137,7 +150,7 @@
else:
raise ValueError("The input file format is not supported.")

psm = psm.head(4)

psm = self._blast_canonical(psm)

first_filter = psm[psm.position == "canonical"]
Expand All @@ -163,25 +176,24 @@
psm_to_findpos = psm_to_findpos[psm_to_findpos.position != "non-canonical"]

if len(psm_to_findpos) > 0:
psm_to_findpos["var_num"] = psm_to_findpos.apply(lambda x: len(x["position"]), axis=1)
psm_to_findpos = psm_to_findpos.loc[psm_to_findpos.index.repeat(psm_to_findpos["var_num"])]
psm_to_findpos["var_num"].iloc[0] = 0
psm_id = psm_to_findpos["usi"].iloc[0]
for i in range(1, psm_to_findpos.shape[0]):
if psm_to_findpos["usi"].iloc[i] == psm_id:
psm_to_findpos["var_num"].iloc[i] = psm_to_findpos["var_num"].iloc[i - 1] + 1
else:
psm_id = psm_to_findpos["usi"].iloc[i]
psm_to_findpos["var_num"].iloc[i] = 0
psm_to_findpos["position"] = psm_to_findpos.apply(
lambda x: str(x["position"])[1:-1].split(",")[x["var_num"]],
axis=1)
psm_to_findpos.drop(columns="var_num", axis=1, inplace=True)
psm_to_findpos["position"] = psm_to_findpos.apply(lambda x: x["position"].replace(' ', ''), axis=1)
psm_to_findpos = psm_to_findpos.explode("position", ignore_index=True)
psm_to_findpos["variant"] = psm_to_findpos["position"].apply(lambda x : x[1])

Check notice on line 180 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L180

Trailing whitespace
psm_to_findpos["protein"] = psm_to_findpos["position"].apply(lambda x : x[2])

Check notice on line 181 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L181

Trailing whitespace
psm_to_findpos["position"] = psm_to_findpos["position"].apply(lambda x : x[0])

Check notice on line 182 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L182

Trailing whitespace

all_psm_out = pd.concat([first_filter, second_filter, non_filter, psm_to_findpos], axis=0, join='outer')
all_psm_out = all_psm_out.sort_values("usi")
all_psm_out.to_csv(output_psm, header=1, sep=",", index=None)

if output_psm.endswith(".csv.gz"):
all_psm_out.to_csv(output_psm, header=True, sep=",", index=None, compression="gzip")
elif output_psm.endswith(".csv"):
all_psm_out.to_csv(output_psm, header=True, sep=",", index=None)
elif output_psm.endswith(".tsv.gz"):
all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None, compression="gzip")
elif output_psm.endswith(".tsv"):
all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None)
else:
all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None)

end_time = datetime.datetime.now()
print("End time :", end_time)
Expand Down
Loading
Loading