Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor bug fixed #78

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions pypgatk/proteogenomics/blast_get_position.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,28 @@ def _result(self, sequence):
self.blast_dict[sequence] = _blast_set(self.fa_set, sequence)

def blast(self, input_psm_to_blast, output_psm):
"""
Blast peptide and reference protein database to find variation sites.
:param input_psm_to_blast: input PSM table to blast
:param output_psm: output PSM table
:return:
"""

start_time = datetime.datetime.now()
print("Start time :", start_time)

psm = pd.read_table(input_psm_to_blast, header=0, sep="\t")
if input_psm_to_blast.endswith(".csv.gz"):
psm = pd.read_csv(input_psm_to_blast, header=0, sep=",", compression="gzip")
elif input_psm_to_blast.endswith(".csv"):
psm = pd.read_csv(input_psm_to_blast, header=0, sep=",")
elif input_psm_to_blast.endswith(".tsv.gz"):
psm = pd.read_table(input_psm_to_blast, header=0, sep="\t", compression="gzip")
elif input_psm_to_blast.endswith(".tsv"):
psm = pd.read_table(input_psm_to_blast, header=0, sep="\t")
else:
raise ValueError("The input file format is not supported.")

psm = psm.head(4)
psm = self._blast_canonical(psm)

first_filter = psm[psm.position == "canonical"]
Expand Down Expand Up @@ -148,12 +166,12 @@ def blast(self, input_psm_to_blast, output_psm):
psm_to_findpos["var_num"] = psm_to_findpos.apply(lambda x: len(x["position"]), axis=1)
psm_to_findpos = psm_to_findpos.loc[psm_to_findpos.index.repeat(psm_to_findpos["var_num"])]
psm_to_findpos["var_num"].iloc[0] = 0
psm_id = psm_to_findpos["PSM_ID"].iloc[0]
psm_id = psm_to_findpos["usi"].iloc[0]
for i in range(1, psm_to_findpos.shape[0]):
if psm_to_findpos["PSM_ID"].iloc[i] == psm_id:
if psm_to_findpos["usi"].iloc[i] == psm_id:
psm_to_findpos["var_num"].iloc[i] = psm_to_findpos["var_num"].iloc[i - 1] + 1
else:
psm_id = psm_to_findpos["PSM_ID"].iloc[i]
psm_id = psm_to_findpos["usi"].iloc[i]
psm_to_findpos["var_num"].iloc[i] = 0
psm_to_findpos["position"] = psm_to_findpos.apply(
lambda x: str(x["position"])[1:-1].split(",")[x["var_num"]],
Expand All @@ -162,8 +180,8 @@ def blast(self, input_psm_to_blast, output_psm):
psm_to_findpos["position"] = psm_to_findpos.apply(lambda x: x["position"].replace(' ', ''), axis=1)

all_psm_out = pd.concat([first_filter, second_filter, non_filter, psm_to_findpos], axis=0, join='outer')
all_psm_out = all_psm_out.sort_values("PSM_ID")
all_psm_out.to_csv(output_psm, header=1, sep="\t", index=None)
all_psm_out = all_psm_out.sort_values("usi")
all_psm_out.to_csv(output_psm, header=1, sep=",", index=None)

end_time = datetime.datetime.now()
print("End time :", end_time)
Expand Down
2 changes: 1 addition & 1 deletion pypgatk/testdata/test_blast_psms.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
PSH sequence PSM_ID accession unique database database_version search_engine search_engine_score[1] modifications retention_time charge exp_mass_to_charge calc_mass_to_charge spectra_ref pre post start end opt_global_q-value opt_global_cv_MS:1002217_decoy_peptide opt_global_cv_MS:1000889_peptidoform_sequence SpecFile ScanNum
PSH sequence usi accession unique database database_version search_engine search_engine_score[1] modifications retention_time charge exp_mass_to_charge calc_mass_to_charge spectra_ref pre post start end opt_global_q-value opt_global_cv_MS:1002217_decoy_peptide opt_global_cv_MS:1000889_peptidoform_sequence SpecFile ScanNum
PSM YHTINGHNAEVR 0 "ENSP00000504571.1,ENSP00000503242.1,ENSP00000503961.1,ENSP00000504660.1,ENSP00000497298.1,ENSP00000503452.1,ENSP00000504799.1,ENSP00000503190.1,ENSP00000503898.1,ENSP00000503968.1,ENSP00000503885.1,ENSP00000504049.1,ENSP00000503550.1,ENSP00000503870.1,ENSP00000503521.1,ENSP00000503236.1,ENSP00000503360.1,ENSP00000503021.1,ENSP00000503915.1,ENSP00000503460.1,ENSP00000346694.4,ENSP00000478691.2,ENSP00000504439.1,ENSP00000504329.1,ENSP00000503476.1,ENSP00000504831.1,ENSP00000504023.1,ENSP00000504721.1,ENSP00000503514.1,ENSP00000503375.1,ENSP00000349101.8,ENSP00000503047.1,ENSP00000503833.1,ENSP00000503836.1,ENSP00000503703.1,ENSP00000503429.1,ENSP00000354021.4,ENSP00000504415.1,ENSP00000503060.1,ENSP00000503501.1,altorf_ENST00000679318.1_2,altorf_ENST00000677339.1_2,altorf_ENST00000678501.1_2,altorf_ENST00000676903.1_2,altorf_ENST00000608362.2_2,altorf_ENST00000677631.1_2,altorf_ENST00000676749.1_2,altorf_ENST00000678035.1_1,altorf_ENST00000678075.1_1,altorf_ENST00000678183.1_3,altorf_ENST00000679021.1_3,altorf_ENST00000677321.1_2,altorf_ENST00000677571.1_2,altorf_ENST00000677906.1_2,altorf_ENST00000678277.1_3,altorf_ENST00000678973.1_2,altorf_ENST00000679124.1_2,altorf_ENST00000679123.1_2,altorf_ENST00000677574.1_2,altorf_ENST00000678631.1_2,altorf_ENST00000678998.1_2,altorf_ENST00000354667.8_2,altorf_ENST00000618183.5_2,altorf_ENST00000677839.1_2,altorf_ENST00000676746.1_3,altorf_ENST00000678675.1_3,altorf_ENST00000676524.1_2,altorf_ENST00000678935.1_2,altorf_ENST00000678962.1_2,altorf_ENST00000679001.1_2,altorf_ENST00000678449.1_2,altorf_ENST00000356674.8_3,altorf_ENST00000678697.1_2,altorf_ENST00000678431.1_2,altorf_ENST00000676497.1_2,altorf_ENST00000677396.1_2,altorf_ENST00000678779.1_2,altorf_ENST00000360787.8_2,altorf_ENST00000679243.1_2,altorf_ENST00000677656.1_2,altorf_ENST00000678884.1_2,ncRNA_ENST00000677075.1_1,ncRNA_ENST00000476233.2_2,ncRNA_ENST00000676932.1_2,ncRNA_ENST00000677669.1_3,ncRNA_ENST00000490912.6_3,ncRNA_ENST00000463181.5_2,ncRNA_ENST00000495810.2_2,COSMIC:HNRNPA2B1_ENST00000618183:p.R225S:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.H108P:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.K104N:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.R190G:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.G65V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.M53I:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.L37*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000618183:p.E11G:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.E92Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.E133Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.D87H:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.M1?:,COSMIC:HNRNPA2B1:p.G233A:Substitution-Missense,COSMIC:HNRNPA2B1:p.G280C:Substitution-Missense,COSMIC:HNRNPA2B1:p.G224*:Substitution-Nonsense,COSMIC:HNRNPA2B1:p.N255Y:Substitution-Missense,COSMIC:HNRNPA2B1:p.K104N:Substitution-Missense,COSMIC:HNRNPA2B1:p.G285S:Substitution-Missense,COSMIC:HNRNPA2B1:p.H108P:Substitution-Missense,COSMIC:HNRNPA2B1:p.R190G:Substitution-Missense,COSMIC:HNRNPA2B1:p.G65V:Substitution-Missense,COSMIC:HNRNPA2B1:p.M53I:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G221A:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G268C:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.N243Y:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G212*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000356674:p.G273S:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.R178G:Substitution-Missense,COSMIC:HNRNPA2B1:p.L37*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000356674:p.H96P:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.K92N:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G53V:Substitution-Missense,COSMIC:HNRNPA2B1:p.G214V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.M41I:Substitution-Missense,COSMIC:HNRNPA2B1:p.G237V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.L25*:Substitution-Nonsense,COSMIC:HNRNPA2B1:p.R203K:Substitution-Missense,COSMIC:HNRNPA2B1:p.Y336C:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G202V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G225V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.R191K:Substitution-Missense,COSMIC:HNRNPA2B1:p.G332C:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.Y324C:Substitution-Missense,COSMIC:HNRNPA2B1:p.E11G:Substitution-Missense,COSMIC:HNRNPA2B1:p.E133Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G320C:Substitution-Missense,COSMIC:HNRNPA2B1:p.E92Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.E121Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.E80Q:Substitution-Missense,COSMIC:HNRNPA2B1:p.M1?:,COSMIC:HNRNPA2B1:p.D87H:Substitution-Missense,COSMIC:HNRNPA2B1:p.G248*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000356674:p.D75H:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.M1?:,COSMIC:HNRNPA2B1:p.G217V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G236*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000356674:p.G205V:Substitution-Missense,cbiomut:ENST00000354667:HNRNPA2B1:p.D76H:Missense_Mutation,cbiomut:ENST00000354667:HNRNPA2B1:p.D76H:Missense_Mutation,cbiomut:ENST00000354667:HNRNPA2B1:p.D76H:Missense_Mutation" 0 PXD014145_decoy null "[, , Percolator, 3.05]" 0.642512 null 436.4756905 3 470.901519 470.9006154 ms_run[8]:controllerType=0 controllerNumber=1 scan=1500 "K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K" "K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K" "162,162,162,162,162,162,162,122,122,174,174,162,162,162,162,162,174,162,162,162,174,162,162,174,174,162,162,162,174,162,174,162,162,162,162,162,174,162,162,162,395,455,395,395,395,395,395,474,474,326,326,395,395,395,382,395,395,439,455,395,395,230,218,395,326,326,395,395,395,439,455,326,395,455,395,395,395,230,455,395,395,566,395,395,431,1358,1118,395,174,174,174,174,174,174,173,174,174,174,174,174,174,174,174,174,174,174,174,174,174,174,162,162,162,162,162,162,173,162,162,162,174,162,174,161,174,174,162,162,162,174,162,174,174,162,174,162,162,174,174,174,162,162,174,162,162,174,174,174" "173,173,173,173,173,173,173,133,133,185,185,173,173,173,173,173,185,173,173,173,185,173,173,185,185,173,173,173,185,173,185,173,173,173,173,173,185,173,173,173,406,466,406,406,406,406,406,485,485,337,337,406,406,406,393,406,406,450,466,406,406,241,229,406,337,337,406,406,406,450,466,337,406,466,406,406,406,241,466,406,406,577,406,406,442,1369,1129,406,185,185,185,185,185,185,184,185,185,185,185,185,185,185,185,185,185,185,185,185,185,185,173,173,173,173,173,173,184,173,173,173,185,173,185,172,185,185,173,173,173,185,173,185,185,173,185,173,173,185,185,185,173,173,185,173,173,185,185,185" 0.0561798 0 YHTINGHNAEVR test_blast_validate.mzML 1500
PSM KMVSLAK 4 pseudo_ENST00000454683.1_2 1 PXD014145_decoy null "[, , Percolator, 3.05]" 0.668987 null 741.1 2 388.740661 388.7385759 ms_run[1]:controllerType=0 controllerNumber=1 scan=3252 R N 443 449 0.0947368 0 KMVSLAK test_blast_validate.mzML 3252
PSM AAMAAWPPAAQAAAAAVAVVGGGGEPGAPR 8 "altorf_ENST00000247706.4_2,altorf_ENST00000593489.1_2" 0 PXD014145_decoy null "[, , Percolator, 3.05]" 0.547212 null 1209.2 5 529.4764486 529.4750268 ms_run[5]:controllerType=0 controllerNumber=1 scan=6341 "R,R" "G,G" "183,147" "212,176" 0.0526316 0 AAMAAWPPAAQAAAAAVAVVGGGGEPGAPR test_blast_validate.mzML 6341
Loading