-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #22 from yhoogstrate/ensembl_bed_conversion
Added utility for GTF to BED conversion
- Loading branch information
Showing
10 changed files
with
227 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#!/usr/bin/env python | ||
|
||
import fuma,sys | ||
|
||
from fuma.CLI import CLI_ensmble_gtf_to_bed_converter | ||
|
||
args = CLI_ensmble_gtf_to_bed_converter() | ||
|
||
## GTF | ||
## - start: 1-based | ||
## - end: 1-based | ||
## | ||
## BED | ||
## - start: 0-based | ||
## - end: 1-based | ||
|
||
#print args | ||
#print args.genecode_gtf_file | ||
|
||
idx = {} | ||
|
||
# Parse the GTF file | ||
with open(args.genecode_gtf_file[0],"r") as fh: | ||
for line in fh: | ||
line = line.strip() | ||
if len(line) > 0: | ||
if line[0] != "#": | ||
params = line.split("\t") | ||
|
||
gene_id = params[8].split("gene_id",1)[1].split(";",1)[0].strip(" ").strip('"') | ||
transcript_id = params[8].split("transcript_id",1)[1].split(";",1)[0].strip(" ").strip('"') | ||
#exon_number = params[8].split("exon_number",1)[1].split(";",1)[0].strip(" ").strip('"') | ||
|
||
start = int(params[3]) | ||
end = int(params[4]) | ||
inversed = (end < start) | ||
|
||
min_pos = min(start,end) | ||
max_pos = max(start,end) | ||
|
||
if not idx.has_key(transcript_id): | ||
idx[transcript_id] = {} | ||
|
||
if not idx[transcript_id].has_key(params[0]): | ||
idx[transcript_id][params[0]] = [min_pos,max_pos,inversed,params[6],gene_id,params[0]] | ||
|
||
if min_pos < idx[transcript_id][params[0]][0]: | ||
# if inversion and strand is identical, overwrite | ||
if inversed == idx[transcript_id][params[0]][2] and params[6] == idx[transcript_id][params[0]][3]: | ||
idx[transcript_id][params[0]][0] = min_pos | ||
else: | ||
raise Exception("Error: transcript annotated in different directions:\n"+line) | ||
|
||
if max_pos > idx[transcript_id][params[0]][1]: | ||
# if inversion and strand is identical, overwrite | ||
if inversed == idx[transcript_id][params[0]][2] and params[6] == idx[transcript_id][params[0]][3]: | ||
idx[transcript_id][params[0]][1] = max_pos | ||
else: | ||
raise Exception("Error: transcript annotated in different directions:\n"+line) | ||
|
||
# Export to BED | ||
lines = set() | ||
for t in sorted(idx.keys()):# Sorted is essential to get the same output and for functional testing | ||
if len(idx[t].keys()) != 1: | ||
raise Exception("Error: " + t + " has either no annotated chromosomes or multiple") | ||
else: | ||
data = idx[t][idx[t].keys()[0]] | ||
|
||
out = data[5]# chr | ||
out += "\t"+str(data[0]-1)# start | ||
out += "\t"+str(data[1])# end | ||
out += "\t"+data[4]#.split(".",1)[0] << trick to get rid of suffixes of ensembl ID's | ||
|
||
lines.update([out]) | ||
|
||
# Lines are unique by using sets | ||
for line in sorted(lines): | ||
print line |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
chr1 110952 129173 ENSG00000238009.2 | ||
chr1 129080 133566 ENSG00000238009.2 | ||
chr1 29553 31097 ENSG00000243485.2 | ||
chr1 30266 31109 ENSG00000243485.2 | ||
chr1 30365 30503 ENSG00000243485.2 | ||
chr1 34553 36081 ENSG00000237613.2 | ||
chr1 35244 36073 ENSG00000237613.2 | ||
chr1 69090 70008 ENSG00000186092.4 | ||
chr1 89294 120932 ENSG00000238009.2 | ||
chr1 92229 129217 ENSG00000238009.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
##description: test file | ||
##provider: STAR Fusion wiki | ||
chr1 HAVANA exon 29554 30039 . + . gene_id "ENSG00000243485.2"; transcript_id "ENST00000473358.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "MIR1302-11"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-001"; exon_number 1; exon_id "ENSE00001947070.1"; level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; | ||
chr1 HAVANA exon 30564 30667 . + . gene_id "ENSG00000243485.2"; transcript_id "ENST00000473358.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "MIR1302-11"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-001"; exon_number 2; exon_id "ENSE00001922571.1"; level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; | ||
chr1 HAVANA exon 30976 31097 . + . gene_id "ENSG00000243485.2"; transcript_id "ENST00000473358.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "MIR1302-11"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-001"; exon_number 3; exon_id "ENSE00001827679.1"; level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; | ||
chr1 HAVANA exon 30267 30667 . + . gene_id "ENSG00000243485.2"; transcript_id "ENST00000469289.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "MIR1302-11"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-002"; exon_number 1; exon_id "ENSE00001841699.1"; level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; | ||
chr1 HAVANA exon 30976 31109 . + . gene_id "ENSG00000243485.2"; transcript_id "ENST00000469289.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "MIR1302-11"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-002"; exon_number 2; exon_id "ENSE00001890064.1"; level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; | ||
chr1 ENSEMBL exon 30366 30503 . + . gene_id "ENSG00000243485.2"; transcript_id "ENST00000607096.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "MIR1302-11"; transcript_type "miRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-201"; exon_number 1; exon_id "ENSE00003695741.1"; level 3; tag "basic"; havana_gene "OTTHUMG00000000959.2"; | ||
chr1 HAVANA exon 35721 36081 . - . gene_id "ENSG00000237613.2"; transcript_id "ENST00000417324.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138A"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138A-001"; exon_number 1; exon_id "ENSE00001656588.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002842.1"; | ||
chr1 HAVANA exon 35277 35481 . - . gene_id "ENSG00000237613.2"; transcript_id "ENST00000417324.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138A"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138A-001"; exon_number 2; exon_id "ENSE00001669267.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002842.1"; | ||
chr1 HAVANA exon 34554 35174 . - . gene_id "ENSG00000237613.2"; transcript_id "ENST00000417324.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138A"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138A-001"; exon_number 3; exon_id "ENSE00001727627.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002842.1"; | ||
chr1 HAVANA exon 35721 36073 . - . gene_id "ENSG00000237613.2"; transcript_id "ENST00000461467.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138A"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138A-002"; exon_number 1; exon_id "ENSE00001618781.2"; level 2; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002843.1"; | ||
chr1 HAVANA exon 35245 35481 . - . gene_id "ENSG00000237613.2"; transcript_id "ENST00000461467.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138A"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138A-002"; exon_number 2; exon_id "ENSE00001874421.1"; level 2; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002843.1"; | ||
chr1 HAVANA exon 69091 70008 . + . gene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; exon_number 1; exon_id "ENSE00002319515.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1"; | ||
chr1 HAVANA exon 120775 120932 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000466430.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-001"; exon_number 1; exon_id "ENSE00001606755.2"; level 2; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003225.1"; | ||
chr1 HAVANA exon 112700 112804 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000466430.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-001"; exon_number 2; exon_id "ENSE00001957285.1"; level 2; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003225.1"; | ||
chr1 HAVANA exon 92091 92240 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000466430.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-001"; exon_number 3; exon_id "ENSE00001944529.1"; level 2; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003225.1"; | ||
chr1 HAVANA exon 89295 91629 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000466430.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-001"; exon_number 4; exon_id "ENSE00001846804.1"; level 2; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003225.1"; | ||
chr1 HAVANA exon 129055 129217 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000477740.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-003"; exon_number 1; exon_id "ENSE00001919246.1"; level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003688.1"; | ||
chr1 HAVANA exon 120721 120932 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000477740.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-003"; exon_number 2; exon_id "ENSE00001171005.3"; level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003688.1"; | ||
chr1 HAVANA exon 112700 112804 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000477740.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-003"; exon_number 3; exon_id "ENSE00001957285.1"; level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003688.1"; | ||
chr1 HAVANA exon 92230 92240 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000477740.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-003"; exon_number 4; exon_id "ENSE00001896976.1"; level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003688.1"; | ||
chr1 HAVANA exon 129055 129173 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000471248.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-002"; exon_number 1; exon_id "ENSE00001934975.1"; level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003687.1"; | ||
chr1 HAVANA exon 112700 112804 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000471248.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-002"; exon_number 2; exon_id "ENSE00001957285.1"; level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003687.1"; | ||
chr1 HAVANA exon 110953 111357 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000471248.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-002"; exon_number 3; exon_id "ENSE00001879696.1"; level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003687.1"; | ||
chr1 HAVANA exon 133374 133566 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000453576.2"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-004"; exon_number 1; exon_id "ENSE00001737600.2"; level 2; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003689.1"; | ||
chr1 HAVANA exon 129081 129223 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000453576.2"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-004"; exon_number 2; exon_id "ENSE00001827073.1"; level 2; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003689.1"; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
chr1 29553 31097 ENSG00000243485.2 | ||
chr1 30266 31109 ENSG00000243485.2 | ||
chr1 30365 30503 ENSG00000243485.2 | ||
chr1 34553 36081 ENSG00000237613.2 | ||
chr1 35244 36073 ENSG00000237613.2 | ||
chr1 69090 70008 ENSG00000186092.4 | ||
chr1 89294 120932 ENSG00000238009.2 | ||
chr1 92229 129217 ENSG00000238009.2 | ||
chr1 110952 129173 ENSG00000238009.2 | ||
chr1 129080 133566 ENSG00000238009.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/usr/bin/env python | ||
|
||
"""[License: GNU General Public License v3 (GPLv3)] | ||
This file is part of FuMa. | ||
FuMa is free software: you can redistribute it and/or modify | ||
it under the terms of the GNU General Public License as published by | ||
the Free Software Foundation, either version 3 of the License, or | ||
(at your option) any later version. | ||
FuMa is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU General Public License for more details. | ||
You should have received a copy of the GNU General Public License | ||
along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
Documentation as defined by: | ||
<http://epydoc.sourceforge.net/manual-fields.html#fields-synonyms> | ||
""" | ||
|
||
import unittest,logging,sys,os | ||
logging.basicConfig(level=logging.INFO,format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",stream=sys.stdout) | ||
|
||
from fuma.ParseBED import ParseBED | ||
|
||
class TestParseBED(unittest.TestCase): | ||
def test_01(self): | ||
inputfile = "tests/data/gencode_hg19.subset.gtf" | ||
outputfile = "tests/data/gencode_hg19.subset.bed" | ||
|
||
command = "export PYTHONPATH=$PYTHONPATH\":fuma:../fuma\" ;\n\n" # ensure the fuma lib is accessible for testing (also without installation) | ||
command += ("bin/fuma-gencode-gtf-to-bed\\\n" | ||
" "+inputfile | ||
) | ||
|
||
result = os.popen(command).read() | ||
validation = open(outputfile,"r").read() | ||
|
||
self.assertEqual(result, validation) | ||
|
||
def test_02(self): | ||
inputfile = "tests/data/gencode_hg19.subset.gtf" | ||
outputfile = "tests/data/gencode_hg19.subset.sorted.bed" | ||
|
||
command = "export PYTHONPATH=$PYTHONPATH\":fuma:../fuma\" ;\n\n" # ensure the fuma lib is accessible for testing (also without installation) | ||
command += ("bin/fuma-gencode-gtf-to-bed \\\n" | ||
" "+inputfile+" | sort -k1,1V -k2,2g -k3,3g " | ||
) | ||
|
||
|
||
result = os.popen(command).read() | ||
validation = open(outputfile,"r").read() | ||
|
||
self.assertEqual(result, validation) | ||
|
||
def main(): | ||
unittest.main() | ||
|
||
if __name__ == '__main__': | ||
main() |