-
Notifications
You must be signed in to change notification settings - Fork 0
/
GFF_BED_Counter_Range2-2.py
238 lines (183 loc) · 8.6 KB
/
GFF_BED_Counter_Range2-2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#This script counts reads from a bed file that start within coordinates or set of coordinates determined by a GFF file.
#It also counts the number of bases from each read that overlap with the coordinates or set of coordinates determined by a GFF file
#All bed files in the folder will be processed, one at a time.
#WARNING coverage will be underestimated in adjacent bed features because if a read spans two features, only the left portion is counted.
#Only coordinates on chromosomes 1 through 10 are considered.
#The Bed file and the GFF file must be sorted by chromosome: 1 through 10, followed by non-numeric chromosomes
#This script is derived from GFF_BED_Counter_Range.py.
##########################################################################################################
#USER INPUT
##########################################################################################################
bin_size = 100 #length of bin in bp
bin_start = -1499 #start position of first bin relative to edge position of feature. "-" means upstream. Inclusive.
bin_end = 3000 #end position of last bin relative to edge of feature. "-" means upstream. Inclusive.
#location of input and output files
in_path = "/scratch/gent/regions"
#location and name of GFF file:
GFF_path = '/home/gent/references/W22_synt_genes.gff'
#file endings
ending = "MRs.bed"
#Which end?
upstream_edge = False #Indicates which edge of feature defines bins
##############################################################################################################
#I/O
##############################################################################################################
import glob
import os
os.chdir(in_path)
for filename in glob.glob(in_path + '/*' + ending):
Bed_path = filename
if upstream_edge:
out_path = filename[:-len(ending)] + '_' + GFF_path[GFF_path.rindex('/') + 1:] + '_upstream.txt'
else:
out_path = filename[:-len(ending)] + '_' + GFF_path[GFF_path.rindex('/') + 1:] + '_downstream.txt'
#GFF columns of interest
GFF_start_col = 3
GFF_end_col = 4
GFF_orient_col = 6
################################################################################################################
#Define Bed_Extractor to identify reads in each bin and write them to a new file
################################################################################################################
def Bed_Extractor(i):
#I/O
Bed_file = open(Bed_path) #File with reads
GFF_file = open(GFF_path) #File with coordinates in GFF format
#initialize sense_counter and antisense_counters
sense_counter = 0
antisense_counter = 0
#get first GFF_feature
GFF_cols = GFF_file.readline().strip().split()
try:
GFF_chr = int(GFF_cols[0])
except ValueError:
GFF_chr = 1000000000000000000000000
GFF_orient = GFF_cols[GFF_orient_col]
if upstream_edge:
if GFF_orient == '+':
GFF_start = int(GFF_cols[GFF_start_col]) + (bin_start + i*bin_size)
else:
GFF_start = int(GFF_cols[GFF_end_col]) - (bin_start + (i+1)*bin_size - 1)
else:
if GFF_orient == '+':
GFF_start = int(GFF_cols[GFF_end_col]) + (bin_start + i*bin_size)
else:
GFF_start = int(GFF_cols[GFF_start_col]) - (bin_start + (i+1)*bin_size - 1)
GFF_end = GFF_start + bin_size - 1
#Bring in one line at a time from file with reads and parse out individivual columns
for Bed_line in Bed_file:
read_has_potential = False #assume this read is not from chromosome 1-10
Bed_cols = Bed_line.strip().split('\t')
try:
Bed_chr = int(Bed_cols[0])
read_has_potential = True
except ValueError:
pass
Bed_start = int(Bed_cols[1]) + 1
Bed_end = int(Bed_cols[2])
Bed_orient = Bed_cols[5]
#Determine whether read overlaps with a GFF feature
if read_has_potential:
while read_has_potential:
#check whether read is on the right chromosome
if Bed_chr == GFF_chr:
#check whether read is far enough to the right
if Bed_start >= GFF_start or Bed_end >= GFF_start:
#check if read starts within GFF feature
if Bed_start <= GFF_end:
read_has_potential = False
#CALCULATE OVERLAP
#For sense
if Bed_orient == GFF_orient:
if Bed_start < GFF_start:
if Bed_end > GFF_end:
sense_counter += GFF_end - GFF_start + 1 #read extends past both edge of GFF feature
else:
sense_counter += Bed_end - GFF_start + 1 #read extends past left edge of GFF feature
else:
if Bed_end > GFF_end:
sense_counter += GFF_end - Bed_start + 1 #read extends past right edge of GFF feature
else:
sense_counter += Bed_end - Bed_start + 1 #read is entirely contained within GFF feature
#For antisense
else:
if Bed_start < GFF_start:
if Bed_end > GFF_end:
antisense_counter += GFF_end - GFF_start + 1 #read extends past both edge of GFF feature
else:
antisense_counter += Bed_end - GFF_start + 1 #read extends past left edge of GFF feature
else:
if Bed_end > GFF_end:
antisense_counter += GFF_end - Bed_start + 1 #read extends past right edge of GFF feature
else:
antisense_counter += Bed_end - Bed_start + 1 #read is entirely contained within GFF feature
#if read starts past the end of GFF object, compare with next GFF feature
else:
#get the chromosome and coordinates from next GFF feature
GFF_cols = GFF_file.readline().strip().split()
if len(GFF_cols) > 0:
try:
GFF_chr = int(GFF_cols[0])
GFF_orient = GFF_cols[GFF_orient_col]
if upstream_edge:
if GFF_orient == '+':
GFF_start = int(GFF_cols[GFF_start_col]) + (bin_start + i*bin_size)
else:
GFF_start = int(GFF_cols[GFF_end_col]) - (bin_start + (i+1)*bin_size - 1)
else:
if GFF_orient == '+':
GFF_start = int(GFF_cols[GFF_end_col]) + (bin_start + i*bin_size)
else:
GFF_start = int(GFF_cols[GFF_start_col]) - (bin_start + (i+1)*bin_size - 1)
GFF_end = GFF_start + bin_size - 1
#if chromosome is Unknown, Mitochondria, or Chloroplast,
except ValueError:
GFF_chr = 10000000000000000000000
read_has_potential = False
else:
GFF_chr = 10000000000000000000000
read_has_potential = False
#if read is to the left of entire gff feature, break out of while loop and get next read
else:
read_has_potential = False
elif Bed_chr > GFF_chr:
#get the chromosome and coordinates from next GFF feature
GFF_cols = GFF_file.readline().strip().split()
try:
GFF_chr = int(GFF_cols[0])
GFF_orient = GFF_cols[GFF_orient_col]
if upstream_edge:
if GFF_orient == '+':
GFF_start = int(GFF_cols[GFF_start_col]) + (bin_start + i*bin_size)
else:
GFF_start = int(GFF_cols[GFF_end_col]) - (bin_start + (i+1)*bin_size - 1)
else:
if GFF_orient == '+':
GFF_start = int(GFF_cols[GFF_end_col]) + (bin_start + i*bin_size)
else:
GFF_start = int(GFF_cols[GFF_start_col]) - (bin_start + (i+1)*bin_size - 1)
GFF_end = GFF_start + bin_size - 1
#if chromosome is Unknown, Mitochondria, or Chloroplast,
except ValueError:
GFF_chr = 10000000000000000000000
read_has_potential = False
#if there are no more features on Bed_chr, get new read
else:
read_has_potential = False
#stop when first ten chromosomes are done.
#else:
# break
out_file.write(str(bin_start + i*bin_size) + '\t' + str(sense_counter) + '\t' + str(antisense_counter) + '\t' + str(sense_counter + antisense_counter) + '\n')
#close files
Bed_file.close()
GFF_file.close()
#################################################################################################################
#Run each bin through Bed_Extractor function
################################################################################################################
out_file = open(out_path, 'w') #Write-to file
out_file.write('bin start\tsense overlap\tantisense overlap\ttotal overlap\n')
for i in range((bin_end + 1 - bin_start)//bin_size):
Bed_Extractor(i)
################################################################################################################
#Close I/O
################################################################################################################
out_file.close()