-
Notifications
You must be signed in to change notification settings - Fork 0
/
MR_Finder.py
163 lines (130 loc) · 6.67 KB
/
MR_Finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#This script will divide a CGmapTools CGmap file into bins of a defined size
#and find bins that are methylated in sequence context of interest.
#Only bins passing coverage criteria will be considered.
#Methylation is calculated as average of the methylation values of all cytosines in region, not weighted by number of reads
#All bins with passing coverage will also be output as eligible methylation regions (EMRs)
#Output formats for each region as follows:
#1 Chromosome
#2 Start position (0-based)
#3 End position (exclusive)
#4 Methylation average for region (by informative cytosine in region, not weighted by reads)
#5 Coverage
#6 Number informative cytosines
###############################################################################
#USER INPUT
###############################################################################
#Input file name and path
inPath = '/scratch/gent/merged_maps/w-EN.CGmap.gz'
#prefix for output files
outPrefix = 'w-EN_CHH.2_2X_unsorted'
#Region size in bp. (larger than read length means multiple reads usually required for 1X coverage)
binSize = 200
#Minimum read coverage of informative cytosines in region (cytosines that contribute to methylation values)
minCov = 2
#Minimum number of informative cytosines in region
minCountC = 5
#Minimum methylation to qualify as a MR, exclusive)
minMeth = .2
#Contexts to evaluate. Both can be set to the same one
context1 = 'CHH'
context2 = 'CHH'
##############################################################################
#INITIALIZE STUFF
##############################################################################
import gzip
import math
#Open files
inFile = gzip.open(inPath, 'rb')
summaryFile = open(outPrefix + '_MR_summary.txt', 'w')
MRFile = open(outPrefix + '_MRs.txt', 'w')
#Region type counts (for summary at end)
MRs = 0
#list of meth values and coverage for each cytosine for each pair of samples for each region
regionMeths = []
regionCoverages = []
newRegion = False #Use this later to decide whether to reset above lists
##############################################################################
#GET VALUES FROM FIRST LINE OF FILE
##############################################################################
line = inFile.readline()
columns = line.strip().split()
chr = columns[0].decode("utf-8")
bp = int(columns[2])
regionStart = int(binSize*math.floor(bp/binSize)) + 1 #CGmaps use 1-based annotation
context = columns[3].decode("utf-8")
print(chr)
if context == context1 or context == context2:
regionMeths.append(float(columns[5]))
regionCoverages.append(float(columns[7]))
####################################################################################
#GET VALUES FROM REST OF FILE, LINE-BY-LINE, AND PROCESS EACH REGION EXCEPT LAST ONE
####################################################################################
for line in inFile:
columns = line.strip().split()
bp = int(columns[2]) #Get position of current cytosine
if columns[0].decode("utf-8") == chr: #Check whether current cytosine is on same chromosome as previous one.
if bp < regionStart + binSize: #Append methylation and coverage values if still in same region and if in allowed context
context = columns[3].decode("utf-8")
if context == context1 or context == context2:
regionMeths.append(float(columns[5]))
regionCoverages.append(float(columns[7]))
else:
newRegion = True
else: #New chromosome means new region and need to reset chromosome name
newRegion = True
chr = columns[0].decode("utf-8")
print(chr)
if newRegion: #If new region, process previous one
#Test for eligibility based on coverage and number of informative cytosines
countC = len(regionCoverages)
if countC >= minCountC:
coverageAve = sum(regionCoverages)/countC
if coverageAve >= minCov:
#Get average methylation for region
methAve = sum(regionMeths)/countC
regionLine = chr + '\t' + str(regionStart - 1) + '\t' + str(regionStart + binSize - 1) + '\t' + str(round(methAve,3)) + '\t' + str(round(coverageAve,2)) + '\t' + str(countC) + '\n'
#If methylation low enough, write region to new file.
if methAve > minMeth:
MRFile.write(regionLine) #write region details to hyperDMR file
MRs += 1 #count number of hyperDMRs
#Reset meths and coverages to include only the values from the most recent position
regionMeths = []
regionCoverages = []
regionStart = int(binSize*math.floor(bp/binSize)) + 1 #CGmaps use 1-based annotation
if context == context1 or context == context2:
regionMeths.append(float(columns[5]))
regionCoverages.append(float(columns[7]))
newRegion = False
####################################################################################
#PROCESS LAST REGION IN FILE
####################################################################################
#Test for eligibility based on coverage and number of informative cytosines
countC = len(regionCoverages)
if countC >= minCountC:
coverageAve = sum(regionCoverages)/countC
if coverageAve >= minCov:
#Get average methylation for region
methAve = sum(regionMeths)/countC
regionLine = chr + '\t' + str(regionStart - 1) + '\t' + str(regionStart + binSize - 1) + '\t' + str(round(methAve,3)) + '\t' + str(round(coverageAve,2)) + '\t' + str(countC) + '\n'
#If methylation low enough, write region to new file.
if methAve > minMeth:
MRFile.write(regionLine) #write region details to MR file
MRs += 1 #count number of MRs
####################################################################################
#CREATE SUMMARY FILE
#####################################################################################
summaryFile.write('Input file name: ' + inPath[inPath.rindex('/') + 1:] + '\n')
summaryFile.write('Methylation is calculated as average of the methylation values of all cytosines in region. It is not weighted by number of reads\n')
summaryFile.write('Region size: ' + str(binSize) + '\n')
summaryFile.write('Minimum read coverage of informative cytosines in region: ' + str(minCov) + '\n')
summaryFile.write('Minimum number of informative cytosines in region: ' + str(minCountC) + '\n')
summaryFile.write('Minimum average methylation value for region to qualify as a MR (exclusive) ' + str(minMeth) + '\n')
summaryFile.write('Region file formats\n')
summaryFile.write('Chromosome\tStart position (0-based)\tEnd position (exclusive)\tSample1 methylation\tSample2 methylation\tSample1 coverage\tSample2 coverage\tinformative cytosines\n')
summaryFile.write('MRs\t' + str(MRs) + '\n')
####################################################################################
#CLOSE FILES
#####################################################################################
inFile.close()
summaryFile.close()
MRFile.close()