-
Notifications
You must be signed in to change notification settings - Fork 1
/
calculate_grouped_pearson_correlation.py
87 lines (70 loc) · 2.08 KB
/
calculate_grouped_pearson_correlation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
import re, os, sys, shutil
from math import *
from string import *
from optparse import OptionParser
import operator
import bisect
import stats
## get BED module
import BED
from GenomeData import *
from associate_binary_modification_with_expression import *
#import separate_by_chrom
Dir = os.getcwd();
grep = "/bin/grep";
cat = "/bin/cat";
plus = re.compile("\+");
minus = re.compile("\-");
def correlation1(list1, list2):
assert len(list1) == len(list2)
N = len(list1)
avg1 = average(list1)
avg2 = average(list2)
total = 0.0
for i in range(0,N):
total += (list1[i] - avg1) * (list2[i] - avg2)
total = total / N
return total
def get_data_list(file, colum):
'''colum starts from 0'''
file = open(file, 'r')
List = []
for line in file:
if not re.match("#", line):
line = line.strip()
sline = line.split()
List.append(atof(sline[colum]))
file.close()
return List
def group_list_avg(List, size):
result = []
length = len(List)
i = 0
while i < length:
templist = []
j = 0
while j < size and i < length:
templist.append(List[i])
j += 1
i += 1
result.append(stats.mean(templist))
return result
def main(argv):
parser = OptionParser()
parser.add_option("-i", "--data file", action="store", type="string", dest="datafile", metavar="<file>", help="data file")
parser.add_option("-n", "--groupsize", action="store", type="int", dest="groupsize", metavar="<int>", help="number of elements in each group")
#parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="output file name")
(opt, args) = parser.parse_args(argv)
if len(argv) < 2:
parser.print_help()
sys.exit(1)
rawList1 = get_data_list(opt.datafile, 1)
rawList2 = get_data_list(opt.datafile, 2)
List1 = group_list_avg(rawList1, opt.groupsize)
List2 = group_list_avg(rawList2, opt.groupsize)
#result = correlation1(List1, List2) / sqrt(correlation1(List1, List1)) / sqrt(correlation1(List2, List2))
result = stats.lpearsonr(List1, List2)[0]
print opt.datafile, result
if __name__ == "__main__":
main(sys.argv)