forked from wilfriedguiblet/smilefinder
-
Notifications
You must be signed in to change notification settings - Fork 1
/
count.py
147 lines (113 loc) · 3.96 KB
/
count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/python
# Prepare the data from HGDP to Smilefinder
# Example of use : ./count.py chr1.geno hgdp_popfile.txt chr1.map > output.csv
import sys
import math
# position map
SnpPositionMap = {}
positionFH = open(sys.argv[3], 'r')
for line in positionFH:
(snp, chromo, pos) = line.split('\t')
pos = pos.strip()
SnpPositionMap[snp] = (chromo, pos)
positionFH.close()
# populations map
populationsFH = open(sys.argv[2], 'r')
populationsMap = {}
for line in populationsFH:
(population, individual) = line.split('\t')
population = population.strip()
individual = individual.strip()
if population in populationsMap:
populationsMap[population].append(individual)
else:
populationsMap[population] = [individual]
populationsFH.close()
# extract desired indexes
genoFile = sys.argv[1]
genoFH = open(genoFile, 'r')
line = genoFH.readline()
individuals = line.split('\t')[1:]
individuals = [x.strip() for x in individuals]
indexMap = {}
for population in populationsMap:
indexMap[population] = []
for individual in populationsMap[population]:
# extract index
index = individuals.index(individual)
indexMap[population].append(index)
# indexMap contains the indexes of desired individuals
# in each population. indexMap-> population:index
# count
sys.stdout.write('Name\tChr\tPos');
for population in populationsMap:
sys.stdout.write('\t' + population + ' MA\t'+ ' Frequency\t' + ' n\t' + 'He\t' + 'Ho');
sys.stdout.write('\t'+'Fst'+'\n')
for line in genoFH:
l = line.split('\t')
currentSnp = l[0]
populationDataMap = {}
for population in populationsMap:
alleleCount = {}
Ho = 0
for index in indexMap[population]:
genotype = l[index + 1]
allele = genotype[0]
for x in range(2):
if allele in alleleCount:
alleleCount[allele] +=1
else:
alleleCount[allele] = 1
allele = genotype[1]
if str(genotype[0]) != str(genotype[1]):
Ho = Ho + 1
# mission is now to find max allele
maxAllele = ''
maxValue = 0
for allele in alleleCount:
if(alleleCount[allele] > maxValue):
maxValue = alleleCount[allele]
maxAllele = allele
# max is in maxAllele
n = len(indexMap[population])
maxFrequency = float(maxValue) / n / 2
He = float(maxFrequency) * 2 * ( 1 - float(maxFrequency))
Ho = float(Ho) / n
populationDataMap[population] = (maxFrequency, n, He, Ho, maxAllele)
# add chr:position here
sys.stdout.write(currentSnp + '\t' + SnpPositionMap[currentSnp][0] + '\t' + SnpPositionMap[currentSnp][1])
Alleles = []
MAF = []
n = []
for population in populationDataMap:
sys.stdout.write('\t' + str(populationDataMap[population][4])+'\t' + str(populationDataMap[population][0]) + '\t' + str(populationDataMap[population][1]) + '\t' + str(populationDataMap[population][2]) + '\t' + str(populationDataMap[population][3]))
#Fst calculations
Alleles[1:1] = [str(populationDataMap[population][4])]
MAF[1:1] = [str(populationDataMap[population][0])]
n[1:1] = [str(populationDataMap[population][1])]
if Alleles[0] == Alleles[1]:
MAF1 = float(MAF[0])
MAF2 = float(MAF[1])
else:
MAF1 = float(MAF[0])
MAF2 = 1 - float(MAF[1])
#MSP
n1 = float(n[0])
n2 = float(n[1])
MSP = n1*(MAF1-(MAF1+MAF2)/2)**2 + n2*(MAF2-(MAF1+MAF2)/2)**2
#MSG
MSG = 1/(n1+n2-2)*((n1*MAF1*(1-MAF1))+(n2*MAF2*(1-MAF2)))
#nc
nc = (n1+n2)-(((n1)**2+(n2)**2)/(n1+n2))
#Fst
if MSP == 0:
Fst = 0
elif MSG == 0:
Fst = 0
else:
Fst = (MSP-MSG)/(MSP+(nc-1)*MSG)
if Fst < 0:
Fst = 0.00000001
sys.stdout.write('\t'+str(Fst))
sys.stdout.write('\n')
# ya!