-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbootstrap_dadi.py
385 lines (316 loc) · 11.6 KB
/
bootstrap_dadi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
#!/usr/bin/env python
# encoding: utf-8
"""
File: bootstrap_dadi.py
Author: JF McLaughlin
Version: 1.1
Info: Given a .vcf file, create multiple .fs input files sampling different individuals and run in DaDi.
Requirements: convert_vcf_to_dadi_input.pl by Kun Wang, phyluce 1.5, BioPython, PyVCF. And of course, DaDi.
"""
import os
import sys
import shlex
import argparse
import pylab
import dadi
import Demographics2D
import vcf
import collections
import ConfigParser
from Bio import SeqIO
from random import choice
#from phyluce.helpers import FullPaths, is_file, is_dir
#from phyluce.log import setup_logging
import subprocess
from numpy import array, random
def get_args():
"""Get arguments from user"""
parser = argparse.ArgumentParser(
description="Given an input vcf file, produce datasets for bootstrapping in dadi"
)
parser.add_argument(
'--input_vcf',
required=True,
#action=FullPaths,
#type=is_file,
help='The vcf file to be used for creating fs file'
)
parser.add_argument(
'--bootstrap_reps',
required=True,
type=int,
help='How many times to sample data. Greater than 30 recommended.'
)
parser.add_argument(
'--model_conf',
required=True,
help='Config file for population info and model parameters'
)
parser.add_argument(
'--output',
required=True,
type=str,
help='Name for output files. Not a full file name, just the base name.'
)
parser.add_argument(
'--indiv_config',
#type=is_file,
#action=FullPaths,
required=True,
help='List of individual names as appearing in vcf header'
)
parser.add_argument(
'--reference',
#type=is_file,
#action=FullPaths,
required=True,
help='fasta file with reference sequence'
)
parser.add_argument(
'--pl_script',
#type=is_file,
#action=FullPaths,
required=True,
help='where the perl script is'
)
#parser.add_argument(
#'--log-path',
#action=FullPaths,
#type=is_dir,
#default=None,
#help='Directory in which to store log file'
#)
return parser.parse_args()
def ConfigSectionMap(Config,section):
dict1 = {}
options = Config.options(section)
for option in options:
try:
dict1[option] = Config.get(section, option)
if dict1[option] == -1:
DebugPrint("skip: %s" % option)
except:
print("exception on %s!" % option)
dict1[option] = None
return dict1
def indiv_by_pop(pop_name,indiv_list): # get individual names for each population
taxon=[] # set up empty list
for ind in indiv_list:
if pop_name in ind:
taxon.append(ind)
return taxon
def data_to_lines(data):
return data.split("\n")
def bootstrap_sample(indiv_list, pop_size, i_id):
bootstrap_indiv = random.choice(indiv_list, pop_size)
print("Bootstrap "+str(i_id)+"using: "+str(bootstrap_indiv))
return bootstrap_indiv
def parse_vcf(vcf_input,output_tag, individuals, i_id):
try:
reader = vcf.Reader(open(vcf_input, 'r'))
print("Reading vcf....")
except:
print("Could not open vcf file")
raise IOError("vcf file "+vcf_input+" could not be opened")
clean_records =[]
for record in reader: # for each record
selected_calls = []
for call in record.samples:
for indiv in individuals:
# go to our selected individuals
if indiv in call.sample: # if it's a match...
selected_calls.append(call)
else:
pass
record.samples = selected_calls
clean_records.append(record)
use_samples = []
for sample in reader.samples:
for indiv in individuals:
if indiv in sample:
use_samples.append(sample)
reader.samples = use_samples
try:
output_vcf = output_tag+'.vcf'
writer = vcf.Writer(open(output_vcf, 'w'), reader)
print('Output vcf created')
except:
print('Could not create output vcf file')
raise IOError('Could not create output vcf file')
for record in clean_records:
writer.write_record(record)
return output_vcf
def create_fs_list(individuals,out_tag,taxon1,taxon2,pop1,pop2):
# create the list file necessary to run the vcf to dadi input script
list_file = open(out_tag+"list.txt", 'w')
try:
for indiv in individuals:
if indiv in taxon1:
list_file.write(indiv+"\t"+pop1+"\n")
elif indiv in taxon2:
list_file.write(indiv+"\t"+pop2+"\n")
else:
pass
except:
raise IOError("Could not write list file")
list_file.close()
return list_file
def create_fs(perl_script,vcf_input,refseq,output_tag,listfile,pop1,pop2,count1,count2):
args_str = "perl "+perl_script+" "+refseq+" "+vcf_input+" "+listfile.name
print args_str
args = shlex.split(args_str)
print args
# perl_input = ['convert_vcf_to_dadi_input.pl', refseq, vcf_input, listfile]
# cmd = [
# "perl",
# perl_script,
# refseq,
# vcf_input,
# listfile
# ]
# print cmd
perl_out = subprocess.Popen(args, stdout=subprocess.PIPE)
perl_out.communicate()
print("vcf converted to dadi input successfully")
# print("Could not run .vcf to .fs conversion script") # kill if script not used
# raise IOError("Could not create fs-type file")
dd = dadi.Misc.make_data_dict(listfile.name+".data") # use dadi to make a data dictionary for this data
folded = dadi.Spectrum.from_data_dict(dd,[pop1,pop2],[count1,count2],polarized=False) # generate a folded spectrum
fs_file = folded.to_file(output_tag+'.fs') # use this to make a file
print("fs file complete!")
return fs_file
def run_dadi(fs_file,output_tag,i_id,dadi_iter,nu1,nu2,time,mig,pop1,pop2):
#TO DO: add functions to allow other model types
# Load the data
data = dadi.Spectrum.from_file(output_tag+".fs")
ns = data.sample_sizes
data.mask_corners()
# create a file to print output to and set number of iterations
output_file = open(output_tag+".txt", "w")
iterations = dadi_iter
# print number of samples to verify correct load
print(ns)
# Grid point settings will be used for extrapolation.
# Grid points need to be formated [n,n+10,n+20].
# Needs to be bigger than the number of samples you have (n>ns) and this will be a strong determination as to how long your program will run.
pts_l = [100,110,120]
# Call particular model to run, the model choosen here is split.w.migration
func = dadi.Demographics2D.split_mig
# params = (nu1,nu2,T,m)
# ns = (n1,n2)
#
# Split into two populations of specifed size, with migration.
# nu1: Size of population 1 after split.
# nu2: Size of population 2 after split.
# T: Time in the past of split (in units of 2*Na generations)
# m: Migration rate between populations (2*Na*m)
# n1,n2: Sample sizes of resulting Spectrum
# pts: Number of grid points to use in integration.
# Now let's optimize parameters for this model.
# The upper_bound and lower_bound lists are for use in optimization.
# Occasionally the optimizer will try wacky parameter values. We in particular
# want to exclude values with very long times, very small population sizes, or
# very high migration rates, as they will take a long time to evaluate.
# Parameters are: (nu1,nu2,T,m)
#Set the upper and lower bounds to make sure that the bounderies are
#there. Suggested time parameters: lower 0, upper 5, migration
#parameters: lower 0, upper 10,size parameters: lower 1e-2, upper 100
upper_bound = [nu1,nu2,time,mig]
lower_bound = [1e-1, 1e-1, 0, 0]
# ready to run multiple iterations
output_file.write("Iteration,MLCL,nu1,nu2,T,m,theta\n")
for iter in range(iterations):
# create an id for this iteration for labelling purposes
iter_id = str(iter)
# This is our initial guess for the parameters, which is somewhat arbitrary.
p0 = [1,1,1,1]
# Make the extrapolating version of our demographic model function.
func_ex = dadi.Numerics.make_extrap_log_func(func)
# Perturb our parameters before optimization. This does so by taking each
# parameter up to a factor of two up or down.
p0 = dadi.Misc.perturb_params(p0, fold=2, upper_bound=upper_bound,
lower_bound=lower_bound)
print('Beginning optimization ************************************************')
popt = dadi.Inference.optimize_log(p0, data, func_ex, pts_l,
lower_bound=lower_bound,
upper_bound=upper_bound,
verbose=1)
# The verbose argument controls how often progress of the optimizer should be
# printed. It's useful to keep track of optimization process.
print('Finshed optimization **************************************************')
print('Best-fit parameters: {0}'.format(popt))
# Calculate the best-fit model AFS.
model = func_ex(popt, ns, pts_l)
# Likelihood of the data given the model AFS.
ll_model = dadi.Inference.ll_multinom(model, data)
print('Maximum log composite likelihood: {0}'.format(ll_model))
# The optimal value of theta given the model.
theta = dadi.Inference.optimal_sfs_scaling(model, data)
print('Optimal value of theta: {0}'.format(theta))
print(str(pts_l)+", "+str(upper_bound)+", "+str(lower_bound))
output_file.write(iter_id+",{0}".format(ll_model)+'{0}'.format(popt)+',{0}'.format(theta)+"\n")
# Plot a comparison of the resulting fs with the data.
pylab.figure(1)
dadi.Plotting.plot_2d_comp_multinom(model, data, vmin=0.1, resid_range=1,
pop_ids =(pop1,pop2))
# This saves the resulting figure to file
pylab.savefig(output_tag+iter_id+"_"+str(i_id)+'.png')
output_file.close()
return output_file
def main():
# get args
args = get_args()
# setup logging
#log, my_name = setup_logging(args)
print("Logging set-up")
print("Parsing conf file......")
Config = ConfigParser.ConfigParser()
Config.read(args.model_conf)
print("Getting config info for data........")
pop1 = ConfigSectionMap(Config,"Data")["population1"]
pop2 = ConfigSectionMap(Config, "Data")["population2"]
N1 = int(ConfigSectionMap(Config, "Data")["pop1_size"])
N2 = int(ConfigSectionMap(Config, "Data")["pop2_size"])
n_alleles_1 = N1 * 2
n_alleles_2 = N2 * 2
pop_alleles = [n_alleles_1,n_alleles_2]
print("Getting config info for model.........")
# get the info from the conf file. TO DO: create as separate functions for different model types
Nu1 = int(ConfigSectionMap(Config, "Model")["nu1"])
Nu2 = int(ConfigSectionMap(Config, "Model")["nu2"])
m = int(ConfigSectionMap(Config, "Model")["m"])
T = int(ConfigSectionMap(Config, "Model")["t"])
dadi_reps = int(ConfigSectionMap(Config, "Model")["dadi_runs"])
upper_bound = [Nu1,Nu2,T,m]
print("Using upper-bound values "+str(upper_bound))
print("Dadi reps requested: "+str(dadi_reps))
print("Getting list of unique individuals")
individual_file = open(args.indiv_config,'r')
individuals_data = individual_file.read()
individuals = data_to_lines(individuals_data)
taxon1 = indiv_by_pop(pop1, individuals)
taxon2 = indiv_by_pop(pop2, individuals)
bootstraps = range(args.bootstrap_reps)
out_tag = args.output
print out_tag
vcf_in = args.input_vcf
refseq = args.reference
print refseq
for b in bootstraps:
print("Bootstrap run: "+str(b))
if not os.path.isdir(out_tag+str(b)): #make an output directory for this round
os.makedirs(out_tag+str(b))
else:
pass
out_file = out_tag+str(b)+"/"+out_tag+str(b)
print out_file
bs_taxon1 = bootstrap_sample(taxon1, N1, b).tolist()
bs_taxon2 = bootstrap_sample(taxon2, N2, b).tolist()
bs_indiv = bs_taxon1 + (bs_taxon2)
bs_vcf = parse_vcf(vcf_in, out_file, bs_indiv,b)
bs_fs_list = create_fs_list(bs_indiv,out_file,bs_taxon1,bs_taxon2,pop1,pop2)
bs_fs = create_fs(args.pl_script,bs_vcf,refseq,out_file,bs_fs_list,pop1,pop2,n_alleles_1,n_alleles_2)
dadi_runs = run_dadi(bs_fs,out_file,b,dadi_reps,Nu1,Nu2,T,m,pop1,pop2)
print("Done!")
if __name__ == '__main__':
main()