-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_SimulateData.R
190 lines (141 loc) · 7.31 KB
/
1_SimulateData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# Parameters used in msprime simulation
# The following parameters have been set in simulation_in_msprime.py script.
# Ne (effective diploid population size) = 3100, European population, Tenesa et al paper
# mutation and recombination rate = 2e-8 and 1e-8 per base per generation, respectively.
# Sequence length = 2 Mbp (2e6 bp)
# Number of sequences = 6200
# demographic_events=[msprime.SimulationModelChange(time=5000, model="hudson")]
# In 5000 generations back in time, we switch from Wright-Fisher model to Hudson CwR model.
# Step 1. Load all the necessary functions for this script
source('1_SimulateData_Utillity_Functions.R')
# Step 2. Simulate the population data
#
# Criteria for the simulated dataset:
#
# 1. The probability of disease in the population should be around prob_disease (as close as possible)
# (In our current simulations it is set to 0.05)
#
# 2. Number of individuals in the population that carry one copy of cSNV should be around N1.
# (In our current simulations this value is 155)
# (Refer to N0_N1_Calcl_Formula.pdf file for more details)
#
# 3. Aim for 15 equifrequent causal SNVs. These cSNVs need to have clade sizes of around 10 (= 155 / 15) in the population.
#
# 4. If needed, randomly choose more causal SNVs to gaurantee that the disease probability in population is close enough to 5%.
# (0.047 - 0.053)
#
pop_data = Simulate_Population_Data(prob_disease = 0.05,
causal_region = c(900000,1100000),
N1 = 155,
equifrequent_cSNV = 15,
Beta0 = -10,
Beta = 16
)
# Step 3. Randomly sample 50 case and 50 control individuals from the population.
#
# We use the Sample_from_population() function. Parameters of this function:
#
# 1. pop_data is a list resulting from Simulate_Population_Data() function in the previous step.
#
# 2. nCase is the desired number of case individuals we want to sample from this population
#
# 3. nCon is the desired number of control individuals we want to sample from this population.
#
sample_data = Sample_from_population(pop_data, nCase = 50, nCon = 50)
# The sample_data is a list with the following objects
#
# Elements of this list:
# 1. Haps: This is a list of two recording all the necessary information for sequences/haplotypes:
# 1.1: sample_haps: The case/control haplotype matrix. A matrix of 0/1 where 0 = Ancestral allele and 1 = Derrived allele.
# 1.2: ccStatus: A numeric vector indicating the case/control status of the sequences. 1 = Case and 0 = Control
#
# 2. Genos: This is a list of two recording all the necessary information for individuals:
# 2.1. sample_geno: Sample genotype matrix. A matrix with values 0/1/2 which counts the number of copies of the minor allele
# in each individual.
# 2.2. ccStatus: A numeric vector indicating the case/control status of each individual. 1 = Case and 0 = Control
#
# 3. Posn: A data.frame with two columns recording the label of SNVs and also their position on the genome.
#
# 4. poly_cSNV: A character vector recording the name of the polymorphic causal SNV in the sample.
#
# 5. CaseIND: A character vector recording the label of each case individual in the sample.
#
# 6. ControlIND: A character vector recording the label of each control individual in the sample.
#
# 7. CaseHapID: A character vector recording the label of case sequences in the sample.
#
# 8. ControlHapID: A character vector recording the label of control sequences in the sample.
#
# We can extract each element of this list by $ sign.
# For example, sample_data$genos extracts the sample genotype matrix.
#
# Step4. Create the permutation indices for all methods:
#
# permute_indx is a matrix of indices for each permutation.
#
nperm = 1000
ccLabel = sample_data$Genos$ccStatus
permute_indx = matrix(NA, nrow = nperm, ncol = length(ccLabel) )
for(i in 1:nperm){
permute_indx[i,] = sample( x = 1:length(ccLabel), replace = FALSE)
}
# Step 5. Save the population data as pop_data.RData.
# Save the sample data as sample_data.RData.
# Save the permutation indices.
# We need this data in the next scripts where we do IBD and none-IBD based methods.
#
#
# Important note:
# Make sure that all the files and scripts for each simulated dataset are at the same directory.
#
save(pop_data, file = "pop_data.RData")
save(sample_data, file = "sample_data.RData")
save(permute_indx, file = 'permute_indx.RData')
# Optional step
#
# We only run this stage as it saves us time later when we pick a dataset as example dataset and we want
# to present it. So this stage is totally optional.
#
# We can save two tables here:
#
# Table1. Case/Control and cSNV information in the sample
# Table2. Case/Control and cSNV information in the population
#
#
###########################################################################################################################
# Table1. Case/Control and cSNV information
Variants = pop_data$Variants
cSNV = pop_data$cSNV
D_case_haplotypes = Find_Sequences_of_Individuals(sample_data$CaseIND, PM = pop_data$Population.Mapping)
D_control_haplotypes = Find_Sequences_of_Individuals(sample_data$ControlIND, PM = pop_data$Population.Mapping)
derived.allele.frequency = rowSums( Variants ) / ncol(Variants)
derived.allele.percentage = derived.allele.frequency * 100
cSNV_HapMat = Variants[cSNV, c(D_case_haplotypes , D_control_haplotypes) ]
dim(cSNV_HapMat)
t = data.frame(
cSNV = cSNV,
daf = derived.allele.percentage[cSNV],
Carrier_Case_Haplo = rowSums(cSNV_HapMat[,1:100]),
Carrier_Control_Haplo = rowSums(cSNV_HapMat[,101:200])
)
rownames(t) = NULL
t = t[order(t$daf , decreasing = TRUE),]
write.table(t, file = "Table1_cSNV_sample_info.txt", sep = ",", quote = FALSE, row.names = F)
rm(t)
##############################################################################################################################
##############################################################################################################################
# Table2. Case/Control and cSNV information in the population
pop_D_case_haplotypes = Find_Sequences_of_Individuals(ind_list = pop_data$DISCRETE[[1]], PM = pop_data$Population.Mapping)
pop_D_control_haplotypes = Find_Sequences_of_Individuals(ind_list = pop_data$DISCRETE[[2]], PM = pop_data$Population.Mapping)
pop_cSNV_HapMat = Variants[cSNV, c(pop_D_case_haplotypes , pop_D_control_haplotypes) ]
t = data.frame(
cSNV = cSNV,
daf = derived.allele.percentage[cSNV],
Carrier_Case_Haplo = rowSums(pop_cSNV_HapMat[,1:length(pop_D_case_haplotypes) ]),
Carrier_Control_Haplo = rowSums(pop_cSNV_HapMat[,(length(pop_D_case_haplotypes)+1):ncol(pop_cSNV_HapMat) ])
)
rownames(t) = NULL
t = t[order(t$daf , decreasing = TRUE),]
write.table(t, file = "Table2_cSNV_population_info.txt", sep = ",", quote = FALSE, row.names = F)
rm(t)
##############################################################################################################################