-
Notifications
You must be signed in to change notification settings - Fork 0
/
2 - MCMC.R
282 lines (228 loc) · 11.2 KB
/
2 - MCMC.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
################################################################################
#
# PROJECT: Estimating the cumulative incidence of SARS-CoV-2 infection and
# the infection fatality ratio in light of waning antibodies
#
# DATE: October 2020
# CODED BY: Kayoko Shioda
# E-MAIL: [email protected]; [email protected]
#
################################################################################
#------------------------------------------------------------------------------#
# Set up
#------------------------------------------------------------------------------#
# Which site are you analyzing?
select.site <- "CT" # "NYC" or "CT"
# Which data do you want to use to inform the delay between symptom onset and death?
time_onset_death <- "GA" # "GA" (GA data) or "ODriscoll" (Wuhan data)
# Do you want to save MCMC outputs?
save.results <- "yes" # "yes" or "no"
# If yes, specify a folder path where you want to save your RData
res.save.folder <- "./Results//"
#------------------------------------------------------------------------------#
# Create a function to estimate time between seroconversion and seroreversion
#------------------------------------------------------------------------------#
FQ_dist <- function (t_lag_all, shape1, shape2, scale1, scale2) {
pr <- {}
for (i in 1:length(t_lag_all)) {
pr[i] <- pweibull(t_lag_all[i], shape1, scale1) - integrate(f = function (y) {
dweibull(y, shape1, scale1) * (pweibull(t_lag_all[i]-y, shape2, scale2))},
lower=0, upper=t_lag_all[i])$value
}
return(pr)
}
#------------------------------------------------------------------------------#
# Load and reformat the COVID-19 mortality data and CDC serosurvey data
#------------------------------------------------------------------------------#
# Select the type of death counts
if (select.site=="NYC") {
# "Combined", "Probable", or "Confirmed"
# "Combined" was used in the main analysis in the paper for NYC
select.death.outcome <- "Combined"
} else if (select.site=="CT") {
# "Daily.deaths" or "Deaths.7dAvg"
# "Deaths.7dAvg" was used in the main analysis in the paper for CT
select.death.outcome <- "Deaths.7dAvg"
}
# Load the COVID-19 mortality data and CDC serosurvey data, and reformat them
if (select.site=="NYC") {
source("0 - data formatting NYC.R")
} else if (select.site=="CT") {
source("0 - data formatting CT.R")
}
# Estimate the timing of symptom onset for fatal cases
if (select.site=="NYC") {
source("1 - redistributed mortality NYC.R")
} else if (select.site=="CT") {
source("1 - redistributed mortality CT.R")
}
if (time_onset_death == "GA") {
timing_onset <- timing_onset.ga
} else if (time_onset_death == "ODriscoll") {
timing_onset <- timing_onset.ODriscoll
}
# Load the reported number of COVID-19 associated deaths
rep_counts <- d1[,select.death.outcome]
# Specify the population size in each site
if (select.site=="NYC") {
pop <- 8.336817*1000000 # From https://www.census.gov/quickfacts/fact/table/newyorkcitynewyork,bronxcountybronxboroughnewyork,kingscountybrooklynboroughnewyork,newyorkcountymanhattanboroughnewyork,queenscountyqueensboroughnewyork,richmondcountystatenislandboroughnewyork/PST045219
} else if (select.site=="CT") {
pop <- 3.7*1000000
}
#------------------------------------------------------------------------------#
# Create a log-likelihood function for MCMC
#------------------------------------------------------------------------------#
weibull_hazard_LL <- function(set.rho, set.mean, set.sd) {
#-----*-----*-----*-----*-----*-----*-----*-----*-----#
# 1. True number of infections
#-----*-----*-----*-----*-----*-----*-----*-----*-----#
# Calculate the number of true infections by adjusting the reported number of
# deaths for time lag between symptom onset and death and IFR (set.rho)
true_inf <- rep(NA, num_date)
for (t1 in 1:(num_date)) {
true_inf[t1] <- min(round(timing_onset[t1] / set.rho), pop) # True infections should not exceed the whole population
}
#-----*-----*-----*-----*-----*-----*-----*-----*-----#
# 2. Timeline of seroconversion & seroreversion
#-----*-----*-----*-----*-----*-----*-----*-----*-----#
# Specify the shape and scale parameters for a Weibull distribution for time
# from symptom onset to seroconversion
shape_conv <- 2.092141 # From Iyer, et al. medRxiv 2020
scale_conv <- 13.00029 # From Iyer, et al. medRxiv 2020
# Calculate the shape and scale parameters for a Weibull distribution for time
# from seroconversion to seroreversion
mean_serorev <- set.mean # From MCMC sample
sd_serorev <- set.sd # From MCMC sample
sum.r <- (mean_serorev^2 + sd_serorev^2)/mean_serorev^2
fn <- function (x, sum) {lgamma(1+2/x) - 2*lgamma(1+1/x) - log(sum)}
shape_rev <- uniroot(fn, sum=sum.r, c(0.01,100), extendInt = "yes", maxiter = 10000)$root
scale_rev <- mean_serorev/(gamma(1+1/shape_rev))
#-----*-----*-----*-----*-----*-----*-----*-----*-----#
# 3. Seropositive individuals on Day t
#-----*-----*-----*-----*-----*-----*-----*-----*-----#
# Calculate the probability that individuals are seropositive on each day
FQ_cdf <- FQ_dist(1:num_date,
shape1=shape_conv, scale1=scale_conv,
shape2=shape_rev, scale2=scale_rev)
# Calculate the number of seropositive individuals on each day
seropos_cases <- rep(NA, num_date)
seropos_cases[1] <- 0 # No individuals have seroconverted on Day 1
for (t2 in 2:num_date) { # On Day 2, 3, ..., last day,
seropos_cases_day_i <- c()
for (k in 1:(t2-1)) {
seropos_cases_day_i[k] <- true_inf[k] * FQ_cdf[t2-k]
}
seropos_cases[t2] <- sum(seropos_cases_day_i)
seropos_cases[t2] <- min(seropos_cases[t2], pop) # Seropositive cases should not exceed the whole population
}
#-----*-----*-----*-----*-----*-----*-----*-----*-----#
# 4. Log likelihood
#-----*-----*-----*-----*-----*-----*-----*-----*-----#
log_likl <- c()
for (round in 1:nrow(cdc_seroprev)) { # For each round of the CDC serosurvey...
# Calculate the log-likelihood, comparing the simulated seroprevalence and
# observed seroprevalence reported by CDC
log_likl[round] <- dbinom(x = cdc_seroprev$n_pos[round], # Number of seropositives (CDC data)
prob = seropos_cases[(d1$date)==cdc_seroprev$mid[round]]/pop, # Simulated seroprevalence
size = cdc_seroprev$n_sample[round], # Number of samples collected (CDC data)
log=T)
}
log_LL <- sum(log_likl) # Sum of log-likelihood for all rounds of the CDC serosurvey
return(list(log_LL, seropos_cases))
}
#------------------------------------------------------------------------------#
# Use MCMC to optimize parameters
#------------------------------------------------------------------------------#
# Give starting values for each parameter (arbitrary choice)
rho_0 <- 0.02 # IFR
mean_0 <- 80 # Average time from seroconversion to seroreversion (days)
sd_0 <- 50 # Standard deviation of the time from seroconversion to seroreversion (days)
old_rho <- rho_0
old_mean <- mean_0
old_sd <- sd_0
# Number of iterations to run the MCMC sampler
burn_in_steps <- 20000
run_steps <- 100000
total_steps <- burn_in_steps + run_steps
# Get a starting log_likelihood value
old_log_LL <- weibull_hazard_LL(old_rho, old_mean, old_sd)[[1]] # [[1]] is log_LL. [[2]] is seropos_cases
# Make an empty vector to store sampled parameters in
sampled_pars <- matrix(0, total_steps, 2) # 3 columns for rho (IFR), mean
sampled_logLL <- matrix(0, total_steps, 2) # 3 columns for rho (IFR), mean
dt.seropos1 <- matrix(0, total_steps, num_date) # Estimated number of seropositives on each day
dt.seropos2 <- matrix(0, total_steps, num_date) # Estimated number of seropositives on each day
dt.seropos3 <- matrix(0, total_steps, num_date) # Estimated number of seropositives on each day
accept_rho <- rep(0, total_steps) # 1 if a new candidate value is accepted. 0 if rejected
accept_mean <- rep(0, total_steps) # 1 if a new candidate value is accepted. 0 if rejected
accept_sd <- rep(0, total_steps) # 1 if a new candidate value is accepted. 0 if rejected
# Implement MCMC
for (i in 1:total_steps) {
###############################
########## rho (IFR) ##########
###############################
# 1. Propose a new value for rho (IFR)
current_rho <- -999
while (current_rho < sum(timing_onset)/pop | current_rho > 1) {
current_rho <- old_rho + 0.001*rnorm(1,0,1)
}
# 2. Proposal ratio (0 because the proposal distribution was symmetric)
log_proposal_ratio <- 0.0
# 3. Get the log-likelihood for the newly sampled parameter
current_log_LL <- weibull_hazard_LL(current_rho, old_mean, old_sd)[[1]]
ts_seropos <- weibull_hazard_LL(current_rho, old_mean, old_sd)[[2]] # This stores the estimated number of seropositives on each day
# 4. Log-likelihood ratio for the last and current sampled parameters
log_likelihood_ratio <- current_log_LL - old_log_LL
# 5. Get the acceptance probability
acceptance_prob <- exp(log_proposal_ratio + log_likelihood_ratio)
# 6. Draw a random nunber from (0, 1] and if it's less than the acceptance
# probability, accept the current parameter. Otherwise, stay on the last one.
if (runif(1) < acceptance_prob) {
old_rho <- current_rho
old_log_LL <- current_log_LL
accept_rho[i] <- 1 # 1 if accepted. 0 if rejected.
}
# 7. Save the current parameter to the trace
sampled_pars[i,1] <- old_rho
sampled_logLL[i,1] <- old_log_LL
dt.seropos1[i,] <- ts_seropos
##########################
########## Mean ##########
##########################
# 1. Propose a new value for the mean of the duration of seroreversion
current_mean <- -999
# while (current_mean < 15 | current_mean > 300) {
current_mean <- old_mean + 1*rnorm(1,0,1)
# }
# 2. Proposal ratio (0 because the proposal distribution was symmetric)
log_proposal_ratio <- 0.0
# # Prior for mean
# log_prior_new <- dexp(current_mean, rate=0.0001, log=TRUE) # non-informative prior
# log_prior_old <- dexp(old_mean, rate=0.0001, log=TRUE)
# log_prior_ratio <- log_prior_new - log_prior_old
# 3. Get the log-likelihood for the newly sampled parameter
current_log_LL <- weibull_hazard_LL(old_rho, current_mean, old_sd)[[1]]
ts_seropos <- weibull_hazard_LL(old_rho, current_mean, old_sd)[[2]] # This stores the estimated number of seropositives on each day
# 4. Log-likelihood ratio for the last and current sampled parameters
log_likelihood_ratio <- current_log_LL - old_log_LL
# 5. Get the acceptance probability
acceptance_prob <- exp(log_proposal_ratio + log_likelihood_ratio)
# 6. Draw a random nunber from (0, 1] and if it's less than the acceptance
# probability, accept the current parameter. Otherwise, stay on the last one.
if (runif(1) < acceptance_prob) {
old_mean <- current_mean
old_log_LL <- current_log_LL
accept_mean[i] <- 1 # 1 if accepted. 0 if rejected.
}
# 7. Save the current parameter to the trace
sampled_pars[i,2] <- old_mean
sampled_logLL[i,2] <- old_log_LL
dt.seropos2[i,] <- ts_seropos
print(round(i/total_steps, 2))
}
# Save MCMC outputs
dt.seropos <- rbind(dt.seropos1, dt.seropos2)
if (save.results == "yes") {
save(sampled_pars, file=paste0(res.save.folder,"sampled_pars.RData"))
save(sampled_logLL, file=paste0(res.save.folder,"sampled_logLL.RData"))
save(dt.seropos, file=paste0(res.save.folder,"dt.seropos.RData"))
}