diff --git a/R/bglm_me.R b/R/bglm_me.R index f2c880b..bd806f4 100644 --- a/R/bglm_me.R +++ b/R/bglm_me.R @@ -43,63 +43,76 @@ #' @export #' #' @examples -#' -#'\dontrun{ -#' data(ozone) -#' data(health_sim) +#' \dontrun{ #' library(bspme) -#' data(ozone) +#' data(NO2_Jan2012) #' data(health_sim) #' library(fields) -#' # exposure mean and covariance at health participant locations with 06/18/1987 data (date id = 16) -#' # using Gaussian process with prior mean zero and exponential covariance kernel -#' # with fixed range 300 (in miles) and stdev 15 (in ppb) +#' library(maps) +#' # Obtain the predicted exposure mean and covariance at health subject locations based on Jan 10, 2012 data +#' # using Gaussian process prior with mean zero and exponential covariance kernel +#' # with fixed range 5 (in miles) and stdev 0.5 (in ppbv) +#' +#' # exposure data +#' data_jan10 = NO2_Jan2012[NO2_Jan2012$date == as.POSIXct("2012-01-10"),] +#' coords_monitor = cbind(data_jan10$lon, data_jan10$lat) #' -#' ozone16 = ozone[ozone$date_id==16,] +#' # health data +#' coords_health = cbind(health_sim$lon, health_sim$lat) #' -#' Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) -#' Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) -#' Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), -#' cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +#' distmat_xx <- rdist.earth(coords_monitor) +#' distmat_xy <- rdist.earth(coords_monitor, coords_health) +#' distmat_yy <- rdist.earth(coords_health) #' -#' Kxx = Exponential(Dxx, range = 300, phi=15^2) -#' Kyy = Exponential(Dyy, range = 300, phi=15^2) -#' Kxy = Exponential(Dxy, range = 300, phi=15^2) +#' a = 5; sigma = 1; # assume known #' -#' X_mean = t(Kxy) %*% solve(Kxx, ozone16$ozone_ppb) -#' X_cov = Kyy - t(Kxy) %*% solve(Kxx, Kxy) +#' Sigmaxx = fields::Matern(distmat_xx, smoothness = 0.5, range = a, phi = sigma^2) +#' Sigmaxy = fields::Matern(distmat_xy, smoothness = 0.5, range = a, phi = sigma^2) +#' Sigmayy = fields::Matern(distmat_yy, smoothness = 0.5, range = a, phi = sigma^2) +#' +#' # posterior predictive mean and covariance of exposure at health data +#' X_mean <- t(Sigmaxy) %*% solve(Sigmaxx, data_jan10$lnNO2) +#' X_cov <- Sigmayy - t(Sigmaxy) %*% solve(Sigmaxx,Sigmaxy) # n_y by n_y #' #' # visualize -#' par(mfrow = c(1,3)) -#' quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), -#' ozone16$ozone_ppb, main = "ozone measurements"); US(add= T) +#' # monitoring station exposure data +#' quilt.plot(cbind(data_jan10$lon, data_jan10$lat), +#' data_jan10$lnNO2, main = "NO2 exposures (in log) at 21 stations", +#' xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +#' maps::map("county", "Texas", add = T) +#' +#' # posterior predictive mean of exposure at health data +#' quilt.plot(cbind(health_sim$lon, health_sim$lat), +#' X_mean, main = "posterior predictive mean of exposure at health data locations", +#' xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +#' maps::map("county", "Texas", add = T) #' -#' quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), -#' X_mean, main = "health subjects, mean of exposure"); US(add= T) -#' points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) +#' # posterior predictive sd of exposure at health data +#' quilt.plot(cbind(health_sim$lon, health_sim$lat), +#' sqrt(diag(X_cov)), main = "posterior predictive sd of exposure at health data locations", +#' xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +#' maps::map("county", "Texas", add = T) #' -#' quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), -#' sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) -#' points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) #' #' # vecchia approximation -#' run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), +#' run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$lon, health_sim$lat), #' n.neighbors = 10) #' Q_sparse = run_vecchia$Q #' run_vecchia$cputime #' -#' # fit the model -#' fit_me = blogireg_me(Y = health_sim$Ybinary, +#' # fit the model, continuous data +#' fit = bglm_me(Y = health_sim$Ybinary, #' X_mean = X_mean, #' X_prec = Q_sparse, # sparse precision matrix #' Z = health_sim$Z, -#' nburn = 100, +#' family = binomial(link = "logit"), +#' nburn = 1000, #' nsave = 1000, -#' nthin = 1) -#' fit_me$cputime -#' summary(fit_me$posterior) +#' nthin = 5) +#' fit$cputime +#' summary(fit$posterior) #' library(bayesplot) -#' bayesplot::mcmc_trace(fit_me$posterior) +#' bayesplot::mcmc_trace(fit$posterior) #' } #' bglm_me <- function(Y, @@ -131,8 +144,8 @@ bglm_me <- function(Y, var_beta = prior$var_beta n_y = length(Y) - # check z is binary (temporary for family = "binomial") - if(!all(Z %in% c(0,1))) stop("Z must be binary") + # check Y is binary (temporary for family = "binomial") + if(!all(Y %in% c(0,1))) stop("Y must be binary") if(is.vector(Z)) Z = as.matrix(Z) if(!is.list(X_mean) & !is.list(X_prec)){ diff --git a/R/blm_me.R b/R/blm_me.R index 8791732..ba5e865 100644 --- a/R/blm_me.R +++ b/R/blm_me.R @@ -40,63 +40,75 @@ #' @export #' #' @examples -#' -#'\dontrun{ -#' data(ozone) -#' data(health_sim) +#' \dontrun{ #' library(bspme) -#' data(ozone) +#' data(NO2_Jan2012) #' data(health_sim) #' library(fields) -#' # exposure mean and covariance at health subject locations with 06/18/1987 data (date id = 16) -#' # using Gaussian process with prior mean zero and exponential covariance kernel -#' # with fixed range 300 (in miles) and stdev 15 (in ppb) +#' library(maps) +#' # Obtain the predicted exposure mean and covariance at health subject locations based on Jan 10, 2012 data +#' # using Gaussian process prior with mean zero and exponential covariance kernel +#' # with fixed range 5 (in miles) and stdev 0.5 (in ppbv) +#' +#' # exposure data +#' data_jan10 = NO2_Jan2012[NO2_Jan2012$date == as.POSIXct("2012-01-10"),] +#' coords_monitor = cbind(data_jan10$lon, data_jan10$lat) #' -#' ozone16 = ozone[ozone$date_id==16,] +#' # health data +#' coords_health = cbind(health_sim$lon, health_sim$lat) #' -#' Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) -#' Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) -#' Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), -#' cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +#' distmat_xx <- rdist.earth(coords_monitor) +#' distmat_xy <- rdist.earth(coords_monitor, coords_health) +#' distmat_yy <- rdist.earth(coords_health) #' -#' Kxx = Exponential(Dxx, range = 300, phi=15^2) -#' Kyy = Exponential(Dyy, range = 300, phi=15^2) -#' Kxy = Exponential(Dxy, range = 300, phi=15^2) +#' a = 5; sigma = 1; # assume known #' -#' X_mean = t(Kxy) %*% solve(Kxx, ozone16$ozone_ppb) -#' X_cov = Kyy - t(Kxy) %*% solve(Kxx, Kxy) +#' Sigmaxx = fields::Matern(distmat_xx, smoothness = 0.5, range = a, phi = sigma^2) +#' Sigmaxy = fields::Matern(distmat_xy, smoothness = 0.5, range = a, phi = sigma^2) +#' Sigmayy = fields::Matern(distmat_yy, smoothness = 0.5, range = a, phi = sigma^2) +#' +#' # posterior predictive mean and covariance of exposure at health data +#' X_mean <- t(Sigmaxy) %*% solve(Sigmaxx, data_jan10$lnNO2) +#' X_cov <- Sigmayy - t(Sigmaxy) %*% solve(Sigmaxx,Sigmaxy) # n_y by n_y #' #' # visualize -#' par(mfrow = c(1,3)) -#' quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), -#' ozone16$ozone_ppb, main = "ozone measurements"); US(add= T) +#' # monitoring station exposure data +#' quilt.plot(cbind(data_jan10$lon, data_jan10$lat), +#' data_jan10$lnNO2, main = "NO2 exposures (in log) at 21 stations", +#' xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +#' maps::map("county", "Texas", add = T) +#' +#' # posterior predictive mean of exposure at health data +#' quilt.plot(cbind(health_sim$lon, health_sim$lat), +#' X_mean, main = "posterior predictive mean of exposure at health data locations", +#' xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +#' maps::map("county", "Texas", add = T) #' -#' quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), -#' X_mean, main = "health subjects, mean of exposure"); US(add= T) -#' points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) +#' # posterior predictive sd of exposure at health data +#' quilt.plot(cbind(health_sim$lon, health_sim$lat), +#' sqrt(diag(X_cov)), main = "posterior predictive sd of exposure at health data locations", +#' xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +#' maps::map("county", "Texas", add = T) #' -#' quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), -#' sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) -#' points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) #' #' # vecchia approximation -#' run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), +#' run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$lon, health_sim$lat), #' n.neighbors = 10) #' Q_sparse = run_vecchia$Q #' run_vecchia$cputime #' -#' # fit the model -#' fit_me = blinreg_me(Y = health_sim$Y, +#' # fit the model, continuous data +#' fit = blm_me(Y = health_sim$Y, #' X_mean = X_mean, #' X_prec = Q_sparse, # sparse precision matrix #' Z = health_sim$Z, -#' nburn = 100, +#' nburn = 1000, #' nsave = 1000, -#' nthin = 1) -#' fit_me$cputime -#' summary(fit_me$posterior) +#' nthin = 5) +#' fit$cputime +#' summary(fit$posterior) #' library(bayesplot) -#' bayesplot::mcmc_trace(fit_me$posterior) +#' bayesplot::mcmc_trace(fit$posterior) #' } #' blm_me <- function(Y, diff --git a/R/bspme-package.R b/R/bspme-package.R index 237bd75..e36fc40 100644 --- a/R/bspme-package.R +++ b/R/bspme-package.R @@ -37,8 +37,8 @@ NULL #' \describe{ #' \item{Y}{n by 1 matrix, numeric, simulated continuous health outcome} #' \item{Ybinary}{n by 1 matrix, numeric, simulated binary health outcome} -#' \item{coords_Y_lon}{n by 1 matrix, numeric, simulated health subject longitude} -#' \item{coords_Y_lat}{n by 1 matrix, numeric, simulated health subject latitude} +#' \item{lon}{n by 1 matrix, numeric, simulated health subject longitude} +#' \item{lat}{n by 1 matrix, numeric, simulated health subject latitude} #' \item{Z}{n by 1 matrix, numeric, covariate} #' \item{X_true}{n by 1 matrix, numeric, true ozone exposure used for simulation} #' } diff --git a/data-raw/health_sim.R b/data-raw/health_sim.R index 359466b..5926f3f 100644 --- a/data-raw/health_sim.R +++ b/data-raw/health_sim.R @@ -3,32 +3,29 @@ n_y = 2000 # number of health data data("NO2_Jan2012") set.seed(1234567) -coords_health = cbind(rnorm(n_y, -95.3, sd = 0.1),rnorm(n_y, 29.8, sd = 0.2)) +coords_health = cbind(rnorm(n_y, -95.5, sd = 0.3),rnorm(n_y, 29.8, sd = 0.2)) # simulate health data based on day 10 (Jan 10, 2012) data_jan10 = NO2_Jan2012[NO2_Jan2012$date == as.POSIXct("2012-01-10"),] coords_monitor = cbind(data_jan10$lon, data_jan10$lat) -quilt.plot(coords_monitor, data_jan10$lnNO2) - +#quilt.plot(coords_monitor, data_jan10$lnNO2) # visualize -plot(coords_monitor, pch = 16, col = "purple") -points(coords_health) -fields::US(add = T) - - +#plot(coords_monitor, pch = 16, col = "purple") +#points(coords_health) +#fields::US(add = T) -sigma <- 0.5 # standard deviation +sigma <- 1 # standard deviation a <- 5 # range, in miles ## Matrices needed for prediction -distmat_xx <- rdist.earth(coords_monitor) -distmat_xy <- rdist.earth(coords_monitor, coords_health) -distmat_yy <- rdist.earth(coords_health) +distmat_xx <- fields::rdist.earth(coords_monitor) +distmat_xy <- fields::rdist.earth(coords_monitor, coords_health) +distmat_yy <- fields::rdist.earth(coords_health) -Sigmaxx = Matern(distmat_xx, smoothness = 0.5, range = a, phi = sigma^2) -Sigmaxy = Matern(distmat_xy, smoothness = 0.5, range = a, phi = sigma^2) -Sigmayy = Matern(distmat_yy, smoothness = 0.5, range = a, phi = sigma^2) +Sigmaxx = fields::Matern(distmat_xx, smoothness = 0.5, range = a, phi = sigma^2) +Sigmaxy = fields::Matern(distmat_xy, smoothness = 0.5, range = a, phi = sigma^2) +Sigmayy = fields::Matern(distmat_yy, smoothness = 0.5, range = a, phi = sigma^2) ## GP mean pred_mean <- t(Sigmaxy) %*% solve(Sigmaxx, data_jan10$lnNO2) @@ -45,14 +42,15 @@ quilt.plot(coords_health, sqrt(diag(pred_cov)), main = "sd"); US(add= T) quilt.plot(coords_health, true_exposure, main = "draw from MVN"); US(add= T) # simulated covariate -z = runif(n_y) +z = rnorm(n_y) # generate health data set.seed(1234) -Y = 2500 -30*true_exposure + 500*z + rnorm(n_y, sd = 200) +Y = 1 -2*true_exposure + 3*z + rnorm(n_y, sd = 1) #summary(Y) +#var(1-2*true_exposure + 3*z) / var(Y) #summary(lm(Y ~ true_exposure + z)) -linpred = 0.2*true_exposure - 2*z +linpred = -1 + 2*true_exposure - 3*z Ybinary = rbinom(n_y, 1, prob = 1/(1+exp(-linpred))) #summary(Ybinary) #summary(glm(Ybinary ~ true_exposure + z), family = "binomial") @@ -61,8 +59,8 @@ health_sim = data.frame(Y = Y, Ybinary = Ybinary, #X_mean = pred_mean, #X_cov = pred_cov, - coords_Y_lon = coords_health[,1], - coords_Y_lat = coords_health[,2], + lon = coords_health[,1], + lat = coords_health[,2], Z = z, X_true = true_exposure) diff --git a/data/health_sim.rda b/data/health_sim.rda index 7c8bf08..368f912 100644 Binary files a/data/health_sim.rda and b/data/health_sim.rda differ diff --git a/man/bglm_me.Rd b/man/bglm_me.Rd index c629e14..131968b 100644 --- a/man/bglm_me.Rd +++ b/man/bglm_me.Rd @@ -68,63 +68,76 @@ We consider normal priors for regression coefficients, where \code{var_beta} corresponds to \eqn{V_\beta}. } \examples{ - \dontrun{ -data(ozone) -data(health_sim) library(bspme) -data(ozone) +data(NO2_Jan2012) data(health_sim) library(fields) -# exposure mean and covariance at health participant locations with 06/18/1987 data (date id = 16) -# using Gaussian process with prior mean zero and exponential covariance kernel -# with fixed range 300 (in miles) and stdev 15 (in ppb) +library(maps) +# Obtain the predicted exposure mean and covariance at health subject locations based on Jan 10, 2012 data +# using Gaussian process prior with mean zero and exponential covariance kernel +# with fixed range 5 (in miles) and stdev 0.5 (in ppbv) + +# exposure data +data_jan10 = NO2_Jan2012[NO2_Jan2012$date == as.POSIXct("2012-01-10"),] +coords_monitor = cbind(data_jan10$lon, data_jan10$lat) -ozone16 = ozone[ozone$date_id==16,] +# health data +coords_health = cbind(health_sim$lon, health_sim$lat) -Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) -Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) -Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), - cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +distmat_xx <- rdist.earth(coords_monitor) +distmat_xy <- rdist.earth(coords_monitor, coords_health) +distmat_yy <- rdist.earth(coords_health) -Kxx = Exponential(Dxx, range = 300, phi=15^2) -Kyy = Exponential(Dyy, range = 300, phi=15^2) -Kxy = Exponential(Dxy, range = 300, phi=15^2) +a = 5; sigma = 1; # assume known -X_mean = t(Kxy) \%*\% solve(Kxx, ozone16$ozone_ppb) -X_cov = Kyy - t(Kxy) \%*\% solve(Kxx, Kxy) +Sigmaxx = fields::Matern(distmat_xx, smoothness = 0.5, range = a, phi = sigma^2) +Sigmaxy = fields::Matern(distmat_xy, smoothness = 0.5, range = a, phi = sigma^2) +Sigmayy = fields::Matern(distmat_yy, smoothness = 0.5, range = a, phi = sigma^2) + +# posterior predictive mean and covariance of exposure at health data +X_mean <- t(Sigmaxy) \%*\% solve(Sigmaxx, data_jan10$lnNO2) +X_cov <- Sigmayy - t(Sigmaxy) \%*\% solve(Sigmaxx,Sigmaxy) # n_y by n_y # visualize -par(mfrow = c(1,3)) -quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), - ozone16$ozone_ppb, main = "ozone measurements"); US(add= T) +# monitoring station exposure data +quilt.plot(cbind(data_jan10$lon, data_jan10$lat), + data_jan10$lnNO2, main = "NO2 exposures (in log) at 21 stations", + xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +maps::map("county", "Texas", add = T) + +# posterior predictive mean of exposure at health data +quilt.plot(cbind(health_sim$lon, health_sim$lat), + X_mean, main = "posterior predictive mean of exposure at health data locations", + xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +maps::map("county", "Texas", add = T) -quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - X_mean, main = "health subjects, mean of exposure"); US(add= T) -points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) +# posterior predictive sd of exposure at health data +quilt.plot(cbind(health_sim$lon, health_sim$lat), + sqrt(diag(X_cov)), main = "posterior predictive sd of exposure at health data locations", + xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +maps::map("county", "Texas", add = T) -quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) -points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) # vecchia approximation -run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), +run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$lon, health_sim$lat), n.neighbors = 10) Q_sparse = run_vecchia$Q run_vecchia$cputime -# fit the model -fit_me = blogireg_me(Y = health_sim$Ybinary, +# fit the model, continuous data +fit = bglm_me(Y = health_sim$Ybinary, X_mean = X_mean, X_prec = Q_sparse, # sparse precision matrix Z = health_sim$Z, - nburn = 100, + family = binomial(link = "logit"), + nburn = 1000, nsave = 1000, - nthin = 1) -fit_me$cputime -summary(fit_me$posterior) + nthin = 5) +fit$cputime +summary(fit$posterior) library(bayesplot) -bayesplot::mcmc_trace(fit_me$posterior) +bayesplot::mcmc_trace(fit$posterior) } } diff --git a/man/blm_me.Rd b/man/blm_me.Rd index b250af7..1e1a8e4 100644 --- a/man/blm_me.Rd +++ b/man/blm_me.Rd @@ -63,63 +63,75 @@ We consider semiconjugate priors for regression coefficients and error variance, where \code{var_beta} corresponds to \eqn{V_\beta} and \code{a_Y}, \code{b_Y} corresponds to hyperparameters for \eqn{\sigma^2_Y}. } \examples{ - \dontrun{ -data(ozone) -data(health_sim) library(bspme) -data(ozone) +data(NO2_Jan2012) data(health_sim) library(fields) -# exposure mean and covariance at health subject locations with 06/18/1987 data (date id = 16) -# using Gaussian process with prior mean zero and exponential covariance kernel -# with fixed range 300 (in miles) and stdev 15 (in ppb) +library(maps) +# Obtain the predicted exposure mean and covariance at health subject locations based on Jan 10, 2012 data +# using Gaussian process prior with mean zero and exponential covariance kernel +# with fixed range 5 (in miles) and stdev 0.5 (in ppbv) + +# exposure data +data_jan10 = NO2_Jan2012[NO2_Jan2012$date == as.POSIXct("2012-01-10"),] +coords_monitor = cbind(data_jan10$lon, data_jan10$lat) -ozone16 = ozone[ozone$date_id==16,] +# health data +coords_health = cbind(health_sim$lon, health_sim$lat) -Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) -Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) -Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), - cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +distmat_xx <- rdist.earth(coords_monitor) +distmat_xy <- rdist.earth(coords_monitor, coords_health) +distmat_yy <- rdist.earth(coords_health) -Kxx = Exponential(Dxx, range = 300, phi=15^2) -Kyy = Exponential(Dyy, range = 300, phi=15^2) -Kxy = Exponential(Dxy, range = 300, phi=15^2) +a = 5; sigma = 1; # assume known -X_mean = t(Kxy) \%*\% solve(Kxx, ozone16$ozone_ppb) -X_cov = Kyy - t(Kxy) \%*\% solve(Kxx, Kxy) +Sigmaxx = fields::Matern(distmat_xx, smoothness = 0.5, range = a, phi = sigma^2) +Sigmaxy = fields::Matern(distmat_xy, smoothness = 0.5, range = a, phi = sigma^2) +Sigmayy = fields::Matern(distmat_yy, smoothness = 0.5, range = a, phi = sigma^2) + +# posterior predictive mean and covariance of exposure at health data +X_mean <- t(Sigmaxy) \%*\% solve(Sigmaxx, data_jan10$lnNO2) +X_cov <- Sigmayy - t(Sigmaxy) \%*\% solve(Sigmaxx,Sigmaxy) # n_y by n_y # visualize -par(mfrow = c(1,3)) -quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), - ozone16$ozone_ppb, main = "ozone measurements"); US(add= T) +# monitoring station exposure data +quilt.plot(cbind(data_jan10$lon, data_jan10$lat), + data_jan10$lnNO2, main = "NO2 exposures (in log) at 21 stations", + xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +maps::map("county", "Texas", add = T) + +# posterior predictive mean of exposure at health data +quilt.plot(cbind(health_sim$lon, health_sim$lat), + X_mean, main = "posterior predictive mean of exposure at health data locations", + xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +maps::map("county", "Texas", add = T) -quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - X_mean, main = "health subjects, mean of exposure"); US(add= T) -points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) +# posterior predictive sd of exposure at health data +quilt.plot(cbind(health_sim$lon, health_sim$lat), + sqrt(diag(X_cov)), main = "posterior predictive sd of exposure at health data locations", + xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +maps::map("county", "Texas", add = T) -quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) -points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) # vecchia approximation -run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), +run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$lon, health_sim$lat), n.neighbors = 10) Q_sparse = run_vecchia$Q run_vecchia$cputime -# fit the model -fit_me = blinreg_me(Y = health_sim$Y, +# fit the model, continuous data +fit = blm_me(Y = health_sim$Y, X_mean = X_mean, X_prec = Q_sparse, # sparse precision matrix Z = health_sim$Z, - nburn = 100, + nburn = 1000, nsave = 1000, - nthin = 1) -fit_me$cputime -summary(fit_me$posterior) + nthin = 5) +fit$cputime +summary(fit$posterior) library(bayesplot) -bayesplot::mcmc_trace(fit_me$posterior) +bayesplot::mcmc_trace(fit$posterior) } } diff --git a/man/health_sim.Rd b/man/health_sim.Rd index c46baf4..6c66177 100644 --- a/man/health_sim.Rd +++ b/man/health_sim.Rd @@ -8,8 +8,8 @@ A data frame with n = 3000 rows and 4 variables: \describe{ \item{Y}{n by 1 matrix, numeric, simulated continuous health outcome} \item{Ybinary}{n by 1 matrix, numeric, simulated binary health outcome} -\item{coords_Y_lon}{n by 1 matrix, numeric, simulated health subject longitude} -\item{coords_Y_lat}{n by 1 matrix, numeric, simulated health subject latitude} +\item{lon}{n by 1 matrix, numeric, simulated health subject longitude} +\item{lat}{n by 1 matrix, numeric, simulated health subject latitude} \item{Z}{n by 1 matrix, numeric, covariate} \item{X_true}{n by 1 matrix, numeric, true ozone exposure used for simulation} } diff --git a/vignettes/Ozone-exposure-and-health-data-analysis.Rmd b/vignettes/Ozone-exposure-and-health-data-analysis.Rmd deleted file mode 100644 index 8eaebfc..0000000 --- a/vignettes/Ozone-exposure-and-health-data-analysis.Rmd +++ /dev/null @@ -1,180 +0,0 @@ ---- -title: "Ozone exposure and health data analysis" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{ozone_simuldata} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - - - -This is a vignette for the `bspme` package based on ozone exposure data of midwest US, 1987 and simulated health dataset. - -First load the package and data: - - -```r -rm(list=ls()) -library(bspme) -library(fields) -data(ozone) -data(health_sim) -``` - -The ozone data are collected daily at 67 monitoring sites in midwest US during 89 days. We focus on ozone exposure on a specific date 06/18/1987 (date id = 16). We also consider simulated health dataset, with $n=3000$ subject locations and continuous health responses. - - -```r -ozone16 = ozone[ozone$date_id==16,] -par(mfrow = c(1,2)) -quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), - ozone16$ozone_ppb, main = "67 ozone monitoring sites"); US(add= T) -plot(health_sim$coords_y_lon, health_sim$coords_y_lat, pch = 19, cex = 0.5, col = "grey", - xlab = "Longitude", ylab = "Latitude", main = "3000 simulated health subject locations"); US(add = T) -``` - -
-plot of chunk ozone_eda -

plot of chunk ozone_eda

-
- -### Obtain posterior predictive distribution at health subject locations - -In this example, we will use Gaussian process to obtain a predictive distribution of $(X_1,\dots,X_n)$, the ozone exposure at the health subject locations. The posterior predictive distribution is n-dimensional multivariate normal $N(\mu_X, Q_X^{-1})$, which will be used as a prior in health model to incorporate measurement error information of ozone exposure. -Specifically, we will use the exponential covariance kernel with fixed range 300 (in miles) and standard deviation 15 (in ppb). - - -```r - -# distance calculation -Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) -Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) -Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), - cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) - -# kernel matrices -Kxx = Exponential(Dxx, range = 300, phi=15^2) -Kyy = Exponential(Dyy, range = 300, phi=15^2) -Kxy = Exponential(Dxy, range = 300, phi=15^2) - -# posterior predictive mean and covariance -X_mean = t(Kxy) %*% solve(Kxx, ozone16$ozone_ppb) -X_cov = Kyy - t(Kxy) %*% solve(Kxx, Kxy) - -# visualization of posterior predictive distribution. Black triangle is monitoring station locations. -par(mfrow = c(1,2)) -quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - X_mean, main = "health subjects, mean of exposure"); US(add= T) -points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) - -quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) -points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) -``` - -
-plot of chunk ozone_eda3 -

plot of chunk ozone_eda3

-
- -The covariance of $(X_1,\dots,X_n)$ contains all spatial dependency information up to a second order, but its inverse $Q_X$ become a dense matrix. -When it comes to fitting the Bayesian linear model with measurement error, the naive implementation with n-dimensional multivariate normal prior $N(\mu_X, Q_X^{-1})$ takes cubic time complexity of posterior inference for each MCMC iteration, which becomes infeasible to run when $n$ is large such as tens of thousands. - - -### Run Vecchia approximation to get sparse precision matrix - -We propose to use Vecchia approximation to get a new multivariate normal prior of $(X_1,\dots,X_n)$ with sparse precision matrix, which is crucial for scalable implementation of health model with measurement error, but also at the same time it aims to best preserve the spatial dependency information in the original covariance matrix. - - -```r -#Run the Vecchia approximation to get the sparse precision matrix. - -# Vecchia approximation -run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - n.neighbors = 10) -Q_sparse = run_vecchia$Q -run_vecchia$cputime -#> Time difference of 0.796325 secs -``` - - -### Fit bspme with sparse precision matrix - -We can now fit the main function of bspme package with sparse precision matrix. - - -```r -# fit the model -fit_me = blinreg_me(Y = health_sim$Y, - X_mean = X_mean, - X_prec = Q_sparse, # sparse precision matrix - Z = health_sim$Z, - nburn = 100, - nsave = 1000, - nthin = 1) -``` - -```r -fit_me$cputime -#> Time difference of 6.17423 secs -``` -It took less than 10 seconds to (1) run Vecchia approximation and (2) run the MCMC algorithm with total 1100 iterations. - - -```r -summary(fit_me$posterior) -#> -#> Iterations = 1:1000 -#> Thinning interval = 1 -#> Number of chains = 1 -#> Sample size per chain = 1000 -#> -#> 1. Empirical mean and standard deviation for each variable, -#> plus standard error of the mean: -#> -#> Mean SD Naive SE Time-series SE -#> (Intercept) 100.0234 0.624015 0.0197331 0.0259492 -#> exposure.1 -0.5021 0.007004 0.0002215 0.0003135 -#> covariate.1 4.3374 0.337301 0.0106664 0.0126324 -#> sigma2_Y 24.7826 0.712924 0.0225446 0.0280662 -#> -#> 2. Quantiles for each variable: -#> -#> 2.5% 25% 50% 75% 97.5% -#> (Intercept) 98.8360 99.6014 100.0057 100.4462 101.2774 -#> exposure.1 -0.5163 -0.5066 -0.5021 -0.4974 -0.4882 -#> covariate.1 3.6657 4.1060 4.3402 4.5681 4.9898 -#> sigma2_Y 23.3910 24.3080 24.7964 25.2340 26.1523 -bayesplot::mcmc_trace(fit_me$posterior) -``` - -![plot of chunk unnamed-chunk-3](figure/unnamed-chunk-3-1.png) - -### Fit bspme without sparse precision matrix - - -As mentioned before, if precision matrix is dense, the posterior computation becomes infeasible when $n$ becomes large, such as tens of thousands. See the following example when $n=3000$. - - -```r -# fit the model, without vecchia approximation -# I will only run 11 iteration for illustration purpose -Q_dense = solve(X_cov) # inverting 3000 x 3000 matrix, takes some time -# run -fit_me_dense = blinreg_me(Y = health_sim$Y, - X_mean = X_mean, - X_prec = Q_dense, # dense precision - Z = health_sim$Z, - nburn = 1, - nsave = 10, - nthin = 1) -``` - -```r -fit_me_dense$cputime -#> Time difference of 58.06338 secs -``` - -With even only 11 MCMC iterations, It took about 1 mins to run the MCMC algorithm. -If number of health subject locations is tens of thousands, the naive algorithm using dense precision matrix becomes infeasible, and Vecchia approximation becomes necesssary to carry out the inference. diff --git a/vignettes/figure/.DS_Store b/vignettes/figure/.DS_Store deleted file mode 100644 index 5008ddf..0000000 Binary files a/vignettes/figure/.DS_Store and /dev/null differ diff --git a/vignettes/figure/eda-1.png b/vignettes/figure/eda-1.png new file mode 100644 index 0000000..ad28463 Binary files /dev/null and b/vignettes/figure/eda-1.png differ diff --git a/vignettes/figure/eda2-1.png b/vignettes/figure/eda2-1.png new file mode 100644 index 0000000..c61c9cb Binary files /dev/null and b/vignettes/figure/eda2-1.png differ diff --git a/vignettes/figure/ozone_eda-1.png b/vignettes/figure/ozone_eda-1.png deleted file mode 100644 index c6c53af..0000000 Binary files a/vignettes/figure/ozone_eda-1.png and /dev/null differ diff --git a/vignettes/figure/ozone_eda3-1.png b/vignettes/figure/ozone_eda3-1.png deleted file mode 100644 index c8b8d88..0000000 Binary files a/vignettes/figure/ozone_eda3-1.png and /dev/null differ diff --git a/vignettes/figure/unnamed-chunk-3-1.png b/vignettes/figure/unnamed-chunk-3-1.png index 6d1c0e6..8eb5a2a 100644 Binary files a/vignettes/figure/unnamed-chunk-3-1.png and b/vignettes/figure/unnamed-chunk-3-1.png differ diff --git a/vignettes/no2-exposure-and-health-data-analysis.Rmd b/vignettes/no2-exposure-and-health-data-analysis.Rmd new file mode 100644 index 0000000..667699f --- /dev/null +++ b/vignettes/no2-exposure-and-health-data-analysis.Rmd @@ -0,0 +1,198 @@ +--- +title: "NO2 exposure and health data analysis" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{no2_simuldata} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +This is a vignette for the `bspme` package based on January 2012 daily average NO2 data at Houston, Texas and simulated health dataset. + +First load the package and data: + + +```r +rm(list=ls()) +library(bspme) +library(fields) +library(maps) +data("NO2_Jan2012") +data("health_sim") +``` + +The daily average NO2 data are collected at 21 monitoring sites within and around the Houston, Texas during Jan 2012. We focus on NO2 exposure on a specific date (Jan 10th, 2012) for an illustration purpose. We also consider simulated health dataset, with $n=2000$ subject locations and continuous health responses. + + +```r +data_jan10 = NO2_Jan2012[NO2_Jan2012$date == as.POSIXct("2012-01-10"),] +par(mfrow = c(1,2)) +quilt.plot(cbind(data_jan10$lon, data_jan10$lat), + data_jan10$lnNO2, main = "NO2 exposures (in log) at 21 stations", + xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +maps::map("county", "Texas", add = T) +plot(health_sim$lon, health_sim$lat, pch = 19, cex = 0.5, col = "grey", + xlab = "Longitude", ylab = "Latitude", main = "2000 simulated health subject locations"); US(add = T) +maps::map("county", "Texas", add = T) +``` + +
+plot of chunk eda +

plot of chunk eda

+
+ +### Obtain posterior predictive distribution at health subject locations + +In this example, we will use Gaussian process to obtain a predictive distribution of $(X_1,\dots,X_n)$, the log(NO2) exposure at the health subject locations. The posterior predictive distribution is n-dimensional multivariate normal $N(\mu_X, Q_X^{-1})$, which will be used as a prior in health model to incorporate measurement error information of ozone exposure. +Specifically, we will use the exponential covariance kernel with fixed range 5 (in miles) and standard deviation 1 (in ppbv). + + +```r +coords_monitor = cbind(data_jan10$lon, data_jan10$lat) +coords_health = cbind(health_sim$lon, health_sim$lat) +a = 5; sigma = 1; # assume known +# distance calculation +distmat_xx <- rdist.earth(coords_monitor) +distmat_xy <- rdist.earth(coords_monitor, coords_health) +distmat_yy <- rdist.earth(coords_health) + +# GP covariance matrices +Sigmaxx = fields::Matern(distmat_xx, smoothness = 0.5, range = a, phi = sigma^2) +Sigmaxy = fields::Matern(distmat_xy, smoothness = 0.5, range = a, phi = sigma^2) +Sigmayy = fields::Matern(distmat_yy, smoothness = 0.5, range = a, phi = sigma^2) + +# posterior predictive mean and covariance +X_mean <- t(Sigmaxy) %*% solve(Sigmaxx, data_jan10$lnNO2) +X_cov <- Sigmayy - t(Sigmaxy) %*% solve(Sigmaxx,Sigmaxy) # n_y by n_y + +# visualization of posterior predictive distribution. Black triangle is monitoring station locations. +par(mfrow = c(1,2)) +quilt.plot(cbind(health_sim$lon, health_sim$lat), + X_mean, main = "posterior predictive mean of exposure at health data locations", + xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +maps::map("county", "Texas", add = T) +# posterior predictive sd of exposure at health data +quilt.plot(cbind(health_sim$lon, health_sim$lat), + sqrt(diag(X_cov)), main = "posterior predictive sd of exposure at health data locations", + xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +maps::map("county", "Texas", add = T) +``` + +
+plot of chunk eda2 +

plot of chunk eda2

+
+ +The covariance of $(X_1,\dots,X_n)$ contains all spatial dependency information up to a second order, but its inverse $Q_X$ become a dense matrix. +When it comes to fitting the Bayesian linear model with measurement error, the naive implementation with n-dimensional multivariate normal prior $N(\mu_X, Q_X^{-1})$ takes cubic time complexity of posterior inference for each MCMC iteration, which becomes infeasible to run when $n$ is large such as tens of thousands. + + +### Run Vecchia approximation to get sparse precision matrix + +We propose to use Vecchia approximation to get a new multivariate normal prior of $(X_1,\dots,X_n)$ with sparse precision matrix, which is crucial for scalable implementation of health model with measurement error, but also at the same time it aims to best preserve the spatial dependency information in the original covariance matrix. + + +```r +#Run the Vecchia approximation to get the sparse precision matrix. + +# Vecchia approximation +run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$lon, health_sim$lat), + n.neighbors = 10) +Q_sparse = run_vecchia$Q +run_vecchia$cputime +#> Time difference of 0.5999758 secs +``` + + +### Fit bspme with sparse precision matrix + +We can now fit the main function of bspme package with sparse precision matrix. + + +```r +# fit the model +fit_me = blm_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_sparse, # sparse precision matrix + Z = health_sim$Z, + nburn = 100, + nsave = 1000, + nthin = 1) +``` + +```r +fit_me$cputime +#> Time difference of 7.23181 secs +``` +It took less than 10 seconds to (1) run Vecchia approximation and (2) run the MCMC algorithm with total 1100 iterations. + + +```r +summary(fit_me$posterior) +#> +#> Iterations = 1:1000 +#> Thinning interval = 1 +#> Number of chains = 1 +#> Sample size per chain = 1000 +#> +#> 1. Empirical mean and standard deviation for each variable, +#> plus standard error of the mean: +#> +#> Mean SD Naive SE Time-series SE +#> (Intercept) 0.7646 0.07789 0.002463 0.010519 +#> exposure.1 -1.9594 0.05087 0.001609 0.007164 +#> covariate.1 2.9536 0.03997 0.001264 0.003023 +#> sigma2_Y 0.9759 0.15318 0.004844 0.033582 +#> +#> 2. Quantiles for each variable: +#> +#> 2.5% 25% 50% 75% 97.5% +#> (Intercept) 0.6082 0.7191 0.7665 0.8129 0.9187 +#> exposure.1 -2.0572 -1.9954 -1.9582 -1.9243 -1.8647 +#> covariate.1 2.8712 2.9281 2.9534 2.9815 3.0285 +#> sigma2_Y 0.6451 0.8785 0.9800 1.0645 1.3046 +bayesplot::mcmc_trace(fit_me$posterior) +``` + +![plot of chunk unnamed-chunk-3](figure/unnamed-chunk-3-1.png) + +### Fit bspme without sparse precision matrix + + +As mentioned before, if precision matrix is dense, the posterior computation becomes infeasible when $n$ becomes large, such as tens of thousands. See the following example when $n=2000$. + + +```r +# fit the model, without vecchia approximation +# I will only run 11 iteration for illustration purpose +Q_dense = solve(X_cov) # inverting 3000 x 3000 matrix, takes some time +# run +fit_me_dense = blm_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_dense, # dense precision + Z = health_sim$Z, + nburn = 1, + nsave = 10, + nthin = 1) +``` + +```r +fit_me_dense$cputime +#> Time difference of 16.98819 secs +``` + +With even only 11 MCMC iterations, it took about 15 seconds to run the MCMC algorithm. With same number of iterations as before (1100), it will take about 1500 seconds, i.e. 25 minutes to run the MCMC algorithm. This gap becomes much larger when the number of health subject locations grows, such as tens of thousands. Then, the naive algorithm using dense precision matrix becomes infeasible, but we can carry out the inference using Vecchia approximation. + +The quality of Vecchia approximation depends on the size of the nearest neighbors, which is controlled by the argument `n.neighbors` in `vecchia_cov` function. The larger the `n.neighbors`, the better the approximation. In the simulaton study of Lee et al. (2024), Vecchia approximation with small `n.neighbors` can provide scalable inference method where the results (e.g. RMSE, empirical coverage) are close to the results that are based on dense precision matrix or fully Bayesian analysis. + +## References + +Lee, C. J., Symanski, E., Rammah, A., Kang, D. H., Hopke, P. K., & Park, E. S. (2024). A scalable two-stage Bayesian approach accounting for exposure measurement error in environmental epidemiology. arXiv preprint arXiv:2401.00634. + + + + + + diff --git a/vignettes/no2-exposure-and-health-data-analysis.Rmd.orig b/vignettes/no2-exposure-and-health-data-analysis.Rmd.orig new file mode 100644 index 0000000..c6aee03 --- /dev/null +++ b/vignettes/no2-exposure-and-health-data-analysis.Rmd.orig @@ -0,0 +1,157 @@ +--- +title: "NO2 exposure and health data analysis" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{no2_simuldata} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +This is a vignette for the `bspme` package based on January 2012 daily average NO2 data at Houston, Texas and simulated health dataset. + +First load the package and data: + +```{r setup} +rm(list=ls()) +library(bspme) +library(fields) +library(maps) +data("NO2_Jan2012") +data("health_sim") +``` + +The daily average NO2 data are collected at 21 monitoring sites within and around the Houston, Texas during Jan 2012. We focus on NO2 exposure on a specific date (Jan 10th, 2012) for an illustration purpose. We also consider simulated health dataset, with $n=2000$ subject locations and continuous health responses. + +```{r, eda, fig.width = 10, fig.height=5, out.width ="100%"} +data_jan10 = NO2_Jan2012[NO2_Jan2012$date == as.POSIXct("2012-01-10"),] +par(mfrow = c(1,2)) +quilt.plot(cbind(data_jan10$lon, data_jan10$lat), + data_jan10$lnNO2, main = "NO2 exposures (in log) at 21 stations", + xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +maps::map("county", "Texas", add = T) +plot(health_sim$lon, health_sim$lat, pch = 19, cex = 0.5, col = "grey", + xlab = "Longitude", ylab = "Latitude", main = "2000 simulated health subject locations"); US(add = T) +maps::map("county", "Texas", add = T) +``` + +### Obtain posterior predictive distribution at health subject locations + +In this example, we will use Gaussian process to obtain a predictive distribution of $(X_1,\dots,X_n)$, the log(NO2) exposure at the health subject locations. The posterior predictive distribution is n-dimensional multivariate normal $N(\mu_X, Q_X^{-1})$, which will be used as a prior in health model to incorporate measurement error information of ozone exposure. +Specifically, we will use the exponential covariance kernel with fixed range 5 (in miles) and standard deviation 1 (in ppbv). + +```{r, eda2, fig.width = 10, fig.height=5, out.width ="100%"} +coords_monitor = cbind(data_jan10$lon, data_jan10$lat) +coords_health = cbind(health_sim$lon, health_sim$lat) +a = 5; sigma = 1; # assume known +# distance calculation +distmat_xx <- rdist.earth(coords_monitor) +distmat_xy <- rdist.earth(coords_monitor, coords_health) +distmat_yy <- rdist.earth(coords_health) + +# GP covariance matrices +Sigmaxx = fields::Matern(distmat_xx, smoothness = 0.5, range = a, phi = sigma^2) +Sigmaxy = fields::Matern(distmat_xy, smoothness = 0.5, range = a, phi = sigma^2) +Sigmayy = fields::Matern(distmat_yy, smoothness = 0.5, range = a, phi = sigma^2) + +# posterior predictive mean and covariance +X_mean <- t(Sigmaxy) %*% solve(Sigmaxx, data_jan10$lnNO2) +X_cov <- Sigmayy - t(Sigmaxy) %*% solve(Sigmaxx,Sigmaxy) # n_y by n_y + +# visualization of posterior predictive distribution. Black triangle is monitoring station locations. +par(mfrow = c(1,2)) +quilt.plot(cbind(health_sim$lon, health_sim$lat), + X_mean, main = "posterior predictive mean of exposure at health data locations", + xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +maps::map("county", "Texas", add = T) +# posterior predictive sd of exposure at health data +quilt.plot(cbind(health_sim$lon, health_sim$lat), + sqrt(diag(X_cov)), main = "posterior predictive sd of exposure at health data locations", + xlab = "longitude", ylab= "latitude", xlim = c(-96.5, -94.5), ylim = c(29, 30.5)) +maps::map("county", "Texas", add = T) +``` + +The covariance of $(X_1,\dots,X_n)$ contains all spatial dependency information up to a second order, but its inverse $Q_X$ become a dense matrix. +When it comes to fitting the Bayesian linear model with measurement error, the naive implementation with n-dimensional multivariate normal prior $N(\mu_X, Q_X^{-1})$ takes cubic time complexity of posterior inference for each MCMC iteration, which becomes infeasible to run when $n$ is large such as tens of thousands. + + +### Run Vecchia approximation to get sparse precision matrix + +We propose to use Vecchia approximation to get a new multivariate normal prior of $(X_1,\dots,X_n)$ with sparse precision matrix, which is crucial for scalable implementation of health model with measurement error, but also at the same time it aims to best preserve the spatial dependency information in the original covariance matrix. + +```{r, vecchia} +#Run the Vecchia approximation to get the sparse precision matrix. + +# Vecchia approximation +run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$lon, health_sim$lat), + n.neighbors = 10) +Q_sparse = run_vecchia$Q +run_vecchia$cputime +``` + + +### Fit bspme with sparse precision matrix + +We can now fit the main function of bspme package with sparse precision matrix. + +```{r, sparse, results = "hide"} +# fit the model +fit_me = blm_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_sparse, # sparse precision matrix + Z = health_sim$Z, + nburn = 100, + nsave = 1000, + nthin = 1) +``` +```{r} +fit_me$cputime +``` +It took less than 10 seconds to (1) run Vecchia approximation and (2) run the MCMC algorithm with total 1100 iterations. + +```{r} +summary(fit_me$posterior) +bayesplot::mcmc_trace(fit_me$posterior) +``` + +### Fit bspme without sparse precision matrix + + +As mentioned before, if precision matrix is dense, the posterior computation becomes infeasible when $n$ becomes large, such as tens of thousands. See the following example when $n=2000$. + +```{r, dense, results = "hide"} +# fit the model, without vecchia approximation +# I will only run 11 iteration for illustration purpose +Q_dense = solve(X_cov) # inverting 3000 x 3000 matrix, takes some time +# run +fit_me_dense = blm_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_dense, # dense precision + Z = health_sim$Z, + nburn = 1, + nsave = 10, + nthin = 1) +``` +```{r} +fit_me_dense$cputime +``` + +With even only 11 MCMC iterations, it took about 15 seconds to run the MCMC algorithm. With same number of iterations as before (1100), it will take about 1500 seconds, i.e. 25 minutes to run the MCMC algorithm. This gap becomes much larger when the number of health subject locations grows, such as tens of thousands. Then, the naive algorithm using dense precision matrix becomes infeasible, but we can carry out the inference using Vecchia approximation. + +The quality of Vecchia approximation depends on the size of the nearest neighbors, which is controlled by the argument `n.neighbors` in `vecchia_cov` function. The larger the `n.neighbors`, the better the approximation. In the simulaton study of Lee et al. (2024), Vecchia approximation with small `n.neighbors` can provide scalable inference method where the results (e.g. RMSE, empirical coverage) are close to the results that are based on dense precision matrix or fully Bayesian analysis. + +## References + +Lee, C. J., Symanski, E., Rammah, A., Kang, D. H., Hopke, P. K., & Park, E. S. (2024). A scalable two-stage Bayesian approach accounting for exposure measurement error in environmental epidemiology. arXiv preprint arXiv:2401.00634. + + + + + + diff --git a/vignettes/ozone-exposure-and-health-data-analysis.Rmd.orig b/vignettes/ozone-exposure-and-health-data-analysis.Rmd.orig deleted file mode 100644 index f9d676c..0000000 --- a/vignettes/ozone-exposure-and-health-data-analysis.Rmd.orig +++ /dev/null @@ -1,139 +0,0 @@ ---- -title: "Ozone exposure and health data analysis" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{ozone_simuldata} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -This is a vignette for the `bspme` package based on ozone exposure data of midwest US, 1987 and simulated health dataset. - -First load the package and data: - -```{r setup} -rm(list=ls()) -library(bspme) -library(fields) -data(ozone) -data(health_sim) -``` - -The ozone data are collected daily at 67 monitoring sites in midwest US during 89 days. We focus on ozone exposure on a specific date 06/18/1987 (date id = 16). We also consider simulated health dataset, with $n=3000$ subject locations and continuous health responses. - -```{r, ozone_eda, fig.width = 10, fig.height=5, out.width ="100%"} -ozone16 = ozone[ozone$date_id==16,] -par(mfrow = c(1,2)) -quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), - ozone16$ozone_ppb, main = "67 ozone monitoring sites"); US(add= T) -plot(health_sim$coords_y_lon, health_sim$coords_y_lat, pch = 19, cex = 0.5, col = "grey", - xlab = "Longitude", ylab = "Latitude", main = "3000 simulated health subject locations"); US(add = T) -``` - -### Obtain posterior predictive distribution at health subject locations - -In this example, we will use Gaussian process to obtain a predictive distribution of $(X_1,\dots,X_n)$, the ozone exposure at the health subject locations. The posterior predictive distribution is n-dimensional multivariate normal $N(\mu_X, Q_X^{-1})$, which will be used as a prior in health model to incorporate measurement error information of ozone exposure. -Specifically, we will use the exponential covariance kernel with fixed range 300 (in miles) and standard deviation 15 (in ppb). - -```{r, ozone_eda3, fig.width = 10, fig.height=5, out.width ="100%"} - -# distance calculation -Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) -Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) -Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), - cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) - -# kernel matrices -Kxx = Exponential(Dxx, range = 300, phi=15^2) -Kyy = Exponential(Dyy, range = 300, phi=15^2) -Kxy = Exponential(Dxy, range = 300, phi=15^2) - -# posterior predictive mean and covariance -X_mean = t(Kxy) %*% solve(Kxx, ozone16$ozone_ppb) -X_cov = Kyy - t(Kxy) %*% solve(Kxx, Kxy) - -# visualization of posterior predictive distribution. Black triangle is monitoring station locations. -par(mfrow = c(1,2)) -quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - X_mean, main = "health subjects, mean of exposure"); US(add= T) -points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) - -quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) -points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) -``` - -The covariance of $(X_1,\dots,X_n)$ contains all spatial dependency information up to a second order, but its inverse $Q_X$ become a dense matrix. -When it comes to fitting the Bayesian linear model with measurement error, the naive implementation with n-dimensional multivariate normal prior $N(\mu_X, Q_X^{-1})$ takes cubic time complexity of posterior inference for each MCMC iteration, which becomes infeasible to run when $n$ is large such as tens of thousands. - - -### Run Vecchia approximation to get sparse precision matrix - -We propose to use Vecchia approximation to get a new multivariate normal prior of $(X_1,\dots,X_n)$ with sparse precision matrix, which is crucial for scalable implementation of health model with measurement error, but also at the same time it aims to best preserve the spatial dependency information in the original covariance matrix. - -```{r, vecchia} -#Run the Vecchia approximation to get the sparse precision matrix. - -# Vecchia approximation -run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - n.neighbors = 10) -Q_sparse = run_vecchia$Q -run_vecchia$cputime -``` - - -### Fit bspme with sparse precision matrix - -We can now fit the main function of bspme package with sparse precision matrix. - -```{r, sparse, results = "hide"} -# fit the model -fit_me = blinreg_me(Y = health_sim$Y, - X_mean = X_mean, - X_prec = Q_sparse, # sparse precision matrix - Z = health_sim$Z, - nburn = 100, - nsave = 1000, - nthin = 1) -``` -```{r} -fit_me$cputime -``` -It took less than 10 seconds to (1) run Vecchia approximation and (2) run the MCMC algorithm with total 1100 iterations. - -```{r} -summary(fit_me$posterior) -bayesplot::mcmc_trace(fit_me$posterior) -``` - -### Fit bspme without sparse precision matrix - - -As mentioned before, if precision matrix is dense, the posterior computation becomes infeasible when $n$ becomes large, such as tens of thousands. See the following example when $n=3000$. - -```{r, dense, results = "hide"} -# fit the model, without vecchia approximation -# I will only run 11 iteration for illustration purpose -Q_dense = solve(X_cov) # inverting 3000 x 3000 matrix, takes some time -# run -fit_me_dense = blinreg_me(Y = health_sim$Y, - X_mean = X_mean, - X_prec = Q_dense, # dense precision - Z = health_sim$Z, - nburn = 1, - nsave = 10, - nthin = 1) -``` -```{r} -fit_me_dense$cputime -``` - -With even only 11 MCMC iterations, It took about 1 mins to run the MCMC algorithm. -If number of health subject locations is tens of thousands, the naive algorithm using dense precision matrix becomes infeasible, and Vecchia approximation becomes necesssary to carry out the inference.