diff --git a/R/blogireg_me.R b/R/bglm_me.R similarity index 56% rename from R/blogireg_me.R rename to R/bglm_me.R index 395b061..f2c880b 100644 --- a/R/blogireg_me.R +++ b/R/bglm_me.R @@ -1,31 +1,45 @@ -#' Bayesian logistic regression models with correlated measurement errors +#' Bayesian generalized linear models with spatial exposure measurement error. #' -#' This function implements the Bayesian logistic regression model with correlated measurement error of covariate(s). -#' Denote \eqn{Y_i} be a binary response, \eqn{X_i} be a \eqn{q\times 1} covariate of \eqn{i}th observation that is subject to measurement error, +#' This function fits Bayesian generalized linear model in the presence of spatial exposure measurement error of covariate(s) \eqn{X}, represented as a multivariate normal prior. +#' One of the most important features of this function is that it allows sparse matrix input for the prior precision matrix of \eqn{X} for scalable computation. +#' As of version 1.0.0, only Bayesian logistic regression model is supported among GLMs, and +#' function \code{bglm_me()} runs a Gibbs sampler to carry out posterior inference using Polya-Gamma augmentation (Polson et al., 2013). +#' See below "Details" section for the model description, and Lee et al. (2024) for an application example in environmental epidemiology. +#' +#' Denote \eqn{Y_i} be a binary response, \eqn{X_i} be a \eqn{q\times 1} covariate that is subject to spatial exposure measurement error, #' and \eqn{Z_i} be a \eqn{p\times 1} covariate without measurement error. -#' The likelihood model is Bayesian logistic regression, -#' \deqn{logit(Pr(Y_i = 1)) = \beta_0 + X_i^\top \beta_x + Z_i^\top \beta_z, \quad i=1,\dots,n, } -#' independently, and correlated measurement error of \eqn{X_i, i=1,\dots,n} is incorporated into the model as a multivariate normal prior. -#' For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior +#' Consider logistic regression model, independently for each \eqn{i=1,\dots,n,} +#' \deqn{\log(Pr(Y_i = 1)/Pr(Y_i = 0)) = \beta_0 + X_i^\top \beta_X + Z_i^\top \beta_Z,} +#' and spatial exposure measurement error of \eqn{X_i} for \eqn{i=1,\dots,n} is incorporated into the model as a multivariate normal prior. +#' For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior of \eqn{X = (X_1,\dots,X_n)^\top}, #' \deqn{(X_1,\dots,X_n)\sim N_n(\mu_X, Q_X^{-1}).} -#' Also, we consider semiconjugate priors for regression coefficients; -#' \deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{x,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{z,k} \stackrel{iid}{\sim} N(0, V_\beta)} -#' The function \code{blogireg_me()} implements the Gibbs sampler for posterior inference using Polya-Gamma augmentation. -#' Most importantly, it allows sparse matrix input for \eqn{Q_X} for scalable computation. -#' -#' @param Y n by 1 matrix, binary responses -#' @param X_mean n by 1 matrix of mean of X \eqn{\mu_X}, or list of n by 1 matrices of length q. -#' @param X_prec n by n matrix precision matrix of X \eqn{Q_X}, or list of n by n matrices of length q. Support sparse matrix format from Matrix package. -#' @param Z n by p matrix, covariates without measurement error -#' @param nburn integer, burn-in iteration -#' @param nthin integer, thin-in rate -#' @param nsave integer, number of posterior samples -#' @param prior list of prior parameters, default is var_beta = 100,a_Y = 0.01, b_Y = 0.01 -#' @param saveX logical, save X or not -#' -#' @return list of (1) posterior, the (nsave)x(q+p) matrix of posterior samples as a coda object, -#' (2) cputime, cpu time taken in seconds, -#' and optionally (3) X_save, posterior samples of X +#' Most importantly, it allows sparse matrix input for the prior precision matrix \eqn{Q_X} for scalable computation, which could be obtained by Vecchia approximation. +#' +#' We consider normal priors for regression coefficients, +#' \deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{X,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{Z,k} \stackrel{iid}{\sim} N(0, V_\beta)} +#' where \code{var_beta} corresponds to \eqn{V_\beta}. +#' +#' @param Y *vector<int>*, n by 1 binary response vector. +#' @param X_mean *matrix<num>*, n by 1 prior mean matrix \eqn{\mu_X}. When there are q multiple exposures subject to measurement error, it can be a length q list of n by 1 matrices. +#' @param X_prec *matrix<num>*, n by 1 prior precision matrix \eqn{Q_X}, which allows sparse format from \link{Matrix} package. When there are q multiple exposures subject to measurement error, it can be a length q list of n by n matrices. +#' @param Z *matrix<num>*, n by p covariates that are not subject to measurement error. +#' @param family *class \link[stats]{family}*, a description of the error distribution and link function to be used in the model. Currently only supports binomial(link = "logit"). +#' @param nburn *integer*, burn-in iteration, default 1000. +#' @param nsave *integer*, number of posterior samples, default 1000. Total number of MCMC iteration is \code{nburn + nsave * nthin}. +#' @param nthin *integer*, thin-in rate, default 1. +#' @param prior *list*, list of prior parameters of regression model. Default is \code{list(var_beta = 100)}. +#' @param saveX *logical*, default FALSE, whether save posterior samples of X (exposure). +#' +#' @return List of the following: +#' \describe{ +#' \item{posterior}{The \code{nsave} by (q+p) matrix of posterior samples of \eqn{\beta_x} and \eqn{\beta_z} as a coda::\link[coda]{mcmc} object.} +#' \item{time}{Time taken for running MCMC (in seconds)} +#' \item{X_save}{ (if \code{saveX = TRUE}) Posterior samples of X} +#' } +#' +#' @references Polson, N. G., Scott, J. G., & Windle, J. (2013). Bayesian inference for logistic models using Pólya–Gamma latent variables. Journal of the American statistical Association, 108(504), 1339-1349. +#' +#' Lee, C. J., Symanski, E., Rammah, A., Kang, D. H., Hopke, P. K., & Park, E. S. (2024). A scalable two-stage Bayesian approach accounting for exposure measurement error in environmental epidemiology. arXiv preprint arXiv:2401.00634. #' @export #' #' @examples @@ -37,7 +51,7 @@ #' data(ozone) #' data(health_sim) #' library(fields) -#' # exposure mean and covariance at health subject locations with 06/18/1987 data (date id = 16) +#' # exposure mean and covariance at health participant locations with 06/18/1987 data (date id = 16) #' # using Gaussian process with prior mean zero and exponential covariance kernel #' # with fixed range 300 (in miles) and stdev 15 (in ppb) #' @@ -88,15 +102,27 @@ #' bayesplot::mcmc_trace(fit_me$posterior) #' } #' -blogireg_me <- function(Y, +bglm_me <- function(Y, X_mean, X_prec, Z, - nburn = 2000, - nsave = 2000, - nthin = 5, + family = binomial(link = "logit"), + nburn = 1000, + nsave = 1000, + nthin = 1, prior = NULL, - saveX = F){ + saveX = FALSE){ + #### input check #### + # check family input + family = "binomial" + if (is.character(family)) family <- get(family, mode = "function", envir = parent.frame()) + if (is.function(family)) family <- family() + if (is.null(family$family)) { + print(family) + stop("'family' not recognized") + } + if (family$family != "binomial") stop("only binomial family is supported") + if (family$link != "logit") stop("only logit link is supported") # prior input, default if(is.null(prior)){ @@ -105,6 +131,9 @@ blogireg_me <- function(Y, var_beta = prior$var_beta n_y = length(Y) + # check z is binary (temporary for family = "binomial") + if(!all(Z %in% c(0,1))) stop("Z must be binary") + if(is.vector(Z)) Z = as.matrix(Z) if(!is.list(X_mean) & !is.list(X_prec)){ q = 1 @@ -132,6 +161,9 @@ blogireg_me <- function(Y, X_spamstruct[[qq]] = spam::chol(X_prec[[qq]]) } } + + #### initialize #### + X = matrix(0, n_y, q) for(qq in 1:q) X[,qq] = X_mean[[q]] if(is.null(names(X_mean))){ @@ -147,14 +179,13 @@ blogireg_me <- function(Y, colnames(Z) = paste0("covariate.",colnames(Z)) } df_temp = as.data.frame(cbind(X,Z)) - D = model.matrix( ~ ., df_temp) - + D = model.matrix( ~ ., df_temp) # design matrix # prior Sigma_beta = diag(var_beta, ncol(D))# 3 coefficients(beta0, beta1, betaz) Sigma_betainv = solve(Sigma_beta) - # initialize + # parameter beta = rep(0.1, ncol(D)) omega = rep(0.1, n_y) # aux variable Dbeta = D%*%beta @@ -167,8 +198,9 @@ blogireg_me <- function(Y, } YtY = crossprod(Y) - #browser() - # sampler starts + + #### run MCMC #### + isave = 0 isnegative = numeric(n_y) pb <- txtProgressBar(style=3) @@ -199,7 +231,6 @@ blogireg_me <- function(Y, D[,(qq+1)] = as.vector(Xstar) } - if((imcmc > nburn)&(imcmc%%nthin==0)){ isave = isave + 1 beta_save[isave,] = beta @@ -207,7 +238,9 @@ blogireg_me <- function(Y, } } t_diff = difftime(Sys.time(), t_start, units = "secs") - print(paste0("Exposure components contains negative vaules total ",sum(isnegative)," times among (# exposures) x n_y x (MCMC iter after burnin) = ",q," x ",n_y," x ",nsave*nthin," instances")) + #print(paste0("Exposure components contains negative vaules total ",sum(isnegative)," times among (# exposures) x n_y x (MCMC iter after burnin) = ",q," x ",n_y," x ",nsave*nthin," instances")) + + #### output #### out = list() out$posterior = beta_save out$posterior = coda::mcmc(out$posterior) diff --git a/R/blinreg_me.R b/R/blm_me.R similarity index 64% rename from R/blinreg_me.R rename to R/blm_me.R index e2e376c..8791732 100644 --- a/R/blinreg_me.R +++ b/R/blm_me.R @@ -1,29 +1,42 @@ -#' Bayesian normal linear regression models with correlated measurement errors +#' Bayesian linear regression models with spatial exposure measurement error. #' -#' This function implements the Bayesian normal linear regression model with correlated measurement error of covariate(s). -#' Denote \eqn{Y_i} be a continuous response, \eqn{X_i} be a \eqn{q\times 1} covariate of \eqn{i}th observation that is subject to measurement error, +#' This function fits Bayesian linear regression model in the presence of spatial exposure measurement error of covariate(s) \eqn{X}, represented as a multivariate normal prior. +#' One of the most important features of this function is that it allows sparse matrix input for the prior precision matrix of \eqn{X} for scalable computation. +#' Function \code{blm_me()} runs a Gibbs sampler to carry out posterior inference; see below "Details" section for the model description, and Lee et al. (2024) for an application example in environmental epidemiology. +#' +#' Denote \eqn{Y_i} be a binary response, \eqn{X_i} be a \eqn{q\times 1} covariate that is subject to spatial exposure measurement error, #' and \eqn{Z_i} be a \eqn{p\times 1} covariate without measurement error. -#' The likelihood model is Bayesian normal linear regression, -#' \deqn{Y_i = \beta_0 + X_i^\top \beta_x + Z_i^\top \beta_z + \epsilon_i,\quad \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n} -#' and correlated measurement error of \eqn{X_i, i=1,\dots,n} is incorporated into the model as a multivariate normal prior. For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior +#' Consider normal linear regression model, +#' \deqn{Y_i = \beta_0 + X_i^\top \beta_x + Z_i^\top \beta_z + \epsilon_i,\quad \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n,} +#' and spatial exposure measurement error of \eqn{X_i} for \eqn{i=1,\dots,n} is incorporated into the model as a multivariate normal prior. +#' For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior of \eqn{X = (X_1,\dots,X_n)^\top}, #' \deqn{(X_1,\dots,X_n)\sim N_n(\mu_X, Q_X^{-1}).} -#' Also, we consider semiconjugate priors for regression coefficients and noise variance; -#' \deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{x,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{z,k} \stackrel{iid}{\sim} N(0, V_\beta), \quad \sigma_Y^2 \sim IG(a_Y, b_Y).} -#' The function \code{blinreg_me()} implements the Gibbs sampler for posterior inference. Most importantly, it allows sparse matrix input for \eqn{Q_X} for scalable computation. -#' -#' @param Y n by 1 matrix, response -#' @param X_mean n by 1 matrix of mean of X \eqn{\mu_X}, or list of n by 1 matrices of length q. -#' @param X_prec n by n matrix precision matrix of X \eqn{Q_X}, or list of n by n matrices of length q. Support sparse matrix format from Matrix package. -#' @param Z n by p matrix, covariates without measurement error -#' @param nburn integer, burn-in iteration -#' @param nthin integer, thin-in rate -#' @param nsave integer, number of posterior samples -#' @param prior list of prior parameters, default is var_beta = 100,a_Y = 0.01, b_Y = 0.01 -#' @param saveX logical, save X or not -#' -#' @return list of (1) posterior, the (nsave)x(q+p) matrix of posterior samples as a coda object, -#' (2) cputime, cpu time taken in seconds, -#' and optionally (3) X_save, posterior samples of X +#' Most importantly, it allows sparse matrix input for the prior precision matrix \eqn{Q_X} for scalable computation, which could be obtained by Vecchia approximation. +#' +#' We consider semiconjugate priors for regression coefficients and error variance, +#' \deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{X,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{Z,k} \stackrel{iid}{\sim} N(0, V_\beta), \quad \sigma_Y^2 \sim IG(a_Y, b_Y).} +#' where \code{var_beta} corresponds to \eqn{V_\beta} and \code{a_Y}, \code{b_Y} corresponds to hyperparameters for \eqn{\sigma^2_Y}. +#' +#' @param Y *vector<int>*, n by 1 continuous response vector. +#' @param X_mean *matrix<num>*, n by 1 prior mean matrix \eqn{\mu_X}. When there are q multiple exposures subject to measurement error, it can be a length q list of n by 1 matrices. +#' @param X_prec *matrix<num>*, n by 1 prior precision matrix \eqn{Q_X}, which allows sparse format from \link{Matrix} package. When there are q multiple exposures subject to measurement error, it can be a length q list of n by n matrices. +#' @param Z *matrix<num>*, n by p covariates that are not subject to measurement error. +#' @param nburn *integer*, burn-in iteration, default 1000. +#' @param nsave *integer*, number of posterior samples, default 1000. Total number of MCMC iteration is \code{nburn + nsave * nthin}. +#' @param nthin *integer*, thin-in rate, default 1. +#' @param prior *list*, list of prior parameters of regression model. Default is \code{list(var_beta = 100, a_Y = 0.01, b_Y = 0.01)}. +#' @param saveX *logical*, default FALSE, whether save posterior samples of X (exposure). +#' +#' @return list of the following: +#' \describe{ +#' \item{posterior}{The \code{nsave} by (q + p + 1) matrix of posterior samples of \eqn{\beta_X}, \eqn{\beta_Z}, \eqn{\sigma_Y^2} as a coda::\link[coda]{mcmc} object.} +#' \item{time}{Time taken for running MCMC (in seconds)} +#' \item{X_save}{ (if \code{saveX = TRUE}) Posterior samples of X} +#' } +#' +#' @references +#' Lee, C. J., Symanski, E., Rammah, A., Kang, D. H., Hopke, P. K., & Park, E. S. (2024). A scalable two-stage Bayesian approach accounting for exposure measurement error in environmental epidemiology. arXiv preprint arXiv:2401.00634. +#' #' @export #' #' @examples @@ -86,16 +99,16 @@ #' bayesplot::mcmc_trace(fit_me$posterior) #' } #' -blinreg_me <- function(Y, +blm_me <- function(Y, X_mean, X_prec, Z, - nburn = 2000, - nsave = 2000, - nthin = 5, + nburn = 1000, + nsave = 1000, + nthin = 1, prior = NULL, - saveX = F){ - + saveX = FALSE){ + #### input check #### # prior input, default if(is.null(prior)){ prior = list(var_beta = 100,a_Y = 0.01, b_Y = 0.01) @@ -134,6 +147,8 @@ blinreg_me <- function(Y, } } + #### initialize #### + X = matrix(0, n_y, q) for(qq in 1:q) X[,qq] = X_mean[[q]] if(is.null(names(X_mean))){ @@ -156,7 +171,7 @@ blinreg_me <- function(Y, Sigma_beta = diag(var_beta, ncol(D))# 3 coefficients(beta0, beta1, betaz) Sigma_betainv = solve(Sigma_beta) - # initialize + # parameter sigma2_Y = 1 beta = rep(0.1, ncol(D)) @@ -170,8 +185,9 @@ blinreg_me <- function(Y, } YtY = crossprod(Y) - #browser() - # sampler starts + + #### run MCMC #### + isave = 0 isnegative = numeric(n_y) pb <- txtProgressBar(style=3) @@ -212,6 +228,8 @@ blinreg_me <- function(Y, } t_diff = difftime(Sys.time(), t_start, units = "secs") #print(paste0("Exposure components contains negative vaules total ",sum(isnegative)," times among (# exposures) x n_y x (MCMC iter after burnin) = ",q," x ",n_y," x ",nsave*nthin," instances")) + + #### output #### out = list() out$posterior = cbind(beta_save, sigma2_save) out$posterior = coda::mcmc(out$posterior) diff --git a/man/bglm_me.Rd b/man/bglm_me.Rd new file mode 100644 index 0000000..c629e14 --- /dev/null +++ b/man/bglm_me.Rd @@ -0,0 +1,135 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bglm_me.R +\name{bglm_me} +\alias{bglm_me} +\title{Bayesian generalized linear models with spatial exposure measurement error.} +\usage{ +bglm_me( + Y, + X_mean, + X_prec, + Z, + family = binomial(link = "logit"), + nburn = 1000, + nsave = 1000, + nthin = 1, + prior = NULL, + saveX = FALSE +) +} +\arguments{ +\item{Y}{\emph{vector}, n by 1 binary response vector.} + +\item{X_mean}{\emph{matrix}, n by 1 prior mean matrix \eqn{\mu_X}. When there are q multiple exposures subject to measurement error, it can be a length q list of n by 1 matrices.} + +\item{X_prec}{\emph{matrix}, n by 1 prior precision matrix \eqn{Q_X}, which allows sparse format from \link{Matrix} package. When there are q multiple exposures subject to measurement error, it can be a length q list of n by n matrices.} + +\item{Z}{\emph{matrix}, n by p covariates that are not subject to measurement error.} + +\item{family}{\emph{class \link[stats]{family}}, a description of the error distribution and link function to be used in the model. Currently only supports binomial(link = "logit").} + +\item{nburn}{\emph{integer}, burn-in iteration, default 1000.} + +\item{nsave}{\emph{integer}, number of posterior samples, default 1000. Total number of MCMC iteration is \code{nburn + nsave * nthin}.} + +\item{nthin}{\emph{integer}, thin-in rate, default 1.} + +\item{prior}{\emph{list}, list of prior parameters of regression model. Default is \code{list(var_beta = 100)}.} + +\item{saveX}{\emph{logical}, default FALSE, whether save posterior samples of X (exposure).} +} +\value{ +List of the following: +\describe{ +\item{posterior}{The \code{nsave} by (q+p) matrix of posterior samples of \eqn{\beta_x} and \eqn{\beta_z} as a coda::\link[coda]{mcmc} object.} +\item{time}{Time taken for running MCMC (in seconds)} +\item{X_save}{ (if \code{saveX = TRUE}) Posterior samples of X} +} +} +\description{ +This function fits Bayesian generalized linear model in the presence of spatial exposure measurement error of covariate(s) \eqn{X}, represented as a multivariate normal prior. +One of the most important features of this function is that it allows sparse matrix input for the prior precision matrix of \eqn{X} for scalable computation. +As of version 1.0.0, only Bayesian logistic regression model is supported among GLMs, and +function \code{bglm_me()} runs a Gibbs sampler to carry out posterior inference using Polya-Gamma augmentation (Polson et al., 2013). +See below "Details" section for the model description, and Lee et al. (2024) for an application example in environmental epidemiology. +} +\details{ +Denote \eqn{Y_i} be a binary response, \eqn{X_i} be a \eqn{q\times 1} covariate that is subject to spatial exposure measurement error, +and \eqn{Z_i} be a \eqn{p\times 1} covariate without measurement error. +Consider logistic regression model, independently for each \eqn{i=1,\dots,n,} +\deqn{\log(Pr(Y_i = 1)/Pr(Y_i = 0)) = \beta_0 + X_i^\top \beta_X + Z_i^\top \beta_Z,} +and spatial exposure measurement error of \eqn{X_i} for \eqn{i=1,\dots,n} is incorporated into the model as a multivariate normal prior. +For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior of \eqn{X = (X_1,\dots,X_n)^\top}, +\deqn{(X_1,\dots,X_n)\sim N_n(\mu_X, Q_X^{-1}).} +Most importantly, it allows sparse matrix input for the prior precision matrix \eqn{Q_X} for scalable computation, which could be obtained by Vecchia approximation. + +We consider normal priors for regression coefficients, +\deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{X,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{Z,k} \stackrel{iid}{\sim} N(0, V_\beta)} +where \code{var_beta} corresponds to \eqn{V_\beta}. +} +\examples{ + +\dontrun{ +data(ozone) +data(health_sim) +library(bspme) +data(ozone) +data(health_sim) +library(fields) +# exposure mean and covariance at health participant locations with 06/18/1987 data (date id = 16) +# using Gaussian process with prior mean zero and exponential covariance kernel +# with fixed range 300 (in miles) and stdev 15 (in ppb) + +ozone16 = ozone[ozone$date_id==16,] + +Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) +Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), + cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) + +Kxx = Exponential(Dxx, range = 300, phi=15^2) +Kyy = Exponential(Dyy, range = 300, phi=15^2) +Kxy = Exponential(Dxy, range = 300, phi=15^2) + +X_mean = t(Kxy) \%*\% solve(Kxx, ozone16$ozone_ppb) +X_cov = Kyy - t(Kxy) \%*\% solve(Kxx, Kxy) + +# visualize +par(mfrow = c(1,3)) +quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), + ozone16$ozone_ppb, main = "ozone measurements"); US(add= T) + +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + X_mean, main = "health subjects, mean of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) + +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) + +# vecchia approximation +run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + n.neighbors = 10) +Q_sparse = run_vecchia$Q +run_vecchia$cputime + +# fit the model +fit_me = blogireg_me(Y = health_sim$Ybinary, + X_mean = X_mean, + X_prec = Q_sparse, # sparse precision matrix + Z = health_sim$Z, + nburn = 100, + nsave = 1000, + nthin = 1) +fit_me$cputime +summary(fit_me$posterior) +library(bayesplot) +bayesplot::mcmc_trace(fit_me$posterior) +} + +} +\references{ +Polson, N. G., Scott, J. G., & Windle, J. (2013). Bayesian inference for logistic models using Pólya–Gamma latent variables. Journal of the American statistical Association, 108(504), 1339-1349. + +Lee, C. J., Symanski, E., Rammah, A., Kang, D. H., Hopke, P. K., & Park, E. S. (2024). A scalable two-stage Bayesian approach accounting for exposure measurement error in environmental epidemiology. arXiv preprint arXiv:2401.00634. +} diff --git a/man/blinreg_me.Rd b/man/blinreg_me.Rd deleted file mode 100644 index c90e592..0000000 --- a/man/blinreg_me.Rd +++ /dev/null @@ -1,115 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/blinreg_me.R -\name{blinreg_me} -\alias{blinreg_me} -\title{Bayesian normal linear regression models with correlated measurement errors} -\usage{ -blinreg_me( - Y, - X_mean, - X_prec, - Z, - nburn = 2000, - nsave = 2000, - nthin = 5, - prior = NULL, - saveX = F -) -} -\arguments{ -\item{Y}{n by 1 matrix, response} - -\item{X_mean}{n by 1 matrix of mean of X \eqn{\mu_X}, or list of n by 1 matrices of length q.} - -\item{X_prec}{n by n matrix precision matrix of X \eqn{Q_X}, or list of n by n matrices of length q. Support sparse matrix format from Matrix package.} - -\item{Z}{n by p matrix, covariates without measurement error} - -\item{nburn}{integer, burn-in iteration} - -\item{nsave}{integer, number of posterior samples} - -\item{nthin}{integer, thin-in rate} - -\item{prior}{list of prior parameters, default is var_beta = 100,a_Y = 0.01, b_Y = 0.01} - -\item{saveX}{logical, save X or not} -} -\value{ -list of (1) posterior, the (nsave)x(q+p) matrix of posterior samples as a coda object, -(2) cputime, cpu time taken in seconds, -and optionally (3) X_save, posterior samples of X -} -\description{ -This function implements the Bayesian normal linear regression model with correlated measurement error of covariate(s). -Denote \eqn{Y_i} be a continuous response, \eqn{X_i} be a \eqn{q\times 1} covariate of \eqn{i}th observation that is subject to measurement error, -and \eqn{Z_i} be a \eqn{p\times 1} covariate without measurement error. -The likelihood model is Bayesian normal linear regression, -\deqn{Y_i = \beta_0 + X_i^\top \beta_x + Z_i^\top \beta_z + \epsilon_i,\quad \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n} -and correlated measurement error of \eqn{X_i, i=1,\dots,n} is incorporated into the model as a multivariate normal prior. For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior -\deqn{(X_1,\dots,X_n)\sim N_n(\mu_X, Q_X^{-1}).} -Also, we consider semiconjugate priors for regression coefficients and noise variance; -\deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{x,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{z,k} \stackrel{iid}{\sim} N(0, V_\beta), \quad \sigma_Y^2 \sim IG(a_Y, b_Y).} -The function \code{blinreg_me()} implements the Gibbs sampler for posterior inference. Most importantly, it allows sparse matrix input for \eqn{Q_X} for scalable computation. -} -\examples{ - -\dontrun{ -data(ozone) -data(health_sim) -library(bspme) -data(ozone) -data(health_sim) -library(fields) -# exposure mean and covariance at health subject locations with 06/18/1987 data (date id = 16) -# using Gaussian process with prior mean zero and exponential covariance kernel -# with fixed range 300 (in miles) and stdev 15 (in ppb) - -ozone16 = ozone[ozone$date_id==16,] - -Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) -Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) -Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), - cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) - -Kxx = Exponential(Dxx, range = 300, phi=15^2) -Kyy = Exponential(Dyy, range = 300, phi=15^2) -Kxy = Exponential(Dxy, range = 300, phi=15^2) - -X_mean = t(Kxy) \%*\% solve(Kxx, ozone16$ozone_ppb) -X_cov = Kyy - t(Kxy) \%*\% solve(Kxx, Kxy) - -# visualize -par(mfrow = c(1,3)) -quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), - ozone16$ozone_ppb, main = "ozone measurements"); US(add= T) - -quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - X_mean, main = "health subjects, mean of exposure"); US(add= T) -points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) - -quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) -points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) - -# vecchia approximation -run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - n.neighbors = 10) -Q_sparse = run_vecchia$Q -run_vecchia$cputime - -# fit the model -fit_me = blinreg_me(Y = health_sim$Y, - X_mean = X_mean, - X_prec = Q_sparse, # sparse precision matrix - Z = health_sim$Z, - nburn = 100, - nsave = 1000, - nthin = 1) -fit_me$cputime -summary(fit_me$posterior) -library(bayesplot) -bayesplot::mcmc_trace(fit_me$posterior) -} - -} diff --git a/man/blm_me.Rd b/man/blm_me.Rd new file mode 100644 index 0000000..b250af7 --- /dev/null +++ b/man/blm_me.Rd @@ -0,0 +1,128 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/blm_me.R +\name{blm_me} +\alias{blm_me} +\title{Bayesian linear regression models with spatial exposure measurement error.} +\usage{ +blm_me( + Y, + X_mean, + X_prec, + Z, + nburn = 1000, + nsave = 1000, + nthin = 1, + prior = NULL, + saveX = FALSE +) +} +\arguments{ +\item{Y}{\emph{vector}, n by 1 continuous response vector.} + +\item{X_mean}{\emph{matrix}, n by 1 prior mean matrix \eqn{\mu_X}. When there are q multiple exposures subject to measurement error, it can be a length q list of n by 1 matrices.} + +\item{X_prec}{\emph{matrix}, n by 1 prior precision matrix \eqn{Q_X}, which allows sparse format from \link{Matrix} package. When there are q multiple exposures subject to measurement error, it can be a length q list of n by n matrices.} + +\item{Z}{\emph{matrix}, n by p covariates that are not subject to measurement error.} + +\item{nburn}{\emph{integer}, burn-in iteration, default 1000.} + +\item{nsave}{\emph{integer}, number of posterior samples, default 1000. Total number of MCMC iteration is \code{nburn + nsave * nthin}.} + +\item{nthin}{\emph{integer}, thin-in rate, default 1.} + +\item{prior}{\emph{list}, list of prior parameters of regression model. Default is \code{list(var_beta = 100, a_Y = 0.01, b_Y = 0.01)}.} + +\item{saveX}{\emph{logical}, default FALSE, whether save posterior samples of X (exposure).} +} +\value{ +list of the following: +\describe{ +\item{posterior}{The \code{nsave} by (q + p + 1) matrix of posterior samples of \eqn{\beta_X}, \eqn{\beta_Z}, \eqn{\sigma_Y^2} as a coda::\link[coda]{mcmc} object.} +\item{time}{Time taken for running MCMC (in seconds)} +\item{X_save}{ (if \code{saveX = TRUE}) Posterior samples of X} +} +} +\description{ +This function fits Bayesian linear regression model in the presence of spatial exposure measurement error of covariate(s) \eqn{X}, represented as a multivariate normal prior. +One of the most important features of this function is that it allows sparse matrix input for the prior precision matrix of \eqn{X} for scalable computation. +Function \code{blm_me()} runs a Gibbs sampler to carry out posterior inference; see below "Details" section for the model description, and Lee et al. (2024) for an application example in environmental epidemiology. +} +\details{ +Denote \eqn{Y_i} be a binary response, \eqn{X_i} be a \eqn{q\times 1} covariate that is subject to spatial exposure measurement error, +and \eqn{Z_i} be a \eqn{p\times 1} covariate without measurement error. +Consider normal linear regression model, +\deqn{Y_i = \beta_0 + X_i^\top \beta_x + Z_i^\top \beta_z + \epsilon_i,\quad \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n,} +and spatial exposure measurement error of \eqn{X_i} for \eqn{i=1,\dots,n} is incorporated into the model as a multivariate normal prior. +For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior of \eqn{X = (X_1,\dots,X_n)^\top}, +\deqn{(X_1,\dots,X_n)\sim N_n(\mu_X, Q_X^{-1}).} +Most importantly, it allows sparse matrix input for the prior precision matrix \eqn{Q_X} for scalable computation, which could be obtained by Vecchia approximation. + +We consider semiconjugate priors for regression coefficients and error variance, +\deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{X,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{Z,k} \stackrel{iid}{\sim} N(0, V_\beta), \quad \sigma_Y^2 \sim IG(a_Y, b_Y).} +where \code{var_beta} corresponds to \eqn{V_\beta} and \code{a_Y}, \code{b_Y} corresponds to hyperparameters for \eqn{\sigma^2_Y}. +} +\examples{ + +\dontrun{ +data(ozone) +data(health_sim) +library(bspme) +data(ozone) +data(health_sim) +library(fields) +# exposure mean and covariance at health subject locations with 06/18/1987 data (date id = 16) +# using Gaussian process with prior mean zero and exponential covariance kernel +# with fixed range 300 (in miles) and stdev 15 (in ppb) + +ozone16 = ozone[ozone$date_id==16,] + +Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) +Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), + cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) + +Kxx = Exponential(Dxx, range = 300, phi=15^2) +Kyy = Exponential(Dyy, range = 300, phi=15^2) +Kxy = Exponential(Dxy, range = 300, phi=15^2) + +X_mean = t(Kxy) \%*\% solve(Kxx, ozone16$ozone_ppb) +X_cov = Kyy - t(Kxy) \%*\% solve(Kxx, Kxy) + +# visualize +par(mfrow = c(1,3)) +quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), + ozone16$ozone_ppb, main = "ozone measurements"); US(add= T) + +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + X_mean, main = "health subjects, mean of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) + +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) + +# vecchia approximation +run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + n.neighbors = 10) +Q_sparse = run_vecchia$Q +run_vecchia$cputime + +# fit the model +fit_me = blinreg_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_sparse, # sparse precision matrix + Z = health_sim$Z, + nburn = 100, + nsave = 1000, + nthin = 1) +fit_me$cputime +summary(fit_me$posterior) +library(bayesplot) +bayesplot::mcmc_trace(fit_me$posterior) +} + +} +\references{ +Lee, C. J., Symanski, E., Rammah, A., Kang, D. H., Hopke, P. K., & Park, E. S. (2024). A scalable two-stage Bayesian approach accounting for exposure measurement error in environmental epidemiology. arXiv preprint arXiv:2401.00634. +} diff --git a/man/blogireg_me.Rd b/man/blogireg_me.Rd deleted file mode 100644 index 7d53dab..0000000 --- a/man/blogireg_me.Rd +++ /dev/null @@ -1,117 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/blogireg_me.R -\name{blogireg_me} -\alias{blogireg_me} -\title{Bayesian logistic regression models with correlated measurement errors} -\usage{ -blogireg_me( - Y, - X_mean, - X_prec, - Z, - nburn = 2000, - nsave = 2000, - nthin = 5, - prior = NULL, - saveX = F -) -} -\arguments{ -\item{Y}{n by 1 matrix, binary responses} - -\item{X_mean}{n by 1 matrix of mean of X \eqn{\mu_X}, or list of n by 1 matrices of length q.} - -\item{X_prec}{n by n matrix precision matrix of X \eqn{Q_X}, or list of n by n matrices of length q. Support sparse matrix format from Matrix package.} - -\item{Z}{n by p matrix, covariates without measurement error} - -\item{nburn}{integer, burn-in iteration} - -\item{nsave}{integer, number of posterior samples} - -\item{nthin}{integer, thin-in rate} - -\item{prior}{list of prior parameters, default is var_beta = 100,a_Y = 0.01, b_Y = 0.01} - -\item{saveX}{logical, save X or not} -} -\value{ -list of (1) posterior, the (nsave)x(q+p) matrix of posterior samples as a coda object, -(2) cputime, cpu time taken in seconds, -and optionally (3) X_save, posterior samples of X -} -\description{ -This function implements the Bayesian logistic regression model with correlated measurement error of covariate(s). -Denote \eqn{Y_i} be a binary response, \eqn{X_i} be a \eqn{q\times 1} covariate of \eqn{i}th observation that is subject to measurement error, -and \eqn{Z_i} be a \eqn{p\times 1} covariate without measurement error. -The likelihood model is Bayesian logistic regression, -\deqn{logit(Pr(Y_i = 1)) = \beta_0 + X_i^\top \beta_x + Z_i^\top \beta_z, \quad i=1,\dots,n, } -independently, and correlated measurement error of \eqn{X_i, i=1,\dots,n} is incorporated into the model as a multivariate normal prior. -For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior -\deqn{(X_1,\dots,X_n)\sim N_n(\mu_X, Q_X^{-1}).} -Also, we consider semiconjugate priors for regression coefficients; -\deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{x,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{z,k} \stackrel{iid}{\sim} N(0, V_\beta)} -The function \code{blogireg_me()} implements the Gibbs sampler for posterior inference using Polya-Gamma augmentation. -Most importantly, it allows sparse matrix input for \eqn{Q_X} for scalable computation. -} -\examples{ - -\dontrun{ -data(ozone) -data(health_sim) -library(bspme) -data(ozone) -data(health_sim) -library(fields) -# exposure mean and covariance at health subject locations with 06/18/1987 data (date id = 16) -# using Gaussian process with prior mean zero and exponential covariance kernel -# with fixed range 300 (in miles) and stdev 15 (in ppb) - -ozone16 = ozone[ozone$date_id==16,] - -Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) -Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) -Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), - cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) - -Kxx = Exponential(Dxx, range = 300, phi=15^2) -Kyy = Exponential(Dyy, range = 300, phi=15^2) -Kxy = Exponential(Dxy, range = 300, phi=15^2) - -X_mean = t(Kxy) \%*\% solve(Kxx, ozone16$ozone_ppb) -X_cov = Kyy - t(Kxy) \%*\% solve(Kxx, Kxy) - -# visualize -par(mfrow = c(1,3)) -quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), - ozone16$ozone_ppb, main = "ozone measurements"); US(add= T) - -quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - X_mean, main = "health subjects, mean of exposure"); US(add= T) -points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) - -quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) -points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) - -# vecchia approximation -run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), - n.neighbors = 10) -Q_sparse = run_vecchia$Q -run_vecchia$cputime - -# fit the model -fit_me = blogireg_me(Y = health_sim$Ybinary, - X_mean = X_mean, - X_prec = Q_sparse, # sparse precision matrix - Z = health_sim$Z, - nburn = 100, - nsave = 1000, - nthin = 1) -fit_me$cputime -summary(fit_me$posterior) -library(bayesplot) -bayesplot::mcmc_trace(fit_me$posterior) -} - -}