diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..7fff937 Binary files /dev/null and b/.DS_Store differ diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..e1da2e5 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,9 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^LICENSE\.md$ +^README\.Rmd$ +^data-raw$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ +^\.github$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 0000000..2d19fc7 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 0000000..ed7650c --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,48 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + release: + types: [published] + workflow_dispatch: + +name: pkgdown + +jobs: + pkgdown: + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write + steps: + - uses: actions/checkout@v3 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + needs: website + + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} + + - name: Deploy to GitHub pages 🚀 + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@v4.4.1 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d7f03b --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata +docs +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..8d67895 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,27 @@ +Package: bspme +Type: Package +Title: Bayesian Spatial Measurement Error Models +Version: 0.2.0 +Authors@R: c(person("Changwoo", "Lee", role=c("aut", "cre"), email="c.lee@stat.tamu.edu"), person("Elaine", "Symanski", role = c('aut')), person("Amal", "Rammah", role = c('aut')), person("Dong Hun", "Kang", role = c('aut')), person("Philip", "Hopke", role = c('aut')), person("Eun Sug", "Park", role = c("aut"))) +Author: Changwoo Lee[aut, cre], Eun Sug Park[aut], Elaine Symanski[aut], Amal Rammah[aut], Dong Hun Kang[aut], Philip Hopke[aut] +Maintainer: Changwoo Lee +Description: Functions for fitting Bayesian linear and genearlized linear models in presence of spatial measurement error of the covariates. +License: MIT + file LICENSE +Encoding: UTF-8 +LazyData: true +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.2.3 +Imports: + coda, + fields, + spam, + spNNGP +Depends: + Matrix, + R (>= 2.10) +URL: https://changwoo-lee.github.io/bspme/ +BugReports: https://github.com/changwoo-lee/bspme/issues +Suggests: + knitr, + rmarkdown +VignetteBuilder: knitr diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..f5cb406 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +# MIT License + +Copyright (c) 2023 bspme authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..5aeb339 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,5 @@ +# Generated by roxygen2: do not edit by hand + +export(blinreg_me) +export(vecchia_cov) +importFrom(Matrix,Diagonal) diff --git a/R/blinreg_me.R b/R/blinreg_me.R new file mode 100644 index 0000000..38c2400 --- /dev/null +++ b/R/blinreg_me.R @@ -0,0 +1,221 @@ +#' Bayesian normal linear regression models with correlated measurement errors +#' +#' This function implements the Bayesian normal linear regression model with correlated measurement error of covariate(s). +#' Denote \eqn{Y_i} be a continuous response, \eqn{X_i} be a \eqn{q\times 1} covariate of \eqn{i}th observation that is subject to measurement error, +#' and \eqn{Z_i} be a \eqn{p\times 1} covariate without measurement error. +#' The likelihood model is Bayesian normal linear regression, +#' \deqn{Y_i = \beta_0 + X_i^\top \beta_x + Z_i^\top \beta_z + \epsilon_i,\quad \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n} +#' and correlated measurement error of \eqn{X_i, i=1,\dots,n} is incorporated into the model as a multivariate normal prior. For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior +#' \deqn{(X_1,\dots,X_n)\sim N_n(\mu_X, Q_X^{-1}).} +#' Also, we consider semiconjugate priors for regression coefficients and noise variance; +#' \deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{x,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{z,k} \stackrel{iid}{\sim} N(0, V_\beta), \quad \sigma_Y^2 \sim IG(a_Y, b_Y).} +#' The function \code{blinreg_me()} implements the Gibbs sampler for posterior inference. Most importantly, it allows sparse matrix input for \eqn{Q_X} for scalable computation. +#' +#' @param Y n by 1 matrix, response +#' @param X_mean n by 1 matrix or list of n by 1 matrices of length q, mean of X \eqn{\mu_X}. +#' @param X_prec n by n matrix or list of n by n matrices of length q, precision matrix of X \eqn{Q_X}. Support sparse matrix format from Matrix package. +#' @param Z n by p matrix, covariates without measurement error +#' @param nburn integer, burn-in iteration +#' @param nthin integer, thin-in rate +#' @param nsave integer, number of posterior samples +#' @param prior list of prior parameters, default is var_beta = 100,a_Y = 0.01, b_Y = 0.01 +#' @param saveX logical, save X or not +#' +#' @return list of (1) posterior, the (nsave)x(q+p) matrix of posterior samples as a coda object, +#' (2) cputime, cpu time taken in seconds, +#' and optionally (3) X_save, posterior samples of X +#' @export +#' +#' @examples +#' +#'\dontrun{ +#' data(ozone) +#' data(health_sim) +#' library(bspme) +#' data(ozone) +#' data(health_sim) +#' library(fields) +#' # exposure mean and covariance at health subject locations with 06/18/1987 data (date id = 16) +#' # using Gaussian process with prior mean zero and exponential covariance kernel +#' # with fixed range 300 (in miles) and stdev 15 (in ppb) +#' +#' ozone16 = ozone[ozone$date_id==16,] +#' +#' Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) +#' Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +#' Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), +#' cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +#' +#' Kxx = Exponential(Dxx, range = 300, phi=15^2) +#' Kyy = Exponential(Dyy, range = 300, phi=15^2) +#' Kxy = Exponential(Dxy, range = 300, phi=15^2) +#' +#' X_mean = t(Kxy) %*% solve(Kxx, ozone16$ozone_ppb) +#' X_cov = Kyy - t(Kxy) %*% solve(Kxx, Kxy) +#' +#' # visualize +#' par(mfrow = c(1,3)) +#' quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), +#' ozone16$ozone_ppb, main = "ozone measurements"); US(add= T) +#' +#' quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), +#' X_mean, main = "health subjects, mean of exposure"); US(add= T) +#' points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) +#' +#' quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), +#' sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) +#' points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) +#' +#' # vecchia approximation +#' run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), +#' n.neighbors = 10) +#' Q_sparse = run_vecchia$Q +#' run_vecchia$cputime +#' +#' # fit the model +#' fit_me = blm_me(Y = health_sim$Y, +#' X_mean = X_mean, +#' X_prec = Q_sparse, # sparse precision matrix +#' Z = health_sim$Z, +#' nburn = 100, +#' nsave = 1000, +#' nthin = 1) +#' fit_me$cputime +#' summary(fit_me$posterior) +#' library(bayesplot) +#' bayesplot::mcmc_trace(fit_me$posterior) +#' } +#' +blinreg_me <- function(Y, + X_mean, + X_prec, + Z, + nburn = 2000, + nsave = 2000, + nthin = 5, + prior = NULL, + saveX = F){ + + # prior input, default + if(is.null(prior)){ + prior = list(var_beta = 100,a_Y = 0.01, b_Y = 0.01) + } + var_beta = 100 + a_Y = 0.01 + b_Y = 0.01 + + n_y = length(Y) + if(is.vector(Z)) Z = as.matrix(Z) + + if(!is.list(X_mean) & !is.list(X_prec)){ + q = 1 + X_mean = list(X_mean) + X_prec = list(X_prec) + }else if(is.list(X_mean) & is.list(X_prec)){ + q = length(X_mean) + if(length(X_prec)!=q) stop("list length does not match") + }else{ + stop("X_mean is not vector/matrix or list") + } + X_prec_X_mean = list() + X_spamstruct = vector(mode = 'list', length = q) + sparsealgo = rep(T,q) + + for(qq in 1:q){ + X_prec_X_mean[[qq]] = as.numeric(X_prec[[qq]]%*%X_mean[[qq]]) + + if(!("sparseMatrix" %in% is(X_prec[[qq]]))){ + print(paste0(qq,"th X_prec is not a sparse matrix! Using dense algorithm, which may very slow when n is large")) + sparsealgo[qq] = F + }else{ + X_prec[[qq]] = as(as(X_prec[[qq]], "generalMatrix"), "CsparseMatrix") + X_prec[[qq]] = spam::as.spam.dgCMatrix(X_prec[[qq]])# spam object + X_spamstruct[[qq]] = spam::chol(X_prec[[qq]]) + } + } + + X = matrix(0, n_y, q) + for(qq in 1:q) X[,qq] = X_mean[[q]] + if(is.null(names(X_mean))){ + colnames(X) = paste0("exposure.",1:q) + }else{ + colnames(X) = paste0("exposure.",names(X_mean)) + } + + p = ncol(Z) + if(is.null(colnames(Z))){ + colnames(Z) = paste0("covariate.",1:p) + }else{ + colnames(Z) = paste0("covariate.",colnames(Z)) + } + df_temp = as.data.frame(cbind(X,Z)) + D = model.matrix( ~ ., df_temp) + + + # prior + Sigma_beta = diag(var_beta, ncol(D))# 3 coefficients(beta0, beta1, betaz) + Sigma_betainv = solve(Sigma_beta) + + # initialize + sigma2_Y = 1 + beta = rep(0.1, ncol(D)) + + sigma2_save = matrix(0, nsave, 1) + colnames(sigma2_save) = "sigma2_Y" + beta_save = matrix(0, nsave, ncol(D)) + colnames(beta_save) <- colnames(D) + if(saveX){ + X_save = array(0, dim = c(nsave, n_y, q)) + dimnames(X_save)[[3]] = names(X_mean) + } + + YtY = crossprod(Y) + #browser() + # sampler starts + isave = 0 + isnegative = numeric(n_y) + pb <- txtProgressBar(style=3) + t_start = Sys.time() + for(imcmc in 1:(nsave*nthin + nburn)){ + setTxtProgressBar(pb, imcmc/(nsave*nthin + nburn)) + # sample beta + Vbetainv = Sigma_betainv + crossprod(D)/sigma2_Y + betatilde = solve(Vbetainv,crossprod(D,Y)/sigma2_Y) + beta = as.numeric(spam::rmvnorm.prec(1, mu = betatilde, Q = Vbetainv)) + # sample sigma2_Y + SSR = crossprod(Y - D%*%beta) + sigma2_Y = 1/rgamma(1, a_Y + n_y/2, b_Y + SSR/2 ) + + for(qq in 1:q){ + # 1st is intercept + b_G = X_prec_X_mean[[qq]] + beta[qq + 1]/sigma2_Y*(Y-D[,-(qq+1)]%*%beta[-(qq+1)]) + Qtilde = X_prec[[qq]] # dense or spam + if(sparsealgo[qq]){ + Qtilde = Qtilde + spam::diag.spam(beta[qq + 1]^2/sigma2_Y, n_y, n_y) + }else{ + diag(Qtilde) = diag(Qtilde) + beta[qq + 1]^2/sigma2_Y + } + Xstar = spam::rmvnorm.canonical(1, b = as.vector(b_G), + Q = Qtilde,# dense or spam + Rstruct = X_spamstruct[[qq]]) #browser() + if(imcmc > nburn) isnegative = isnegative + (Xstar<0) + D[,(qq+1)] = as.vector(Xstar) + } + + + if((imcmc > nburn)&(imcmc%%nthin==0)){ + isave = isave + 1 + sigma2_save[isave] = sigma2_Y + beta_save[isave,] = beta + if(saveX) X_save[isave,,] = D[,2:(q+1)] + } + } + t_diff = difftime(Sys.time(), t_start, units = "secs") + #print(paste0("Exposure components contains negative vaules total ",sum(isnegative)," times among (# exposures) x n_y x (MCMC iter after burnin) = ",q," x ",n_y," x ",nsave*nthin," instances")) + out = list() + out$posterior = cbind(beta_save, sigma2_save) + out$posterior = coda::mcmc(out$posterior) + out$cputime = t_diff + if(saveX) out$X_save = X_save + out +} diff --git a/R/bspme-package.R b/R/bspme-package.R new file mode 100644 index 0000000..829e0ae --- /dev/null +++ b/R/bspme-package.R @@ -0,0 +1,39 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +## usethis namespace: end +NULL + + +#' Dataset, ozone exposure +#' +#' This is a subset of "ozone2" dataset in fields package, only containing data from monitoring station with no missing values. +#' The 8-hour average (surface) ozone (from 9AM-4PM) measured in parts per billion (PPB) for 67 sites in the midwestern US over the period June 3,1987 through August 31, 1987, 89 days. +#' +#' @format A data frame with 5963 rows and 6 variables: +#' \describe{ +#' \item{date_id}{integer, 1 corresponds to 06/03/1987 and 89 corresponds to 08/31/1987} +#' \item{date}{POIXct, date} +#' \item{station_id}{character, station id} +#' \item{coords_lon}{numeric, longitude of monitoring station} +#' \item{coords_lat}{numeric, latitude of monitoring station} +#' \item{ozone_ppb}{8-hour average surface ozone from 9am-4pm in parts per billion (PPB)} +#' } +"ozone" + + +#' Dataset, simulated health data +#' +#' Simulated health data based on ozone exposures on 06/18/1987. For details, see \code{health_sim.R}. +#' +#' @format A data frame with n = 3000 rows and 4 variables: +#' \describe{ +#' \item{Y}{n by 1 matrix, numeric, simulated continuous health outcome} +#' \item{Ybinary}{n by 1 matrix, numeric, simulated binary health outcome} +#' \item{coords_y_lon}{n by 1 matrix, numeric, simulated health subject longitude} +#' \item{coords_y_lat}{n by 1 matrix, numeric, simulated health subject latitude} +#' \item{Z}{n by 1 matrix, numeric, covariate} +#' \item{X_true}{n by 1 matrix, numeric, true ozone exposure used for simulation} +#' } +"health_sim" diff --git a/R/vecchia_cov.R b/R/vecchia_cov.R new file mode 100644 index 0000000..44412bb --- /dev/null +++ b/R/vecchia_cov.R @@ -0,0 +1,117 @@ +#' +#' Vecchia approximation given a covariance matrix +#' +#' This function finds a sparse precision matrix (inverse precision matrix) Q that approximates the covariance matrix Sigma based on Vecchia's approximation. +#' Specifically, the nearest neighbor conditioning set are used based on the coords argument. +#' +#' @param Sigma n by n covariance matrix +#' @param coords n by 2 matrix, coordinates to determine neighborhood +#' @param n.neighbors positive integer, number of nearest neighbors to construct knn graph +#' @param ord length n vector, ordering of data. If NULL, ordering based on first coordinate will be used. +#' @param KLdiv logical, return KL divergence? +#' +#' @return list of (1) Q, the sparse precision matrix, +#' (2) ord, the ordering used for Vecchia approximation, +#' (3) cputime, time taken in seconds, +#' and optionally (4) KLdiv, the KL divergence KL(p, ptilde), where p is multivariate normal with mean 0 and covariance Sigma, and ptilde is multivariate normal with mean 0 and precision Q. +#' @importFrom Matrix Diagonal +#' +#' @export +#' +#' @examples +#' +#' n = 1000 +#' coords = cbind(runif(n), runif(n)) +#' Sigma = fields::Exp.cov(coords, aRange = 1) +#' fit5 = vecchia_cov(Sigma, coords, n.neighbors = 5, KLdiv = TRUE) +#' fit5$KLdiv +#' fit10 = vecchia_cov(Sigma, coords, n.neighbors = 10, KLdiv = TRUE) +#' fit10$KLdiv +#' +vecchia_cov = function(Sigma, coords, n.neighbors, ord = NULL, KLdiv = FALSE){ + start.time = Sys.time() + + n = ncol(Sigma) + if(nrow(Sigma) != n) stop("Sigma must be square matrix") + if(nrow(coords) != n) stop("number of coordinates are not same as dimension of Sigma") + if (missing(ord)) { + ord <- order(coords[, 1]) + } + else { + if (length(ord) != n) { + stop("error: supplied order vector ord must be of length n") + } + } + + indx <- spNNGP:::mkNNIndxCB(coords, n.neighbors, n.omp.threads = 1) # search.type == "cb" + nn.indx <- indx$nnIndx + n.indx = spNNGP:::mk.n.indx.list(nn.indx, n, n.neighbors) + + if(n.neighbors > 1){ + NN_ind <- t(sapply(1: (n - 1), get_NN_ind, n.indx[-1], n.neighbors)) + }else if(n.neighbors == 1){ # n.neighbors == 1 + NN_ind <- as.matrix(sapply(1: (n - 1), get_NN_ind, n.indx[-1], n.neighbors)) + }else{ # n.neighbors == 0 + Q = Matrix::Diagonal(n, x=1/diag(Sigma)) + Q = as(Q, "dgCMatrix") + out = list(Q = Q, ord = ord, cputime = difftime(Sys.time(),start.time, units = "secs")) + if(KLdiv){ + out$KLdiv = as.numeric(0.5*(- determinant(Q)$modulus - determinant(Sigma)$modulus)) + } + return(out) + } + Sigma = Sigma[ord,ord] # Sigma is now reordered, overwrite to reduce memory usage + + Nlist = list() + Nlist[[1]] = c() + for(i in 1:(n-1)){ + if(i<=n.neighbors){ + Nlist[[i+1]] <- NN_ind[i,1:i] + }else{ + Nlist[[i+1]] <- NN_ind[i,] + } + } + # sparse matrix + A = Matrix::Matrix(0, n, n); + D = numeric(n) #D = Matrix(0, n, n); + D[1] = 1 + # pseudocode 2 of Finley et al. + #Nlist[[i]] be the set of indices j + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "man/figures/README-", + out.width = "100%") +``` + + +# bspme + + +Go to package website: [[link]](https://changwoo-lee.github.io/bspme/) + +See a vignette with ozone exposure data: [[link]](https://changwoo-lee.github.io/bspme/articles/Ozone-exposure-and-health-data-analysis.html) + +**bspme** is an R package that provides a set of functions for **B**ayesian **sp**atially correlated **m**easurement **e**rror models, the Bayesian linear models with the presence of (spatially) correlated measurement error of covariate(s). For more details, please see the following paper: + + +> A scalable two-stage Bayesian approach accounting for exposure measurement error in +environmental epidemiology. +> Lee, C. J., Symanski, E., Rammah, A., Kang, D.H., Hopke, P., Park, E.S. (2023+). preprint (url here). + + +For illustration, consider a normal linear regression model with intercept, one covariate $X$ with measurement error, and one covariate $Z$ without measurement error: + +$$ +Y_i = \beta_0 + \beta_x X_i + \beta_z Z_i + \epsilon_i, \quad \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n, +$$ + +In the context of environmental epidemiology, the covariate $X_i$ can be an exposure to air pollution of subject $i$ at different locations, $Z_i$ can be demographic information, and $Y_i$ can be the associated health outcome. Since exposure $X_i$, $i=1,\dots,n$ are not directly measured at health subject locations but are predicted from air pollution monitoring station locations, this induces spatially correlated measurement error in $X_i$. Also, the uncertainty information should be taken account, which depends on the proximity of the monitoring station to the subject location. One way to incorporate this information is to use a multivariate normal (MVN) prior on a covariate $(X_1,\dots,X_n)$ for all subjects, + +$$ +(X_1,\dots,X_n)\sim \mathrm{N}_n(\mathbf{m}, \mathbf{Q}^{-1}), +$$ + +with some mean $\mathbf{m}$ and precision (inverse covariance) matrix $\mathbf{Q}$, referred as a MVN prior approach. + +The **bspme** package provides fast, scalable inference tools for Bayesian spatially correlated measurement error models, where measurement error information is represented as an MVN prior distribution with a **sparse precision matrix** $\mathbf{Q}$. +Naive choices of $\mathbf{Q}$, such as a sample precision matrix, make the MCMC posterior inference algorithm infeasible to run for a large number of subjects $n$ because of the $O(n^3)$ computational cost associated with the $n$-dimensional MVN prior. +With the sparse precision matrix $\mathbf{Q}$ obtained from the Vecchia approximation, the **bspme** package offers a fast, scalable algorithm to conduct posterior inference for large health datasets, with the number of subjects $n$ possibly reaching tens of thousands. + +## Installation + +You can install the development version of bspme with the following code: + +``` r +# install.packages("devtools") +devtools::install_github("changwoo-lee/bspme") +``` + + +## Functionality + +| Function | Description | +| ---------------------- | -------------------------------------------------------------------------| +| `blinreg_me()` | Bayesian normal linear regression models with a MVN prior for measurement errors | +| `blogireg_me()` | Bayesian logistic regression models with a MVN prior for measurement errors | +| `vecchia_cov()` | Perform the Vecchia approximation given a MVN covariance matrix | + + + +## datasets + +| Dataset call | Description | +| ---------------------- | -------------------------------------------------------------------------| +| `data("ozone")` | 1987 midwest ozone exposure data | +| `data("health_sim")` | Simulated health data corresponding to ozone exposure data | + +## Examples + +Please see the vignette "Ozone exposure and simulated health data". diff --git a/README.md b/README.md new file mode 100644 index 0000000..7fe8297 --- /dev/null +++ b/README.md @@ -0,0 +1,92 @@ + + + +# bspme + + + + +Go to package website: [\[link\]](https://changwoo-lee.github.io/bspme/) + +See a vignette with ozone exposure data: +[\[link\]](https://changwoo-lee.github.io/bspme/articles/Ozone-exposure-and-health-data-analysis.html) + +**bspme** is an R package that provides a set of functions for +**B**ayesian **sp**atially correlated **m**easurement **e**rror models, +the Bayesian linear models with the presence of (spatially) correlated +measurement error of covariate(s). For more details, please see the +following paper: + +> A scalable two-stage Bayesian approach accounting for exposure +> measurement error in environmental epidemiology. Lee, C. J., Symanski, +> E., Rammah, A., Kang, D.H., Hopke, P., Park, E.S. (2023+). preprint +> (url here). + +For illustration, consider a normal linear regression model with +intercept, one covariate $X$ with measurement error, and one covariate +$Z$ without measurement error: + +$$ +Y_i = \beta_0 + \beta_x X_i + \beta_z Z_i + \epsilon_i, \quad \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n, +$$ + +In the context of environmental epidemiology, the covariate $X_i$ can be +an exposure to air pollution of subject $i$ at different locations, +$Z_i$ can be demographic information, and $Y_i$ can be the associated +health outcome. Since exposure $X_i$, $i=1,\dots,n$ are not directly +measured at health subject locations but are predicted from air +pollution monitoring station locations, this induces spatially +correlated measurement error in $X_i$. Also, the uncertainty information +should be taken account, which depends on the proximity of the +monitoring station to the subject location. One way to incorporate this +information is to use a multivariate normal (MVN) prior on a covariate +$(X_1,\dots,X_n)$ for all subjects, + +$$ +(X_1,\dots,X_n)\sim \mathrm{N}_n(\mathbf{m}, \mathbf{Q}^{-1}), +$$ + +with some mean $\mathbf{m}$ and precision (inverse covariance) matrix +$\mathbf{Q}$, referred as a MVN prior approach. + +The **bspme** package provides fast, scalable inference tools for +Bayesian spatially correlated measurement error models, where +measurement error information is represented as an MVN prior +distribution with a **sparse precision matrix** $\mathbf{Q}$. Naive +choices of $\mathbf{Q}$, such as a sample precision matrix, make the +MCMC posterior inference algorithm infeasible to run for a large number +of subjects $n$ because of the $O(n^3)$ computational cost associated +with the $n$-dimensional MVN prior. With the sparse precision matrix +$\mathbf{Q}$ obtained from the Vecchia approximation, the **bspme** +package offers a fast, scalable algorithm to conduct posterior inference +for large health datasets, with the number of subjects $n$ possibly +reaching tens of thousands. + +## Installation + +You can install the development version of bspme with the following +code: + +``` r +# install.packages("devtools") +devtools::install_github("changwoo-lee/bspme") +``` + +## Functionality + +| Function | Description | +|-----------------|----------------------------------------------------------------------------------| +| `blinreg_me()` | Bayesian normal linear regression models with a MVN prior for measurement errors | +| `blogireg_me()` | Bayesian logistic regression models with a MVN prior for measurement errors | +| `vecchia_cov()` | Perform the Vecchia approximation given a MVN covariance matrix | + +## datasets + +| Dataset call | Description | +|----------------------|------------------------------------------------------------| +| `data("ozone")` | 1987 midwest ozone exposure data | +| `data("health_sim")` | Simulated health data corresponding to ozone exposure data | + +## Examples + +Please see the vignette “Ozone exposure and simulated health data”. diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 0000000..fffc7d9 --- /dev/null +++ b/_pkgdown.yml @@ -0,0 +1,6 @@ +url: https://changwoo-lee.github.io/bspme/ +template: + bootstrap: 5 +repo: + url: + home: https://github.com/changwoo-lee/bspme diff --git a/bspme.Rproj b/bspme.Rproj new file mode 100644 index 0000000..497f8bf --- /dev/null +++ b/bspme.Rproj @@ -0,0 +1,20 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source diff --git a/data-raw/health_sim.R b/data-raw/health_sim.R new file mode 100644 index 0000000..d7bfa56 --- /dev/null +++ b/data-raw/health_sim.R @@ -0,0 +1,128 @@ + +## code to prepare `healthdata_simul` dataset goes here +n_y = 3000 +library(fields) +data(ozone2) +lon.lat = ozone2$lon.lat +set.seed(1) +#coords_test = cbind(runif(2*n_y,min(lon.lat[,1]),max(lon.lat[,1])), runif(2*n_y,min(lon.lat[,2]),max(lon.lat[,2]))) + +# Chicago +coords_test1 = cbind(rnorm(1000, -87.73, 0.4), rnorm(1000, 41.89, 0.4)) +# Milwaukee +coords_test2 = cbind(rnorm(500, -87.91, 0.4), rnorm(500, 43.03, 0.4)) +# Indianapolis +coords_test3 = cbind(rnorm(500, -86.16, 0.4), rnorm(500, 39.76)) +# St Louis +coords_test4 = cbind(rnorm(500, -90.20, 0.4), rnorm(500, 38.63, 0.4)) +# Detroit +coords_test5 = cbind(rnorm(500, -83.1, 0.3), rnorm(500, 42.34, 0.3)) +# Louisville +coords_test6 = cbind(rnorm(200, -85.76, 0.3), rnorm(200, 38.25, 0.3)) +# Cincinnati +coords_test7 = cbind(rnorm(500, -84.51, 0.3), rnorm(500, 39.10, 0.3)) +# Columbus +coords_test8 = cbind(rnorm(500, -82.99, 0.4), rnorm(500, 39.96, 0.4)) +# Champaign +coords_test9 = cbind(rnorm(200, -88.24, 0.4), rnorm(200, 40.11, 0.4)) +# grand rapids +coords_test10 = cbind(rnorm(200, -85.67, 0.4), rnorm(200, 42.96, 0.4)) +# lansing +coords_test11 = cbind(rnorm(200, -84.55, 0.4), rnorm(200, 42.73, 0.4)) +# lexington +coords_test12 = cbind(rnorm(200, -84.49, 0.2), rnorm(200, 38.05, 0.2)) +# madison +coords_test13 = cbind(rnorm(100, -89.40, 0.4), rnorm(100, 43.07, 0.4)) +# davenport +coords_test14 = cbind(rnorm(100, -90.58, 0.4), rnorm(100, 41.52, 0.4)) +# peoria +coords_test15 = cbind(rnorm(100, -89.61, 0.2), rnorm(100, 40.69, 0.2)) +# evansville +coords_test16 = cbind(rnorm(100, -87.57, 0.2), rnorm(100, 37.97, 0.2)) +# la crosse +coords_test17 = cbind(rnorm(100, -91.25, 0.2), rnorm(100, 43.81, 0.2)) +# fort wayne +coords_test18 = cbind(rnorm(100, -85.14, 0.2), rnorm(100, 41.08, 0.2)) + +coords_test = rbind(coords_test1, coords_test2, coords_test3, coords_test4, coords_test5, coords_test6, + coords_test7, coords_test8, coords_test9, coords_test10, coords_test11, coords_test12, + coords_test13, coords_test14, coords_test15, coords_test16, coords_test17, coords_test18) + +# extract index so that x is between -87.9 and -86 +idx1 = which(coords_test[,1] > -87.9 & coords_test[,1] < -86) +# extract index so that y is between 41.7 and 45 +idx2 = which(coords_test[,2] > 41.7 & coords_test[,2] < 45) +idx = intersect(idx1, idx2) +if(length(idx) > 0) coords_test = coords_test[-idx, ] + +# extract index so that x is between -84 and -83 +idx1 = which(coords_test[,1] > -84 & coords_test[,1] < -82) +# extract index so that y is between 43.5 and 45 +idx2 = which(coords_test[,2] > 43.5 & coords_test[,2] < 45) +idx = intersect(idx1, idx2) +if(length(idx) > 0) coords_test = coords_test[-idx, ] + +# extract index so that x is between -84 and -82 +idx1 = which(coords_test[,1] > -83.5 & coords_test[,1] < -82) +# extract index so that y is between 41.5 and 42.5 +idx2 = which(coords_test[,2] > 41.5 & coords_test[,2] < 42.5) +idx = intersect(idx1, idx2) +if(length(idx) > 0) coords_test = coords_test[-idx, ] +nrow(coords_test) + +coords_test = coords_test[-sample.int(nrow(coords_test), nrow(coords_test) - n_y),] + +# simulate health data based on day 16 +load("data/ozone.rda") +ozone_day16 = ozone[ozone$date_id == 16,] +coords_train = cbind(ozone_day16$coords_lon, ozone_day16$coords_lat) +quilt.plot(coords_train, ozone_day16$ozone_ppb) +sigma <- 15 # standard deviation +a <- 300 # range, in miles +## Matrices needed for prediction + +distmat_xx <- rdist.earth(coords_train) +distmat_xy <- rdist.earth(coords_train, coords_test) +distmat_yy <- rdist.earth(coords_test) + +Sigmaxx = Matern(distmat_xx, smoothness = 0.5, range = a, phi = sigma^2) +Sigmaxy = Matern(distmat_xy, smoothness = 0.5, range = a, phi = sigma^2) +Sigmayy = Matern(distmat_yy, smoothness = 0.5, range = a, phi = sigma^2) + +## GP mean +pred_mean <- t(Sigmaxy) %*% solve(Sigmaxx,ozone_day16$ozone_ppb) +## GP cov +pred_cov <- Sigmayy - t(Sigmaxy) %*% solve(Sigmaxx,Sigmaxy) + +# draw predictive sample +set.seed(1) +true_exposure = mvnfast::rmvn(1, pred_mean, pred_cov) +true_exposure = as.vector(true_exposure) +# par(mfrow = c(1,3)) +# quilt.plot(coords_test, pred_mean, main = "mean"); US(add= T) +# quilt.plot(coords_test, sqrt(diag(pred_cov)), main = "sd"); US(add= T) +# quilt.plot(coords_test, true_exposure, main = "draw from MVN"); US(add= T) + +# simulated covariate +z = runif(n_y) + +# generate health data +Y = 100 -0.5*true_exposure + 5*z + rnorm(n_y, sd = 5) +var(Y) +var(-0.5*true_exposure + 5*z) + +linpred = -5 +0.05*true_exposure + 0.5*z +Ybinary = rbinom(n_y, 1, prob = 1/(1+exp(-linpred))) + +health_sim = data.frame(Y = Y, + Ybinary = Ybinary, + #X_mean = pred_mean, + #X_cov = pred_cov, + coords_y_lon = coords_test[,1], + coords_y_lat = coords_test[,2], + Z = z, + X_true = true_exposure) + + +usethis::use_data(health_sim, overwrite = TRUE) + diff --git a/data-raw/ozone.R b/data-raw/ozone.R new file mode 100644 index 0000000..06d8e84 --- /dev/null +++ b/data-raw/ozone.R @@ -0,0 +1,17 @@ +## code to prepare `ozone` dataset goes here + +library(fields) +data(ozone2) +idx = which(colSums(is.na(ozone2$y)) == 0) +station_id = ozone2$station.id[idx] +date = as.POSIXct(paste0("19",ozone2$dates), format = "%Y%m%d", tz = "UTC") +ymat = ozone2$y[,idx] +str(ymat) +ozone = data.frame( date_id = rep(1:89, each = 67), + date = rep(date, each = 67), + station_id = rep(station_id, 89), + coords_lon = rep(ozone2$lon.lat[idx,1], 89), + coords_lat = rep(ozone2$lon.lat[idx,2], 89), + ozone_ppb = as.vector(t(ymat))) + +usethis::use_data(ozone, overwrite = TRUE) diff --git a/data/health_sim.rda b/data/health_sim.rda new file mode 100644 index 0000000..ce5a313 Binary files /dev/null and b/data/health_sim.rda differ diff --git a/data/ozone.rda b/data/ozone.rda new file mode 100644 index 0000000..4f836a6 Binary files /dev/null and b/data/ozone.rda differ diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 0000000..4c1d952 --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,8 @@ +bibentry( +title = "A scalable two-stage Bayesian approach accounting for exposure measurement error in environmental epidemiology.", + year = "2023", + author = "Lee, Changwoo J. and Elaine Symanski and Amal Rammah and Dong Hun Kang and Philip K. Hopke and Eun Sug Park", + bibtype = "article", + url = "tobeadded", + journal = "arXiv preprint" +) diff --git a/man/blinreg_me.Rd b/man/blinreg_me.Rd new file mode 100644 index 0000000..fdde82e --- /dev/null +++ b/man/blinreg_me.Rd @@ -0,0 +1,115 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/blinreg_me.R +\name{blinreg_me} +\alias{blinreg_me} +\title{Bayesian normal linear regression models with correlated measurement errors} +\usage{ +blinreg_me( + Y, + X_mean, + X_prec, + Z, + nburn = 2000, + nsave = 2000, + nthin = 5, + prior = NULL, + saveX = F +) +} +\arguments{ +\item{Y}{n by 1 matrix, response} + +\item{X_mean}{n by 1 matrix or list of n by 1 matrices of length q, mean of X \eqn{\mu_X}.} + +\item{X_prec}{n by n matrix or list of n by n matrices of length q, precision matrix of X \eqn{Q_X}. Support sparse matrix format from Matrix package.} + +\item{Z}{n by p matrix, covariates without measurement error} + +\item{nburn}{integer, burn-in iteration} + +\item{nsave}{integer, number of posterior samples} + +\item{nthin}{integer, thin-in rate} + +\item{prior}{list of prior parameters, default is var_beta = 100,a_Y = 0.01, b_Y = 0.01} + +\item{saveX}{logical, save X or not} +} +\value{ +list of (1) posterior, the (nsave)x(q+p) matrix of posterior samples as a coda object, +(2) cputime, cpu time taken in seconds, +and optionally (3) X_save, posterior samples of X +} +\description{ +This function implements the Bayesian normal linear regression model with correlated measurement error of covariate(s). +Denote \eqn{Y_i} be a continuous response, \eqn{X_i} be a \eqn{q\times 1} covariate of \eqn{i}th observation that is subject to measurement error, +and \eqn{Z_i} be a \eqn{p\times 1} covariate without measurement error. +The likelihood model is Bayesian normal linear regression, +\deqn{Y_i = \beta_0 + X_i^\top \beta_x + Z_i^\top \beta_z + \epsilon_i,\quad \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n} +and correlated measurement error of \eqn{X_i, i=1,\dots,n} is incorporated into the model as a multivariate normal prior. For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior +\deqn{(X_1,\dots,X_n)\sim N_n(\mu_X, Q_X^{-1}).} +Also, we consider semiconjugate priors for regression coefficients and noise variance; +\deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{x,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{z,k} \stackrel{iid}{\sim} N(0, V_\beta), \quad \sigma_Y^2 \sim IG(a_Y, b_Y).} +The function \code{blinreg_me()} implements the Gibbs sampler for posterior inference. Most importantly, it allows sparse matrix input for \eqn{Q_X} for scalable computation. +} +\examples{ + +\dontrun{ +data(ozone) +data(health_sim) +library(bspme) +data(ozone) +data(health_sim) +library(fields) +# exposure mean and covariance at health subject locations with 06/18/1987 data (date id = 16) +# using Gaussian process with prior mean zero and exponential covariance kernel +# with fixed range 300 (in miles) and stdev 15 (in ppb) + +ozone16 = ozone[ozone$date_id==16,] + +Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) +Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), + cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) + +Kxx = Exponential(Dxx, range = 300, phi=15^2) +Kyy = Exponential(Dyy, range = 300, phi=15^2) +Kxy = Exponential(Dxy, range = 300, phi=15^2) + +X_mean = t(Kxy) \%*\% solve(Kxx, ozone16$ozone_ppb) +X_cov = Kyy - t(Kxy) \%*\% solve(Kxx, Kxy) + +# visualize +par(mfrow = c(1,3)) +quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), + ozone16$ozone_ppb, main = "ozone measurements"); US(add= T) + +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + X_mean, main = "health subjects, mean of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) + +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) + +# vecchia approximation +run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + n.neighbors = 10) +Q_sparse = run_vecchia$Q +run_vecchia$cputime + +# fit the model +fit_me = blm_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_sparse, # sparse precision matrix + Z = health_sim$Z, + nburn = 100, + nsave = 1000, + nthin = 1) +fit_me$cputime +summary(fit_me$posterior) +library(bayesplot) +bayesplot::mcmc_trace(fit_me$posterior) +} + +} diff --git a/man/bspme-package.Rd b/man/bspme-package.Rd new file mode 100644 index 0000000..ce3536a --- /dev/null +++ b/man/bspme-package.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bspme-package.R +\docType{package} +\name{bspme-package} +\alias{bspme} +\alias{bspme-package} +\title{bspme: Bayesian Spatial Measurement Error Models} +\description{ +Functions for fitting Bayesian linear and genearlized linear models in presence of measurement error of the covariate. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://changwoo-lee.github.io/bspme/} +} + +} +\keyword{internal} diff --git a/man/health_sim.Rd b/man/health_sim.Rd new file mode 100644 index 0000000..3f524ed --- /dev/null +++ b/man/health_sim.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bspme-package.R +\docType{data} +\name{health_sim} +\alias{health_sim} +\title{Dataset, simulated health data} +\format{ +A data frame with n = 3000 rows and 4 variables: +\describe{ +\item{Y}{n by 1 matrix, numeric, simulated continuous health outcome} +\item{Ybinary}{n by 1 matrix, numeric, simulated binary health outcome} +\item{coords_y_lon}{n by 1 matrix, numeric, simulated health subject longitude} +\item{coords_y_lat}{n by 1 matrix, numeric, simulated health subject latitude} +\item{Z}{n by 1 matrix, numeric, covariate} +\item{X_true}{n by 1 matrix, numeric, true ozone exposure used for simulation} +} +} +\usage{ +health_sim +} +\description{ +Simulated health data based on ozone exposures on 06/18/1987. For details, see \code{health_sim.R}. +} +\keyword{datasets} diff --git a/man/ozone.Rd b/man/ozone.Rd new file mode 100644 index 0000000..4037bd5 --- /dev/null +++ b/man/ozone.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bspme-package.R +\docType{data} +\name{ozone} +\alias{ozone} +\title{Dataset, ozone exposure} +\format{ +A data frame with 5963 rows and 6 variables: +\describe{ +\item{date_id}{integer, 1 corresponds to 06/03/1987 and 89 corresponds to 08/31/1987} +\item{date}{POIXct, date} +\item{station_id}{character, station id} +\item{coords_lon}{numeric, longitude of monitoring station} +\item{coords_lat}{numeric, latitude of monitoring station} +\item{ozone_ppb}{8-hour average surface ozone from 9am-4pm in parts per billion (PPB)} +} +} +\usage{ +ozone +} +\description{ +This is a subset of "ozone2" dataset in fields package, only containing data from monitoring station with no missing values. +The 8-hour average (surface) ozone (from 9AM-4PM) measured in parts per billion (PPB) for 67 sites in the midwestern US over the period June 3,1987 through August 31, 1987, 89 days. +} +\keyword{datasets} diff --git a/man/vecchia_cov.Rd b/man/vecchia_cov.Rd new file mode 100644 index 0000000..f91941a --- /dev/null +++ b/man/vecchia_cov.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/vecchia_cov.R +\name{vecchia_cov} +\alias{vecchia_cov} +\title{Vecchia approximation given a covariance matrix} +\usage{ +vecchia_cov(Sigma, coords, n.neighbors, ord = NULL, KLdiv = FALSE) +} +\arguments{ +\item{Sigma}{n by n covariance matrix} + +\item{coords}{n by 2 matrix, coordinates to determine neighborhood} + +\item{n.neighbors}{positive integer, number of nearest neighbors to construct knn graph} + +\item{ord}{length n vector, ordering of data. If NULL, ordering based on first coordinate will be used.} + +\item{KLdiv}{logical, return KL divergence?} +} +\value{ +list of (1) Q, the sparse precision matrix, +(2) ord, the ordering used for Vecchia approximation, +(3) cputime, time taken in seconds, +and optionally (4) KLdiv, the KL divergence KL(p, ptilde), where p is multivariate normal with mean 0 and covariance Sigma, and ptilde is multivariate normal with mean 0 and precision Q. +} +\description{ +This function finds a sparse precision matrix (inverse precision matrix) Q that approximates the covariance matrix Sigma based on Vecchia's approximation. +Specifically, the nearest neighbor conditioning set are used based on the coords argument. +} +\examples{ + +n = 1000 +coords = cbind(runif(n), runif(n)) +Sigma = fields::Exp.cov(coords, aRange = 1) +fit5 = vecchia_cov(Sigma, coords, n.neighbors = 5, KLdiv = TRUE) +fit5$KLdiv +fit10 = vecchia_cov(Sigma, coords, n.neighbors = 10, KLdiv = TRUE) +fit10$KLdiv + +} diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/Ozone-exposure-and-health-data-analysis.Rmd b/vignettes/Ozone-exposure-and-health-data-analysis.Rmd new file mode 100644 index 0000000..8eaebfc --- /dev/null +++ b/vignettes/Ozone-exposure-and-health-data-analysis.Rmd @@ -0,0 +1,180 @@ +--- +title: "Ozone exposure and health data analysis" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{ozone_simuldata} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +This is a vignette for the `bspme` package based on ozone exposure data of midwest US, 1987 and simulated health dataset. + +First load the package and data: + + +```r +rm(list=ls()) +library(bspme) +library(fields) +data(ozone) +data(health_sim) +``` + +The ozone data are collected daily at 67 monitoring sites in midwest US during 89 days. We focus on ozone exposure on a specific date 06/18/1987 (date id = 16). We also consider simulated health dataset, with $n=3000$ subject locations and continuous health responses. + + +```r +ozone16 = ozone[ozone$date_id==16,] +par(mfrow = c(1,2)) +quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), + ozone16$ozone_ppb, main = "67 ozone monitoring sites"); US(add= T) +plot(health_sim$coords_y_lon, health_sim$coords_y_lat, pch = 19, cex = 0.5, col = "grey", + xlab = "Longitude", ylab = "Latitude", main = "3000 simulated health subject locations"); US(add = T) +``` + +
+plot of chunk ozone_eda +

plot of chunk ozone_eda

+
+ +### Obtain posterior predictive distribution at health subject locations + +In this example, we will use Gaussian process to obtain a predictive distribution of $(X_1,\dots,X_n)$, the ozone exposure at the health subject locations. The posterior predictive distribution is n-dimensional multivariate normal $N(\mu_X, Q_X^{-1})$, which will be used as a prior in health model to incorporate measurement error information of ozone exposure. +Specifically, we will use the exponential covariance kernel with fixed range 300 (in miles) and standard deviation 15 (in ppb). + + +```r + +# distance calculation +Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) +Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), + cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) + +# kernel matrices +Kxx = Exponential(Dxx, range = 300, phi=15^2) +Kyy = Exponential(Dyy, range = 300, phi=15^2) +Kxy = Exponential(Dxy, range = 300, phi=15^2) + +# posterior predictive mean and covariance +X_mean = t(Kxy) %*% solve(Kxx, ozone16$ozone_ppb) +X_cov = Kyy - t(Kxy) %*% solve(Kxx, Kxy) + +# visualization of posterior predictive distribution. Black triangle is monitoring station locations. +par(mfrow = c(1,2)) +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + X_mean, main = "health subjects, mean of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) + +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) +``` + +
+plot of chunk ozone_eda3 +

plot of chunk ozone_eda3

+
+ +The covariance of $(X_1,\dots,X_n)$ contains all spatial dependency information up to a second order, but its inverse $Q_X$ become a dense matrix. +When it comes to fitting the Bayesian linear model with measurement error, the naive implementation with n-dimensional multivariate normal prior $N(\mu_X, Q_X^{-1})$ takes cubic time complexity of posterior inference for each MCMC iteration, which becomes infeasible to run when $n$ is large such as tens of thousands. + + +### Run Vecchia approximation to get sparse precision matrix + +We propose to use Vecchia approximation to get a new multivariate normal prior of $(X_1,\dots,X_n)$ with sparse precision matrix, which is crucial for scalable implementation of health model with measurement error, but also at the same time it aims to best preserve the spatial dependency information in the original covariance matrix. + + +```r +#Run the Vecchia approximation to get the sparse precision matrix. + +# Vecchia approximation +run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + n.neighbors = 10) +Q_sparse = run_vecchia$Q +run_vecchia$cputime +#> Time difference of 0.796325 secs +``` + + +### Fit bspme with sparse precision matrix + +We can now fit the main function of bspme package with sparse precision matrix. + + +```r +# fit the model +fit_me = blinreg_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_sparse, # sparse precision matrix + Z = health_sim$Z, + nburn = 100, + nsave = 1000, + nthin = 1) +``` + +```r +fit_me$cputime +#> Time difference of 6.17423 secs +``` +It took less than 10 seconds to (1) run Vecchia approximation and (2) run the MCMC algorithm with total 1100 iterations. + + +```r +summary(fit_me$posterior) +#> +#> Iterations = 1:1000 +#> Thinning interval = 1 +#> Number of chains = 1 +#> Sample size per chain = 1000 +#> +#> 1. Empirical mean and standard deviation for each variable, +#> plus standard error of the mean: +#> +#> Mean SD Naive SE Time-series SE +#> (Intercept) 100.0234 0.624015 0.0197331 0.0259492 +#> exposure.1 -0.5021 0.007004 0.0002215 0.0003135 +#> covariate.1 4.3374 0.337301 0.0106664 0.0126324 +#> sigma2_Y 24.7826 0.712924 0.0225446 0.0280662 +#> +#> 2. Quantiles for each variable: +#> +#> 2.5% 25% 50% 75% 97.5% +#> (Intercept) 98.8360 99.6014 100.0057 100.4462 101.2774 +#> exposure.1 -0.5163 -0.5066 -0.5021 -0.4974 -0.4882 +#> covariate.1 3.6657 4.1060 4.3402 4.5681 4.9898 +#> sigma2_Y 23.3910 24.3080 24.7964 25.2340 26.1523 +bayesplot::mcmc_trace(fit_me$posterior) +``` + +![plot of chunk unnamed-chunk-3](figure/unnamed-chunk-3-1.png) + +### Fit bspme without sparse precision matrix + + +As mentioned before, if precision matrix is dense, the posterior computation becomes infeasible when $n$ becomes large, such as tens of thousands. See the following example when $n=3000$. + + +```r +# fit the model, without vecchia approximation +# I will only run 11 iteration for illustration purpose +Q_dense = solve(X_cov) # inverting 3000 x 3000 matrix, takes some time +# run +fit_me_dense = blinreg_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_dense, # dense precision + Z = health_sim$Z, + nburn = 1, + nsave = 10, + nthin = 1) +``` + +```r +fit_me_dense$cputime +#> Time difference of 58.06338 secs +``` + +With even only 11 MCMC iterations, It took about 1 mins to run the MCMC algorithm. +If number of health subject locations is tens of thousands, the naive algorithm using dense precision matrix becomes infeasible, and Vecchia approximation becomes necesssary to carry out the inference. diff --git a/vignettes/figure/.DS_Store b/vignettes/figure/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/vignettes/figure/.DS_Store differ diff --git a/vignettes/figure/ozone_eda-1.png b/vignettes/figure/ozone_eda-1.png new file mode 100644 index 0000000..c6c53af Binary files /dev/null and b/vignettes/figure/ozone_eda-1.png differ diff --git a/vignettes/figure/ozone_eda3-1.png b/vignettes/figure/ozone_eda3-1.png new file mode 100644 index 0000000..c8b8d88 Binary files /dev/null and b/vignettes/figure/ozone_eda3-1.png differ diff --git a/vignettes/figure/unnamed-chunk-3-1.png b/vignettes/figure/unnamed-chunk-3-1.png new file mode 100644 index 0000000..6d1c0e6 Binary files /dev/null and b/vignettes/figure/unnamed-chunk-3-1.png differ diff --git a/vignettes/ozone-exposure-and-health-data-analysis.Rmd.orig b/vignettes/ozone-exposure-and-health-data-analysis.Rmd.orig new file mode 100644 index 0000000..f9d676c --- /dev/null +++ b/vignettes/ozone-exposure-and-health-data-analysis.Rmd.orig @@ -0,0 +1,139 @@ +--- +title: "Ozone exposure and health data analysis" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{ozone_simuldata} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +This is a vignette for the `bspme` package based on ozone exposure data of midwest US, 1987 and simulated health dataset. + +First load the package and data: + +```{r setup} +rm(list=ls()) +library(bspme) +library(fields) +data(ozone) +data(health_sim) +``` + +The ozone data are collected daily at 67 monitoring sites in midwest US during 89 days. We focus on ozone exposure on a specific date 06/18/1987 (date id = 16). We also consider simulated health dataset, with $n=3000$ subject locations and continuous health responses. + +```{r, ozone_eda, fig.width = 10, fig.height=5, out.width ="100%"} +ozone16 = ozone[ozone$date_id==16,] +par(mfrow = c(1,2)) +quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), + ozone16$ozone_ppb, main = "67 ozone monitoring sites"); US(add= T) +plot(health_sim$coords_y_lon, health_sim$coords_y_lat, pch = 19, cex = 0.5, col = "grey", + xlab = "Longitude", ylab = "Latitude", main = "3000 simulated health subject locations"); US(add = T) +``` + +### Obtain posterior predictive distribution at health subject locations + +In this example, we will use Gaussian process to obtain a predictive distribution of $(X_1,\dots,X_n)$, the ozone exposure at the health subject locations. The posterior predictive distribution is n-dimensional multivariate normal $N(\mu_X, Q_X^{-1})$, which will be used as a prior in health model to incorporate measurement error information of ozone exposure. +Specifically, we will use the exponential covariance kernel with fixed range 300 (in miles) and standard deviation 15 (in ppb). + +```{r, ozone_eda3, fig.width = 10, fig.height=5, out.width ="100%"} + +# distance calculation +Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) +Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), + cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) + +# kernel matrices +Kxx = Exponential(Dxx, range = 300, phi=15^2) +Kyy = Exponential(Dyy, range = 300, phi=15^2) +Kxy = Exponential(Dxy, range = 300, phi=15^2) + +# posterior predictive mean and covariance +X_mean = t(Kxy) %*% solve(Kxx, ozone16$ozone_ppb) +X_cov = Kyy - t(Kxy) %*% solve(Kxx, Kxy) + +# visualization of posterior predictive distribution. Black triangle is monitoring station locations. +par(mfrow = c(1,2)) +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + X_mean, main = "health subjects, mean of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) + +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) +``` + +The covariance of $(X_1,\dots,X_n)$ contains all spatial dependency information up to a second order, but its inverse $Q_X$ become a dense matrix. +When it comes to fitting the Bayesian linear model with measurement error, the naive implementation with n-dimensional multivariate normal prior $N(\mu_X, Q_X^{-1})$ takes cubic time complexity of posterior inference for each MCMC iteration, which becomes infeasible to run when $n$ is large such as tens of thousands. + + +### Run Vecchia approximation to get sparse precision matrix + +We propose to use Vecchia approximation to get a new multivariate normal prior of $(X_1,\dots,X_n)$ with sparse precision matrix, which is crucial for scalable implementation of health model with measurement error, but also at the same time it aims to best preserve the spatial dependency information in the original covariance matrix. + +```{r, vecchia} +#Run the Vecchia approximation to get the sparse precision matrix. + +# Vecchia approximation +run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + n.neighbors = 10) +Q_sparse = run_vecchia$Q +run_vecchia$cputime +``` + + +### Fit bspme with sparse precision matrix + +We can now fit the main function of bspme package with sparse precision matrix. + +```{r, sparse, results = "hide"} +# fit the model +fit_me = blinreg_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_sparse, # sparse precision matrix + Z = health_sim$Z, + nburn = 100, + nsave = 1000, + nthin = 1) +``` +```{r} +fit_me$cputime +``` +It took less than 10 seconds to (1) run Vecchia approximation and (2) run the MCMC algorithm with total 1100 iterations. + +```{r} +summary(fit_me$posterior) +bayesplot::mcmc_trace(fit_me$posterior) +``` + +### Fit bspme without sparse precision matrix + + +As mentioned before, if precision matrix is dense, the posterior computation becomes infeasible when $n$ becomes large, such as tens of thousands. See the following example when $n=3000$. + +```{r, dense, results = "hide"} +# fit the model, without vecchia approximation +# I will only run 11 iteration for illustration purpose +Q_dense = solve(X_cov) # inverting 3000 x 3000 matrix, takes some time +# run +fit_me_dense = blinreg_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_dense, # dense precision + Z = health_sim$Z, + nburn = 1, + nsave = 10, + nthin = 1) +``` +```{r} +fit_me_dense$cputime +``` + +With even only 11 MCMC iterations, It took about 1 mins to run the MCMC algorithm. +If number of health subject locations is tens of thousands, the naive algorithm using dense precision matrix becomes infeasible, and Vecchia approximation becomes necesssary to carry out the inference.