recommit all things again

changwoo-lee · Dec 29, 2023 · 33c8f7f · 33c8f7f
commit 33c8f7f
Show file tree

Hide file tree

Showing 32 changed files with 1,390 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,9 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^LICENSE\.md$
+^README\.Rmd$
+^data-raw$
+^_pkgdown\.yml$
+^docs$
+^pkgdown$
+^\.github$
diff --git a/.github/.gitignore b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
@@ -0,0 +1,48 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+  release:
+    types: [published]
+  workflow_dispatch:
+
+name: pkgdown
+
+jobs:
+  pkgdown:
+    runs-on: ubuntu-latest
+    # Only restrict concurrency for non-PR jobs
+    concurrency:
+      group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::pkgdown, local::.
+          needs: website
+
+      - name: Build site
+        run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
+        shell: Rscript {0}
+
+      - name: Deploy to GitHub pages 🚀
+        if: github.event_name != 'pull_request'
+        uses: JamesIves/[email protected]
+        with:
+          clean: false
+          branch: gh-pages
+          folder: docs
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+docs
+inst/doc
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,27 @@
+Package: bspme
+Type: Package
+Title: Bayesian Spatial Measurement Error Models
+Version: 0.2.0
+Authors@R: c(person("Changwoo", "Lee", role=c("aut", "cre"), email="[email protected]"), person("Elaine", "Symanski", role = c('aut')), person("Amal", "Rammah", role = c('aut')), person("Dong Hun", "Kang", role = c('aut')), person("Philip", "Hopke", role = c('aut')), person("Eun Sug", "Park", role = c("aut")))
+Author: Changwoo Lee[aut, cre], Eun Sug Park[aut], Elaine Symanski[aut], Amal Rammah[aut], Dong Hun Kang[aut], Philip Hopke[aut]
+Maintainer: Changwoo Lee <[email protected]>
+Description: Functions for fitting Bayesian linear and genearlized linear models in presence of spatial measurement error of the covariates. 
+License: MIT + file LICENSE
+Encoding: UTF-8
+LazyData: true
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.2.3
+Imports: 
+    coda,
+    fields,
+    spam,
+    spNNGP
+Depends: 
+    Matrix,
+    R (>= 2.10)
+URL: https://changwoo-lee.github.io/bspme/
+BugReports: https://github.com/changwoo-lee/bspme/issues
+Suggests: 
+    knitr,
+    rmarkdown
+VignetteBuilder: knitr
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+
+Copyright (c) 2023 bspme authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,5 @@
+# Generated by roxygen2: do not edit by hand
+
+export(blinreg_me)
+export(vecchia_cov)
+importFrom(Matrix,Diagonal)
diff --git a/R/blinreg_me.R b/R/blinreg_me.R
@@ -0,0 +1,221 @@
+#' Bayesian normal linear regression models with correlated measurement errors
+#'
+#' This function implements the Bayesian normal linear regression model with correlated measurement error of covariate(s).
+#' Denote \eqn{Y_i} be a continuous response, \eqn{X_i} be a \eqn{q\times 1} covariate of \eqn{i}th observation that is subject to measurement error,
+#' and \eqn{Z_i} be a \eqn{p\times 1} covariate without measurement error.
+#' The likelihood model is Bayesian normal linear regression,
+#' \deqn{Y_i = \beta_0 + X_i^\top \beta_x +  Z_i^\top \beta_z + \epsilon_i,\quad  \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n}
+#' and correlated measurement error of \eqn{X_i, i=1,\dots,n} is incorporated into the model as a multivariate normal prior. For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior
+#' \deqn{(X_1,\dots,X_n)\sim N_n(\mu_X, Q_X^{-1}).}
+#' Also, we consider semiconjugate priors for regression coefficients and noise variance;
+#' \deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{x,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{z,k} \stackrel{iid}{\sim} N(0, V_\beta), \quad \sigma_Y^2 \sim IG(a_Y, b_Y).}
+#' The function \code{blinreg_me()} implements the Gibbs sampler for posterior inference. Most importantly, it allows sparse matrix input for \eqn{Q_X} for scalable computation.
+#'
+#' @param Y n by 1 matrix, response
+#' @param X_mean n by 1 matrix or list of n by 1 matrices of length q, mean of X \eqn{\mu_X}.
+#' @param X_prec n by n matrix or list of n by n matrices of length q, precision matrix of X  \eqn{Q_X}. Support sparse matrix format from Matrix package.
+#' @param Z n by p matrix, covariates without measurement error
+#' @param nburn integer, burn-in iteration
+#' @param nthin integer, thin-in rate
+#' @param nsave integer, number of posterior samples
+#' @param prior list of prior parameters, default is var_beta = 100,a_Y = 0.01, b_Y = 0.01
+#' @param saveX logical, save X or not
+#'
+#' @return list of (1) posterior, the (nsave)x(q+p) matrix of posterior samples as a coda object,
+#'  (2) cputime, cpu time taken in seconds,
+#'  and optionally (3) X_save, posterior samples of X
+#' @export
+#'
+#' @examples
+#'
+#'\dontrun{
+#' data(ozone)
+#' data(health_sim)
+#' library(bspme)
+#' data(ozone)
+#' data(health_sim)
+#' library(fields)
+#' # exposure mean and covariance at health subject locations with 06/18/1987 data (date id = 16)
+#' # using Gaussian process with prior mean zero and exponential covariance kernel
+#' # with fixed range 300 (in miles) and stdev 15 (in ppb)
+#'
+#' ozone16 = ozone[ozone$date_id==16,]
+#'
+#' Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat))
+#' Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat))
+#' Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat),
+#'                   cbind(health_sim$coords_y_lon, health_sim$coords_y_lat))
+#'
+#' Kxx = Exponential(Dxx, range = 300, phi=15^2)
+#' Kyy = Exponential(Dyy, range = 300, phi=15^2)
+#' Kxy = Exponential(Dxy, range = 300, phi=15^2)
+#'
+#' X_mean = t(Kxy) %*% solve(Kxx, ozone16$ozone_ppb)
+#' X_cov = Kyy - t(Kxy) %*% solve(Kxx, Kxy)
+#'
+#' # visualize
+#' par(mfrow = c(1,3))
+#' quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat),
+#'            ozone16$ozone_ppb, main = "ozone measurements"); US(add= T)
+#'
+#' quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat),
+#'            X_mean, main = "health subjects, mean of exposure"); US(add= T)
+#' points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17)
+#'
+#' quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat),
+#'            sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T)
+#' points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17)
+#'
+#' # vecchia approximation
+#' run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat),
+#'                           n.neighbors = 10)
+#' Q_sparse = run_vecchia$Q
+#' run_vecchia$cputime
+#'
+#' # fit the model
+#' fit_me = blm_me(Y = health_sim$Y,
+#'                 X_mean = X_mean,
+#'                 X_prec = Q_sparse, # sparse precision matrix
+#'                 Z = health_sim$Z,
+#'                 nburn = 100,
+#'                 nsave = 1000,
+#'                 nthin = 1)
+#' fit_me$cputime
+#' summary(fit_me$posterior)
+#' library(bayesplot)
+#' bayesplot::mcmc_trace(fit_me$posterior)
+#' }
+#'
+blinreg_me <- function(Y,
+                      X_mean,
+                      X_prec,
+                      Z,
+                      nburn = 2000,
+                      nsave = 2000,
+                      nthin = 5,
+                      prior = NULL,
+                      saveX = F){
+
+  # prior input, default
+  if(is.null(prior)){
+    prior = list(var_beta = 100,a_Y = 0.01, b_Y = 0.01)
+  }
+  var_beta = 100
+  a_Y = 0.01
+  b_Y = 0.01
+
+  n_y = length(Y)
+  if(is.vector(Z)) Z = as.matrix(Z)
+
+  if(!is.list(X_mean) & !is.list(X_prec)){
+    q = 1
+    X_mean = list(X_mean)
+    X_prec = list(X_prec)
+  }else if(is.list(X_mean) & is.list(X_prec)){
+    q = length(X_mean)
+    if(length(X_prec)!=q) stop("list length does not match")
+  }else{
+    stop("X_mean is not vector/matrix or list")
+  }
+  X_prec_X_mean = list()
+  X_spamstruct = vector(mode = 'list', length = q)
+  sparsealgo = rep(T,q)
+
+  for(qq in 1:q){
+    X_prec_X_mean[[qq]] = as.numeric(X_prec[[qq]]%*%X_mean[[qq]])
+
+    if(!("sparseMatrix" %in% is(X_prec[[qq]]))){
+      print(paste0(qq,"th X_prec is not a sparse matrix! Using dense algorithm, which may very slow when n is large"))
+      sparsealgo[qq] = F
+    }else{
+      X_prec[[qq]] = as(as(X_prec[[qq]], "generalMatrix"), "CsparseMatrix")
+      X_prec[[qq]] = spam::as.spam.dgCMatrix(X_prec[[qq]])# spam object
+      X_spamstruct[[qq]] = spam::chol(X_prec[[qq]])
+    }
+  }
+
+  X = matrix(0, n_y, q)
+  for(qq in 1:q) X[,qq] = X_mean[[q]]
+  if(is.null(names(X_mean))){
+    colnames(X) = paste0("exposure.",1:q)
+  }else{
+    colnames(X) =  paste0("exposure.",names(X_mean))
+  }
+
+  p = ncol(Z)
+  if(is.null(colnames(Z))){
+    colnames(Z) = paste0("covariate.",1:p)
+  }else{
+    colnames(Z) =  paste0("covariate.",colnames(Z))
+  }
+  df_temp = as.data.frame(cbind(X,Z))
+  D = model.matrix( ~ ., df_temp)
+
+
+  # prior
+  Sigma_beta = diag(var_beta, ncol(D))# 3 coefficients(beta0, beta1, betaz)
+  Sigma_betainv = solve(Sigma_beta)
+
+  # initialize
+  sigma2_Y = 1
+  beta = rep(0.1, ncol(D))
+
+  sigma2_save = matrix(0, nsave, 1)
+  colnames(sigma2_save) = "sigma2_Y"
+  beta_save = matrix(0, nsave, ncol(D))
+  colnames(beta_save) <- colnames(D)
+  if(saveX){
+    X_save = array(0, dim = c(nsave, n_y, q))
+    dimnames(X_save)[[3]] = names(X_mean)
+  }
+
+  YtY = crossprod(Y)
+  #browser()
+  # sampler starts
+  isave = 0
+  isnegative = numeric(n_y)
+  pb <- txtProgressBar(style=3)
+  t_start = Sys.time()
+  for(imcmc in 1:(nsave*nthin + nburn)){
+    setTxtProgressBar(pb, imcmc/(nsave*nthin + nburn))
+    # sample beta
+    Vbetainv = Sigma_betainv + crossprod(D)/sigma2_Y
+    betatilde = solve(Vbetainv,crossprod(D,Y)/sigma2_Y)
+    beta = as.numeric(spam::rmvnorm.prec(1, mu = betatilde, Q = Vbetainv))
+    # sample sigma2_Y
+    SSR = crossprod(Y - D%*%beta)
+    sigma2_Y = 1/rgamma(1, a_Y + n_y/2, b_Y + SSR/2 )
+
+    for(qq in 1:q){
+      # 1st is intercept
+      b_G =  X_prec_X_mean[[qq]]  + beta[qq + 1]/sigma2_Y*(Y-D[,-(qq+1)]%*%beta[-(qq+1)])
+      Qtilde = X_prec[[qq]] # dense or spam
+      if(sparsealgo[qq]){
+        Qtilde = Qtilde + spam::diag.spam(beta[qq + 1]^2/sigma2_Y, n_y, n_y)
+      }else{
+        diag(Qtilde) = diag(Qtilde) + beta[qq + 1]^2/sigma2_Y
+      }
+      Xstar = spam::rmvnorm.canonical(1, b = as.vector(b_G),
+                                      Q = Qtilde,# dense or spam
+                                      Rstruct = X_spamstruct[[qq]]) #browser()
+      if(imcmc > nburn) isnegative = isnegative + (Xstar<0)
+      D[,(qq+1)] = as.vector(Xstar)
+    }
+
+
+    if((imcmc > nburn)&(imcmc%%nthin==0)){
+      isave = isave + 1
+      sigma2_save[isave] = sigma2_Y
+      beta_save[isave,] = beta
+      if(saveX) X_save[isave,,] = D[,2:(q+1)]
+    }
+  }
+  t_diff = difftime(Sys.time(), t_start, units = "secs")
+  #print(paste0("Exposure components contains negative vaules total ",sum(isnegative)," times among (# exposures) x n_y x (MCMC iter after burnin) = ",q," x ",n_y," x ",nsave*nthin," instances"))
+  out = list()
+  out$posterior = cbind(beta_save, sigma2_save)
+  out$posterior = coda::mcmc(out$posterior)
+  out$cputime = t_diff
+  if(saveX) out$X_save = X_save
+  out
+}
diff --git a/R/bspme-package.R b/R/bspme-package.R
@@ -0,0 +1,39 @@
+#' @keywords internal
+"_PACKAGE"
+
+## usethis namespace: start
+## usethis namespace: end
+NULL
+
+
+#' Dataset, ozone exposure
+#'
+#' This is a subset of "ozone2" dataset in fields package, only containing data from monitoring station with no missing values.
+#' The 8-hour average (surface) ozone (from 9AM-4PM) measured in parts per billion (PPB) for 67 sites in the midwestern US over the period June 3,1987 through August 31, 1987, 89 days.
+#'
+#' @format A data frame with 5963 rows and 6 variables:
+#' \describe{
+#'   \item{date_id}{integer, 1 corresponds to 06/03/1987 and 89 corresponds to 08/31/1987}
+#'   \item{date}{POIXct, date}
+#'   \item{station_id}{character, station id}
+#'   \item{coords_lon}{numeric, longitude of monitoring station}
+#'   \item{coords_lat}{numeric, latitude of monitoring station}
+#'   \item{ozone_ppb}{8-hour average surface ozone from 9am-4pm in parts per billion (PPB)}
+#' }
+"ozone"
+
+
+#' Dataset, simulated health data
+#'
+#' Simulated health data based on ozone exposures on 06/18/1987. For details, see \code{health_sim.R}.
+#'
+#' @format A data frame with n = 3000 rows and 4 variables:
+#' \describe{
+#'   \item{Y}{n by 1 matrix, numeric, simulated continuous health outcome}
+#'   \item{Ybinary}{n by 1 matrix, numeric, simulated binary health outcome}
+#'   \item{coords_y_lon}{n by 1 matrix, numeric, simulated health subject longitude}
+#'   \item{coords_y_lat}{n by 1 matrix, numeric, simulated health subject latitude}
+#'   \item{Z}{n by 1 matrix, numeric, covariate}
+#'   \item{X_true}{n by 1 matrix, numeric, true ozone exposure used for simulation}
+#' }
+"health_sim"