diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
index d4c3cf9..0f2fe08 100644
--- a/.github/workflows/R-CMD-check.yaml
+++ b/.github/workflows/R-CMD-check.yaml
@@ -43,7 +43,7 @@ jobs:
 
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
-          extra-packages: any::rcmdcheck, margins=?ignore
+          extra-packages: any::rcmdcheck
           needs: check
 
       - uses: r-lib/actions/check-r-package@v2
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
index b6e0377..c9f0165 100644
--- a/.github/workflows/pkgdown.yaml
+++ b/.github/workflows/pkgdown.yaml
@@ -34,14 +34,9 @@ jobs:
 
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
-          extra-packages: any::pkgdown, local::., margins=?ignore
+          extra-packages: any::pkgdown, local::.
           needs: website
 
-      - name: Install archived dependencies (margins)
-        run: |
-          R -e 'install.packages("https://cran.r-project.org/src/contrib/Archive/prediction/prediction_0.3.17.tar.gz")'
-          R -e 'install.packages("https://cran.r-project.org/src/contrib/Archive/margins/margins_0.3.26.tar.gz")'
-
       - name: Build site
         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
         shell: Rscript {0}
diff --git a/.github/workflows/rhub.yaml b/.github/workflows/rhub.yaml
index e16437f..a66ce37 100644
--- a/.github/workflows/rhub.yaml
+++ b/.github/workflows/rhub.yaml
@@ -60,7 +60,7 @@ jobs:
         with:
           token: ${{ secrets.RHUB_TOKEN }}
           job-config: ${{ matrix.config.job-config }}
-          extra-packages: any::rcmdcheck, margins=?ignore
+          extra-packages: any::rcmdcheck
       - uses: r-hub/actions/run-check@v1
         with:
           token: ${{ secrets.RHUB_TOKEN }}
@@ -90,7 +90,7 @@ jobs:
         with:
           job-config: ${{ matrix.config.job-config }}
           token: ${{ secrets.RHUB_TOKEN }}
-          extra-packages: any::rcmdcheck, margins=?ignore
+          extra-packages: any::rcmdcheck
       - uses: r-hub/actions/run-check@v1
         with:
           job-config: ${{ matrix.config.job-config }}
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
index 6365cf6..e9e4c6f 100644
--- a/.github/workflows/test-coverage.yaml
+++ b/.github/workflows/test-coverage.yaml
@@ -23,7 +23,7 @@ jobs:
 
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
-          extra-packages: any::covr, margins=?ignore
+          extra-packages: any::covr
           needs: coverage
 
       - name: Test coverage
diff --git a/DESCRIPTION b/DESCRIPTION
index 9969e12..f7800dd 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: beeca
 Title: Binary Endpoint Estimation with Covariate Adjustment
-Version: 0.1.3.9000
+Version: 0.2.0
 Authors@R: 
   c(person(given = "Alex", 
            family = "Przybylski", 
@@ -33,7 +33,7 @@ Maintainer: Alex Przybylski <alexander.przybylski@novartis.com>
 License: LGPL (>= 3)
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Suggests: 
     knitr,
     rmarkdown,
@@ -41,7 +41,7 @@ Suggests:
     tidyr,
     marginaleffects,
     margins,
-    RobinCar (== 0.3.0)
+    RobinCar (>= 0.3.0)
 Config/testthat/edition: 3
 Depends: 
     R (>= 2.10)
diff --git a/NAMESPACE b/NAMESPACE
index 0d6a44d..0ce768e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -17,4 +17,5 @@ importFrom(stats,predict)
 importFrom(stats,terms)
 importFrom(stats,var)
 importFrom(stats,vcov)
+importFrom(utils,combn)
 importFrom(utils,packageVersion)
diff --git a/NEWS.md b/NEWS.md
index b165148..93fc6ee 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,6 @@
-# beeca (development version)
+# beeca 0.2.0
+
+- Extensions to allow for more than two treatment arms in the model fit.
 
 # beeca 0.1.3
 
diff --git a/R/apply_contrast.R b/R/apply_contrast.R
index d4b042e..d6d6b7c 100644
--- a/R/apply_contrast.R
+++ b/R/apply_contrast.R
@@ -27,9 +27,10 @@
 #' The choice of contrast affects how treatment effects are calculated and
 #' interpreted. Default is `diff`.
 #'
-#' @param reference a string indicating which treatment group should be considered as
-#' the reference level. Accepted values are one of the levels in the treatment
-#' variable. Default to the first level used in the `glm` object.
+#' @param reference a string or list of strings indicating which treatment
+#' group(s) to use as reference level for pairwise comparisons. Accepted values
+#' must be a subset of the levels in the treatment variable. Default to the
+#' first n-1 treatment levels used in the `glm` object.
 #'
 #' This parameter influences the calculation of treatment effects
 #' relative to the chosen reference group.
@@ -68,10 +69,11 @@
 #' fit3$marginal_est
 #' fit3$marginal_se
 #'
+#' @importFrom utils combn
 #' @export
 #'
 apply_contrast <- function(object, contrast = c("diff", "rr", "or", "logrr", "logor"), reference) {
-  # assert means are available in object
+  # Assert means are available in object
   if (!"counterfactual.means" %in% names(object)) {
     msg <- sprintf(
       "Missing counterfactual means. First run `%1$s <- average_predictions(%1$s)`.",
@@ -79,7 +81,7 @@ apply_contrast <- function(object, contrast = c("diff", "rr", "or", "logrr", "lo
     )
     stop(msg, call. = FALSE)
   }
-  # assert varcov is available in object
+  # Assert varcov is available in object
   if (!"robust_varcov" %in% names(object)) {
     msg <- sprintf(
       "Missing robust varcov. First run `%1$s <- get_varcov(%1$s, ...)`.",
@@ -92,53 +94,81 @@ apply_contrast <- function(object, contrast = c("diff", "rr", "or", "logrr", "lo
   object <- .assert_sanitized(object, trt, warn = T)
 
   data <- .get_data(object)
-  # assert non-missing reference
+
+  # Assert non-missing reference or set default
   if (missing(reference)) {
-    reference <- object$xlevels[[trt]][1]
-    warning(sprintf("No reference argument was provided, using %s as the reference level", reference), call. = FALSE)
+    reference <- object$xlevels[[trt]][-nlevels(data[[trt]])]
+    warning(sprintf("No reference argument was provided, using {%s} as the reference level(s)", paste(reference, collapse = ", ")), call. = FALSE)
+  }
+
+  # Assert reference to be subset of the treatment levels
+  if (!all(reference %in% levels(data[[trt]]))) {
+    stop("Reference levels must be a subset of treatment levels : ", paste(levels(data[[trt]]), collapse = ", "), ".")
   }
 
-  # assert reference to be one of the treatment levels
-  if (!reference %in% levels(data[[trt]])) {
-    stop("Reference must be one of : ", paste(levels(data[[trt]]), collapse = ", "), ".")
+  # Only accept at most nlevels(trt)-1 reference levels
+  if (length(reference) > (nlevels(data[[trt]])-1)) {
+    stop(sprintf("Too many reference levels provided. Expecting at most %s value(s) from the possible treatment levels: {%s}",
+                 nlevels(data[[trt]]) - 1,
+                 paste(levels(data[[trt]]), collapse = ", ")))
   }
 
-  # match contrast argument and retrieve relevant functions
+  # Match contrast argument and retrieve relevant helper functions
   contrast <- match.arg(contrast)
   c_est <- get(contrast)
   c_str <- get(paste0(contrast, "_str"))
   c_grad <- get(paste0("grad_", contrast))
 
-  # set reference to 1 if it is the first level
-  reference_n <- which(levels(data[[trt]]) == reference)
-  cf_mean_ref <- object$counterfactual.means[reference_n]
-  cf_mean_inv <- object$counterfactual.means[3 - reference_n]
-
-  trt_ref <- reference
-  trt_inv <- levels(data[[trt]])[-reference_n]
+  # Extract counterfactual means
+  cf_means <- object$counterfactual.means
 
-  # apply contrast to point estimate
-  marginal_est <- c_est(cf_mean_inv, cf_mean_ref)
+  # Get all pairwise arm comparisons for contrasts
+  # Format according to specified reference levels
+  l <- levels(data[[trt]])
+  combos <- combn(l, 2, \(x) {
+    if (any(reference %in% x)) {
+      ref <- intersect(reference, x)[[1]]
+      comp <- x[x!=ref]
+      c(which(ref == l), which(comp==l))
+    } else {
+      c(NA, NA)
+    }
+  })
+  combos <- combos[, is.finite(colSums(combos)), drop=F]
+  if (length(reference) > 1) {
+    combos <- combos[,order(combos[1,])]
+  }
+  idxs <- matrix(F, nrow=length(l), ncol=length(l))
+  rownames(idxs) <- colnames(idxs) <- l
+  idxs[cbind(combos[2,], combos[1,])] <- T
 
-  object$marginal_est <- marginal_est
-  names(object$marginal_est) <- "marginal_est"
-  attr(object$marginal_est, "reference") <- trt_ref
-  attr(object$marginal_est, "contrast") <- paste0(contrast, ": ", c_str(trt_inv, trt_ref))
 
-  # apply contrast to varcov
-  gr <- c_grad(cf_mean_inv, cf_mean_ref)
+  # Get marginal estimates for the contrasts of interest, using correct ref levels
+  marginal_est <- outer(cf_means, cf_means, c_est)
+  marginal_est <- marginal_est[idxs]
 
-  robust_varcov <- object$robust_varcov
+  # Define contrast jacobian
+  gr <- matrix(0, nrow=ncol(combos), ncol=length(l))
+  # Apply the grad function to all combinations
+  contrast_values <- t(apply(combos, 2, function(idx) c_grad(cf_means[idx[2]], cf_means[idx[1]])))
+  gr[cbind(1:ncol(combos), combos[1,])] <- contrast_values[, 1]
+  gr[cbind(1:ncol(combos), combos[2,])] <- contrast_values[, 2]
+  # Get marginal standard error for contrasts of interest
+  marginal_se <- sqrt(diag(gr %*% object$robust_varcov %*% t(gr)))
 
-  # correct varcov order based on reference
-  if (reference_n == 2) diag(robust_varcov) <- rev(diag(robust_varcov))
+  # Add string description for each comparison
+  contrast_desc <- paste0(contrast, ": ",
+                          apply(combos, 2, \(x) c_str(l[x[2]], l[x[1]])))
+  names(marginal_est) <- contrast_desc
+  names(marginal_se) <- contrast_desc
 
-  marginal_se <- sqrt(t(gr) %*% robust_varcov %*% gr)
+  object$marginal_est <- marginal_est
+  attr(object$marginal_est, "reference") <- reference
+  attr(object$marginal_est, "contrast") <- contrast_desc
 
-  object$marginal_se <- marginal_se[[1]]
-  names(object$marginal_se) <- "marginal_se"
-  attr(object$marginal_se, "reference") <- trt_ref
-  attr(object$marginal_se, "contrast") <- paste0(contrast, ": ", c_str(trt_inv, trt_ref))
+  object$marginal_se <- marginal_se
+  attr(object$marginal_se, "reference") <- reference
+  attr(object$marginal_se, "contrast") <- contrast_desc
   attr(object$marginal_se, "type") <- attr(object$robust_varcov, "type")
 
   return(object)
diff --git a/R/estimate_varcov.R b/R/estimate_varcov.R
index ed8c3a6..435682c 100644
--- a/R/estimate_varcov.R
+++ b/R/estimate_varcov.R
@@ -98,7 +98,7 @@
 estimate_varcov <- function(object, strata = NULL, method = c("Ge", "Ye"),
                             type = c("HC0", "model-based", "HC3", "HC", "HC1", "HC2", "HC4", "HC4m", "HC5"),
                             mod = FALSE) {
-  # assert means are available in object
+  # Assert means are available in object
   if (!"counterfactual.means" %in% names(object)) {
     msg <- sprintf(
       "Missing counterfactual means. First run `%1$s <- average_predictions(%1$s)`.",
@@ -115,14 +115,18 @@ estimate_varcov <- function(object, strata = NULL, method = c("Ge", "Ye"),
     varcov <- varcov_ye(object, trt, strata, mod)
   } else if (method == "Ge") {
     type <- match.arg(type)
-    # throw warning is strata supplied but method == "Ge" is specified
+    # Throw warning is strata supplied but method == "Ge" is specified
     if (!is.null(strata)) {
       warning("Strata is supplied but will be ignored when using method='Ge'", call. = FALSE)
     }
     varcov <- varcov_ge(object, trt, type)
   }
 
-  rownames(varcov) <- colnames(varcov) <- levels(.get_data(object)[[trt]])
+  # Assert varcov is symmetric
+  if (isFALSE(all.equal(varcov[lower.tri(varcov)], varcov[upper.tri(varcov)]))) {
+    stop("Error in calculation of variance covariance matrix: `varcov` is not symmetric.",
+         call. = FALSE)
+  }
 
   object$robust_varcov <- varcov
   attr(object$robust_varcov, "type") <- ifelse(method == "Ge", paste0(method, " - ", type), method)
@@ -133,58 +137,86 @@ estimate_varcov <- function(object, strata = NULL, method = c("Ge", "Ye"),
 
 ## Re-implementation based on Ye et al. (https://doi.org/10.1080/24754269.2023.2205802)
 varcov_ye <- function(object, trt, strata, mod) {
-  ## calculate quantities
-  # get data
+
   data <- .get_data(object)
 
-  # counterfactual predictions
+  # Extract allocation information
+  pi_all <- prop.table(table(data[, trt]))
+  n <- length(data[, trt])
+
+  # Extract observed responses
+  y_observed <- lapply(levels(data[[trt]]), \(x) object$y[data[, trt] == x])
+  names(y_observed) <- levels(data[[trt]])
+
+  # Extract counterfactual predictions
   cf_pred <- object$counterfactual.predictions
+  # Append actual treatment assignment to cf predictions
+  cf_pred[[trt]] <- data[,trt]
 
-  pred_0 <- cf_pred$cf_pred_0
-  pred_1 <- cf_pred$cf_pred_1
+  # Estimate varcov using Ye et al (2022)
 
-  ## Extract allocation information
-  pi_0 <- mean(data[, trt] == levels(data[[trt]])[1])
-  pi_1 <- 1 - pi_0
-  n <- length(data[, trt])
+  # diag_vars stores variances per treatment arm
+  diag_vars <- list()
+  if (mod) {
 
-  ## Extract responses on observed arm i and arm j
-  y_0 <- object$y[data[, trt] == levels(data[[trt]])[1]]
-  y_1 <- object$y[data[, trt] == levels(data[[trt]])[2]]
+    for (trtlvl in levels(data[[trt]])) {
+      var_resid <- var(y_observed[[trtlvl]] - cf_pred[cf_pred[[trt]] == trtlvl, trtlvl])
+      cov_obs_pred <- cov(y_observed[[trtlvl]], cf_pred[cf_pred[[trt]] == trtlvl, trtlvl])
+      var_pred <- var(cf_pred[, trtlvl])
 
-  ## Predictions on arm i from patients in arm i only
-  pred_0_0 <- pred_0[data[, trt] == levels(data[[trt]])[1]]
-  pred_1_1 <- pred_1[data[, trt] == levels(data[[trt]])[2]]
+      diag_vars[[trtlvl]] <- var_resid / pi_all[[trtlvl]] + 2 * cov_obs_pred - var_pred
 
-  ## Predictions on arm i from patients in arm j only
-  pred_0_1 <- pred_0[data[, trt] == levels(data[[trt]])[2]]
-  pred_1_0 <- pred_1[data[, trt] == levels(data[[trt]])[1]]
+    }
 
-  ## estimate varcov using Ye et al (2022)
-  if (mod) {
-    v_0_0 <- var(y_0 - pred_0_0) / pi_0 + 2 * cov(y_0, pred_0_0) - var(pred_0)
-    v_1_1 <- var(y_1 - pred_1_1) / pi_1 + 2 * cov(y_1, pred_1_1) - var(pred_1)
   } else {
-    # use variance decomposition to match RobinCar implementation
-    var_y0_pred00 <- var(y_0) - 2 * cov(y_0, pred_0_0) + var(pred_0)
-    var_y1_pred11 <- var(y_1) - 2 * cov(y_1, pred_1_1) + var(pred_1)
+    # Use variance decomposition to match RobinCar implementation
+
+    for (trtlvl in levels(data[[trt]])) {
+      var_obs <- var(y_observed[[trtlvl]])
+      cov_obs_pred <- cov(y_observed[[trtlvl]], cf_pred[cf_pred[[trt]] == trtlvl, trtlvl])
+      var_pred <- var(cf_pred[, trtlvl])
+
+      var_obs_pred <- var_obs - 2 * cov_obs_pred + var_pred
+
+      diag_vars[[trtlvl]] <- var_obs_pred / pi_all[[trtlvl]] + 2 * cov_obs_pred - var_pred
+
+    }
 
-    v_0_0 <- var_y0_pred00 / pi_0 + 2 * cov(y_0, pred_0_0) - var(pred_0)
-    v_1_1 <- var_y1_pred11 / pi_1 + 2 * cov(y_1, pred_1_1) - var(pred_1)
   }
 
-  v_0_1 <- v_1_0 <- cov(y_0, pred_1_0) + cov(y_1, pred_0_1) - cov(pred_0, pred_1)
+  # tri_vars stores trt arm covariances
+  tri_vars <- list()
+  for (trtlvl in levels(data[[trt]])) {
+
+    # Covar of observed arm i and corresponding counterfactual predictions for those on arm j
+    cov_obsi_predj <- sapply(levels(data[[trt]])[levels(data[[trt]]) != trtlvl],
+                             \(x) cov(y_observed[[trtlvl]], cf_pred[cf_pred[[trt]] == trtlvl, x]))
 
-  ## Variance of theta_i = p_i(Y=1) on arm i = 0,1
-  varcov <- matrix(c(
-    v_0_0, v_0_1,
-    v_1_0, v_1_1
-  ), nrow = 2, ncol = 2)
+    # Covar of observed arm j and corresponding counterfactual predictions for those on arm i
+    cov_obsj_predi <- sapply(levels(data[[trt]])[levels(data[[trt]]) != trtlvl],
+                             \(x) cov(y_observed[[x]], cf_pred[cf_pred[[trt]] == x, trtlvl]))
 
-  ## Variance adjustment based on stratified CAR scheme, formula 18: https://doi.org/10.1080/01621459.2022.2049278
+    # Covar of counterfactual predictions for arm i and arm j for all subjects
+    cov_predi_predj <- sapply(levels(data[[trt]])[levels(data[[trt]]) != trtlvl],
+                              \(x) cov(cf_pred[, trtlvl], cf_pred[, x]))
+
+    tri_vars[[trtlvl]] <- cov_obsi_predj + cov_obsj_predi - cov_predi_predj
+
+  }
+
+  # Construct variance covariance matrix
+  varcov <- matrix(nrow = nlevels(data[[trt]]),
+                   ncol = nlevels(data[[trt]]))
+  rownames(varcov) <- colnames(varcov) <- levels(data[[trt]])
+
+  for (trtlvl in rownames(varcov)) {
+    varcov[trtlvl, c(trtlvl, names(tri_vars[[trtlvl]]))] <- c(diag_vars[[trtlvl]], tri_vars[[trtlvl]])
+  }
+
+  # Variance adjustment based on stratified CAR scheme, formula 18: https://doi.org/10.1080/01621459.2022.2049278
   if (!is.null(strata)) {
 
-    # warn if any single stratification variable has suspiciously many unique values
+    # Warn if any single stratification variable has suspiciously many unique values
     n.strata <- sapply(strata, function(x) length(unique(.get_data(object)[[x]])))
     if (any(n.strata > 3)) {
       msg <- sprintf(
@@ -201,10 +233,10 @@ varcov_ye <- function(object, trt, strata, mod) {
 }
 
 ye_strata_adjustment <- function(object, trt, strata) {
-  # get data
+
   data <- .get_data(object)
 
-  # assert strata is a column name found in model data
+  # Assert strata is a column name found in model data
   if (!all(strata %in% colnames(data))) {
     msg <- sprintf('Stratification variable(s) "%s" must be found in the model', paste(strata[!strata %in% colnames(data)], collapse = ", "))
     stop(msg, call. = FALSE)
@@ -213,27 +245,28 @@ ye_strata_adjustment <- function(object, trt, strata) {
   strata_all <- Reduce(function(...) paste(..., sep = "-"), data[strata])
   data <- cbind(data, strata_all)
 
-  pi_0 <- mean(data[, trt] == levels(data[[trt]])[1])
-  pi_1 <- 1 - pi_0
+  # Extract allocation information
+  pi_all <- prop.table(table(data[,trt]))
+  # Get predictions for observed data and calculate residuals
   data$preds <- predict(object, type = "response")
   data$resid <- object$y - data$preds
 
   get_rb <- function(z) {
-    # assume always two treatment levels
-    resid_0_z <- data$resid[(data[, trt] == levels(data[[trt]])[1]) & (data$strata_all == z)]
-    resid_1_z <- data$resid[(data[, trt] == levels(data[[trt]])[2]) & (data$strata_all == z)]
-    rb_0 <- (1 / pi_0) * mean(resid_0_z)
-    rb_1 <- (1 / pi_1) * mean(resid_1_z)
-    return(diag(c(rb_0, rb_1)))
+    rb_list <- list()
+    for (trtlvl in levels(data[[trt]])) {
+      resid_i_z <- data$resid[(data[[trt]] == trtlvl) & (data$strata_all == z)]
+      rb_list[[trtlvl]] <- (1 / pi_all[[trtlvl]]) * mean(resid_i_z)
+    }
+    return(diag(rb_list))
   }
 
-  strata_levels <- levels(as.factor(data$strata_all))
+  omega_sr <- diag(pi_all) - pi_all %*% t(pi_all)
 
-  omega_sr <- diag(c(pi_0, pi_1)) - c(pi_0, pi_1) %*% t(c(pi_0, pi_1))
-  # always assume permuted block stratified randomization
-  omega_z <- matrix(0, nrow = length(unique((data[[trt]]))), ncol = length(unique((data[[trt]]))))
+  # Always assume permuted block stratified randomization
+  omega_z <- matrix(0, nrow = nlevels(data[[trt]]), ncol = nlevels(data[[trt]]))
 
-  # compute outer expectation
+  # Compute outer expectation
+  strata_levels <- levels(as.factor(data$strata_all))
   erb_s <- lapply(strata_levels, function(x) {
     erb_i <- get_rb(x) %*% (omega_sr - omega_z) %*% get_rb(x)
     erb_i * (sum(data$strata_all == x) / nrow(data))
@@ -258,36 +291,25 @@ varcov_ge <- function(object, trt, type) {
     V <- sandwich::vcovHC(object, type = type)
   }
 
-  # counterfactual predictions
+  # Counterfactual predictions
   cf_pred <- object$counterfactual.predictions
 
-  pred_0 <- cf_pred$cf_pred_0
-  pred_1 <- cf_pred$cf_pred_1
-
-  # get order of treatment column and assign counterfactual treatments
-  X0 <- X1 <- data
-  X0[, trt] <- factor(levels(data[[trt]])[1], levels = levels(data[[trt]]))
-  X1[, trt] <- factor(levels(data[[trt]])[2], levels = levels(data[[trt]]))
-  X0 <- model.matrix(object$formula, X0)
-  X1 <- model.matrix(object$formula, X1)
-  # compute derivatives
-  pderiv0 <- pred_0 * (1 - pred_0)
-  pderiv1 <- pred_1 * (1 - pred_1)
-  d0 <- (t(pderiv0) %*% as.matrix(X0)) / nrow(X0)
-  d1 <- (t(pderiv1) %*% as.matrix(X1)) / nrow(X1)
-
-  # variance components aligned with varcov_ye implementation
-  v_0_0 <- d0 %*% V %*% t(d0)
-  v_1_1 <- d1 %*% V %*% t(d1)
-  v_0_1 <- d0 %*% V %*% t(d1)
-  v_1_0 <- d1 %*% V %*% t(d0)
-
-  varcov_ge <- matrix(
-    c(
-      v_0_0, v_0_1,
-      v_1_0, v_1_1
-    ),
-    nrow = 2, ncol = 2
-  )
+  # Get derivatives per treatment level counterfactual outcomes
+  d_list <- list()
+  for (trtlvl in levels(data[[trt]])) {
+    X_i <- data
+    X_i[, trt] <- factor(trtlvl, levels = levels(data[[trt]]))
+    X_i <- model.matrix(object$formula, X_i)
+
+    # Compute derivative
+    pderiv_i <- cf_pred[, trtlvl] * (1 - cf_pred[, trtlvl])
+    d_i <- (t(pderiv_i) %*% as.matrix(X_i)) / nrow(X_i)
+    d_list[[trtlvl]] <- d_i
+  }
+
+  all_d <- do.call(rbind, d_list)
+  varcov_ge <- all_d %*% V %*% t(all_d)
+  rownames(varcov_ge) <- colnames(varcov_ge) <- levels(data[[trt]])
+
   return(varcov_ge)
 }
diff --git a/R/get_marginal_effect.R b/R/get_marginal_effect.R
index 1e56ff0..89a785e 100644
--- a/R/get_marginal_effect.R
+++ b/R/get_marginal_effect.R
@@ -81,38 +81,46 @@ get_marginal_effect <- function(object, trt, strata = NULL,
     apply_contrast(contrast, reference)
 
   data <- .get_data(object)
-
-  trt_ref <- attr(object$marginal_est, "reference")
-  trt_inv <- levels(data[[trt]])[-which(levels(data[[trt]]) == trt_ref)]
+  n_trt_levels <- nlevels(data[[trt]])
   outcome <- paste(object$formula[[2]])
 
-  marginal_results <- data.frame(
-    TRTVAR = c(rep(trt, 12)),
-    TRTVAL = c(
-      rep(trt_inv, 5), rep(trt_ref, 5),
-      rep(attributes(object$marginal_est)[["contrast"]], 2)
-    ),
-    PARAM = rep(outcome, 12),
-    ANALTYP1 = c(rep(c(rep("DESCRIPTIVE", 3), rep("INFERENTIAL", 2)), 2), rep("INFERENTIAL", 2)),
-    STAT = c("N", "n", "%", "risk", "risk_se", "N", "n", "%", "risk", "risk_se", contrast, paste0(contrast, "_se")),
-    STATVAL = c(
-      sum(data[trt] == trt_inv), sum(data[outcome][data[trt] == trt_inv] == "1"),
-      sum(data[outcome][data[trt] == trt_inv] == "1") / sum(data[trt] == trt_inv) * 100,
-      object$counterfactual.means[trt_inv], sqrt(object$robust_varcov[trt_inv, trt_inv]),
-      sum(data[trt] == trt_ref), sum(data[outcome][data[trt] == trt_ref] == "1"),
-      sum(data[outcome][data[trt] == trt_ref] == "1") / sum(data[trt] == trt_ref) * 100,
-      object$counterfactual.means[trt_ref], sqrt(object$robust_varcov[trt_ref, trt_ref]),
-      object$marginal_est, object$marginal_se
-    ),
-    ANALMETH = c(
-      rep(c("count", "count", "percentage", "g-computation", attributes(object$marginal_se)[["type"]]), 2),
-      "g-computation", attributes(object$marginal_se)[["type"]]
+  # Marginal results for each trt level
+  marginal_responses <- data.frame(
+    TRTVAR = rep(trt, 5*n_trt_levels),
+    TRTVAL = rep(levels(data[[trt]]), each=5),
+    PARAM = rep(outcome, 5),
+    ANALTYP1 = rep(c(rep("DESCRIPTIVE", 3), rep("INFERENTIAL", 2)), n_trt_levels),
+    STAT = rep(c("N", "n", "%", "risk", "risk_se"), n_trt_levels),
+    STATVAL = c(sapply(levels(data[[trt]]), \(x) {
+      c(
+        sum(data[trt] == x),
+        sum(data[outcome][data[trt] == x] == "1"),
+        sum(data[outcome][data[trt] == x] == "1") / sum(data[trt] == x) * 100,
+        object$counterfactual.means[x],
+        sqrt(object$robust_varcov[x, x])
+      )
+    })
     ),
-    ANALDESC = paste0("Computed using beeca@", packageVersion("beeca"))
-  ) |>
-    dplyr::as_tibble()
+    ANALMETH = rep(c("count", "count", "percentage", "g-computation",
+                     attributes(object$marginal_se)[["type"]]), n_trt_levels)
+  )
+
+  # Contrast results
+  n_contrasts <- length(object$marginal_est)
+
+  marginal_contrasts <- data.frame(TRTVAR = rep(trt, 2*n_contrasts),
+                                   TRTVAL = c(rbind(attributes(object$marginal_est)[["contrast"]],
+                                                    attributes(object$marginal_se)[["contrast"]])),
+                                   PARAM = rep(outcome, 2*n_contrasts),
+                                   ANALTYP1 = c(rep("INFERENTIAL", 2*n_contrasts)),
+                                   STAT = rep(c(contrast, paste0(contrast, "_se")), n_contrasts),
+                                   STATVAL = c(rbind(object$marginal_est, object$marginal_se)),
+                                   ANALMETH = rep(c("g-computation", attributes(object$marginal_se)[["type"]]), n_contrasts)
+  )
 
-  object$marginal_results <- marginal_results
+  object$marginal_results <- rbind(marginal_responses,
+                                   marginal_contrasts) |> dplyr::as_tibble()
+  object$marginal_results$ANALDESC <- paste0("Computed using beeca@", packageVersion("beeca"))
 
   return(object)
 }
diff --git a/R/predict_counterfactuals.R b/R/predict_counterfactuals.R
index 9b6a488..64eca24 100644
--- a/R/predict_counterfactuals.R
+++ b/R/predict_counterfactuals.R
@@ -61,22 +61,21 @@ predict_counterfactuals <- function(object, trt) {
 
   data <- .get_data(object)
 
-  # calculate counterfactual predictions for each trt level
-  X0 <- X1 <- data
-  X0[, trt] <- levels(data[[trt]])[1]
-  X1[, trt] <- levels(data[[trt]])[2]
-  cf_pred_0 <- predict(object, newdata = X0, type = "response")
-  cf_pred_1 <- predict(object, newdata = X1, type = "response")
+  # Calculate counterfactual predictions for each trt level
+  cf_preds <- list()
+  for (trtlevel in levels(data[[trt]])) {
+    X_cf <- data
+    X_cf[, trt] <- trtlevel
+    cf_pred <- predict(object, newdata = X_cf, type = "response")
+    cf_preds[[trtlevel]] <- cf_pred
+  }
 
-  # store predictions in tidy dataframe
-  cf_predictions <- data.frame(
-    cf_pred_0 = cf_pred_0,
-    cf_pred_1 = cf_pred_1
-  ) |>
+  # Store predictions in tidy dataframe
+  cf_predictions <- do.call(cbind, cf_preds) |>
     dplyr::as_tibble()
 
   cf_lbl <- \(trt, x) paste0("Counterfactual predictions setting ", trt, "=", x, " for all records")
-  attr(cf_predictions, "label") <- c(cf_lbl(trt, levels(data[[trt]])[1]), cf_lbl(trt, levels(data[[trt]])[2]))
+  attr(cf_predictions, "label") <- sapply(levels(data[[trt]]), \(x) cf_lbl(trt, x))
   attr(cf_predictions, "treatment.variable") <- trt
 
   object$counterfactual.predictions <- cf_predictions
diff --git a/R/sanitize.R b/R/sanitize.R
index 018e853..75d867e 100644
--- a/R/sanitize.R
+++ b/R/sanitize.R
@@ -117,9 +117,9 @@ sanitize_variable <- function(model, trt) {
     msg <- sprintf('Treatment variable "%s" must be of type factor, not "%s".', trt, typeof(data[[trt]]))
     stop(msg, call. = FALSE)
   }
-  # assert trt has 2 levels
-  if (nlevels(data[[trt]]) != 2) {
-    msg <- sprintf('Treatment variable "%s" must have 2 levels. Found %s: {%s}.', trt, length(levels(data[[trt]])), paste(levels(data[[trt]]), collapse = ","))
+  # assert trt has at least 2 levels
+  if (nlevels(data[[trt]]) < 2) {
+    msg <- sprintf('Treatment variable "%s" must have at least 2 levels. Found %s: {%s}.', trt, nlevels(data[[trt]]), paste(levels(data[[trt]]), collapse = ","))
     stop(msg, call. = FALSE)
   }
   # assert response is coded 0/1
@@ -142,7 +142,7 @@ sanitize_variable <- function(model, trt) {
   if (is.null(object$sanitized) || object$sanitized == FALSE) {
     if (warn) {
       warning("Input object did not meet the expected format for beeca. Results may not be valid. Consider using ?get_marginal_effect",
-        call. = FALSE
+              call. = FALSE
       )
     }
     object <- sanitize_model(object, trt)
diff --git a/cran-comments.md b/cran-comments.md
index e670830..a57d509 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -1,18 +1,6 @@
-## Resubmission
 
-This is a resubmission. In this version I have:
-
-* Updated the description to avoid starting with "A lightweight package..."
-
-* Removed unnecessary \dontrun{} wrapper for an example
 
 ## R CMD check results
 
-0 errors | 0 warnings | 1 note
-
-I am aware of one possible note: The suggested package 'margins' was very recently reinstated to CRAN and binaries may not be available for certain platforms / R versions.
+0 errors | 0 warnings | 0 notes
 
-```
-* checking package dependencies ... NOTE
-Package suggested but not available for checking: ‘margins’
-```
diff --git a/data-raw/DATASET.R b/data-raw/DATASET.R
index 72154ee..566d5be 100644
--- a/data-raw/DATASET.R
+++ b/data-raw/DATASET.R
@@ -24,7 +24,6 @@ trial02_cdisc <- safetyData::adam_adtte |>
       TRTP == "Xanomeline High Dose" ~ 3
     )
   ) |>
-  dplyr::filter(TRTPN != 2) |> # restrict to 2 arms in the data
   dplyr::select(
     USUBJID, PARAM, AGE, AGEGR1, AGEGR1N, RACE, RACEN, SEX,
     TRTP, TRTPN, AVAL, AVALC, FASFL
diff --git a/data/trial02_cdisc.rda b/data/trial02_cdisc.rda
index 923af6a..6f64fd6 100644
Binary files a/data/trial02_cdisc.rda and b/data/trial02_cdisc.rda differ
diff --git a/man/apply_contrast.Rd b/man/apply_contrast.Rd
index e896b12..c40590f 100644
--- a/man/apply_contrast.Rd
+++ b/man/apply_contrast.Rd
@@ -22,9 +22,10 @@ and or when computing confidence intervals using normal approximation.
 The choice of contrast affects how treatment effects are calculated and
 interpreted. Default is \code{diff}.}
 
-\item{reference}{a string indicating which treatment group should be considered as
-the reference level. Accepted values are one of the levels in the treatment
-variable. Default to the first level used in the \code{glm} object.
+\item{reference}{a string or list of strings indicating which treatment
+group(s) to use as reference level for pairwise comparisons. Accepted values
+must be a subset of the levels in the treatment variable. Default to the
+first n-1 treatment levels used in the \code{glm} object.
 
 This parameter influences the calculation of treatment effects
 relative to the chosen reference group.}
diff --git a/man/beeca-package.Rd b/man/beeca-package.Rd
index a137665..9661251 100644
--- a/man/beeca-package.Rd
+++ b/man/beeca-package.Rd
@@ -12,6 +12,7 @@ Performs estimation of marginal treatment effects for binary outcomes when using
 Useful links:
 \itemize{
   \item \url{https://openpharma.github.io/beeca/}
+  \item Report bugs at \url{https://github.com/openpharma/beeca/issues}
 }
 
 }
diff --git a/tests/testthat/test-apply_contrasts.R b/tests/testthat/test-apply_contrasts.R
index 66328ff..5c5d5be 100644
--- a/tests/testthat/test-apply_contrasts.R
+++ b/tests/testthat/test-apply_contrasts.R
@@ -1,5 +1,6 @@
-# set up example ----------------------------------------------------------
+# set up examples ----------------------------------------------------------
 
+# 2 arm example
 data01 <- trial01 |>
   transform(trtp = as.factor(trtp)) |>
   dplyr::filter(!is.na(aval))
@@ -13,6 +14,14 @@ example_varcov <- fit1$robust_varcov
 example_theta_s <- fit1$counterfactual.means[["0"]]
 example_theta_t <- fit1$counterfactual.means[["1"]]
 
+# 3 arm example
+data02 <- trial02_cdisc |>
+  transform(TRTPN = as.factor(TRTPN))
+
+fit_3arm <- glm(AVAL ~ TRTPN + SEX, family = "binomial", data = data02) |>
+  predict_counterfactuals(trt = "TRTPN") |>
+  average_predictions() |>
+  estimate_varcov(method="Ye")
 
 
 # test warnings/errors ------------------------------------------------------------
@@ -69,9 +78,7 @@ test_that("Test valid contrast types", {
 test_that("Correctly throwing errors on mismatched arguments", {
   expect_error(
     fit1 |>
-      apply_contrast(contrast = "diffs", reference = "0"),
-    "'arg' should be one of"
-  )
+      apply_contrast(contrast = "diffs", reference = "0"))
 })
 
 test_that("Test handling of reference levels", {
@@ -91,15 +98,25 @@ test_that("Test handling of reference levels", {
   expect_error(
     fit |>
       apply_contrast(contrast = "diffs", reference = "C"),
-    "Reference must be one of : A, B",
+    "Reference levels must be a subset of treatment levels : A, B.",
+    fixed = TRUE
+  )
+
+  # Too many reference levels
+  expect_error(
+    fit |>
+      apply_contrast(contrast = "diffs", reference = c("A", "B")),
+    "Too many reference levels provided.",
     fixed = TRUE
   )
+
 })
 
 
 test_that("Check model is sanitized", {
   fit1$sanitized <- NULL
-  expect_warning(apply_contrast(fit1, reference = "0"),
+  expect_warning(
+    apply_contrast(fit1, reference = "0"),
     "Input object did not meet the expected format for beeca. Results may not be valid. Consider using ?get_marginal_effect",
     fixed = TRUE
   )
@@ -272,3 +289,215 @@ test_that("logor contrast is correct for variance estimate", {
     )
   )
 })
+
+
+# tests for data with 3 arms ----------------------------------------
+
+test_that("Reference levels correctly handled when >2 treatment levels", {
+
+  expect_warning(
+    fit_ref12 <- fit_3arm |> apply_contrast()
+  )
+
+  # Default reference levels are first n-1 treatment levels
+  expect_equal(
+    attr(fit_ref12$marginal_est, "reference"),
+    fit_3arm$xlevels$TRTPN[-nlevels(fit_3arm$data$TRTPN)]
+  )
+
+  expect_equal(
+    attr(fit_ref12$marginal_est, "contrast"),
+    c("diff: 2-1", "diff: 3-1", "diff: 3-2")
+  )
+
+  # Custom order of reference levels is correctly handled
+  fit_ref32 <- fit_3arm |>
+    apply_contrast(contrast = "diff", reference = c("3", "2"))
+
+  expect_equal(
+    attr(fit_ref32$marginal_est, "contrast"),
+    c("diff: 1-2", "diff: 1-3", "diff: 2-3")
+  )
+
+  expect_equal(
+    fit_ref32$marginal_est[["diff: 1-2"]],
+    -fit_ref12$marginal_est[["diff: 2-1"]]
+  )
+  expect_equal(
+    fit_ref32$marginal_est[["diff: 1-3"]],
+    -fit_ref12$marginal_est[["diff: 3-1"]]
+  )
+  expect_equal(
+    fit_ref32$marginal_est[["diff: 2-3"]],
+    -fit_ref12$marginal_est[["diff: 3-2"]]
+  )
+
+  expect_equal(
+    unname(fit_ref32$marginal_se)[1:3],
+    unname(fit_ref12$marginal_se)[1:3]
+  )
+
+
+  gr <- matrix(c(-1,-1,0,
+                 1,0,-1,
+                 0,1,1),
+               nrow=3, ncol=3)
+  expect_equal(
+    unname(fit_ref12$marginal_se)[1:3],
+    sqrt(diag(gr %*% fit_ref12$robust_varcov %*% t(gr)))
+  )
+
+
+  # Switched references
+  fit_ref23 <- fit_3arm |>
+    apply_contrast(contrast = "diff", reference = c("2", "3"))
+
+  gr <- matrix(c(1,0,1,
+                 -1,-1,0,
+                 0,1,-1),
+               nrow=3, ncol=3)
+
+  expect_equal(
+    unname(fit_ref23$marginal_se)[1:3],
+    sqrt(diag(gr %*% fit_ref23$robust_varcov %*% t(gr)))
+  )
+
+
+  # Single reference
+  fit_ref2 <- fit_3arm |>
+    apply_contrast(contrast = "diff", reference = "2")
+
+  gr <- matrix(c(1,0,-1,
+                 -1,0,1),
+               nrow=2, ncol=3)
+  expect_equal(
+    unname(fit_ref2$marginal_se)[1:2],
+    sqrt(diag(gr %*% fit_ref2$robust_varcov %*% t(gr)))
+  )
+
+
+  # Check expected values for other contrast type
+  # (logor)
+  fit_ref12_lor <- fit_3arm |>
+    apply_contrast(contrast = "logor", reference = c("1", "2"))
+
+  fit_ref23_lor <- fit_3arm |>
+    apply_contrast(contrast = "logor", reference = c("2", "3"))
+
+  expect_equal(
+    unname(fit_ref12_lor$marginal_est)[1:3],
+    c(1.70521672372, 1.60865867380, -0.09655804992)
+  )
+  expect_equal(
+    unname(fit_ref12_lor$marginal_se)[1:3],
+    c(0.3385774965, 0.3336867307, 0.3470551734)
+  )
+
+  expect_equal(
+    unname(fit_ref23_lor$marginal_est)[1:3],
+    c(-1.70521672372, -0.09655804992, -1.60865867380)
+  )
+  expect_equal(
+    unname(fit_ref23_lor$marginal_se)[1:3],
+    c(0.3385774965, 0.3470551734, 0.3336867307)
+  )
+
+})
+
+
+
+
+
+# if RobinCar is available compare contrast results
+robincar_available <- requireNamespace("RobinCar", quietly = T)
+
+if (robincar_available){
+
+  test_that("Contrast results match RobinCar: diff", {
+    rc_diff <- RobinCar::robincar_glm(data.frame(fit_3arm$data),
+                                      response_col = "AVAL",
+                                      treat_col = "TRTPN",
+                                      formula = fit_3arm$formula,
+                                      g_family = fit_3arm$family,
+                                      contrast_h = "diff")
+    fit_ref1 <- fit_3arm |>
+      apply_contrast(contrast = "diff", reference = "1")
+
+    expect_equal(
+      unname(fit_ref1$marginal_est[1:2]),
+      unname(rc_diff$contrast$result$estimate)
+    )
+
+    expect_equal(
+      unname(fit_ref1$marginal_se[1:2]),
+      unname(rc_diff$contrast$result$se)
+    )
+
+    # switch reference
+    rc_diff_switched <- RobinCar::robincar_glm(data.frame(fit_3arm$data) |>
+                                                 transform(TRTPN = factor(TRTPN, levels=c("3", "1", "2"))),
+                                               response_col = "AVAL",
+                                               treat_col = "TRTPN",
+                                               formula = fit_3arm$formula,
+                                               g_family = fit_3arm$family,
+                                               contrast_h = "diff")
+    fit_ref3 <- fit_3arm |>
+      apply_contrast(contrast = "diff", reference = "3")
+
+    expect_equal(
+      unname(fit_ref3$marginal_est[1:2]),
+      unname(rc_diff_switched$contrast$result$estimate)
+    )
+
+    expect_equal(
+      unname(fit_ref3$marginal_se[1:2]),
+      unname(rc_diff_switched$contrast$result$se)
+    )
+
+  })
+
+
+  test_that("Contrast results match RobinCar: risk ratio", {
+    rc_rr <- RobinCar::robincar_glm(data.frame(fit_3arm$data),
+                                    response_col = "AVAL",
+                                    treat_col = "TRTPN",
+                                    formula = fit_3arm$formula,
+                                    g_family = fit_3arm$family,
+                                    contrast_h = "ratio")
+    fit_ref1 <- fit_3arm |>
+      apply_contrast(contrast = "rr", reference = "1")
+
+    expect_equal(
+      unname(fit_ref1$marginal_est[1:2]),
+      unname(rc_rr$contrast$result$estimate)
+    )
+
+    expect_equal(
+      unname(fit_ref1$marginal_se[1:2]),
+      unname(rc_rr$contrast$result$se)
+    )
+
+    # switch reference
+    rc_rr_switched <- RobinCar::robincar_glm(data.frame(fit_3arm$data) |>
+                                               transform(TRTPN = factor(TRTPN, levels=c("3", "1", "2"))),
+                                             response_col = "AVAL",
+                                             treat_col = "TRTPN",
+                                             formula = fit_3arm$formula,
+                                             g_family = fit_3arm$family,
+                                             contrast_h = "ratio")
+    fit_ref3 <- fit_3arm |>
+      apply_contrast(contrast = "rr", reference = "3")
+
+    expect_equal(
+      unname(fit_ref3$marginal_est[1:2]),
+      unname(rc_rr_switched$contrast$result$estimate)
+    )
+
+    expect_equal(
+      unname(fit_ref3$marginal_se[1:2]),
+      unname(rc_rr_switched$contrast$result$se)
+    )
+
+  })
+
+}
diff --git a/tests/testthat/test-average_predictions.R b/tests/testthat/test-average_predictions.R
index 94b52a6..f840308 100644
--- a/tests/testthat/test-average_predictions.R
+++ b/tests/testthat/test-average_predictions.R
@@ -12,8 +12,8 @@ test_that("averaging counterfactual predictions works", {
   fit1 <- average_predictions(fit1)
 
   expected_output <- c(
-    "0" = mean(fit1$counterfactual.predictions$cf_pred_0),
-    "1" = mean(fit1$counterfactual.predictions$cf_pred_1)
+    "0" = mean(fit1$counterfactual.predictions[["0"]]),
+    "1" = mean(fit1$counterfactual.predictions[["1"]])
   )
 
   expect_equal(fit1$counterfactual.means, expected_output)
diff --git a/tests/testthat/test-estimate_varcov.R b/tests/testthat/test-estimate_varcov.R
index c789e77..f754b11 100644
--- a/tests/testthat/test-estimate_varcov.R
+++ b/tests/testthat/test-estimate_varcov.R
@@ -1,4 +1,4 @@
-# set up example ----------------------------------------------------------
+# set up examples ----------------------------------------------------------
 
 data01 <- trial01 |>
   transform(trtp = as.factor(trtp)) |>
@@ -9,6 +9,14 @@ fit1 <- glm(aval ~ trtp + bl_cov, family = "binomial", data = data01) |>
 
 gr <- c(-1, 1)
 
+# 3 arm example
+data02 <- trial02_cdisc |>
+  transform(TRTPN = as.factor(TRTPN))
+
+fit_3arm <- glm(AVAL ~ TRTPN + SEX, family = "binomial", data = data02) |>
+  beeca::predict_counterfactuals(trt = "TRTPN") |>
+  beeca::average_predictions()
+
 
 # test warnings/errors ------------------------------------------------------------
 
@@ -20,7 +28,7 @@ test_that("Correctly throwing errors on missing components", {
   )
 })
 
-test_that("Correctly throwing warnings on arugment mismatch", {
+test_that("Correctly throwing errors on argument mismatch", {
   expect_error(
     fit1 |>
       estimate_varcov(method = "Fe")
@@ -34,6 +42,13 @@ test_that("Correctly throwing errors on missing stratification variable", {
   )
 })
 
+test_that("Correctly throwing warnings when strata supplied with method Ge", {
+  expect_warning(
+    fit1 |>
+      estimate_varcov(method = "Ge", strata = "bl_cov_c")
+  )
+})
+
 test_that("Check model is sanitized", {
   fit1$sanitized <- NULL
 
@@ -50,8 +65,10 @@ test_that("Check model is sanitized", {
 
 V1 <- estimate_varcov(fit1, method = "Ye")$robust_varcov
 
+V1_3arm <- estimate_varcov(fit_3arm, method = "Ye")$robust_varcov
+
 # if RobinCar is available
-robincar_available <- requireNamespace("RobinCar", versionCheck = list(name = "RobinCar", op = "==", version = "0.3.0"), quietly = T)
+robincar_available <- requireNamespace("RobinCar", quietly = T)
 
 if (robincar_available){
   test_that("Correct variance calculation for Ye's method matching RobinCar", {
@@ -66,6 +83,20 @@ if (robincar_available){
       )$contrast$varcov[1,1]
     )
   })
+
+
+  test_that("3-arm: Correct variance calculation for Ye's method matching RobinCar", {
+    expect_equal(
+      matrix(V1_3arm, nrow=3, ncol=3),
+      unname(RobinCar::robincar_glm(data.frame(fit_3arm$data),
+                                    response_col = "AVAL",
+                                    treat_col = "TRTPN",
+                                    formula = fit_3arm$formula,
+                                    g_family = fit_3arm$family)$varcov)
+    )
+  })
+
+
 } else {
   test_that("Correct variance calculation for Ye's method matching RobinCar", {
     expect_equal(
@@ -74,9 +105,27 @@ if (robincar_available){
     )
   })
 
+  test_that("3-arm: Correct variance calculation for Ye's method matching RobinCar", {
+    expect_equal(
+      matrix(V1_3arm, nrow=3, ncol=3),
+      matrix(c(2.652272364e-03, 2.576978801e-06, -5.244417729e-07,
+               2.576978801e-06, 2.303420439e-03,  1.053457633e-05,
+               -5.244417729e-07, 1.053457633e-05,  2.379076091e-03),
+             nrow=3, ncol=3)
+    )
+  })
+
 }
 
 
+# varcov must be symmetric
+test_that("Ye varcov is symmetric", {
+  expect_equal(
+    V1[lower.tri(V1)],
+    V1[upper.tri(V1)]
+  )
+})
+
 
 
 ## Ye - stratification -----------------------------------------------------
@@ -85,23 +134,43 @@ fit2 <- glm(aval ~ trtp + bl_cov + bl_cov_c2, data = data01 |> transform(bl_cov_
   predict_counterfactuals(trt = "trtp") |>
   average_predictions()
 
-V2 <- estimate_varcov(fit2, method = "Ye", strata = "bl_cov_c2")$robust_varcov
+expect_warning(
+  V2 <- estimate_varcov(fit2, method = "Ye", strata = "bl_cov_c2")$robust_varcov,
+  "More than three unique values are found in stratification variable bl_cov_c2. Please double check if the correct stratification variables are supplied via `strata` argument.",
+  fixed = TRUE
+)
+
+V2_3arm <- estimate_varcov(fit_3arm, method = "Ye", strata = "SEX")$robust_varcov
 
 if (robincar_available){
-test_that("Correct variance calculation for Ye's method matcing RobinCar", {
-  expect_equal(
-    (t(gr) %*% V2 %*% gr)[1, 1],
-    RobinCar::robincar_glm(data.frame(fit2$data),
-      response_col = as.character(fit2$formula[2]),
-      treat_col = "trtp",
-      formula = fit2$formula,
-      car_strata_cols = "bl_cov_c2",
-      car_scheme = "permuted-block",
-      g_family = fit2$family,
-      contrast_h = "diff",
-    )$contrast$varcov[1,1]
-  )
-})
+  test_that("Correct variance calculation for Ye's method matcing RobinCar", {
+    expect_equal(
+      (t(gr) %*% V2 %*% gr)[1, 1],
+      RobinCar::robincar_glm(data.frame(fit2$data),
+                             response_col = as.character(fit2$formula[2]),
+                             treat_col = "trtp",
+                             formula = fit2$formula,
+                             car_strata_cols = "bl_cov_c2",
+                             car_scheme = "permuted-block",
+                             g_family = fit2$family,
+                             contrast_h = "diff",
+      )$contrast$varcov[1,1]
+    )
+  })
+
+  test_that("3-arm: Correct variance calculation for Ye-stratified method matching RobinCar", {
+    expect_equal(
+      matrix(V2_3arm, nrow=3, ncol=3),
+      unname(RobinCar::robincar_glm(data.frame(fit_3arm$data),
+                                    response_col = "AVAL",
+                                    treat_col = "TRTPN",
+                                    formula = fit_3arm$formula,
+                                    car_strata_cols = "SEX",
+                                    car_scheme = "permuted-block",
+                                    g_family = fit_3arm$family)$varcov)
+    )
+  })
+
 } else {
   test_that("Correct variance calculation for Ye's method matching RobinCar", {
     expect_equal(
@@ -110,8 +179,27 @@ test_that("Correct variance calculation for Ye's method matcing RobinCar", {
     )
   })
 
+
+  test_that("3-arm: Correct variance calculation for Ye-stratified method matching RobinCar", {
+    expect_equal(
+      matrix(V2_3arm, nrow=3, ncol=3),
+      matrix(c(2.622886513e-03, -7.797320489e-06, -4.923527008e-06,
+               -7.797320489e-06,  2.288915038e-03,  1.360521021e-05,
+               -4.923527008e-06,  1.360521021e-05,  2.376256827e-03),
+             nrow=3, ncol=3)
+    )
+  })
+
 }
 
+# varcov must be symmetric
+test_that("Ye-stratified varcov is symmetric", {
+  expect_equal(
+    V2[lower.tri(V2)],
+    V2[upper.tri(V2)]
+  )
+})
+
 
 
 ## Ye - mod ----------------------------------------------------------------
@@ -133,6 +221,15 @@ test_that("Correct variance calculation for Ye's method based on original manusc
 
 
 
+# varcov must be symmetric
+test_that("Ye-mod varcov is symmetric", {
+  expect_equal(
+    V3[lower.tri(V3)],
+    V3[upper.tri(V3)]
+  )
+})
+
+
 ## Ge ----------------------------------------------------------------------
 
 ge_var_paper <- function(glmfit, trt) {
@@ -179,11 +276,20 @@ ge_var_paper <- function(glmfit, trt) {
   return(difP(glmfit)$var[[1]])
 }
 
-V3 <- estimate_varcov(fit1, method = "Ge", type = "model-based")$robust_varcov
+V4 <- estimate_varcov(fit1, method = "Ge", type = "model-based")$robust_varcov
 
-test_that("Correct variance calculation for Ge's method matcing manuscript code", {
+test_that("Correct variance calculation for Ge's method matching manuscript code", {
   expect_equal(
-    (t(gr) %*% V3 %*% gr)[1, 1],
+    (t(gr) %*% V4 %*% gr)[1, 1],
     ge_var_paper(fit1, "trtp")
   )
 })
+
+
+# varcov must be symmetric
+test_that("Ge varcov is symmetric", {
+  expect_equal(
+    V4[lower.tri(V4)],
+    V4[upper.tri(V4)]
+  )
+})
diff --git a/tests/testthat/test-get_marginal_effect.R b/tests/testthat/test-get_marginal_effect.R
index 8dd2ad4..396f6c9 100644
--- a/tests/testthat/test-get_marginal_effect.R
+++ b/tests/testthat/test-get_marginal_effect.R
@@ -7,26 +7,32 @@ data01 <- trial01 |>
 fit1 <- glm(aval ~ trtp + bl_cov, family = "binomial", data = data01) |>
   get_marginal_effect(trt = "trtp", method = "Ye", contrast = "diff", reference = "0")
 
+# 3-arm example
+data02 <- trial02_cdisc |>
+  transform(TRTPN = as.factor(TRTPN))
+
+fit_3arm <- glm(AVAL ~ TRTPN + SEX, family = "binomial", data = data02) |>
+  get_marginal_effect(trt = "TRTPN", method = "Ye", contrast = "diff", reference = "1")
+
+
 # test warnings/errors ------------------------------------------------------------
 
 test_that("Correctly throwing errors on missing argument", {
   expect_error(
     glm(aval ~ trtp + bl_cov, family = "binomial", data = data01) |>
-      get_marginal_effect(method = "Ye"), 'argument "trt" is missing, with no default'
-  )
+      get_marginal_effect(method = "Ye"))
 })
 
-test_that("Correctly throwing errors on arugment mismatch", {
+test_that("Correctly throwing errors on argument mismatch", {
   expect_error(
     glm(aval ~ trtp + bl_cov, family = "binomial", data = data01) |>
-      get_marginal_effect(method = "Fe"), 'argument "trt" is missing, with no default'
-  )
+      get_marginal_effect(trt = "trtp", method = "Fe"))
 })
 
 test_that("Correctly throwing warnings on missing value", {
   expect_warning(
     glm(aval ~ trtp + bl_cov, family = "binomial", data = trial01 |>
-      transform(trtp = as.factor(trtp))) |>
+          transform(trtp = as.factor(trtp))) |>
       get_marginal_effect(trt = "trtp", method = "Ye", reference = "0"),
     "There is 1 record omitted from the original data due to missing values, please check if they should be imputed prior to model fitting."
   )
@@ -42,3 +48,12 @@ test_that("Correctly producing results in ARD format", {
 test_that("Correctly producing results", {
   expect_false(any(is.na(fit1$marginal_results)))
 })
+
+
+test_that("3-arm: Correctly producing results in ARD format", {
+  expect_equal(dim(fit_3arm$marginal_results), c(19, 8))
+})
+
+test_that("3-arm: Correctly producing results", {
+  expect_false(any(is.na(fit_3arm$marginal_results)))
+})
diff --git a/tests/testthat/test-sanitize.R b/tests/testthat/test-sanitize.R
index a17108f..3b5172b 100644
--- a/tests/testthat/test-sanitize.R
+++ b/tests/testthat/test-sanitize.R
@@ -55,20 +55,13 @@ test_that("Treatment variable in model data", {
 test_that("Check treatment variable is a factor", {
   data_complete$trtp <- as.numeric(data_complete$trtp)
   fit1 <- glm(aval ~ trtp + bl_cov, family = binomial(link = "logit"), data = data_complete)
-  expect_error(sanitize_model(fit1, "trtp"),
+  expect_error(
+    sanitize_model(fit1, "trtp"),
     'Treatment variable "trtp" must be of type factor, not "double".',
     fixed = TRUE
   )
 })
 
-test_that("Correctly throwing errors on treatment variable > 2 levels", {
-  fit1 <- glm(aval ~ trtp + bl_cov, family = "binomial", data = trial01 |> transform(trtp = factor(as.numeric(trtp) + rbinom(268, size = 1, prob = 0.5))))
-  expect_error(sanitize_model(fit1, "trtp"),
-    'Treatment variable "trtp" must have 2 levels. Found 3: {0,1,2}.',
-    fixed = TRUE
-  )
-})
-
 test_that("Check response variable is 0/1", {
   data_complete$aval <- replace(data_complete$aval, data_complete$aval == "0", "0.5")
   levels(data_complete[["aval"]]) <- c("0.5", "1")
@@ -80,7 +73,8 @@ test_that("Check response variable is 0/1", {
 test_that("Correctly throw error on incorrect model class", {
   lm_fit <- stats::lm(aval ~ trtp + bl_cov, data = data_complete)
 
-  expect_error(sanitize_model(lm_fit, "trtp"),
+  expect_error(
+    sanitize_model(lm_fit, "trtp"),
     'Model of class "lm" is not supported.',
     fixed = TRUE
   )
@@ -98,7 +92,8 @@ test_that("Throw warning if model matrix not full rank", {
   mat[["x3"]] <- mat$x1 + mat$x2
   fit1 <- glm(y ~ trtp + x1 + x2 + x3, family = "binomial", data = mat)
 
-  expect_error(sanitize_model(fit1, "trtp"),
+  expect_error(
+    sanitize_model(fit1, "trtp"),
     "The data does not have full rank, please check glm model fitting.",
     fixed = TRUE
   )
@@ -109,12 +104,14 @@ test_that("Throw warning if model not converged", {
   # fit glm with reduced max iterations so does not converge
   suppressWarnings(
     fit1 <- glm(aval ~ trtp,
-      family = "binomial", data = data,
-      control = glm.control(maxit = 1)
+                family = "binomial",
+                data = data_complete,
+                control = glm.control(maxit = 1)
     )
   )
 
-  expect_warning(sanitize_model(fit1, "trtp"),
+  expect_warning(
+    sanitize_model(fit1, "trtp"),
     "The glm model was not converged, please check glm model fitting.",
     fixed = TRUE
   )
@@ -124,7 +121,8 @@ test_that("Throw warning if model not converged", {
 test_that("Throw error if treatment not on right hand side of model formula", {
   fit1 <- glm(trtp ~ aval, family = "binomial", data = data)
 
-  expect_error(sanitize_variable(fit1, "trtp"),
+  expect_error(
+    sanitize_variable(fit1, "trtp"),
     'Did not find the treatment variable "trtp" on right hand side of the model formula',
     fixed = TRUE
   )
diff --git a/vignettes/estimand_and_implementations.Rmd b/vignettes/estimand_and_implementations.Rmd
index a15fb81..d22fb1e 100644
--- a/vignettes/estimand_and_implementations.Rmd
+++ b/vignettes/estimand_and_implementations.Rmd
@@ -70,8 +70,6 @@ library(dplyr)
 ```{r}
 ## Prepare the dataset for input
 dat <- trial02_cdisc %>%
-  ## In this case we are only interested in comparing two arms: Placebo vs Xanomeline High Dose
-  dplyr::filter(TRTP %in% c("Placebo", "Xanomeline High Dose")) %>%
   ## Treatment variable must be coded as a factor
   dplyr::mutate(TRTP = factor(TRTP))
 
@@ -113,8 +111,12 @@ ci_u <- diff_est + (qnorm(0.975) * diff_se)
 z_score <- diff_est / diff_se
 p_value <- 2 * (1 - pnorm(abs(z_score)))
 
-sprintf("The risk difference is %s with 95%% CI: (%s - %s)", round(diff_est, 2), round(ci_l, 2), round(ci_u, 2))
-sprintf("p-value: %s", formatC(p_value, format = "e", digits = 2))
+tibble(
+  Contrast = marginal_results[marginal_results$STAT == "diff", ]$TRTVAL,
+  `Risk difference` = round(diff_est, 2),
+  `95% CI` = sprintf("(%s - %s)", round(ci_l, 2), round(ci_u, 2)),
+  `p-value` = formatC(p_value, format = "e", digits = 2)
+)
 ```
 
 ## Comparing different implementations
@@ -233,7 +235,7 @@ cat("Point estimate", beeca_ye$marginal_est, "\nStandard error estimate", beeca_
 **Version 1**: from `RobinCar` package 
 
 ```{r}
-if (requireNamespace("RobinCar", versionCheck = list(name = "RobinCar", op = "==", version = "0.3.0"), quietly = T)) {
+if (requireNamespace("RobinCar", quietly = T)) {
   robincar_ye <- RobinCar::robincar_glm(data.frame(fit$data), response_col = as.character(fit$formula[2]),
       treat_col = "trtp", formula = fit$formula, g_family = fit$family,
       contrast_h = "diff")$contrast$result