easystats · strengejacke · May 26, 2023 · May 25, 2023 · May 25, 2023 · May 25, 2023
diff --git a/NAMESPACE b/NAMESPACE
@@ -230,6 +230,7 @@ export(data_to_wide)
 export(data_transpose)
 export(data_ungroup)
 export(data_unique)
+export(data_unite)
 export(data_write)
 export(degroup)
 export(demean)

diff --git a/R/data_unite.R b/R/data_unite.R
@@ -0,0 +1,111 @@
+#' @title Unite ("merge") multiple variables
+#' @name data_unite
+#'
+#' @description
+#' Merges values of multiple variables per observation into one new variable.
+#'
+#' @param data A data frame.
+#' @param new_column The name of the new column, as a string.
+#' @param separator Separator, as string, to use between values.
+#' @param append Logical, if `FALSE` (default), removes original columns that
+#' were united. If `TRUE`, all columns are preserved and the new column is
+#' appended to the data frame.
+#' @param remove_na Logical, if `TRUE`, missing values (`NA`) are not included
+#' in the united values. If `FALSE`, missing values are represented as `"NA"`
+#' in the united values.
+#' @param ... Currently not used.
+#' @inheritParams find_columns
+#'
+#' @inheritSection center Selection of variables - the `select` argument
+#'
+#' @return `data`, with a newly created variable.
+#'
+#' @examples
+#' d <- data.frame(
+#'   x = 1:3,
+#'   y = letters[1:3],
+#'   z = 6:8
+#' )
+#' d
+#' data_unite(d, new_column = "xyz")
+#' data_unite(d, new_column = "xyz", remove = FALSE)
+#' data_unite(d, new_column = "xyz", select = c("x", "z"))
+#' data_unite(d, new_column = "xyz", select = c("x", "z"), append = TRUE)
+#' @export
+data_unite <- function(data,
+                       new_column = NULL,
+                       select = NULL,
+                       exclude = NULL,
+                       separator = "_",
+                       append = FALSE,
+                       remove_na = FALSE,
+                       ignore_case = FALSE,
+                       verbose = TRUE,
+                       regex = FALSE,
+                       ...) {
+  # we need a name for the new column
+  if (is.null(new_column)) {
+    insight::format_error(
+      "No name for the new column was provided.",
+      "Please use `new_column` to define a name for the newly created column."
+    )
+  }
+
+  # only one column name
+  if (length(new_column) > 1) {
+    insight::format_error(
+      "Please provide only a single string for `new_column`, no character vector with multiple values."
+    )
+  }
+
+  # evaluate select/exclude, may be select-helpers
+  select <- .select_nse(select,
+    data,
+    exclude,
+    ignore_case,
+    regex = regex,
+    verbose = verbose
+  )
+
+  if (is.null(select) || length(select) <= 1) {
+    insight::format_error(
+      "At least two columns in `select` are required for `data_unite()`."
+    )
+  }
+
+  # unite
+  out <- data.frame(
+    new_col = do.call(paste, c(data[select], sep = separator)),
+    stringsAsFactors = FALSE
+  )
+  colnames(out) <- new_column
+
+  # remove missings
+  if (remove_na) {
+    # remove trailing and leading "NA_" and "_NA"
+    out[[new_column]] <- gsub(paste0("^NA", separator), "", out[[new_column]])
+    out[[new_column]] <- gsub(paste0(separator, "NA$"), "", out[[new_column]])
+    # remove _NA_ inside string, add separator back. This ensure we match
+    # whole-word NA and do not break strings like "COUNTRY_NATION"
+    out[[new_column]] <- gsub(paste0(separator, "NA", separator), separator, out[[new_column]], fixed = TRUE)
+  }
+
+  # remove old columns
+  if (!isTRUE(append)) {
+    data[select] <- NULL
+  }
+
+  # overwrite?
+  if (new_column %in% colnames(data) && verbose) {
+    insight::format_alert(
+      "The name for `new_column` already exists as variable name in the data.",
+      "This variable will be replaced by `new_column`."
+    )
+  }
+
+  # overwrite or append
+  data[[new_column]] <- out[[new_column]]
+
+  # fin
+  data
+}
diff --git a/man/data_unite.Rd b/man/data_unite.Rd
diff --git a/tests/testthat/test-data_unite.R b/tests/testthat/test-data_unite.R
@@ -0,0 +1,154 @@
+d_unite <- data.frame(
+  x = c(NA, 1:3),
+  y = c(letters[1:3], NA_character_),
+  z = 6:9,
+  m = c("X", NA_character_, "Y", "Z"),
+  n = c("NATION", "COUNTRY", "NATION", NA_character_),
+  stringsAsFactors = FALSE
+)
+
+
+# for following tests, we need to check for correct column names,
+# and correct values in new variable
+
+test_that("data_unite: simple use case", {
+  # basic
+  out <- data_unite(d_unite, new_column = "xyz")
+  expect_identical(colnames(out), "xyz")
+  expect_identical(
+    out$xyz,
+    c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA")
+  )
+  # use existing column name
+  out <- data_unite(d_unite, new_column = "x")
+  expect_identical(colnames(out), "x")
+  expect_identical(
+    out$x,
+    c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA")
+  )
+  # select
+  out <- data_unite(d_unite, new_column = "xyz", select = c("x", "n"))
+  expect_identical(colnames(out), c(setdiff(colnames(d_unite), c("x", "n")), "xyz"))
+  expect_identical(
+    out$xyz,
+    c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA")
+  )
+  # select, use existing column name
+  out <- data_unite(d_unite, new_column = "x", select = c("x", "n"))
+  expect_identical(colnames(out), c(setdiff(colnames(d_unite), c("x", "n")), "x"))
+  expect_identical(
+    out$x,
+    c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA")
+  )
+})
+
+
+test_that("data_unite: remove_na", {
+  # basic
+  out <- data_unite(d_unite, new_column = "xyz", remove_na = TRUE)
+  expect_identical(colnames(out), "xyz")
+  expect_identical(
+    out$xyz,
+    c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z")
+  )
+  # use existing column name
+  out <- data_unite(d_unite, new_column = "x", remove_na = TRUE)
+  expect_identical(colnames(out), "x")
+  expect_identical(
+    out$x,
+    c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z")
+  )
+  # select
+  out <- data_unite(d_unite, new_column = "xyz", remove_na = TRUE, select = c("x", "n"))
+  expect_identical(colnames(out), c(setdiff(colnames(d_unite), c("x", "n")), "xyz"))
+  expect_identical(
+    out$xyz,
+    c("NATION", "1_COUNTRY", "2_NATION", "3")
+  )
+  # select, use existing column name
+  out <- data_unite(d_unite, new_column = "x", remove_na = TRUE, select = c("x", "n"))
+  expect_identical(colnames(out), c(setdiff(colnames(d_unite), c("x", "n")), "x"))
+  expect_identical(
+    out$x,
+    c("NATION", "1_COUNTRY", "2_NATION", "3")
+  )
+})
+
+
+test_that("data_unite: append", {
+  # basic
+  out <- data_unite(d_unite, new_column = "xyz", append = TRUE)
+  expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz"))
+  expect_identical(
+    out$xyz,
+    c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA")
+  )
+  # remove NA
+  out <- data_unite(d_unite, new_column = "xyz", remove_na = TRUE, append = TRUE)
+  expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz"))
+  expect_identical(
+    out$xyz,
+    c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z")
+  )
+  # append, using existing column name
+  expect_message({
+    out <- data_unite(d_unite, new_column = "x", append = TRUE)
+  })
+  expect_identical(colnames(out), c("x", "y", "z", "m", "n"))
+  expect_identical(
+    out$x,
+    c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA")
+  )
+  # append, using existing column name, and remove NA
+  expect_message({
+    out <- data_unite(d_unite, new_column = "x", remove_na = TRUE, append = TRUE)
+  })
+  expect_identical(colnames(out), c("x", "y", "z", "m", "n"))
+  expect_identical(
+    out$x,
+    c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z")
+  )
+})
+
+
+test_that("data_unite: combine select and append", {
+  # basic
+  out <- data_unite(d_unite, new_column = "xyz", append = TRUE, select = c("x", "n"))
+  expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz"))
+  expect_identical(
+    out$xyz,
+    c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA")
+  )
+  # remove NA
+  out <- data_unite(d_unite, new_column = "xyz", remove_na = TRUE, append = TRUE, select = c("x", "n"))
+  expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz"))
+  expect_identical(
+    out$xyz,
+    c("NATION", "1_COUNTRY", "2_NATION", "3")
+  )
+  # append, using existing column name
+  expect_message({
+    out <- data_unite(d_unite, new_column = "x", append = TRUE, select = c("x", "n"))
+  })
+  expect_identical(colnames(out), c("x", "y", "z", "m", "n"))
+  expect_identical(
+    out$x,
+    c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA")
+  )
+  # append, using existing column name, and remove NA
+  expect_message({
+    out <- data_unite(d_unite, new_column = "x", remove_na = TRUE, append = TRUE, select = c("x", "n"))
+  })
+  expect_identical(colnames(out), c("x", "y", "z", "m", "n"))
+  expect_identical(
+    out$x,
+    c("NATION", "1_COUNTRY", "2_NATION", "3")
+  )
+})
+
+
+test_that("data_unite: errors", {
+  expect_error(data_unite(d_unite), regex = "No name")
+  expect_error(data_unite(d_unite, new_column = c("a", "b")), regex = "a single string")
+  expect_error(expect_warning(data_unite(d_unite, new_column = "a", select = "huhu")), regex = "At least")
+})