Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

datawizard version of unite() #424

Merged
merged 13 commits into from
May 26, 2023
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ export(data_to_wide)
export(data_transpose)
export(data_ungroup)
export(data_unique)
export(data_unite)
export(data_write)
export(degroup)
export(demean)
Expand Down
111 changes: 111 additions & 0 deletions R/data_unite.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#' @title Unite ("merge") multiple variables
#' @name data_unite
#'
#' @description
#' Merges values of multiple variables per observation into one new variable.
strengejacke marked this conversation as resolved.
Show resolved Hide resolved
#'
#' @param data A data frame.
#' @param new_column The name of the new column, as a string.
#' @param separator Separator, as string, to use between values.
strengejacke marked this conversation as resolved.
Show resolved Hide resolved
#' @param append Logical, if `FALSE` (default), removes original columns that
#' were united. If `TRUE`, all columns are preserved and the new column is
#' appended to the data frame.
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved
#' @param remove_na Logical, if `TRUE`, missing values (`NA`) are not included
#' in the united values. If `FALSE`, missing values are represented as `"NA"`
#' in the united values.
#' @param ... Currently not used.
#' @inheritParams find_columns
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @return `data`, with a newly created variable.
#'
#' @examples
#' d <- data.frame(
#' x = 1:3,
#' y = letters[1:3],
#' z = 6:8
#' )
#' d
#' data_unite(d, new_column = "xyz")
#' data_unite(d, new_column = "xyz", remove = FALSE)
#' data_unite(d, new_column = "xyz", select = c("x", "z"))
#' data_unite(d, new_column = "xyz", select = c("x", "z"), append = TRUE)
#' @export
data_unite <- function(data,
new_column = NULL,
select = NULL,
exclude = NULL,
separator = "_",
append = FALSE,
remove_na = FALSE,
ignore_case = FALSE,
verbose = TRUE,
regex = FALSE,
...) {
# we need a name for the new column
if (is.null(new_column)) {
insight::format_error(
"No name for the new column was provided.",
"Please use `new_column` to define a name for the newly created column."
)
}

# only one column name
if (length(new_column) > 1) {
insight::format_error(
"Please provide only a single string for `new_column`, no character vector with multiple values."
)
}

# evaluate select/exclude, may be select-helpers
select <- .select_nse(select,
data,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)

if (is.null(select) || length(select) <= 1) {
insight::format_error(
"At least two columns in `select` are required for `data_unite()`."
)
}

# unite
out <- data.frame(
new_col = do.call(paste, c(data[select], sep = separator)),
stringsAsFactors = FALSE
)
colnames(out) <- new_column
strengejacke marked this conversation as resolved.
Show resolved Hide resolved

# remove missings
if (remove_na) {
# remove trailing and leading "NA_" and "_NA"
out[[new_column]] <- gsub(paste0("^NA", separator), "", out[[new_column]])
out[[new_column]] <- gsub(paste0(separator, "NA$"), "", out[[new_column]])
# remove _NA_ inside string, add separator back. This ensure we match
# whole-word NA and do not break strings like "COUNTRY_NATION"
out[[new_column]] <- gsub(paste0(separator, "NA", separator), separator, out[[new_column]], fixed = TRUE)
}
strengejacke marked this conversation as resolved.
Show resolved Hide resolved

# remove old columns
if (!isTRUE(append)) {
data[select] <- NULL
}

# overwrite?
if (new_column %in% colnames(data) && verbose) {
insight::format_alert(
"The name for `new_column` already exists as variable name in the data.",
"This variable will be replaced by `new_column`."
)
}
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved

# overwrite or append
data[[new_column]] <- out[[new_column]]

# fin
data
strengejacke marked this conversation as resolved.
Show resolved Hide resolved
}
117 changes: 117 additions & 0 deletions man/data_unite.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

154 changes: 154 additions & 0 deletions tests/testthat/test-data_unite.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
d_unite <- data.frame(
x = c(NA, 1:3),
y = c(letters[1:3], NA_character_),
z = 6:9,
m = c("X", NA_character_, "Y", "Z"),
n = c("NATION", "COUNTRY", "NATION", NA_character_),
stringsAsFactors = FALSE
)


# for following tests, we need to check for correct column names,
# and correct values in new variable

test_that("data_unite: simple use case", {
# basic
out <- data_unite(d_unite, new_column = "xyz")
expect_identical(colnames(out), "xyz")
expect_identical(
out$xyz,
c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA")
)
# use existing column name
out <- data_unite(d_unite, new_column = "x")
expect_identical(colnames(out), "x")
expect_identical(
out$x,
c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA")
)
# select
out <- data_unite(d_unite, new_column = "xyz", select = c("x", "n"))
expect_identical(colnames(out), c(setdiff(colnames(d_unite), c("x", "n")), "xyz"))
expect_identical(
out$xyz,
c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA")
)
# select, use existing column name
out <- data_unite(d_unite, new_column = "x", select = c("x", "n"))
expect_identical(colnames(out), c(setdiff(colnames(d_unite), c("x", "n")), "x"))
expect_identical(
out$x,
c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA")
)
})


test_that("data_unite: remove_na", {
# basic
out <- data_unite(d_unite, new_column = "xyz", remove_na = TRUE)
expect_identical(colnames(out), "xyz")
expect_identical(
out$xyz,
c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z")
)
# use existing column name
out <- data_unite(d_unite, new_column = "x", remove_na = TRUE)
expect_identical(colnames(out), "x")
expect_identical(
out$x,
c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z")
)
# select
out <- data_unite(d_unite, new_column = "xyz", remove_na = TRUE, select = c("x", "n"))
expect_identical(colnames(out), c(setdiff(colnames(d_unite), c("x", "n")), "xyz"))
expect_identical(
out$xyz,
c("NATION", "1_COUNTRY", "2_NATION", "3")
)
# select, use existing column name
out <- data_unite(d_unite, new_column = "x", remove_na = TRUE, select = c("x", "n"))
expect_identical(colnames(out), c(setdiff(colnames(d_unite), c("x", "n")), "x"))
expect_identical(
out$x,
c("NATION", "1_COUNTRY", "2_NATION", "3")
)
})


test_that("data_unite: append", {
# basic
out <- data_unite(d_unite, new_column = "xyz", append = TRUE)
expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz"))
expect_identical(
out$xyz,
c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA")
)
# remove NA
out <- data_unite(d_unite, new_column = "xyz", remove_na = TRUE, append = TRUE)
expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz"))
expect_identical(
out$xyz,
c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z")
)
# append, using existing column name
expect_message({
out <- data_unite(d_unite, new_column = "x", append = TRUE)
})
expect_identical(colnames(out), c("x", "y", "z", "m", "n"))
expect_identical(
out$x,
c("NA_a_6_X_NATION", "1_b_7_NA_COUNTRY", "2_c_8_Y_NATION", "3_NA_9_Z_NA")
)
# append, using existing column name, and remove NA
expect_message({
out <- data_unite(d_unite, new_column = "x", remove_na = TRUE, append = TRUE)
})
expect_identical(colnames(out), c("x", "y", "z", "m", "n"))
expect_identical(
out$x,
c("a_6_X_NATION", "1_b_7_COUNTRY", "2_c_8_Y_NATION", "3_9_Z")
)
})


test_that("data_unite: combine select and append", {
# basic
out <- data_unite(d_unite, new_column = "xyz", append = TRUE, select = c("x", "n"))
expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz"))
expect_identical(
out$xyz,
c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA")
)
# remove NA
out <- data_unite(d_unite, new_column = "xyz", remove_na = TRUE, append = TRUE, select = c("x", "n"))
expect_identical(colnames(out), c("x", "y", "z", "m", "n", "xyz"))
expect_identical(
out$xyz,
c("NATION", "1_COUNTRY", "2_NATION", "3")
)
# append, using existing column name
expect_message({
out <- data_unite(d_unite, new_column = "x", append = TRUE, select = c("x", "n"))
})
expect_identical(colnames(out), c("x", "y", "z", "m", "n"))
expect_identical(
out$x,
c("NA_NATION", "1_COUNTRY", "2_NATION", "3_NA")
)
# append, using existing column name, and remove NA
expect_message({
out <- data_unite(d_unite, new_column = "x", remove_na = TRUE, append = TRUE, select = c("x", "n"))
})
expect_identical(colnames(out), c("x", "y", "z", "m", "n"))
expect_identical(
out$x,
c("NATION", "1_COUNTRY", "2_NATION", "3")
)
})


test_that("data_unite: errors", {
expect_error(data_unite(d_unite), regex = "No name")
expect_error(data_unite(d_unite, new_column = c("a", "b")), regex = "a single string")
expect_error(expect_warning(data_unite(d_unite, new_column = "a", select = "huhu")), regex = "At least")
})