diff --git a/DESCRIPTION b/DESCRIPTION index e19219cf1..f3e9e0c0e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.7.1.10 +Version: 0.7.1.11 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NAMESPACE b/NAMESPACE index deacfec21..c2b4ecdb7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -228,6 +228,7 @@ export(data_reorder) export(data_restoretype) export(data_rotate) export(data_select) +export(data_separate) export(data_tabulate) export(data_to_long) export(data_to_wide) diff --git a/NEWS.md b/NEWS.md index f67f7fdeb..e08aba37a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -36,6 +36,9 @@ NEW FUNCTIONS * `data_unite()`, to merge values of multiple variables into one new variable. +* `data_separate()`, as counterpart to `data_unite()`, to separate a single + variable into multiple new variables. + * `data_modify()`, to create new variables, or modify or remove existing variables in a data frame. diff --git a/R/data_separate.R b/R/data_separate.R new file mode 100644 index 000000000..866866d31 --- /dev/null +++ b/R/data_separate.R @@ -0,0 +1,401 @@ +#' @title Separate single variable into multiple variables +#' @name data_separate +#' +#' @description +#' Separates a single variable into multiple new variables. +#' +#' @param data A data frame. +#' @param new_columns The names of the new columns, as character vector. If +#' more than one variable was selected (in `select`), the new names are prefixed +#' with the name of the original column. `new_columns` can also be a list of +#' (named) character vectors when multiple variables should be separated. See +#' 'Examples'. +#' @param separator Separator between columns. Can be a character vector, which +#' is then treated as regular expression, or a numeric vector that indicates at +#' which positions the string values will be split. +#' @param append Logical, if `FALSE` (default), removes original columns that +#' were separated. If `TRUE`, all columns are preserved and the new columns are +#' appended to the data frame. +#' @param guess_columns If `new_columns` is not given, the required number of +#' new columns is guessed based on the results of value splitting. For example, +#' if a variable is split into three new columns, this will be considered as +#' the required number of new columns, and columns are named `"split_1"`, +#' `"split_2"` and `"split_3"`. When values from a variable are split into +#' different amount of new columns, the `guess_column` can be either `"mode"` +#' (number of new columns is based on the most common number of splits), `"min"` +#' or `"max"` to use the minimum resp. maximum number of possible splits as +#' required number of columns. +#' @param fill How to deal with values that return fewer new columns after +#' splitting? Can be `"left"` (fill missing columns from the left with `NA`), +#' `"right"` (fill missing columns from the right with `NA`) or `"value_left"` +#' or `"value_right"` to fill missing columns from left or right with the +#' left-most or right-most values. +#' @param extra How to deal with values that return too many new columns after +#' splitting? Can be `"drop_left"` or `"drop_right"` to drop the left-most or +#' right-most values, or `"merge_left"` or `"merge_right"` to merge the left- +#' or right-most value together, and keeping all remaining values as is. +#' @param merge_multiple Logical, if `TRUE` and more than one variable is selected +#' for separating, new columns can be merged. Value pairs of all split variables +#' are merged. +#' @param merge_separator Separator string when `merge_multiple = TRUE`. Defines +#' the string that is used to merge values together. +#' @param convert_na Logical, if `TRUE`, character `"NA"` values are converted +#' into real `NA` values. +#' @param ... Currently not used. +#' @inheritParams find_columns +#' +#' @seealso [`data_unite()`] +#' +#' @return A data frame with the newly created variable(s), or - when `append = TRUE` - +#' `data` including new variables. +#' +#' @examples +#' # simple case +#' d <- data.frame( +#' x = c("1.a.6", "2.b.7", "3.c.8"), +#' stringsAsFactors = FALSE +#' ) +#' d +#' data_separate(d, new_columns = c("a", "b", "c")) +#' +#' # guess number of columns +#' d <- data.frame( +#' x = c("1.a.6", NA, "2.b.6.7", "3.c", "x.y.z"), +#' stringsAsFactors = FALSE +#' ) +#' d +#' data_separate(d, guess_columns = "mode") +#' +#' data_separate(d, guess_columns = "max") +#' +#' # drop left-most column +#' data_separate(d, guess_columns = "mode", extra = "drop_left") +#' +#' # merge right-most column +#' data_separate(d, guess_columns = "mode", extra = "merge_right") +#' +#' # fill columns with fewer values with left-most values +#' data_separate(d, guess_columns = "mode", fill = "value_left") +#' +#' # fill and merge +#' data_separate( +#' d, +#' guess_columns = "mode", +#' fill = "value_left", +#' extra = "merge_right" +#' ) +#' +#' # multiple columns to split +#' d <- data.frame( +#' x = c("1.a.6", "2.b.7", "3.c.8"), +#' y = c("x.y.z", "10.11.12", "m.n.o"), +#' stringsAsFactors = FALSE +#' ) +#' d +#' # split two columns, default column names +#' data_separate(d, guess_columns = "mode") +#' +#' # split into new named columns, repeating column names +#' data_separate(d, new_columns = c("a", "b", "c")) +#' +#' # split selected variable new columns +#' data_separate(d, select = "y", new_columns = c("a", "b", "c")) +#' +#' # merge multiple split columns +#' data_separate( +#' d, +#' new_columns = c("a", "b", "c"), +#' merge_multiple = TRUE +#' ) +#' +#' # merge multiple split columns +#' data_separate( +#' d, +#' new_columns = c("a", "b", "c"), +#' merge_multiple = TRUE, +#' merge_separator = "-" +#' ) +#' +#' # separate multiple columns, give proper column names +#' d_sep <- data.frame( +#' x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"), +#' y = c("m.n.99.22", "77.f.g.34", "44.9", NA), +#' stringsAsFactors = FALSE +#' ) +#' +#' data_separate( +#' d_sep, +#' select = c("x", "y"), +#' new_columns = list( +#' x = c("A", "B", "C"), # separate "x" into three columns +#' y = c("EE", "FF", "GG", "HH") # separate "y" into four columns +#' ), +#' verbose = FALSE +#' ) +#' @export +data_separate <- function(data, + select = NULL, + new_columns = NULL, + separator = "[^[:alnum:]]+", + guess_columns = NULL, + merge_multiple = FALSE, + merge_separator = "", + fill = "right", + extra = "drop_right", + convert_na = TRUE, + exclude = NULL, + append = FALSE, + ignore_case = FALSE, + verbose = TRUE, + regex = FALSE, + ...) { + # we need at least one explicit choice for either `new_columns` or `guess_columns` + if (is.null(new_columns) && is.null(guess_columns)) { + insight::format_error("Cannot separate values. Either `new_columns` or `guess_columns` must be provided.") + } + # in case user did not provide names of new columns, we can try + # to guess number of columns per variable + guess_columns <- match.arg(guess_columns, choices = c("min", "max", "mode")) + + # make sure we have valid options for fill and extra + fill <- match.arg(fill, choices = c("left", "right", "value_left", "value_right")) + extra <- match.arg(extra, choices = c("drop_left", "drop_right", "merge_left", "merge_right")) + + # evaluate select/exclude, may be select-helpers + select <- .select_nse(select, + data, + exclude, + ignore_case, + regex = regex, + verbose = verbose + ) + + # make new_columns as list, this works with single and multiple columns + if (!is.null(new_columns) && !is.list(new_columns)) { + new_columns <- rep(list(new_columns), times = length(select)) + # if we have multiple columns that were separated, we avoid duplicated + # column names of created variables by appending name of original column + # however, we don't have duplicated column names when we merge them together + # so don't create new column names when "merge_multiple" is FALSE. + make_unique_colnames <- length(select) > 1 && !merge_multiple + } else { + # we don't want to create own unique column names when user explicitly + # provided column names as a list, i.e. column names for each separated + # variable + make_unique_colnames <- FALSE + } + + # make sure list of new column names is named + if (!is.null(new_columns) && is.null(names(new_columns))) { + names(new_columns) <- select + } + + # iterate columns that should be split + split_data <- lapply(select, function(sep_column) { + + # do we have known number of columns? + if (is.null(new_columns)) { + n_columns <- NULL + } else { + n_columns <- length(new_columns[[sep_column]]) + } + + # make sure we have a character that we can split + x <- data[[sep_column]] + if (!is.character(x)) { + x <- as.character(x) + } + + # separate column into multiple strings + if (is.numeric(separator)) { + maxlen <- max(nchar(x), na.rm = TRUE) + starts <- c(0, separator) + ends <- c(separator - 1, maxlen) + separated_columns <- lapply(seq_along(starts), function(i) { + substr(x, starts[i], ends[i]) + }) + separated_columns <- as.data.frame( + do.call(rbind, separated_columns), + stringsAsFactors = FALSE + ) + } else { + separated_columns <- strsplit(x, separator, perl = TRUE) + } + + # how many new columns do we need? + if (is.null(n_columns)) { + # lengths of all split strings + l <- lengths(separated_columns) + # but without NA values + l <- l[!vapply(l, function(i) all(is.na(i)), TRUE)] + # define number of new columns, based on user-choice + n_cols <- switch( + guess_columns, + "min" = min(l, na.rm = TRUE), + "max" = max(l, na.rm = TRUE), + "mode" = distribution_mode(l), + ) + # tell user + if (verbose && insight::n_unique(l) != 1 && !is.numeric(separator)) { + insight::format_alert(paste0( + "Column `", sep_column, "` had different number of values after splitting. Variable was split into ", + n_cols, " column", ifelse(n_cols > 1, "s", ""), "." + )) + } + } else { + # else, if we know number of columns, use that number + n_cols <- n_columns + } + + # main task here - fill or drop values for all columns + separated_columns <- tryCatch( + .fix_separated_columns(separated_columns, fill, extra, n_cols, sep_column, verbose), + error = function(e) NULL + ) + + # catch error + if (is.null(separated_columns)) { + insight::format_error( + "Something went wrong. Probably the number of provided column names did not match number of newly created columns?" + ) + } + + # bind separated columns into data frame and set column names + out <- as.data.frame(do.call(rbind, separated_columns)) + + # if no column names provided, use standard names + if (is.null(new_columns[[sep_column]])) { + new_column_names <- paste0(sep_column, "_", seq_along(out)) + } else { + # if we have multiple columns that were separated, we avoid duplicated + # column names of created variables by appending name of original column + if (make_unique_colnames) { + new_column_names <- paste0(sep_column, "_", new_columns[[sep_column]]) + } else { + new_column_names <- new_columns[[sep_column]] + } + } + + colnames(out) <- new_column_names + out + }) + + # any split performed? + if (all(lengths(split_data) == 1)) { + if (verbose) { + insight::format_alert("Separator probably not found. No values were split. Returning original data.") + } + return(data) + } + + # final preparation, bind or merge columns, make unique columm names + if (isTRUE(merge_multiple) && length(split_data) > 1) { + # we merge all split columns, which are currently saved as list + # of data frames, together into one data frame + for (i in 2:length(split_data)) { + for (j in seq_along(split_data[[1]])) { + split_data[[1]][[j]] <- gsub(" ", "", + paste( + split_data[[1]][[j]], + split_data[[i]][[j]], + sep = merge_separator + ), + fixed = TRUE + ) + } + } + split_data <- split_data[[1]] + } else { + # bind all columns + split_data <- do.call(cbind, split_data) + } + + # convert "NA" strings into real NA? + if (convert_na) { + split_data[] <- lapply(split_data, function(i) { + i[i == "NA"] <- NA_character_ + i + }) + } + + data <- cbind(data, split_data) + if (!isTRUE(append)) { + data[select] <- NULL + } + + # fin + data +} + + +#' @keywords internal +.fix_separated_columns <- function(separated_columns, fill, extra, n_cols, sep_column, verbose = TRUE) { + warn_extra <- warn_fill <- FALSE + for (sc in seq_along(separated_columns)) { + i <- separated_columns[[sc]] + # determine number of values in separated column + n_values <- length(i) + if (all(is.na(i))) { + # we have NA values - so fill everything with NA + out <- rep(NA_character_, times = n_cols) + } else if (n_values > n_cols) { + # we have more values than required - drop extra columns + if (extra == "drop_left") { + out <- i[(n_values - n_cols + 1):n_values] + } else if (extra == "drop_right") { + out <- i[1:n_cols] + } else if (extra == "merge_left") { + out <- paste(i[1:(n_values - n_cols + 1)], collapse = " ") + out <- c(out, i[(n_values - n_cols + 2):n_values]) + } else { + out <- i[1:(n_cols - 1)] + out <- c(out, paste(i[n_cols:n_values], collapse = " ")) + } + warn_extra <- TRUE + } else if (n_values < n_cols) { + # we have fewer values than required - fill columns + if (fill == "left") { + out <- c(rep(NA_character_, times = n_cols - n_values), i) + } else if (fill == "right") { + out <- c(i, rep(NA_character_, times = n_cols - n_values)) + } else if (fill == "value_left") { + out <- c(rep(i[1], times = n_cols - n_values), i) + } else { + out <- c(i, rep(i[length(i)], times = n_cols - n_values)) + } + warn_fill <- TRUE + } else { + out <- i + } + separated_columns[[sc]] <- out + } + + if (verbose) { + if (warn_extra) { + insight::format_alert(paste0( + "`", sep_column, "`", + " returned more columns than expected after splitting. ", + switch(extra, + "drop_left" = "Left-most columns have been dropped.", + "drop_right" = "Right-most columns have been dropped.", + "merge_left" = "Left-most columns have been merged together.", + "merge_right" = "Right-most columns have been merged together." + ) + )) + } + if (warn_fill) { + insight::format_alert(paste0( + "`", sep_column, "`", + "returned fewer columns than expected after splitting. ", + switch(fill, + "left" = "Left-most columns were filled with `NA`.", + "right" = "Right-most columns were filled with `NA`.", + "value_left" = "Left-most columns were filled with first value.", + "value_right" = "Right-most columns were filled with last value." + ) + )) + } + } + + separated_columns +} diff --git a/R/data_unite.R b/R/data_unite.R index 97c4e3356..a4cf9dea5 100644 --- a/R/data_unite.R +++ b/R/data_unite.R @@ -16,6 +16,8 @@ #' @param ... Currently not used. #' @inheritParams find_columns #' +#' @seealso [`data_separate()`] +#' #' @return `data`, with a newly created variable. #' #' @examples diff --git a/_pkgdown.yaml b/_pkgdown.yaml index dca8791f7..038a405a3 100644 --- a/_pkgdown.yaml +++ b/_pkgdown.yaml @@ -28,6 +28,7 @@ reference: Functions for transforming variables contents: - data_modify + - data_separate - data_unite - categorize - recode_into diff --git a/man/data_separate.Rd b/man/data_separate.Rd new file mode 100644 index 000000000..2905adaaf --- /dev/null +++ b/man/data_separate.Rd @@ -0,0 +1,220 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_separate.R +\name{data_separate} +\alias{data_separate} +\title{Separate single variable into multiple variables} +\usage{ +data_separate( + data, + select = NULL, + new_columns = NULL, + separator = "[^[:alnum:]]+", + guess_columns = NULL, + merge_multiple = FALSE, + merge_separator = "", + fill = "right", + extra = "drop_right", + convert_na = TRUE, + exclude = NULL, + append = FALSE, + ignore_case = FALSE, + verbose = TRUE, + regex = FALSE, + ... +) +} +\arguments{ +\item{data}{A data frame.} + +\item{select}{Variables that will be included when performing the required +tasks. Can be either +\itemize{ +\item a variable specified as a literal variable name (e.g., \code{column_name}), +\item a string with the variable name (e.g., \code{"column_name"}), or a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a formula with variable names (e.g., \code{~column_1 + column_2}), +\item a vector of positive integers, giving the positions counting from the left +(e.g. \code{1} or \code{c(1, 3, 5)}), +\item a vector of negative integers, giving the positions counting from the +right (e.g., \code{-1} or \code{-1:-3}), +\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, +\code{contains()}, a range using \code{:} or \code{regex("")}. \code{starts_with()}, +\code{ends_with()}, and \code{contains()} accept several patterns, e.g +\code{starts_with("Sep", "Petal")}. +\item or a function testing for logical conditions, e.g. \code{is.numeric()} (or +\code{is.numeric}), or any user-defined function that selects the variables +for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), +\item ranges specified via literal variable names, select-helpers (except +\code{regex()}) and (user-defined) functions can be negated, i.e. return +non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with("")}, +\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means +that matches are \emph{excluded}, and thus, the \code{exclude} argument can be +used alternatively. For instance, \code{select=-ends_with("Length")} (with +\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case +negation should not work as expected, use the \code{exclude} argument instead. +} + +If \code{NULL}, selects all columns. Patterns that found no matches are silently +ignored, e.g. \code{find_columns(iris, select = c("Species", "Test"))} will just +return \code{"Species"}.} + +\item{new_columns}{The names of the new columns, as character vector. If +more than one variable was selected (in \code{select}), names are duplicated and +then made unique using \code{make.unique()}. \code{new_columns} can also be a list of +(named) character vectors when multiple variables should be separated. See +'Examples'.} + +\item{separator}{Separator between columns. Can be a character vector, which +is then treated as regular expression, or a numeric vector that indicates at +which positions the string values will be split.} + +\item{guess_columns}{If \code{new_columns} is not given, the required number of +new columns is guessed based on the results of value splitting. For example, +if a variable is split into three new columns, this will be considered as +the required number of new columns, and columns are named \code{"split_1"}, +\code{"split_2"} and \code{"split_3"}. When values from a variable are split into +different amount of new columns, the \code{guess_column} can be either \code{"mode"} +(number of new columns is based on the most common number of splits), \code{"min"} +or \code{"max"} to use the minimum resp. maximum number of possible splits as +required number of columns.} + +\item{merge_multiple}{Logical, if \code{TRUE} and more than one variable is selected +for separating, new columns can be merged. Value pairs of all split variables +are merged.} + +\item{merge_separator}{Separator string when \code{merge_multiple = TRUE}. Defines +the string that is used to merge values together.} + +\item{fill}{How to deal with values that return fewer new columns after +splitting? Can be \code{"left"} (fill missing columns from the left with \code{NA}), +\code{"right"} (fill missing columns from the right with \code{NA}) or \code{"value_left"} +or \code{"value_right"} to fill missing columns from left or right with the +left-most or right-most values.} + +\item{extra}{How to deal with values that return too many new columns after +splitting? Can be \code{"drop_left"} or \code{"drop_right"} to drop the left-most or +right-most values, or \code{"merge_left"} or \code{"merge_right"} to merge the left- +or right-most value together, and keeping all remaining values as is.} + +\item{convert_na}{Logical, if \code{TRUE}, character \code{"NA"} values are converted +into real \code{NA} values.} + +\item{exclude}{See \code{select}, however, column names matched by the pattern +from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), +excludes no columns.} + +\item{append}{Logical, if \code{FALSE} (default), removes original columns that +were united. If \code{TRUE}, all columns are preserved and the new column is +appended to the data frame.} + +\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or +a regular expression is used in \code{select}, ignores lower/upper case in the +search pattern when matching against variable names.} + +\item{verbose}{Toggle warnings.} + +\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be +treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a +character string (or a variable containing a character string) and is not +allowed to be one of the supported select-helpers or a character vector +of length > 1. \code{regex = TRUE} is comparable to using one of the two +select-helpers, \code{select = contains("")} or \code{select = regex("")}, however, +since the select-helpers may not work when called from inside other +functions (see 'Details'), this argument may be used as workaround.} + +\item{...}{Currently not used.} +} +\value{ +A data frame with the newly created variable(s), or - when \code{append = TRUE} - +\code{data} including new variables. +} +\description{ +Separates a single variable into multiple new variables. +} +\examples{ +# simple case +d <- data.frame( + x = c("1.a.6", "2.b.7", "3.c.8"), + stringsAsFactors = FALSE +) +d +data_separate(d, new_columns = c("a", "b", "c")) + +# guess number of columns +d <- data.frame( + x = c("1.a.6", NA, "2.b.6.7", "3.c", "x.y.z"), + stringsAsFactors = FALSE +) +d +data_separate(d, guess_columns = "mode") + +data_separate(d, guess_columns = "max") + +# drop left-most column +data_separate(d, guess_columns = "mode", extra = "drop_left") + +# merge right-most column +data_separate(d, guess_columns = "mode", extra = "merge_right") + +# fill columns with fewer values with left-most values +data_separate(d, guess_columns = "mode", fill = "value_left") + +# fill and merge +data_separate( + d, + guess_columns = "mode", + fill = "value_left", + extra = "merge_right" +) + +# multiple columns to split +d <- data.frame( + x = c("1.a.6", "2.b.7", "3.c.8"), + y = c("x.y.z", "10.11.12", "m.n.o"), + stringsAsFactors = FALSE +) +d +# split two columns, default column names +data_separate(d, guess_columns = "mode") + +# split into new named columns, repeating column names +data_separate(d, new_columns = c("a", "b", "c")) + +# split selected variable new columns +data_separate(d, select = "y", new_columns = c("a", "b", "c")) + +# merge multiple split columns +data_separate( + d, + new_columns = c("a", "b", "c"), + merge_multiple = TRUE +) + +# merge multiple split columns +data_separate( + d, + new_columns = c("a", "b", "c"), + merge_multiple = TRUE, + merge_separator = "-" +) + +# separate multiple columns, give proper column names +d_sep <- data.frame( + x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"), + y = c("m.n.99.22", "77.f.g.34", "44.9", NA), + stringsAsFactors = FALSE +) + +data_separate( + d_sep, + select = c("x", "y"), + new_columns = list( + x = c("A", "B", "C"), # separate "x" into three columns + y = c("EE", "FF", "GG", "HH") # separate "y" into four columns + ), + verbose = FALSE +) +} +\seealso{ +\code{\link[=data_unite]{data_unite()}} +} diff --git a/man/data_unite.Rd b/man/data_unite.Rd index fd7112d4d..63c2e73a6 100644 --- a/man/data_unite.Rd +++ b/man/data_unite.Rd @@ -104,3 +104,6 @@ data_unite(d, new_column = "xyz", remove = FALSE) data_unite(d, new_column = "xyz", select = c("x", "z")) data_unite(d, new_column = "xyz", select = c("x", "z"), append = TRUE) } +\seealso{ +\code{\link[=data_separate]{data_separate()}} +} diff --git a/tests/testthat/_snaps/data_separate.md b/tests/testthat/_snaps/data_separate.md new file mode 100644 index 000000000..e46ee77c2 --- /dev/null +++ b/tests/testthat/_snaps/data_separate.md @@ -0,0 +1,154 @@ +# data_separate: multiple columns + + Code + out + Output + x_1 x_2 x_3 y_1 y_2 y_3 + 1 1 a 6 m n 99 + 2 2 b 7 77 f g + 3 3 c 8 44 9 + 4 5 j + +--- + + Code + out + Output + x_1 x_2 x_3 y_1 y_2 y_3 + 1 1 a 6 m n 99 + 2 2 b 7 d 77 f g + 3 3 c 8 44 9 + 4 5 j + +--- + + Code + out + Output + x_A x_B x_C y_A y_B y_C + 1 1 a 6 m n 99 + 2 2 b 7 d 77 f g + 3 3 c 8 44 9 + 4 5 j + +--- + + Code + out + Output + x y x_A x_B x_C y_A y_B y_C + 1 1.a.6 m.n.99 1 a 6 m n 99 + 2 2.b.7.d 77.f.g 2 b 7 d 77 f g + 3 3.c.8 44.9 3 c 8 44 9 + 4 5.j 5 j + +--- + + Code + out + Output + x_1 x_2 x_3 y_1 y_2 y_3 + 1 1 a 6 m n 99 + 2 b 7 d 77 f g + 3 3 c 8 44 9 + 4 5 j + +--- + + Code + out + Output + x y x_A x_B x_C y_A y_B y_C + 1 1.a.6 m.n.99 1 a 6 m n 99 + 2 2.b.7.d 77.f.g 2 b 7 d 77 f g + 3 3.c.8 44.9 3 c 8 44 9 9 + 4 5.j 5 j j + +--- + + Code + out + Output + x y A B C + 1 1.a.6 m.n.99 1m an 699 + 2 2.b.7.d 77.f.g 277 bf 7dg + 3 3.c.8 44.9 344 c9 89 + 4 5.j 5NA jNA jNA + +--- + + Code + out + Output + x y A B C + 1 1.a.6 m.n.99 1m an 699 + 2 2.b.7.d 77.f.g 277 bf 7g + 3 3.c.8 44.9 344 c9 8NA + 4 5.j 5NA jNA NANA + +--- + + Code + out + Output + x_1 x_2 x_3 y_1 y_2 y_3 + 1 1 a 6 m n 99 + 2 2 b 7 77 f g + 3 3 c 8 44 44 9 + 4 5 5 j + +# data_separate: multiple columns, different lengths + + Code + out + Output + A B C EE FF GG + 1 1 a 6 m n 99 + 2 2 b 7 77 f g + 3 3 c 8 44 9 + 4 5 j + +--- + + Code + out + Output + A B C EE FF GG HH + 1 1 a 6 m n 99 22 + 2 2 b 7 77 f g 34 + 3 3 c 8 44 9 + 4 5 j + +# data_separate: fail if invalid column selected + + Code + data_separate(d_sep, guess_columns = "mode", select = NULL) + Message + Column `x` had different number of values after splitting. Variable was + split into 3 columns. + `x` returned more columns than expected after splitting. Right-most + columns have been dropped. + `x`returned fewer columns than expected after splitting. Right-most + columns were filled with `NA`. + Column `y` had different number of values after splitting. Variable was + split into 3 columns. + `y`returned fewer columns than expected after splitting. Right-most + columns were filled with `NA`. + Output + x_1 x_2 x_3 y_1 y_2 y_3 + 1 1 a 6 m n 99 + 2 2 b 7 77 f g + 3 3 c 8 44 9 + 4 5 j + +# data_separate: numeric column + + Code + out + Output + y x_1 x_2 x_3 x_4 + V1 m.n.99 15 435 352 3 + V2 77.f.g 53 554 353 2 + V3 44.9 12 342 422 + V4 15 454 334 535 + diff --git a/tests/testthat/test-data_separate.R b/tests/testthat/test-data_separate.R new file mode 100644 index 000000000..fa49fa3d2 --- /dev/null +++ b/tests/testthat/test-data_separate.R @@ -0,0 +1,336 @@ +test_that("data_separate: simple use case", { + # simple case + d_sep <- data.frame( + x = c("1.a.6", "2.b.7", "3.c.8"), + stringsAsFactors = FALSE + ) + + expect_error(data_separate(d_sep), regex = "Either") + + # basic + expect_silent(data_separate(d_sep, guess_columns = "mode", verbose = FALSE)) + expect_silent( + { + out <- data_separate(d_sep, guess_columns = "mode") + } + ) + expect_identical(colnames(out), c("x_1", "x_2", "x_3")) + expect_identical(out$x_1, c("1", "2", "3")) + expect_identical(out$x_2, c("a", "b", "c")) + + # manual separator char + out2 <- data_separate(d_sep, separator = "\\.", guess_columns = "mode", verbose = FALSE) + expect_identical(out, out2) + + # non-existing separator char + expect_message( + data_separate(d_sep, separator = "_", guess_columns = "mode"), + regex = "Separator probably not found" + ) + + # column names + out <- data_separate(d_sep, new_columns = c("A1", "B2", "C3"), verbose = FALSE) + expect_identical(colnames(out), c("A1", "B2", "C3")) + expect_identical(out$A1, c("1", "2", "3")) + expect_identical(out$B2, c("a", "b", "c")) + + out <- data_separate(d_sep, new_columns = letters[1:3], append = TRUE) + expect_equal( + out, + data.frame( + x = c("1.a.6", "2.b.7", "3.c.8"), + a = c("1", "2", "3"), + b = c("a", "b", "c"), + c = c("6", "7", "8"), + stringsAsFactors = FALSE + ), + ignore_attr = TRUE + ) +}) + + +test_that("data_separate: convert between data_unite and data_separate", { + d_unite <- data.frame( + x = as.character(c(NA, 1:3)), + y = c(letters[1:3], NA_character_), + z = as.character(6:9), + m = c("X", NA_character_, "Y", "Z"), + n = c("NATION", "COUNTRY", "NATION", NA_character_), + stringsAsFactors = FALSE + ) + + out1 <- data_unite(d_unite, new_column = "test") + d_sep <- data_separate(out1, new_columns = c("x", "y", "z", "m", "n"), separator = "_") + + expect_identical(d_unite, d_sep) +}) + + +test_that("data_separate: different number of values", { + d_sep <- data.frame( + x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"), + stringsAsFactors = FALSE + ) + + # basic use-case + expect_silent(data_separate(d_sep, guess_columns = "mode", verbose = FALSE)) + expect_message( + expect_message( + expect_message( + data_separate(d_sep, guess_columns = "mode"), + regex = "3 columns" + ), + regex = "have been dropped" + ), + regex = "filled with `NA`" + ) + out <- data_separate(d_sep, guess_columns = "mode", verbose = FALSE) + expect_identical(colnames(out), c("x_1", "x_2", "x_3")) + expect_identical(out$x_1, c("1", "2", "3", "5")) + expect_identical(out$x_2, c("a", "b", "c", "j")) + expect_identical(out$x_3, c("6", "7", "8", NA)) + + # fill missings left + out <- data_separate(d_sep, guess_columns = "mode", fill = "left", verbose = FALSE) + expect_identical(colnames(out), c("x_1", "x_2", "x_3")) + expect_identical(out$x_1, c("1", "2", "3", NA)) + expect_identical(out$x_2, c("a", "b", "c", "5")) + expect_identical(out$x_3, c("6", "7", "8", "j")) + + # merge extra right + out <- data_separate(d_sep, guess_columns = "mode", extra = "merge_right", verbose = FALSE) + expect_identical(colnames(out), c("x_1", "x_2", "x_3")) + expect_identical(out$x_1, c("1", "2", "3", "5")) + expect_identical(out$x_2, c("a", "b", "c", "j")) + expect_identical(out$x_3, c("6", "7 d", "8", NA)) + + # max columns + out <- data_separate(d_sep, guess_columns = "max", verbose = FALSE) + expect_equal( + out, + data.frame( + x_1 = c("1", "2", "3", "5"), + x_2 = c("a", "b", "c", "j"), + x_3 = c("6", "7", "8", NA), + x_4 = c(NA, "d", NA, NA), + stringsAsFactors = FALSE + ), + ignore_attr = TRUE + ) + + # min columns + out <- data_separate(d_sep, guess_columns = "min", verbose = FALSE) + expect_equal( + out, + data.frame( + x_1 = c("1", "2", "3", "5"), + x_2 = c("a", "b", "c", "j"), + stringsAsFactors = FALSE + ), + ignore_attr = TRUE + ) + + out <- data_separate(d_sep, guess_columns = "min", extra = "merge_left", verbose = FALSE) + expect_equal( + out, + data.frame( + x_1 = c("1 a", "2 b 7", "3 c", "5"), + x_2 = c("6", "d", "8", "j"), + stringsAsFactors = FALSE + ), + ignore_attr = TRUE + ) + + out <- data_separate(d_sep, guess_columns = "max", fill = "left", verbose = FALSE) + expect_equal( + out, + data.frame( + x_1 = c(NA, "2", NA, NA), + x_2 = c("1", "b", "3", NA), + x_3 = c("a", "7", "c", "5"), + x_4 = c("6", "d", "8", "j"), + stringsAsFactors = FALSE + ), + ignore_attr = TRUE + ) +}) + + +test_that("data_separate: multiple columns", { + d_sep <- data.frame( + x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"), + y = c("m.n.99", "77.f.g", "44.9", NA), + stringsAsFactors = FALSE + ) + + # select works + out <- data_separate(d_sep, select = "x", guess_columns = "mode", verbose = FALSE) + expect_identical(colnames(out), c("y", "x_1", "x_2", "x_3")) + expect_identical(out$x_1, c("1", "2", "3", "5")) + expect_identical(out$x_2, c("a", "b", "c", "j")) + expect_identical(out$x_3, c("6", "7", "8", NA)) + + out <- data_separate(d_sep, guess_columns = "mode", verbose = FALSE) + expect_snapshot(out) + + out <- data_separate(d_sep, guess_columns = "mode", extra = "merge_right", verbose = FALSE) + expect_snapshot(out) + + out <- data_separate(d_sep, new_columns = c("A", "B", "C"), extra = "merge_right", verbose = FALSE) + expect_snapshot(out) + + out <- data_separate(d_sep, new_columns = c("A", "B", "C"), extra = "merge_right", append = TRUE, verbose = FALSE) + expect_snapshot(out) + + out <- data_separate(d_sep, guess_columns = "mode", extra = "drop_left", verbose = FALSE) + expect_snapshot(out) + + out <- data_separate( + d_sep, + new_columns = c("A", "B", "C"), + fill = "value_right", + extra = "merge_right", + append = TRUE, + verbose = FALSE + ) + expect_snapshot(out) + + out <- data_separate( + d_sep, + new_columns = c("A", "B", "C"), + fill = "value_right", + extra = "merge_right", + merge_multiple = TRUE, + append = TRUE, + verbose = FALSE + ) + expect_snapshot(out) + + out <- data_separate( + d_sep, + new_columns = c("A", "B", "C"), + merge_multiple = TRUE, + append = TRUE, + verbose = FALSE + ) + expect_snapshot(out) + + out <- data_separate(d_sep, guess_columns = "mode", fill = "value_left", verbose = FALSE) + expect_snapshot(out) +}) + + +test_that("data_separate: multiple columns, different lengths", { + d_sep <- data.frame( + x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"), + y = c("m.n.99.22", "77.f.g.34", "44.9", NA), + stringsAsFactors = FALSE + ) + + # separate column names + out <- data_separate( + d_sep, + select = c("x", "y"), + new_columns = list(x = c("A", "B", "C"), y = c("EE", "FF", "GG")), + verbose = FALSE + ) + expect_named(out, c("A", "B", "C", "EE", "FF", "GG")) + expect_snapshot(out) + + out <- data_separate( + d_sep, + select = c("x", "y"), + new_columns = list(x = c("A", "B", "C"), y = c("EE", "FF", "GG", "HH")), + verbose = FALSE + ) + expect_named(out, c("A", "B", "C", "EE", "FF", "GG", "HH")) + expect_snapshot(out) +}) + + +test_that("data_separate: numeric separator", { + d_sep <- data.frame( + x = c("Thisisalongstring", "Doeshe1losteverything", "Wereme2longornot"), + stringsAsFactors = FALSE + ) + + expect_silent({ + out <- data_separate(d_sep, guess_columns = "mode", separator = c(5, 7, 8, 12), verbose = TRUE) + }) + expect_equal( + out, + data.frame( + x_1 = c("This", "Does", "Were"), + x_2 = c("is", "he", "me"), + x_3 = c("a", "1", "2"), + x_4 = c("long", "lost", "long"), + x_5 = c("string", "everything", "ornot"), + stringsAsFactors = FALSE + ), + ignore_attr = TRUE + ) + + d_sep <- data.frame( + x = c("Thisisalongstring", "Doeshe1losteverything"), + y = c("Wereme2longornot", NA), + stringsAsFactors = FALSE + ) + expect_silent({ + out <- data_separate(d_sep, separator = c(5, 7, 8, 12), new_columns = LETTERS[1:5]) + }) + expect_equal( + out, + data.frame( + A = c("This", "Does"), + B = c("is", "he"), + C = c("a", "1"), + D = c("long", "lost"), + E = c("string", "everything"), + A.1 = c("Were", NA), + B.1 = c("me", NA), + C.1 = c("2", NA), + D.1 = c("long", NA), + E.1 = c("ornot", NA), + stringsAsFactors = FALSE + ), + ignore_attr = TRUE + ) + + expect_error( + data_separate(d_sep, separator = c(5, 7, 8, 12), new_columns = LETTERS[1:6]), + regex = "went wrong" + ) +}) + + +test_that("data_separate: fail if invalid column selected", { + d_sep <- data.frame( + x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"), + y = c("m.n.99", "77.f.g", "44.9", NA), + stringsAsFactors = FALSE + ) + expect_warning( + expect_message( + data_separate(d_sep, guess_columns = "mode", select = "z"), + reg = "not found" + ), + regex = "misspelled?" + ) + expect_identical( + data_separate(d_sep, guess_columns = "mode", select = "z", verbose = FALSE), + d_sep + ) + expect_snapshot(data_separate(d_sep, guess_columns = "mode", select = NULL)) +}) + + +test_that("data_separate: numeric column", { + d_sep <- data.frame( + x = c(154353523, 535543532, 12342422, 15454334535), + y = c("m.n.99", "77.f.g", "44.9", NA), + stringsAsFactors = FALSE + ) + expect_message(data_separate(d_sep, guess_columns = "mode", select = "x"), regex = "Separator probably") + out <- data_separate(d_sep, guess_columns = "mode", select = "x", separator = c(3, 6, 9)) + expect_snapshot(out) +}) diff --git a/vignettes/tidyverse_translation.Rmd b/vignettes/tidyverse_translation.Rmd index d70026b29..b03402468 100644 --- a/vignettes/tidyverse_translation.Rmd +++ b/vignettes/tidyverse_translation.Rmd @@ -98,7 +98,8 @@ Before we look at their *tidyverse* equivalents, we can first have a look at | `data_to_long()` | [to convert data from wide to long](#reshaping) | | `data_to_wide()` | [to convert data from long to wide](#reshaping) | | `data_join()` | [to join two data frames](#joining) | -| `data_unite()` | [to concatenate several columns into a single one](#uniting) | +| `data_unite()` | [to concatenate several columns into a single one](#uniting) | +| `data_separate()` | [to separate a single column into multiple columns](#separating) | Note that there are a few functions in `{datawizard}` that have no strict equivalent in `{dplyr}` or `{tidyr}` (e.g `data_rotate()`), and so we won't discuss them in @@ -122,7 +123,8 @@ Before we look at them individually, let's first have a look at the summary tabl | `data_join()` | `dplyr::inner_join()`, `dplyr::left_join()`, `dplyr::right_join()`, | | | `dplyr::full_join()`, `dplyr::anti_join()`, `dplyr::semi_join()` | | `data_peek()` | `dplyr::glimpse()` | -| `data_unite()` | `tidyr::unite()` | +| `data_unite()` | `tidyr::unite()` | +| `data_separate()` | `tidyr::separate()` | ## Filtering {#filtering} @@ -977,6 +979,65 @@ test %>% ``` +## Separating {#separating} + +Separating variables is the counterpart to uniting variables and is useful to split values into multiple columns, e.g. when splitting a date into values for years, months and days. `data_separate()` offers an interface very close to `tidyr::separate()`: + +```{r eval=evaluate_chunk} +test <- data.frame( + date_arrival = c("2002-02-11", "2003-03-22", "2004-09-28"), + date_departure = c("2002-03-15", "2003-03-28", "2004-09-30"), + stringsAsFactors = FALSE +) +test +``` + + +:::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} + +::: {} + +```{r separate1, class.source = "datawizard"} +# ---------- datawizard ----------- +test %>% + data_separate( + select = "date_arrival", + new_columns = c("Year", "Month", "Day") + ) +``` +::: + +::: {} + +```{r, class.source = "tidyverse"} +# ---------- tidyverse ----------- +test %>% + separate( + date_arrival, + into = c("Year", "Month", "Day") + ) +``` +::: + +:::: + +```{r separate1, eval = evaluate_chunk, echo = FALSE} +``` + + +Unlike `tidyr::separate()`, you can separate multiple columns in one step with `data_separate()`. + +```{r eval = evaluate_chunk} +test %>% + data_separate( + new_columns = list( + date_arrival = c("Arr_Year", "Arr_Month", "Arr_Day"), + date_departure = c("Dep_Year", "Dep_Month", "Dep_Day") + ) + ) +``` + + # Other useful functions `{datawizard}` contains other functions that are not necessarily included in