From 1dcec608f908e134e5adb15581b19361af17cd59 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 17:05:14 +0200 Subject: [PATCH 01/12] Draft `row_count()` --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 2 + R/row_count.R | 68 ++++++++++++++++++++++ man/row_count.Rd | 99 +++++++++++++++++++++++++++++++++ pkgdown/_pkgdown.yaml | 1 + tests/testthat/test-row_count.R | 25 +++++++++ 7 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 R/row_count.R create mode 100644 man/row_count.Rd create mode 100644 tests/testthat/test-row_count.R diff --git a/DESCRIPTION b/DESCRIPTION index 4758f601c..00574ecb1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.2 +Version: 0.13.0.4 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NAMESPACE b/NAMESPACE index c435c0cc5..1775af562 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -296,6 +296,7 @@ export(reshape_longer) export(reshape_wider) export(reverse) export(reverse_scale) +export(row_count) export(row_means) export(row_to_colnames) export(rowid_as_column) diff --git a/NEWS.md b/NEWS.md index 388c5a822..b4154449d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ CHANGES variables, can now also be a character vector with quoted variable names, including a colon to indicate a range of several variables (e.g. `"cyl:gear"`). +* New function `row_count()`, to calculate row-wise sums of specific values. + BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute diff --git a/R/row_count.R b/R/row_count.R new file mode 100644 index 000000000..7ece22514 --- /dev/null +++ b/R/row_count.R @@ -0,0 +1,68 @@ +#' @title Row means or sums (optionally with minimum amount of valid values) +#' @name row_count +#' @description `row_count()` mimics base R's `rowSums()`, with sums for a +#' specific value indicated by `count`. Hence, it is equivalent to +#' `rowSums(x == count, na.rm = TRUE)`. +#' +#' @param data A data frame with at least two columns, where number of specific +#' values are counted row-wise. +#' @param count The value for which the row sum should be computed. May be a +#' numeric value, a character string (for factors or character vectors), `NA` or +#' `Inf`. +#' @inheritParams extract_column_names +#' @inheritParams row_means +#' +#' @return A vector with row-wise counts of values specified in `count`. +#' +#' @examples +#' dat <- data.frame( +#' c1 = c(1, 2, NA, 4), +#' c2 = c(NA, 2, NA, 5), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, 8) +#' ) +#' +#' # count all 2s per row +#' row_count(dat, count = 2) +#' # count all missing values per row +#' row_count(dat, count = NA) +#' +#' @export +row_count <- function(data, + select = NULL, + exclude = NULL, + count = NULL, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE) { + # evaluate arguments + select <- .select_nse(select, + data, + exclude, + ignore_case = ignore_case, + regex = regex, + verbose = verbose + ) + + if (is.null(count)) { + insight::format_error("`count` must be a valid value (including `NA` or `Inf`), but not `NULL`.") + } + + if (is.null(select) || length(select) == 0) { + insight::format_error("No columns selected.") + } + + data <- .coerce_to_dataframe(data[select]) + + # check if we have a data framme with at least two columns + if (ncol(data) < 2) { + insight::format_error("`data` must be a data frame with at least two numeric columns.") + } + + # special case: count missing + if (is.na(count)) { + rowSums(is.na(data)) + } else { + rowSums(data == count, na.rm = TRUE) + } +} diff --git a/man/row_count.Rd b/man/row_count.Rd new file mode 100644 index 000000000..820baad8c --- /dev/null +++ b/man/row_count.Rd @@ -0,0 +1,99 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/row_count.R +\name{row_count} +\alias{row_count} +\title{Row means or sums (optionally with minimum amount of valid values)} +\usage{ +row_count( + data, + select = NULL, + exclude = NULL, + count = NULL, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE +) +} +\arguments{ +\item{data}{A data frame with at least two columns, where number of specific +values are counted row-wise.} + +\item{select}{Variables that will be included when performing the required +tasks. Can be either +\itemize{ +\item a variable specified as a literal variable name (e.g., \code{column_name}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), +\item a formula with variable names (e.g., \code{~column_1 + column_2}), +\item a vector of positive integers, giving the positions counting from the left +(e.g. \code{1} or \code{c(1, 3, 5)}), +\item a vector of negative integers, giving the positions counting from the +right (e.g., \code{-1} or \code{-1:-3}), +\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, +\code{contains()}, a range using \code{:} or \code{regex("")}. \code{starts_with()}, +\code{ends_with()}, and \code{contains()} accept several patterns, e.g +\code{starts_with("Sep", "Petal")}. +\item or a function testing for logical conditions, e.g. \code{is.numeric()} (or +\code{is.numeric}), or any user-defined function that selects the variables +for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), +\item ranges specified via literal variable names, select-helpers (except +\code{regex()}) and (user-defined) functions can be negated, i.e. return +non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with("")}, +\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means +that matches are \emph{excluded}, and thus, the \code{exclude} argument can be +used alternatively. For instance, \code{select=-ends_with("Length")} (with +\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case +negation should not work as expected, use the \code{exclude} argument instead. +} + +If \code{NULL}, selects all columns. Patterns that found no matches are silently +ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} +will just return \code{"Species"}.} + +\item{exclude}{See \code{select}, however, column names matched by the pattern +from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), +excludes no columns.} + +\item{count}{The value for which the row sum should be computed. May be a +numeric value, a character string (for factors or character vectors), \code{NA} or +\code{Inf}.} + +\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or +a regular expression is used in \code{select}, ignores lower/upper case in the +search pattern when matching against variable names.} + +\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be +treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a +character string (or a variable containing a character string) and is not +allowed to be one of the supported select-helpers or a character vector +of length > 1. \code{regex = TRUE} is comparable to using one of the two +select-helpers, \code{select = contains("")} or \code{select = regex("")}, however, +since the select-helpers may not work when called from inside other +functions (see 'Details'), this argument may be used as workaround.} + +\item{verbose}{Toggle warnings.} +} +\value{ +A vector with row-wise counts of values specified in \code{count}. +} +\description{ +\code{row_count()} mimics base R's \code{rowSums()}, with sums for a +specific value indicated by \code{count}. Hence, it is equivalent to +\code{rowSums(x == count, na.rm = TRUE)}. +} +\examples{ +dat <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) +) + +# count all 2s per row +row_count(dat, count = 2) +# count all missing values per row +row_count(dat, count = NA) + +} diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml index 6e6feb5b2..31ec901d0 100644 --- a/pkgdown/_pkgdown.yaml +++ b/pkgdown/_pkgdown.yaml @@ -71,6 +71,7 @@ reference: - kurtosis - smoothness - skewness + - row_count - row_means - weighted_mean - mean_sd diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R new file mode 100644 index 000000000..f40c7f69b --- /dev/null +++ b/tests/testthat/test-row_count.R @@ -0,0 +1,25 @@ +test_that("row_count", { + d_mn <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) + ) + expect_identical(row_count(d_mn, count = 2), c(1, 2, 0, 0)) + expect_identical(row_count(d_mn, count = NA), c(2, 0, 3, 1)) + d_mn <- data.frame( + c1 = c("a", "b", NA, "c"), + c2 = c(NA, "b", NA, "d"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf) + ) + expect_identical(row_count(d_mn, count = "b"), c(0, 2, 0, 0)) + expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1)) +}) + +test_that("row_means, errors or messages", { + data(iris) + expect_error(expect_warning(row_count(iris, select = "abc")), regex = "must be a valid") + expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "no columns") + expect_error(row_count(iris[1], count = 3), regex = "with at least") +}) From 05550c777cb773db9f6adb56ad5d336a48a43fbd Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 17:29:56 +0200 Subject: [PATCH 02/12] docs, type safe comparisons --- R/row_count.R | 35 ++++++++++++++++++++++++++++++++- man/row_count.Rd | 23 +++++++++++++++++++++- tests/testthat/test-row_count.R | 16 +++++++++++++-- 3 files changed, 70 insertions(+), 4 deletions(-) diff --git a/R/row_count.R b/R/row_count.R index 7ece22514..f6e946952 100644 --- a/R/row_count.R +++ b/R/row_count.R @@ -2,13 +2,22 @@ #' @name row_count #' @description `row_count()` mimics base R's `rowSums()`, with sums for a #' specific value indicated by `count`. Hence, it is equivalent to -#' `rowSums(x == count, na.rm = TRUE)`. +#' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including +#' strict comparisons: Comparisons using `==` coerce values to atomic vectors, +#' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also +#' possible to make "type safe" comparisons using the `exact` argument, where +#' `"2" == 2` is not treated as identical. #' #' @param data A data frame with at least two columns, where number of specific #' values are counted row-wise. #' @param count The value for which the row sum should be computed. May be a #' numeric value, a character string (for factors or character vectors), `NA` or #' `Inf`. +#' @param exact Logical, if `TRUE`, `count` matches only values of same type +#' (i.e. when `count = 2`, the value `"2"` is not counted and vice versa). +#' By default, when `exact = FALSE`, `count = 2` also matches `"2"`. See +#' 'Examples'. +#' #' @inheritParams extract_column_names #' @inheritParams row_means #' @@ -27,11 +36,23 @@ #' # count all missing values per row #' row_count(dat, count = NA) #' +#' dat <- data.frame( +#' c1 = c("1", "2", NA, "3"), +#' c2 = c(NA, "2", NA, "3"), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, Inf) +#' ) +#' # count all 2s and "2"s per row +#' row_count(dat, count = 2) +#' # only count 2s, but not "2"s +#' row_count(dat, count = 2, exact = TRUE) +#' #' @export row_count <- function(data, select = NULL, exclude = NULL, count = NULL, + exact = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE) { @@ -63,6 +84,18 @@ row_count <- function(data, if (is.na(count)) { rowSums(is.na(data)) } else { + # comparisons in R using == coerce values into a atomic vector, i.e. + # 2 == "2" is TRUE. If `exact = TRUE`, we only want 2 == 2 or "2" == "2". + # to achieve this, we simply compute the comparison on numeric or non-numeric + # columns only + if (isTRUE(exact)) { + numeric_columns <- vapply(data, is.numeric, TRUE) + if (is.numeric(count)) { + data <- data[numeric_columns] + } else { + data <- data[!numeric_columns] + } + } rowSums(data == count, na.rm = TRUE) } } diff --git a/man/row_count.Rd b/man/row_count.Rd index 820baad8c..02389e781 100644 --- a/man/row_count.Rd +++ b/man/row_count.Rd @@ -9,6 +9,7 @@ row_count( select = NULL, exclude = NULL, count = NULL, + exact = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE @@ -60,6 +61,11 @@ excludes no columns.} numeric value, a character string (for factors or character vectors), \code{NA} or \code{Inf}.} +\item{exact}{Logical, if \code{TRUE}, \code{count} matches only values of same type +(i.e. when \code{count = 2}, the value \code{"2"} is not counted and vice versa). +By default, when \code{exact = FALSE}, \code{count = 2} also matches \code{"2"}. See +'Examples'.} + \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the search pattern when matching against variable names.} @@ -81,7 +87,11 @@ A vector with row-wise counts of values specified in \code{count}. \description{ \code{row_count()} mimics base R's \code{rowSums()}, with sums for a specific value indicated by \code{count}. Hence, it is equivalent to -\code{rowSums(x == count, na.rm = TRUE)}. +\code{rowSums(x == count, na.rm = TRUE)}, but offers some more options, including +strict comparisons: Comparisons using \code{==} coerce values to atomic vectors, +thus both \code{2 == 2} and \code{"2" == 2} are \code{TRUE}. In \code{row_count()}, it is also +possible to make "type safe" comparisons using the \code{exact} argument, where +\code{"2" == 2} is not treated as identical. } \examples{ dat <- data.frame( @@ -96,4 +106,15 @@ row_count(dat, count = 2) # count all missing values per row row_count(dat, count = NA) +dat <- data.frame( + c1 = c("1", "2", NA, "3"), + c2 = c(NA, "2", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf) +) +# count all 2s and "2"s per row +row_count(dat, count = 2) +# only count 2s, but not "2"s +row_count(dat, count = 2, exact = TRUE) + } diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R index f40c7f69b..741d7fc41 100644 --- a/tests/testthat/test-row_count.R +++ b/tests/testthat/test-row_count.R @@ -17,9 +17,21 @@ test_that("row_count", { expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1)) }) -test_that("row_means, errors or messages", { +test_that("row_count, errors or messages", { data(iris) expect_error(expect_warning(row_count(iris, select = "abc")), regex = "must be a valid") - expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "no columns") + expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "No columns") expect_error(row_count(iris[1], count = 3), regex = "with at least") }) + +test_that("row_count, exact match", { + d_mn <- data.frame( + c1 = c("1", "2", NA, "3"), + c2 = c(NA, "2", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf) + ) + expect_identical(row_count(d_mn, count = 2, exact = FALSE), c(1, 2, 0, 0)) + expect_identical(row_count(d_mn, count = 2, exact = TRUE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", exact = TRUE), c(0, 2, 0, 0)) +}) From 4e811ec2471192d23f3370a3aa2b2e14027dde33 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 17:43:18 +0200 Subject: [PATCH 03/12] lintr --- tests/testthat/test-row_count.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R index 741d7fc41..312eb3415 100644 --- a/tests/testthat/test-row_count.R +++ b/tests/testthat/test-row_count.R @@ -11,7 +11,8 @@ test_that("row_count", { c1 = c("a", "b", NA, "c"), c2 = c(NA, "b", NA, "d"), c3 = c(NA, 4, NA, NA), - c4 = c(2, 3, 7, Inf) + c4 = c(2, 3, 7, Inf), + stringsAsFactors = FALSE ) expect_identical(row_count(d_mn, count = "b"), c(0, 2, 0, 0)) expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1)) @@ -29,7 +30,8 @@ test_that("row_count, exact match", { c1 = c("1", "2", NA, "3"), c2 = c(NA, "2", NA, "3"), c3 = c(NA, 4, NA, NA), - c4 = c(2, 3, 7, Inf) + c4 = c(2, 3, 7, Inf), + stringsAsFactors = FALSE ) expect_identical(row_count(d_mn, count = 2, exact = FALSE), c(1, 2, 0, 0)) expect_identical(row_count(d_mn, count = 2, exact = TRUE), c(1, 0, 0, 0)) From 30f6ba5b9b24954f28bc47e3f0fcfc52a0a8d035 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 22:02:08 +0200 Subject: [PATCH 04/12] apply suggestions --- NEWS.md | 2 +- R/row_count.R | 10 +++++----- man/row_count.Rd | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/NEWS.md b/NEWS.md index 69d6f4ea6..da3296536 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,7 +11,7 @@ CHANGES variables, can now also be a character vector with quoted variable names, including a colon to indicate a range of several variables (e.g. `"cyl:gear"`). -* New function `row_count()`, to calculate row-wise sums of specific values. +* New function `row_count()`, to count specific values row-wise. BUG FIXES diff --git a/R/row_count.R b/R/row_count.R index f6e946952..b7e249755 100644 --- a/R/row_count.R +++ b/R/row_count.R @@ -1,19 +1,19 @@ -#' @title Row means or sums (optionally with minimum amount of valid values) +#' @title Count specific values row-wise #' @name row_count #' @description `row_count()` mimics base R's `rowSums()`, with sums for a #' specific value indicated by `count`. Hence, it is equivalent to #' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including -#' strict comparisons: Comparisons using `==` coerce values to atomic vectors, +#' strict comparisons. Comparisons using `==` coerce values to atomic vectors, #' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also #' possible to make "type safe" comparisons using the `exact` argument, where -#' `"2" == 2` is not treated as identical. +#' `"2" == 2` is not true. #' #' @param data A data frame with at least two columns, where number of specific #' values are counted row-wise. #' @param count The value for which the row sum should be computed. May be a #' numeric value, a character string (for factors or character vectors), `NA` or #' `Inf`. -#' @param exact Logical, if `TRUE`, `count` matches only values of same type +#' @param exact Logical. If `TRUE`, `count` matches only values of same type #' (i.e. when `count = 2`, the value `"2"` is not counted and vice versa). #' By default, when `exact = FALSE`, `count = 2` also matches `"2"`. See #' 'Examples'. @@ -32,7 +32,7 @@ #' ) #' #' # count all 2s per row -#' row_count(dat, count = 2) +#' row_count(dat, count = 4) #' # count all missing values per row #' row_count(dat, count = NA) #' diff --git a/man/row_count.Rd b/man/row_count.Rd index 02389e781..c6a2bbdbc 100644 --- a/man/row_count.Rd +++ b/man/row_count.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/row_count.R \name{row_count} \alias{row_count} -\title{Row means or sums (optionally with minimum amount of valid values)} +\title{Count specific values row-wise} \usage{ row_count( data, @@ -61,7 +61,7 @@ excludes no columns.} numeric value, a character string (for factors or character vectors), \code{NA} or \code{Inf}.} -\item{exact}{Logical, if \code{TRUE}, \code{count} matches only values of same type +\item{exact}{Logical. If \code{TRUE}, \code{count} matches only values of same type (i.e. when \code{count = 2}, the value \code{"2"} is not counted and vice versa). By default, when \code{exact = FALSE}, \code{count = 2} also matches \code{"2"}. See 'Examples'.} @@ -88,10 +88,10 @@ A vector with row-wise counts of values specified in \code{count}. \code{row_count()} mimics base R's \code{rowSums()}, with sums for a specific value indicated by \code{count}. Hence, it is equivalent to \code{rowSums(x == count, na.rm = TRUE)}, but offers some more options, including -strict comparisons: Comparisons using \code{==} coerce values to atomic vectors, +strict comparisons. Comparisons using \code{==} coerce values to atomic vectors, thus both \code{2 == 2} and \code{"2" == 2} are \code{TRUE}. In \code{row_count()}, it is also possible to make "type safe" comparisons using the \code{exact} argument, where -\code{"2" == 2} is not treated as identical. +\code{"2" == 2} is not true. } \examples{ dat <- data.frame( @@ -102,7 +102,7 @@ dat <- data.frame( ) # count all 2s per row -row_count(dat, count = 2) +row_count(dat, count = 4) # count all missing values per row row_count(dat, count = NA) From b45a51797066555c8524ffead629e40edbc9b582 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 22:04:46 +0200 Subject: [PATCH 05/12] add test --- R/row_count.R | 5 +++++ tests/testthat/test-row_count.R | 1 + 2 files changed, 6 insertions(+) diff --git a/R/row_count.R b/R/row_count.R index b7e249755..d63596472 100644 --- a/R/row_count.R +++ b/R/row_count.R @@ -75,6 +75,11 @@ row_count <- function(data, data <- .coerce_to_dataframe(data[select]) + # check if we have a data framme with at least two columns + if (nrow(data) < 1) { + insight::format_error("`data` must be a data frame with at least one row.") + } + # check if we have a data framme with at least two columns if (ncol(data) < 2) { insight::format_error("`data` must be a data frame with at least two numeric columns.") diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R index 312eb3415..91b7ddd49 100644 --- a/tests/testthat/test-row_count.R +++ b/tests/testthat/test-row_count.R @@ -23,6 +23,7 @@ test_that("row_count, errors or messages", { expect_error(expect_warning(row_count(iris, select = "abc")), regex = "must be a valid") expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "No columns") expect_error(row_count(iris[1], count = 3), regex = "with at least") + expect_error(row_count(d_mn[-c(1:4), , drop = FALSE], count = 2), regex = "one row") }) test_that("row_count, exact match", { From f820ffa7b45889daae2de0ac6fe7d849a08550a0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 22:11:53 +0200 Subject: [PATCH 06/12] fix test --- tests/testthat/test-row_count.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R index 91b7ddd49..7eea5d6a6 100644 --- a/tests/testthat/test-row_count.R +++ b/tests/testthat/test-row_count.R @@ -23,7 +23,7 @@ test_that("row_count, errors or messages", { expect_error(expect_warning(row_count(iris, select = "abc")), regex = "must be a valid") expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "No columns") expect_error(row_count(iris[1], count = 3), regex = "with at least") - expect_error(row_count(d_mn[-c(1:4), , drop = FALSE], count = 2), regex = "one row") + expect_error(row_count(iris[-seq_len(nrow(iris)), , drop = FALSE], count = 2), regex = "one row") }) test_that("row_count, exact match", { From 3a9b8a6aa3adc82b71ec85aa4ea5f418e83b9afd Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 22:13:58 +0200 Subject: [PATCH 07/12] rename arg --- R/row_count.R | 24 ++++++++++++------------ man/row_count.Rd | 16 ++++++++-------- tests/testthat/test-row_count.R | 8 ++++---- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/R/row_count.R b/R/row_count.R index d63596472..93f1ebbca 100644 --- a/R/row_count.R +++ b/R/row_count.R @@ -5,18 +5,18 @@ #' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including #' strict comparisons. Comparisons using `==` coerce values to atomic vectors, #' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also -#' possible to make "type safe" comparisons using the `exact` argument, where -#' `"2" == 2` is not true. +#' possible to make "type safe" comparisons using the `allow_coercion` argument, +#' where `"2" == 2` is not true. #' #' @param data A data frame with at least two columns, where number of specific #' values are counted row-wise. #' @param count The value for which the row sum should be computed. May be a #' numeric value, a character string (for factors or character vectors), `NA` or #' `Inf`. -#' @param exact Logical. If `TRUE`, `count` matches only values of same type -#' (i.e. when `count = 2`, the value `"2"` is not counted and vice versa). -#' By default, when `exact = FALSE`, `count = 2` also matches `"2"`. See -#' 'Examples'. +#' @param allow_coercion Logical. If `TRUE`, `count` matches only values of same +#' type (i.e. when `count = 2`, the value `"2"` is not counted and vice versa). +#' By default, when `allow_coercion = FALSE`, `count = 2` also matches `"2"`. +#' See 'Examples'. #' #' @inheritParams extract_column_names #' @inheritParams row_means @@ -45,14 +45,14 @@ #' # count all 2s and "2"s per row #' row_count(dat, count = 2) #' # only count 2s, but not "2"s -#' row_count(dat, count = 2, exact = TRUE) +#' row_count(dat, count = 2, allow_coercion = TRUE) #' #' @export row_count <- function(data, select = NULL, exclude = NULL, count = NULL, - exact = FALSE, + allow_coercion = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE) { @@ -90,10 +90,10 @@ row_count <- function(data, rowSums(is.na(data)) } else { # comparisons in R using == coerce values into a atomic vector, i.e. - # 2 == "2" is TRUE. If `exact = TRUE`, we only want 2 == 2 or "2" == "2". - # to achieve this, we simply compute the comparison on numeric or non-numeric - # columns only - if (isTRUE(exact)) { + # 2 == "2" is TRUE. If `allow_coercion = TRUE`, we only want 2 == 2 or + # "2" == "2". to achieve this, we simply compute the comparison on numeric + # or non-numeric columns only + if (isTRUE(allow_coercion)) { numeric_columns <- vapply(data, is.numeric, TRUE) if (is.numeric(count)) { data <- data[numeric_columns] diff --git a/man/row_count.Rd b/man/row_count.Rd index c6a2bbdbc..cfcb051f1 100644 --- a/man/row_count.Rd +++ b/man/row_count.Rd @@ -9,7 +9,7 @@ row_count( select = NULL, exclude = NULL, count = NULL, - exact = FALSE, + allow_coercion = FALSE, ignore_case = FALSE, regex = FALSE, verbose = TRUE @@ -61,10 +61,10 @@ excludes no columns.} numeric value, a character string (for factors or character vectors), \code{NA} or \code{Inf}.} -\item{exact}{Logical. If \code{TRUE}, \code{count} matches only values of same type -(i.e. when \code{count = 2}, the value \code{"2"} is not counted and vice versa). -By default, when \code{exact = FALSE}, \code{count = 2} also matches \code{"2"}. See -'Examples'.} +\item{allow_coercion}{Logical. If \code{TRUE}, \code{count} matches only values of same +type (i.e. when \code{count = 2}, the value \code{"2"} is not counted and vice versa). +By default, when \code{allow_coercion = FALSE}, \code{count = 2} also matches \code{"2"}. +See 'Examples'.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the @@ -90,8 +90,8 @@ specific value indicated by \code{count}. Hence, it is equivalent to \code{rowSums(x == count, na.rm = TRUE)}, but offers some more options, including strict comparisons. Comparisons using \code{==} coerce values to atomic vectors, thus both \code{2 == 2} and \code{"2" == 2} are \code{TRUE}. In \code{row_count()}, it is also -possible to make "type safe" comparisons using the \code{exact} argument, where -\code{"2" == 2} is not true. +possible to make "type safe" comparisons using the \code{allow_coercion} argument, +where \code{"2" == 2} is not true. } \examples{ dat <- data.frame( @@ -115,6 +115,6 @@ dat <- data.frame( # count all 2s and "2"s per row row_count(dat, count = 2) # only count 2s, but not "2"s -row_count(dat, count = 2, exact = TRUE) +row_count(dat, count = 2, allow_coercion = TRUE) } diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R index 7eea5d6a6..57c7e76ed 100644 --- a/tests/testthat/test-row_count.R +++ b/tests/testthat/test-row_count.R @@ -26,7 +26,7 @@ test_that("row_count, errors or messages", { expect_error(row_count(iris[-seq_len(nrow(iris)), , drop = FALSE], count = 2), regex = "one row") }) -test_that("row_count, exact match", { +test_that("row_count, allow_coercion match", { d_mn <- data.frame( c1 = c("1", "2", NA, "3"), c2 = c(NA, "2", NA, "3"), @@ -34,7 +34,7 @@ test_that("row_count, exact match", { c4 = c(2, 3, 7, Inf), stringsAsFactors = FALSE ) - expect_identical(row_count(d_mn, count = 2, exact = FALSE), c(1, 2, 0, 0)) - expect_identical(row_count(d_mn, count = 2, exact = TRUE), c(1, 0, 0, 0)) - expect_identical(row_count(d_mn, count = "2", exact = TRUE), c(0, 2, 0, 0)) + expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 2, 0, 0)) + expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = TRUE), c(0, 2, 0, 0)) }) From 640557ca3d91b1076e88adc76f3844d8687e7aab Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 22:15:08 +0200 Subject: [PATCH 08/12] switch TRUE and FALSE --- R/row_count.R | 6 +++--- man/row_count.Rd | 2 +- tests/testthat/test-row_count.R | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/R/row_count.R b/R/row_count.R index 93f1ebbca..0635688c8 100644 --- a/R/row_count.R +++ b/R/row_count.R @@ -52,7 +52,7 @@ row_count <- function(data, select = NULL, exclude = NULL, count = NULL, - allow_coercion = FALSE, + allow_coercion = TRUE, ignore_case = FALSE, regex = FALSE, verbose = TRUE) { @@ -90,10 +90,10 @@ row_count <- function(data, rowSums(is.na(data)) } else { # comparisons in R using == coerce values into a atomic vector, i.e. - # 2 == "2" is TRUE. If `allow_coercion = TRUE`, we only want 2 == 2 or + # 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or # "2" == "2". to achieve this, we simply compute the comparison on numeric # or non-numeric columns only - if (isTRUE(allow_coercion)) { + if (isFALSE(allow_coercion)) { numeric_columns <- vapply(data, is.numeric, TRUE) if (is.numeric(count)) { data <- data[numeric_columns] diff --git a/man/row_count.Rd b/man/row_count.Rd index cfcb051f1..8e66ba9bb 100644 --- a/man/row_count.Rd +++ b/man/row_count.Rd @@ -9,7 +9,7 @@ row_count( select = NULL, exclude = NULL, count = NULL, - allow_coercion = FALSE, + allow_coercion = TRUE, ignore_case = FALSE, regex = FALSE, verbose = TRUE diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R index 57c7e76ed..46125587b 100644 --- a/tests/testthat/test-row_count.R +++ b/tests/testthat/test-row_count.R @@ -34,7 +34,7 @@ test_that("row_count, allow_coercion match", { c4 = c(2, 3, 7, Inf), stringsAsFactors = FALSE ) - expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 2, 0, 0)) - expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(1, 0, 0, 0)) - expect_identical(row_count(d_mn, count = "2", allow_coercion = TRUE), c(0, 2, 0, 0)) + expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(1, 2, 0, 0)) + expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(0, 2, 0, 0)) }) From cbad289b2cd9b932aa40013bff79f65ac7ed65c6 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 22:15:57 +0200 Subject: [PATCH 09/12] update docs --- R/row_count.R | 4 ++-- man/row_count.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/row_count.R b/R/row_count.R index 0635688c8..b4ac2f385 100644 --- a/R/row_count.R +++ b/R/row_count.R @@ -13,9 +13,9 @@ #' @param count The value for which the row sum should be computed. May be a #' numeric value, a character string (for factors or character vectors), `NA` or #' `Inf`. -#' @param allow_coercion Logical. If `TRUE`, `count` matches only values of same +#' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same #' type (i.e. when `count = 2`, the value `"2"` is not counted and vice versa). -#' By default, when `allow_coercion = FALSE`, `count = 2` also matches `"2"`. +#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. #' See 'Examples'. #' #' @inheritParams extract_column_names diff --git a/man/row_count.Rd b/man/row_count.Rd index 8e66ba9bb..3bd2a0281 100644 --- a/man/row_count.Rd +++ b/man/row_count.Rd @@ -61,9 +61,9 @@ excludes no columns.} numeric value, a character string (for factors or character vectors), \code{NA} or \code{Inf}.} -\item{allow_coercion}{Logical. If \code{TRUE}, \code{count} matches only values of same +\item{allow_coercion}{Logical. If \code{FALSE}, \code{count} matches only values of same type (i.e. when \code{count = 2}, the value \code{"2"} is not counted and vice versa). -By default, when \code{allow_coercion = FALSE}, \code{count = 2} also matches \code{"2"}. +By default, when \code{allow_coercion = TRUE}, \code{count = 2} also matches \code{"2"}. See 'Examples'.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or From 867e2377ca35962f576f7aeb46db7d78508f3286 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 23:02:56 +0200 Subject: [PATCH 10/12] resolve comment --- R/row_count.R | 44 +++++++++++++++++++++++---------- man/row_count.Rd | 22 +++++++++++++---- tests/testthat/test-row_count.R | 17 +++++++++++++ 3 files changed, 65 insertions(+), 18 deletions(-) diff --git a/R/row_count.R b/R/row_count.R index b4ac2f385..a7dd509ca 100644 --- a/R/row_count.R +++ b/R/row_count.R @@ -1,7 +1,7 @@ #' @title Count specific values row-wise #' @name row_count #' @description `row_count()` mimics base R's `rowSums()`, with sums for a -#' specific value indicated by `count`. Hence, it is equivalent to +#' specific value indicated by `count`. Hence, it is similar to #' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including #' strict comparisons. Comparisons using `==` coerce values to atomic vectors, #' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also @@ -14,9 +14,10 @@ #' numeric value, a character string (for factors or character vectors), `NA` or #' `Inf`. #' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same -#' type (i.e. when `count = 2`, the value `"2"` is not counted and vice versa). -#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. -#' See 'Examples'. +#' class (i.e. when `count = 2`, the value `"2"` is not counted and vice versa). +#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. In +#' order to count factor levels in the data, use `count = factor("level")`. See +#' 'Examples'. #' #' @inheritParams extract_column_names #' @inheritParams row_means @@ -45,7 +46,18 @@ #' # count all 2s and "2"s per row #' row_count(dat, count = 2) #' # only count 2s, but not "2"s -#' row_count(dat, count = 2, allow_coercion = TRUE) +#' row_count(dat, count = 2, allow_coercion = FALSE) +#' +#' dat <- data.frame( +#' c1 = factor(c("1", "2", NA, "3")), +#' c2 = c("2", "1", NA, "3"), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, Inf) +#' ) +#' # find only character "2"s +#' row_count(dat, count = "2", allow_coercion = FALSE) +#' # find only factor level "2"s +#' row_count(dat, count = factor("2"), allow_coercion = FALSE) #' #' @export row_count <- function(data, @@ -84,23 +96,29 @@ row_count <- function(data, if (ncol(data) < 2) { insight::format_error("`data` must be a data frame with at least two numeric columns.") } - # special case: count missing if (is.na(count)) { rowSums(is.na(data)) } else { # comparisons in R using == coerce values into a atomic vector, i.e. # 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or - # "2" == "2". to achieve this, we simply compute the comparison on numeric - # or non-numeric columns only + # "2" == "2" (i.e. we want exact types to be compared only) if (isFALSE(allow_coercion)) { - numeric_columns <- vapply(data, is.numeric, TRUE) - if (is.numeric(count)) { - data <- data[numeric_columns] - } else { - data <- data[!numeric_columns] + # we need the "type" of the count-value - we use class() instead of typeof(), + # because the latter sometimes returns unsuitable classes/types. compare + # typeof(as.Date("2020-01-01")), which returns "double". + count_type <- class(count)[1] + valid_columns <- vapply(data, function(i) identical(class(i)[1], count_type), TRUE) + # check if any columns left? + if (!any(valid_columns)) { + insight::format_error("No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`.") # nolint } + data <- data[valid_columns] } + # coerce - we have only valid columns anyway, and we need to coerce factors + # to vectors, else comparison with `==` errors. + count <- as.vector(count) + # finally, count rowSums(data == count, na.rm = TRUE) } } diff --git a/man/row_count.Rd b/man/row_count.Rd index 3bd2a0281..d24404f5f 100644 --- a/man/row_count.Rd +++ b/man/row_count.Rd @@ -62,9 +62,10 @@ numeric value, a character string (for factors or character vectors), \code{NA} \code{Inf}.} \item{allow_coercion}{Logical. If \code{FALSE}, \code{count} matches only values of same -type (i.e. when \code{count = 2}, the value \code{"2"} is not counted and vice versa). -By default, when \code{allow_coercion = TRUE}, \code{count = 2} also matches \code{"2"}. -See 'Examples'.} +class (i.e. when \code{count = 2}, the value \code{"2"} is not counted and vice versa). +By default, when \code{allow_coercion = TRUE}, \code{count = 2} also matches \code{"2"}. In +order to count factor levels in the data, use \code{count = factor("level")}. See +'Examples'.} \item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or a regular expression is used in \code{select}, ignores lower/upper case in the @@ -86,7 +87,7 @@ A vector with row-wise counts of values specified in \code{count}. } \description{ \code{row_count()} mimics base R's \code{rowSums()}, with sums for a -specific value indicated by \code{count}. Hence, it is equivalent to +specific value indicated by \code{count}. Hence, it is similar to \code{rowSums(x == count, na.rm = TRUE)}, but offers some more options, including strict comparisons. Comparisons using \code{==} coerce values to atomic vectors, thus both \code{2 == 2} and \code{"2" == 2} are \code{TRUE}. In \code{row_count()}, it is also @@ -115,6 +116,17 @@ dat <- data.frame( # count all 2s and "2"s per row row_count(dat, count = 2) # only count 2s, but not "2"s -row_count(dat, count = 2, allow_coercion = TRUE) +row_count(dat, count = 2, allow_coercion = FALSE) + +dat <- data.frame( + c1 = factor(c("1", "2", NA, "3")), + c2 = c("2", "1", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf) +) +# find only character "2"s +row_count(dat, count = "2", allow_coercion = FALSE) +# find only factor level "2"s +row_count(dat, count = factor("2"), allow_coercion = FALSE) } diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R index 46125587b..0c7d67691 100644 --- a/tests/testthat/test-row_count.R +++ b/tests/testthat/test-row_count.R @@ -37,4 +37,21 @@ test_that("row_count, allow_coercion match", { expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(1, 2, 0, 0)) expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0)) expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(0, 2, 0, 0)) + expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(1, 2, 0, 0)) + expect_error(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), regex = "No column has") + + # mix character / factor + d_mn <- data.frame( + c1 = factor(c("1", "2", NA, "3")), + c2 = c("2", "1", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf), + stringsAsFactors = FALSE + ) + expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(2, 1, 0, 0)) + expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = TRUE), c(2, 1, 0, 0)) + expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), c(0, 1, 0, 0)) + expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(2, 1, 0, 0)) }) From 0e4e256be31eb6117a31c5aee0ec28194faad1ff Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 23:40:24 +0200 Subject: [PATCH 11/12] comments --- R/row_count.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/row_count.R b/R/row_count.R index a7dd509ca..5ab971795 100644 --- a/R/row_count.R +++ b/R/row_count.R @@ -108,7 +108,7 @@ row_count <- function(data, # because the latter sometimes returns unsuitable classes/types. compare # typeof(as.Date("2020-01-01")), which returns "double". count_type <- class(count)[1] - valid_columns <- vapply(data, function(i) identical(class(i)[1], count_type), TRUE) + valid_columns <- vapply(data, inherits, TRUE, what = count_type) # check if any columns left? if (!any(valid_columns)) { insight::format_error("No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`.") # nolint From dc517121912f80b710f91c8cca0a4d9225ec65b5 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 11 Oct 2024 08:13:55 +0200 Subject: [PATCH 12/12] typo --- R/row_count.R | 2 +- man/row_count.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/row_count.R b/R/row_count.R index 5ab971795..02b1c16dc 100644 --- a/R/row_count.R +++ b/R/row_count.R @@ -32,7 +32,7 @@ #' c4 = c(2, 3, 7, 8) #' ) #' -#' # count all 2s per row +#' # count all 4s per row #' row_count(dat, count = 4) #' # count all missing values per row #' row_count(dat, count = NA) diff --git a/man/row_count.Rd b/man/row_count.Rd index d24404f5f..7bf54fe5f 100644 --- a/man/row_count.Rd +++ b/man/row_count.Rd @@ -102,7 +102,7 @@ dat <- data.frame( c4 = c(2, 3, 7, 8) ) -# count all 2s per row +# count all 4s per row row_count(dat, count = 4) # count all missing values per row row_count(dat, count = NA)