Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add row_count() to count specific values row-wise #553

Merged
merged 13 commits into from
Oct 11, 2024
Merged
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.13.0.5
Version: 0.13.0.6
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ export(reshape_longer)
export(reshape_wider)
export(reverse)
export(reverse_scale)
export(row_count)
export(row_means)
export(row_to_colnames)
export(rowid_as_column)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ CHANGES
variables, can now also be a character vector with quoted variable names,
including a colon to indicate a range of several variables (e.g. `"cyl:gear"`).

* New function `row_count()`, to count specific values row-wise.

BUG FIXES

* `describe_distribution()` no longer errors if the sample was too sparse to compute
Expand Down
124 changes: 124 additions & 0 deletions R/row_count.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#' @title Count specific values row-wise
#' @name row_count
#' @description `row_count()` mimics base R's `rowSums()`, with sums for a
#' specific value indicated by `count`. Hence, it is similar to
#' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including
#' strict comparisons. Comparisons using `==` coerce values to atomic vectors,
#' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also
#' possible to make "type safe" comparisons using the `allow_coercion` argument,
#' where `"2" == 2` is not true.
#'
#' @param data A data frame with at least two columns, where number of specific
#' values are counted row-wise.
#' @param count The value for which the row sum should be computed. May be a
#' numeric value, a character string (for factors or character vectors), `NA` or
#' `Inf`.
#' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same
#' class (i.e. when `count = 2`, the value `"2"` is not counted and vice versa).
#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. In
#' order to count factor levels in the data, use `count = factor("level")`. See
#' 'Examples'.
#'
#' @inheritParams extract_column_names
#' @inheritParams row_means
#'
#' @return A vector with row-wise counts of values specified in `count`.
#'
#' @examples
#' dat <- data.frame(
#' c1 = c(1, 2, NA, 4),
#' c2 = c(NA, 2, NA, 5),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, 8)
#' )
#'
#' # count all 4s per row
#' row_count(dat, count = 4)
#' # count all missing values per row
#' row_count(dat, count = NA)
#'
#' dat <- data.frame(
#' c1 = c("1", "2", NA, "3"),
#' c2 = c(NA, "2", NA, "3"),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, Inf)
#' )
#' # count all 2s and "2"s per row
#' row_count(dat, count = 2)
strengejacke marked this conversation as resolved.
Show resolved Hide resolved
#' # only count 2s, but not "2"s
#' row_count(dat, count = 2, allow_coercion = FALSE)
#'
#' dat <- data.frame(
#' c1 = factor(c("1", "2", NA, "3")),
#' c2 = c("2", "1", NA, "3"),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, Inf)
#' )
#' # find only character "2"s
#' row_count(dat, count = "2", allow_coercion = FALSE)
#' # find only factor level "2"s
#' row_count(dat, count = factor("2"), allow_coercion = FALSE)
#'
#' @export
row_count <- function(data,
select = NULL,
exclude = NULL,
count = NULL,
allow_coercion = TRUE,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE) {
# evaluate arguments
select <- .select_nse(select,
data,
exclude,
ignore_case = ignore_case,
regex = regex,
verbose = verbose
)

if (is.null(count)) {
insight::format_error("`count` must be a valid value (including `NA` or `Inf`), but not `NULL`.")
}

if (is.null(select) || length(select) == 0) {
insight::format_error("No columns selected.")
}

data <- .coerce_to_dataframe(data[select])

# check if we have a data framme with at least two columns
if (nrow(data) < 1) {
insight::format_error("`data` must be a data frame with at least one row.")
}

# check if we have a data framme with at least two columns
if (ncol(data) < 2) {
insight::format_error("`data` must be a data frame with at least two numeric columns.")
}
# special case: count missing
if (is.na(count)) {
rowSums(is.na(data))
} else {
# comparisons in R using == coerce values into a atomic vector, i.e.
# 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or
# "2" == "2" (i.e. we want exact types to be compared only)
if (isFALSE(allow_coercion)) {
# we need the "type" of the count-value - we use class() instead of typeof(),
# because the latter sometimes returns unsuitable classes/types. compare
# typeof(as.Date("2020-01-01")), which returns "double".
count_type <- class(count)[1]
valid_columns <- vapply(data, inherits, TRUE, what = count_type)
# check if any columns left?
if (!any(valid_columns)) {
insight::format_error("No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`.") # nolint
}
data <- data[valid_columns]
}
# coerce - we have only valid columns anyway, and we need to coerce factors
# to vectors, else comparison with `==` errors.
count <- as.vector(count)
# finally, count
rowSums(data == count, na.rm = TRUE)
}
}
132 changes: 132 additions & 0 deletions man/row_count.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkgdown/_pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ reference:
- kurtosis
- smoothness
- skewness
- row_count
- row_means
- weighted_mean
- mean_sd
Expand Down
57 changes: 57 additions & 0 deletions tests/testthat/test-row_count.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
test_that("row_count", {
d_mn <- data.frame(
c1 = c(1, 2, NA, 4),
c2 = c(NA, 2, NA, 5),
c3 = c(NA, 4, NA, NA),
c4 = c(2, 3, 7, 8)
)
expect_identical(row_count(d_mn, count = 2), c(1, 2, 0, 0))
expect_identical(row_count(d_mn, count = NA), c(2, 0, 3, 1))
strengejacke marked this conversation as resolved.
Show resolved Hide resolved
d_mn <- data.frame(
c1 = c("a", "b", NA, "c"),
c2 = c(NA, "b", NA, "d"),
c3 = c(NA, 4, NA, NA),
c4 = c(2, 3, 7, Inf),
stringsAsFactors = FALSE
)
expect_identical(row_count(d_mn, count = "b"), c(0, 2, 0, 0))
expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1))
})

test_that("row_count, errors or messages", {
data(iris)
expect_error(expect_warning(row_count(iris, select = "abc")), regex = "must be a valid")
expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "No columns")
expect_error(row_count(iris[1], count = 3), regex = "with at least")
expect_error(row_count(iris[-seq_len(nrow(iris)), , drop = FALSE], count = 2), regex = "one row")
})

test_that("row_count, allow_coercion match", {
d_mn <- data.frame(
c1 = c("1", "2", NA, "3"),
c2 = c(NA, "2", NA, "3"),
c3 = c(NA, 4, NA, NA),
c4 = c(2, 3, 7, Inf),
stringsAsFactors = FALSE
)
expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(1, 2, 0, 0))
expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0))
expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(0, 2, 0, 0))
expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(1, 2, 0, 0))
expect_error(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), regex = "No column has")

# mix character / factor
d_mn <- data.frame(
c1 = factor(c("1", "2", NA, "3")),
c2 = c("2", "1", NA, "3"),
c3 = c(NA, 4, NA, NA),
c4 = c(2, 3, 7, Inf),
stringsAsFactors = FALSE
)
expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(2, 1, 0, 0))
expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0))
expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(1, 0, 0, 0))
expect_identical(row_count(d_mn, count = "2", allow_coercion = TRUE), c(2, 1, 0, 0))
expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), c(0, 1, 0, 0))
expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(2, 1, 0, 0))
})
Loading