From 23bbb402adbc2562de1b4c6d47ea5eaf2c0d1d1a Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Fri, 6 Jan 2023 10:34:55 +0100 Subject: [PATCH 1/4] replace `interaction()` --- R/group_utils.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/group_utils.R b/R/group_utils.R index 4859841..e41d828 100644 --- a/R/group_utils.R +++ b/R/group_utils.R @@ -65,8 +65,13 @@ calculate_groups <- function(data, groups, drop = group_by_drop_default(data)) { n_comb <- nrow(unique_groups) rows <- rep(list(NA), n_comb) data_groups <- interaction(data[, groups, drop = TRUE]) + + # Concatenate group variables + pasted_groups <- do.call(paste, c(unique_groups[, groups, drop = FALSE], sep = ".")) + pasted_groups[is.na(unique_groups)] <- NA + for (i in seq_len(n_comb)) { - rows[[i]] <- which(data_groups %in% interaction(unique_groups[i, groups])) + rows[[i]] <- which(data_groups %in% pasted_groups[i]) } if (!isTRUE(drop) && any(is_factor)) { From 59bf4d9a815ab90ee355554c297ea9c4f6fa0982 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Fri, 6 Jan 2023 13:45:13 +0100 Subject: [PATCH 2/4] add some tests when several groups contain NA --- inst/tinytest/test_group_by.R | 44 +++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/inst/tinytest/test_group_by.R b/inst/tinytest/test_group_by.R index eeb56b2..a59ade1 100644 --- a/inst/tinytest/test_group_by.R +++ b/inst/tinytest/test_group_by.R @@ -154,8 +154,48 @@ df <- data.frame(x = 1:2, y = 1:2) %>% structure(class = c("grouped_df", "data.frame")) expect_true(group_by_drop_default(df), info = "group_by_drop_default() is forgiving about corrupt grouped df") + +# with NA in groups --------------------------------------------------- + +# One group res <- data.frame(x = c("apple", NA, "banana"), y = 1:3, stringsAsFactors = FALSE) %>% group_by(x) %>% group_data() -expect_identical(res$x, c("apple", "banana", NA_character_), info = "group_by() puts NA groups last in STRSXP") -expect_identical(res$.rows, list(1L, 3L, 2L), info = "group_by() puts NA groups last in STRSXP") + +expect_identical( + res$x, + c("apple", "banana", NA_character_), + info = "group_by() puts NA groups last in STRSXP" +) +expect_identical( + res$.rows, + list(1L, 3L, 2L), + info = "group_by() puts NA groups last in STRSXP" +) + +# Several groups +d <- data.frame( + orig = rep(c("France", "UK"), each = 4), + dest = rep(c("Spain", "Germany"), times = 4), + year = rep(c(2010, 2011), each = 4), + value = 1:8 +) +d[2, 1] <- NA +d[7, 2] <- NA + +res <- d %>% + group_by(orig, dest) %>% + group_data() + +expect_identical(nrow(res), 6L) +expect_identical( + res[5:6, 1:2], + structure( + data.frame(orig = c("UK", NA), dest = c(NA, "Germany")), + row.names = 5:6 + ) +) +expect_identical( + vapply(res$.rows, length, FUN.VALUE = numeric(1L)), + c(1, 2, 2, 1, 0, 2) +) From 3584621515274dc4e5bd2bd2f490030e6dd28e28 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Fri, 6 Jan 2023 13:47:43 +0100 Subject: [PATCH 3/4] fix previous test --- inst/tinytest/test_group_by.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tinytest/test_group_by.R b/inst/tinytest/test_group_by.R index a59ade1..8d1aeb1 100644 --- a/inst/tinytest/test_group_by.R +++ b/inst/tinytest/test_group_by.R @@ -177,7 +177,7 @@ expect_identical( d <- data.frame( orig = rep(c("France", "UK"), each = 4), dest = rep(c("Spain", "Germany"), times = 4), - year = rep(c(2010, 2011), each = 4), + year = rep(rep(c(2010, 2011), each = 2), 2), value = 1:8 ) d[2, 1] <- NA From e6e28e81082abf4a21cdd42e012f69b402300b3d Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Fri, 6 Jan 2023 14:42:54 +0100 Subject: [PATCH 4/4] fix case where there are NA in several groups --- R/group_utils.R | 14 ++++++++------ inst/tinytest/test_group_by.R | 12 +++++++++--- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/R/group_utils.R b/R/group_utils.R index e41d828..8209dff 100644 --- a/R/group_utils.R +++ b/R/group_utils.R @@ -63,16 +63,18 @@ calculate_groups <- function(data, groups, drop = group_by_drop_default(data)) { unique_groups <- unique(data[, groups, drop = FALSE]) is_factor <- do.call(c, lapply(unique_groups, function(x) is.factor(x))) n_comb <- nrow(unique_groups) - rows <- rep(list(NA), n_comb) - data_groups <- interaction(data[, groups, drop = TRUE]) + temp_id <- paste(sample(letters), collapse = "") + # Concatenate group variables - pasted_groups <- do.call(paste, c(unique_groups[, groups, drop = FALSE], sep = ".")) - pasted_groups[is.na(unique_groups)] <- NA - + pasted_groups <- do.call(paste, c(unique_groups, sep = ".")) + data[[temp_id]] <- do.call(paste, c(data[, groups, drop = FALSE], sep = ".")) + + rows <- rep(list(NA), n_comb) for (i in seq_len(n_comb)) { - rows[[i]] <- which(data_groups %in% pasted_groups[i]) + rows[[i]] <- which(data[[temp_id]] %in% pasted_groups[i]) } + data[[temp_id]] <- NULL if (!isTRUE(drop) && any(is_factor)) { na_lvls <- do.call( diff --git a/inst/tinytest/test_group_by.R b/inst/tinytest/test_group_by.R index 8d1aeb1..888ac18 100644 --- a/inst/tinytest/test_group_by.R +++ b/inst/tinytest/test_group_by.R @@ -189,13 +189,19 @@ res <- d %>% expect_identical(nrow(res), 6L) expect_identical( - res[5:6, 1:2], + res[5:6, 1:3], structure( - data.frame(orig = c("UK", NA), dest = c(NA, "Germany")), + list2DF( + list( + orig = c("UK", NA), + dest = c(NA, "Germany"), + .rows = list(7L, 2L) + ) + ), row.names = 5:6 ) ) expect_identical( vapply(res$.rows, length, FUN.VALUE = numeric(1L)), - c(1, 2, 2, 1, 0, 2) + c(1, 2, 2, 1, 1, 1) )