Fix invalid multibyte string (#452)

* Fix invalid multibyte string * desc, news * fix * fix * add test * styler
easystats · Aug 29, 2023 · dad5f20 · dad5f20
1 parent 7c86d71
commit dad5f20
Show file tree

Hide file tree

Showing 8 changed files with 107 additions and 81 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.8.0.7
+Version: 0.8.0.8
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NEWS.md b/NEWS.md
@@ -15,6 +15,9 @@ CHANGES
 * `recode_into()` gains an `overwrite` argument to skip overwriting already
   recoded cases when multiple recode patterns apply to the same case.
 
+* `data_read()` now passes the `encoding` argument to `data.table::fread()`.
+  This allows to read files with non-ASCII characters.
+
 * `datawizard` moves from the GPL-3 license to the MIT license.
 
 BUG FIXES
@@ -29,6 +32,10 @@ BUG FIXES
   naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as
   faulty syntax. 
 
+* Fixed issue in `empty_column()` for strings with invalid multibyte strings.
+  For such data frames or files, `empty_column()` or `data_read()` no longer
+  fails.
+
 # datawizard 0.8.0
 
 BREAKING CHANGES

diff --git a/R/contrs.R b/R/contrs.R
@@ -34,52 +34,52 @@
 #'
 #' @examples
 #' if (FALSE) {
-#' data("mtcars")
+#'   data("mtcars")
 #'
-#' mtcars <- data_modify(mtcars, cyl = factor(cyl))
+#'   mtcars <- data_modify(mtcars, cyl = factor(cyl))
 #'
-#' c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl))
-#' solve(c.treatment)
-#' #>            4 6 8
-#' #> Intercept  1 0 0  # mean of the 1st level
-#' #> 6         -1 1 0  # 2nd level - 1st level
-#' #> 8         -1 0 1  # 3rd level - 1st level
+#'   c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl))
+#'   solve(c.treatment)
+#'   #>            4 6 8
+#'   #> Intercept  1 0 0  # mean of the 1st level
+#'   #> 6         -1 1 0  # 2nd level - 1st level
+#'   #> 8         -1 0 1  # 3rd level - 1st level
 #'
-#' contrasts(mtcars$cyl) <- contr.sum
-#' c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl))
-#' solve(c.sum)
-#' #>                4      6      8
-#' #> Intercept  0.333  0.333  0.333   # overall mean
-#' #>            0.667 -0.333 -0.333   # deviation of 1st from overall mean
-#' #>           -0.333  0.667 -0.333   # deviation of 2nd from overall mean
+#'   contrasts(mtcars$cyl) <- contr.sum
+#'   c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl))
+#'   solve(c.sum)
+#'   #>                4      6      8
+#'   #> Intercept  0.333  0.333  0.333   # overall mean
+#'   #>            0.667 -0.333 -0.333   # deviation of 1st from overall mean
+#'   #>           -0.333  0.667 -0.333   # deviation of 2nd from overall mean
 #'
 #'
-#' contrasts(mtcars$cyl) <- contr.deviation
-#' c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl))
-#' solve(c.deviation)
-#' #>                4     6     8
-#' #> Intercept  0.333 0.333 0.333   # overall mean
-#' #> 6         -1.000 1.000 0.000   # 2nd level - 1st level
-#' #> 8         -1.000 0.000 1.000   # 3rd level - 1st level
+#'   contrasts(mtcars$cyl) <- contr.deviation
+#'   c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl))
+#'   solve(c.deviation)
+#'   #>                4     6     8
+#'   #> Intercept  0.333 0.333 0.333   # overall mean
+#'   #> 6         -1.000 1.000 0.000   # 2nd level - 1st level
+#'   #> 8         -1.000 0.000 1.000   # 3rd level - 1st level
 #'
-#' ## With Interactions -----------------------------------------
-#' mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation))
-#' mtcars <- data_arrange(mtcars, select = c("cyl", "am"))
+#'   ## With Interactions -----------------------------------------
+#'   mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation))
+#'   mtcars <- data_arrange(mtcars, select = c("cyl", "am"))
 #'
-#' mm <- unique(model.matrix(~ cyl * am, data = mtcars))
-#' rownames(mm) <- c(
-#'   "cyl4.am0", "cyl4.am1", "cyl6.am0",
-#'   "cyl6.am1", "cyl8.am0", "cyl8.am1"
-#' )
+#'   mm <- unique(model.matrix(~ cyl * am, data = mtcars))
+#'   rownames(mm) <- c(
+#'     "cyl4.am0", "cyl4.am1", "cyl6.am0",
+#'     "cyl6.am1", "cyl8.am0", "cyl8.am1"
+#'   )
 #'
-#' solve(mm)
-#' #>             cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1
-#' #> (Intercept)    0.167    0.167    0.167    0.167    0.167    0.167  # overall mean
-#' #> cyl6          -0.500   -0.500    0.500    0.500    0.000    0.000  # cyl MAIN eff: 2nd - 1st
-#' #> cyl8          -0.500   -0.500    0.000    0.000    0.500    0.500  # cyl MAIN eff: 2nd - 1st
-#' #> am1           -0.333    0.333   -0.333    0.333   -0.333    0.333  # am MAIN eff
-#' #> cyl6:am1       1.000   -1.000   -1.000    1.000    0.000    0.000
-#' #> cyl8:am1       1.000   -1.000    0.000    0.000   -1.000    1.000
+#'   solve(mm)
+#'   #>             cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1
+#'   #> (Intercept)    0.167    0.167    0.167    0.167    0.167    0.167  # overall mean
+#'   #> cyl6          -0.500   -0.500    0.500    0.500    0.000    0.000  # cyl MAIN eff: 2nd - 1st
+#'   #> cyl8          -0.500   -0.500    0.000    0.000    0.500    0.500  # cyl MAIN eff: 2nd - 1st
+#'   #> am1           -0.333    0.333   -0.333    0.333   -0.333    0.333  # am MAIN eff
+#'   #> cyl6:am1       1.000   -1.000   -1.000    1.000    0.000    0.000
+#'   #> cyl8:am1       1.000   -1.000    0.000    0.000   -1.000    1.000
 #' }
 #'
 #' @export

diff --git a/R/data_read.R b/R/data_read.R
@@ -271,7 +271,11 @@ data_read <- function(path,
 
 .read_text <- function(path, encoding, verbose, ...) {
   if (insight::check_if_installed("data.table", quietly = TRUE)) {
-    out <- data.table::fread(input = path, ...)
+    # set proper default encoding-value for fread
+    if (is.null(encoding)) {
+      encoding <- "unknown"
+    }
+    out <- data.table::fread(input = path, encoding = encoding, ...)
     class(out) <- "data.frame"
     return(out)
   }

diff --git a/R/remove_empty.R b/R/remove_empty.R
@@ -63,7 +63,7 @@ empty_columns <- function(x) {
   } else {
     all_na <- colSums(is.na(x)) == nrow(x)
     all_empty <- vapply(x, function(i) {
-      (is.character(i) || is.factor(i)) && max(c(0, nchar(as.character(i))), na.rm = TRUE) == 0
+      (is.character(i) || is.factor(i)) && !any(nzchar(as.character(i[!is.na(i)])))
     }, FUN.VALUE = logical(1L))
 
     which(all_na | all_empty)

diff --git a/man/contr.deviation.Rd b/man/contr.deviation.Rd
diff --git a/tests/testthat/test-contr.deviation.R b/tests/testthat/test-contr.deviation.R
@@ -21,8 +21,10 @@ test_that("contr.deviation | snapshot", {
   expect_snapshot(solve(c.deviation))
 
   mm <- unique(model.matrix(~ cyl * am, data = mtcars))
-  rownames(mm) <- c("cyl4.am0", "cyl4.am1", "cyl6.am0",
-                    "cyl6.am1", "cyl8.am0", "cyl8.am1")
+  rownames(mm) <- c(
+    "cyl4.am0", "cyl4.am1", "cyl6.am0",
+    "cyl6.am1", "cyl8.am0", "cyl8.am1"
+  )
 
   expect_snapshot(solve(mm))
 })
diff --git a/tests/testthat/test-empty-dataframe.R b/tests/testthat/test-empty-dataframe.R
@@ -59,3 +59,16 @@ test_that("empty_columns with only NA characters", {
   )
   expect_identical(empty_columns(tmp), c(var2 = 2L))
 })
+
+
+test_that("works with non-ascii chars", {
+  tmp <- data.frame(
+    a = c(1, 2, 3, NA, 5),
+    b = c("", NA, "", NA, ""),
+    c = c(NA, NA, NA, NA, NA),
+    d = c("test", "Se\x96ora", "works fine", "this too", "yeah"),
+    e = c("", "", "", "", ""),
+    stringsAsFactors = FALSE
+  )
+  expect_identical(empty_columns(tmp), c(b = 2L, c = 3L, e = 5L))
+})