Skip to content

Commit

Permalink
Fix invalid multibyte string (#452)
Browse files Browse the repository at this point in the history
* Fix invalid multibyte string

* desc, news

* fix

* fix

* add test

* styler
  • Loading branch information
strengejacke authored Aug 29, 2023
1 parent 7c86d71 commit dad5f20
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 81 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.8.0.7
Version: 0.8.0.8
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ CHANGES
* `recode_into()` gains an `overwrite` argument to skip overwriting already
recoded cases when multiple recode patterns apply to the same case.

* `data_read()` now passes the `encoding` argument to `data.table::fread()`.
This allows to read files with non-ASCII characters.

* `datawizard` moves from the GPL-3 license to the MIT license.

BUG FIXES
Expand All @@ -29,6 +32,10 @@ BUG FIXES
naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as
faulty syntax.

* Fixed issue in `empty_column()` for strings with invalid multibyte strings.
For such data frames or files, `empty_column()` or `data_read()` no longer
fails.

# datawizard 0.8.0

BREAKING CHANGES
Expand Down
76 changes: 38 additions & 38 deletions R/contrs.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,52 +34,52 @@
#'
#' @examples
#' if (FALSE) {
#' data("mtcars")
#' data("mtcars")
#'
#' mtcars <- data_modify(mtcars, cyl = factor(cyl))
#' mtcars <- data_modify(mtcars, cyl = factor(cyl))
#'
#' c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl))
#' solve(c.treatment)
#' #> 4 6 8
#' #> Intercept 1 0 0 # mean of the 1st level
#' #> 6 -1 1 0 # 2nd level - 1st level
#' #> 8 -1 0 1 # 3rd level - 1st level
#' c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl))
#' solve(c.treatment)
#' #> 4 6 8
#' #> Intercept 1 0 0 # mean of the 1st level
#' #> 6 -1 1 0 # 2nd level - 1st level
#' #> 8 -1 0 1 # 3rd level - 1st level
#'
#' contrasts(mtcars$cyl) <- contr.sum
#' c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl))
#' solve(c.sum)
#' #> 4 6 8
#' #> Intercept 0.333 0.333 0.333 # overall mean
#' #> 0.667 -0.333 -0.333 # deviation of 1st from overall mean
#' #> -0.333 0.667 -0.333 # deviation of 2nd from overall mean
#' contrasts(mtcars$cyl) <- contr.sum
#' c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl))
#' solve(c.sum)
#' #> 4 6 8
#' #> Intercept 0.333 0.333 0.333 # overall mean
#' #> 0.667 -0.333 -0.333 # deviation of 1st from overall mean
#' #> -0.333 0.667 -0.333 # deviation of 2nd from overall mean
#'
#'
#' contrasts(mtcars$cyl) <- contr.deviation
#' c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl))
#' solve(c.deviation)
#' #> 4 6 8
#' #> Intercept 0.333 0.333 0.333 # overall mean
#' #> 6 -1.000 1.000 0.000 # 2nd level - 1st level
#' #> 8 -1.000 0.000 1.000 # 3rd level - 1st level
#' contrasts(mtcars$cyl) <- contr.deviation
#' c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl))
#' solve(c.deviation)
#' #> 4 6 8
#' #> Intercept 0.333 0.333 0.333 # overall mean
#' #> 6 -1.000 1.000 0.000 # 2nd level - 1st level
#' #> 8 -1.000 0.000 1.000 # 3rd level - 1st level
#'
#' ## With Interactions -----------------------------------------
#' mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation))
#' mtcars <- data_arrange(mtcars, select = c("cyl", "am"))
#' ## With Interactions -----------------------------------------
#' mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation))
#' mtcars <- data_arrange(mtcars, select = c("cyl", "am"))
#'
#' mm <- unique(model.matrix(~ cyl * am, data = mtcars))
#' rownames(mm) <- c(
#' "cyl4.am0", "cyl4.am1", "cyl6.am0",
#' "cyl6.am1", "cyl8.am0", "cyl8.am1"
#' )
#' mm <- unique(model.matrix(~ cyl * am, data = mtcars))
#' rownames(mm) <- c(
#' "cyl4.am0", "cyl4.am1", "cyl6.am0",
#' "cyl6.am1", "cyl8.am0", "cyl8.am1"
#' )
#'
#' solve(mm)
#' #> cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1
#' #> (Intercept) 0.167 0.167 0.167 0.167 0.167 0.167 # overall mean
#' #> cyl6 -0.500 -0.500 0.500 0.500 0.000 0.000 # cyl MAIN eff: 2nd - 1st
#' #> cyl8 -0.500 -0.500 0.000 0.000 0.500 0.500 # cyl MAIN eff: 2nd - 1st
#' #> am1 -0.333 0.333 -0.333 0.333 -0.333 0.333 # am MAIN eff
#' #> cyl6:am1 1.000 -1.000 -1.000 1.000 0.000 0.000
#' #> cyl8:am1 1.000 -1.000 0.000 0.000 -1.000 1.000
#' solve(mm)
#' #> cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1
#' #> (Intercept) 0.167 0.167 0.167 0.167 0.167 0.167 # overall mean
#' #> cyl6 -0.500 -0.500 0.500 0.500 0.000 0.000 # cyl MAIN eff: 2nd - 1st
#' #> cyl8 -0.500 -0.500 0.000 0.000 0.500 0.500 # cyl MAIN eff: 2nd - 1st
#' #> am1 -0.333 0.333 -0.333 0.333 -0.333 0.333 # am MAIN eff
#' #> cyl6:am1 1.000 -1.000 -1.000 1.000 0.000 0.000
#' #> cyl8:am1 1.000 -1.000 0.000 0.000 -1.000 1.000
#' }
#'
#' @export
Expand Down
6 changes: 5 additions & 1 deletion R/data_read.R
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,11 @@ data_read <- function(path,

.read_text <- function(path, encoding, verbose, ...) {
if (insight::check_if_installed("data.table", quietly = TRUE)) {
out <- data.table::fread(input = path, ...)
# set proper default encoding-value for fread
if (is.null(encoding)) {
encoding <- "unknown"
}
out <- data.table::fread(input = path, encoding = encoding, ...)
class(out) <- "data.frame"
return(out)
}
Expand Down
2 changes: 1 addition & 1 deletion R/remove_empty.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ empty_columns <- function(x) {
} else {
all_na <- colSums(is.na(x)) == nrow(x)
all_empty <- vapply(x, function(i) {
(is.character(i) || is.factor(i)) && max(c(0, nchar(as.character(i))), na.rm = TRUE) == 0
(is.character(i) || is.factor(i)) && !any(nzchar(as.character(i[!is.na(i)])))
}, FUN.VALUE = logical(1L))

which(all_na | all_empty)
Expand Down
76 changes: 38 additions & 38 deletions man/contr.deviation.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions tests/testthat/test-contr.deviation.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ test_that("contr.deviation | snapshot", {
expect_snapshot(solve(c.deviation))

mm <- unique(model.matrix(~ cyl * am, data = mtcars))
rownames(mm) <- c("cyl4.am0", "cyl4.am1", "cyl6.am0",
"cyl6.am1", "cyl8.am0", "cyl8.am1")
rownames(mm) <- c(
"cyl4.am0", "cyl4.am1", "cyl6.am0",
"cyl6.am1", "cyl8.am0", "cyl8.am1"
)

expect_snapshot(solve(mm))
})
13 changes: 13 additions & 0 deletions tests/testthat/test-empty-dataframe.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,16 @@ test_that("empty_columns with only NA characters", {
)
expect_identical(empty_columns(tmp), c(var2 = 2L))
})


test_that("works with non-ascii chars", {
tmp <- data.frame(
a = c(1, 2, 3, NA, 5),
b = c("", NA, "", NA, ""),
c = c(NA, NA, NA, NA, NA),
d = c("test", "Se\x96ora", "works fine", "this too", "yeah"),
e = c("", "", "", "", ""),
stringsAsFactors = FALSE
)
expect_identical(empty_columns(tmp), c(b = 2L, c = 3L, e = 5L))
})

0 comments on commit dad5f20

Please sign in to comment.