-
-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement data_separate()
#431
Conversation
This comment was marked as outdated.
This comment was marked as outdated.
This is really a long list of examples... library(datawizard)
d <- data.frame(x = c(NA, "x.y", "x.z.y", "y.z", "1.2.3"))
d
#> x
#> 1 <NA>
#> 2 x.y
#> 3 x.z.y
#> 4 y.z
#> 5 1.2.3
data_separate(d)
#> split_1 split_2
#> 1 <NA> <NA>
#> 2 x y
#> 3 x z
#> 4 y z
#> 5 1 2
data_separate(d, guess_columns = "max")
#> split_1 split_2 split_3
#> 1 <NA> <NA> <NA>
#> 2 x y <NA>
#> 3 x z y
#> 4 y z <NA>
#> 5 1 2 3
data_separate(d, guess_columns = "min")
#> split_1
#> 1 <NA>
#> 2 x
#> 3 x
#> 4 y
#> 5 1
# new_columns overwrites guess_columns
data_separate(d, new_columns = c("A", "B"), guess_columns = "min")
#> A B
#> 1 <NA> <NA>
#> 2 x y
#> 3 x z
#> 4 y z
#> 5 1 2
data_separate(d, new_columns = c("A", "B"))
#> A B
#> 1 <NA> <NA>
#> 2 x y
#> 3 x z
#> 4 y z
#> 5 1 2
data_separate(d, new_columns = c("A", "B", "C"))
#> A B C
#> 1 <NA> <NA> <NA>
#> 2 x y <NA>
#> 3 x z y
#> 4 y z <NA>
#> 5 1 2 3
data_separate(d, new_columns = c("A", "B", "C"), fill = "left")
#> A B C
#> 1 <NA> <NA> <NA>
#> 2 <NA> x y
#> 3 x z y
#> 4 <NA> y z
#> 5 1 2 3
data_separate(d, new_columns = c("A", "B", "C"), fill = "right")
#> A B C
#> 1 <NA> <NA> <NA>
#> 2 x y <NA>
#> 3 x z y
#> 4 y z <NA>
#> 5 1 2 3
data_separate(d, new_columns = c("A", "B", "C"), fill = "value_left")
#> A B C
#> 1 <NA> <NA> <NA>
#> 2 x x y
#> 3 x z y
#> 4 y y z
#> 5 1 2 3
data_separate(d, new_columns = c("A", "B", "C"), fill = "value_right")
#> A B C
#> 1 <NA> <NA> <NA>
#> 2 x y y
#> 3 x z y
#> 4 y z z
#> 5 1 2 3
data_separate(d, new_columns = c("A", "B"), extra = "merge_right")
#> A B
#> 1 <NA> <NA>
#> 2 x y
#> 3 x z y
#> 4 y z
#> 5 1 2 3
data_separate(d, new_columns = c("A", "B"), extra = "merge_left")
#> A B
#> 1 <NA> <NA>
#> 2 x y
#> 3 x z y
#> 4 y z
#> 5 1 2 3
data_separate(d, new_columns = c("A", "B"), extra = "drop_right")
#> A B
#> 1 <NA> <NA>
#> 2 x y
#> 3 x z
#> 4 y z
#> 5 1 2
data_separate(d, new_columns = c("A", "B"), extra = "drop_left")
#> A B
#> 1 <NA> <NA>
#> 2 x y
#> 3 z y
#> 4 y z
#> 5 2 3
d <- data.frame(
x = c(NA, "x.y", "x.z.y", "y.z", "1.2.3"),
y = c(NA, "a.b", "a.b.c", "a.c", "5.6.7")
)
d
#> x y
#> 1 <NA> <NA>
#> 2 x.y a.b
#> 3 x.z.y a.b.c
#> 4 y.z a.c
#> 5 1.2.3 5.6.7
data_separate(d)
#> split_1 split_2 split_1.1 split_2.1
#> 1 <NA> <NA> <NA> <NA>
#> 2 x y a b
#> 3 x z a b
#> 4 y z a c
#> 5 1 2 5 6
data_separate(d, new_columns = c("A", "B"))
#> A B A.1 B.1
#> 1 <NA> <NA> <NA> <NA>
#> 2 x y a b
#> 3 x z a b
#> 4 y z a c
#> 5 1 2 5 6
data_separate(d, new_columns = c("A", "B", "C"))
#> A B C A.1 B.1 C.1
#> 1 <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 x y <NA> a b <NA>
#> 3 x z y a b c
#> 4 y z <NA> a c <NA>
#> 5 1 2 3 5 6 7
data_separate(d, select = "x")
#> split_1 split_2
#> 1 <NA> <NA>
#> 2 x y
#> 3 x z
#> 4 y z
#> 5 1 2
data_separate(d, select = "x", new_columns = c("A", "B"))
#> A B
#> 1 <NA> <NA>
#> 2 x y
#> 3 x z
#> 4 y z
#> 5 1 2
data_separate(d, select = "x", new_columns = c("A", "B"), append = TRUE)
#> x y A B
#> 1 <NA> <NA> <NA> <NA>
#> 2 x.y a.b x y
#> 3 x.z.y a.b.c x z
#> 4 y.z a.c y z
#> 5 1.2.3 5.6.7 1 2
data_separate(d, append = TRUE)
#> x y split_1 split_2 split_1.1 split_2.1
#> 1 <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 x.y a.b x y a b
#> 3 x.z.y a.b.c x z a b
#> 4 y.z a.c y z a c
#> 5 1.2.3 5.6.7 1 2 5 6
data_separate(d, new_columns = c("A", "B"), append = TRUE)
#> x y A B A.1 B.1
#> 1 <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 x.y a.b x y a b
#> 3 x.z.y a.b.c x z a b
#> 4 y.z a.c y z a c
#> 5 1.2.3 5.6.7 1 2 5 6
data_separate(d, extra = "drop_left")
#> split_1 split_2 split_1.1 split_2.1
#> 1 <NA> <NA> <NA> <NA>
#> 2 x y a b
#> 3 z y b c
#> 4 y z a c
#> 5 2 3 6 7
data_separate(d, extra = "drop_right")
#> split_1 split_2 split_1.1 split_2.1
#> 1 <NA> <NA> <NA> <NA>
#> 2 x y a b
#> 3 x z a b
#> 4 y z a c
#> 5 1 2 5 6
data_separate(d, extra = "merge_left")
#> split_1 split_2 split_1.1 split_2.1
#> 1 <NA> <NA> <NA> <NA>
#> 2 x y a b
#> 3 x z y a b c
#> 4 y z a c
#> 5 1 2 3 5 6 7
data_separate(d, extra = "merge_right")
#> split_1 split_2 split_1.1 split_2.1
#> 1 <NA> <NA> <NA> <NA>
#> 2 x y a b
#> 3 x z y a b c
#> 4 y z a c
#> 5 1 2 3 5 6 7
data_separate(d, new_columns = c("A", "B", "C"))
#> A B C A.1 B.1 C.1
#> 1 <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 x y <NA> a b <NA>
#> 3 x z y a b c
#> 4 y z <NA> a c <NA>
#> 5 1 2 3 5 6 7
data_separate(d, new_columns = c("A", "B", "C"), fill = "left")
#> A B C A.1 B.1 C.1
#> 1 <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 <NA> x y <NA> a b
#> 3 x z y a b c
#> 4 <NA> y z <NA> a c
#> 5 1 2 3 5 6 7
data_separate(d, new_columns = c("A", "B", "C"), fill = "right")
#> A B C A.1 B.1 C.1
#> 1 <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 x y <NA> a b <NA>
#> 3 x z y a b c
#> 4 y z <NA> a c <NA>
#> 5 1 2 3 5 6 7
data_separate(d, new_columns = c("A", "B", "C"), fill = "value_left")
#> A B C A.1 B.1 C.1
#> 1 <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 x x y a a b
#> 3 x z y a b c
#> 4 y y z a a c
#> 5 1 2 3 5 6 7
data_separate(d, new_columns = c("A", "B", "C"), fill = "value_right")
#> A B C A.1 B.1 C.1
#> 1 <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 x y y a b b
#> 3 x z y a b c
#> 4 y z z a c c
#> 5 1 2 3 5 6 7
data_separate(d, select = "x")
#> split_1 split_2
#> 1 <NA> <NA>
#> 2 x y
#> 3 x z
#> 4 y z
#> 5 1 2
data_separate(d, select = "x", new_columns = c("A", "B"))
#> A B
#> 1 <NA> <NA>
#> 2 x y
#> 3 x z
#> 4 y z
#> 5 1 2
data_separate(d, select = "x", new_columns = c("A", "B"), append = TRUE)
#> x y A B
#> 1 <NA> <NA> <NA> <NA>
#> 2 x.y a.b x y
#> 3 x.z.y a.b.c x z
#> 4 y.z a.c y z
#> 5 1.2.3 5.6.7 1 2
data_separate(d, append = TRUE)
#> x y split_1 split_2 split_1.1 split_2.1
#> 1 <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 x.y a.b x y a b
#> 3 x.z.y a.b.c x z a b
#> 4 y.z a.c y z a c
#> 5 1.2.3 5.6.7 1 2 5 6
data_separate(d, new_columns = c("A", "B"), append = TRUE)
#> x y A B A.1 B.1
#> 1 <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 x.y a.b x y a b
#> 3 x.z.y a.b.c x z a b
#> 4 y.z a.c y z a c
#> 5 1.2.3 5.6.7 1 2 5 6
d <- data.frame(
x = c(NA, "abcdefghijk", "hijklmnopqr", "lmnopqrstuvw", "pqrstuvwxyz"),
y = c(NA, "12234567i89", "545643543j5jkjkl", "434234234jlk432423", "45543kljkjk45435345345")
)
d
#> x y
#> 1 <NA> <NA>
#> 2 abcdefghijk 12234567i89
#> 3 hijklmnopqr 545643543j5jkjkl
#> 4 lmnopqrstuvw 434234234jlk432423
#> 5 pqrstuvwxyz 45543kljkjk45435345345
data_separate(d, separator = c(2, 5, 7))
#> split_1 split_2 split_3 split_4 split_5 split_1.1 split_2.1 split_3.1
#> 1 <NA> ab hi lm pq <NA> 12 54
#> 2 <NA> cdefg jklmn nopqr rstuv <NA> 23456 56435
#> 3 <NA> hijk opqr stuvw wxyz <NA> 7i89 43j5jkj
#> 4 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> split_4.1 split_5.1
#> 1 43 45
#> 2 42342 543kl
#> 3 34jlk43 jkjk454
#> 4 <NA> <NA> Created on 2023-06-09 with reprex v2.0.2 |
This comment was marked as outdated.
This comment was marked as outdated.
ok, merge multiple split columns and numeric separator works now: library(datawizard)
d <- data.frame(
x = c(NA, "x.y", "x.z.y", "y.z", "1.2.3"),
y = c(NA, "a.b", "a.b.c", "a.c", "5.6.7")
)
d
#> x y
#> 1 <NA> <NA>
#> 2 x.y a.b
#> 3 x.z.y a.b.c
#> 4 y.z a.c
#> 5 1.2.3 5.6.7
data_separate(d, new_columns = c("A", "B"))
#> A B A.1 B.1
#> 1 <NA> <NA> <NA> <NA>
#> 2 x y a b
#> 3 x z a b
#> 4 y z a c
#> 5 1 2 5 6
data_separate(d, new_columns = c("A", "B"), merge_multiple = TRUE)
#> A B
#> 1 NA NA NA NA
#> 2 x a y b
#> 3 x a z b
#> 4 y a z c
#> 5 1 5 2 6
data_separate(d, new_columns = c("A", "B", "C"))
#> A B C A.1 B.1 C.1
#> 1 <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 x y <NA> a b <NA>
#> 3 x z y a b c
#> 4 y z <NA> a c <NA>
#> 5 1 2 3 5 6 7
data_separate(d, new_columns = c("A", "B", "C"), merge_multiple = TRUE)
#> A B C
#> 1 NA NA NA NA NA NA
#> 2 x a y b NA NA
#> 3 x a z b y c
#> 4 y a z c NA NA
#> 5 1 5 2 6 3 7
data_separate(d, extra = "drop_left")
#> split_1 split_2 split_1.1 split_2.1
#> 1 <NA> <NA> <NA> <NA>
#> 2 x y a b
#> 3 z y b c
#> 4 y z a c
#> 5 2 3 6 7
data_separate(d, extra = "drop_left", merge_multiple = TRUE)
#> split_1 split_2
#> 1 NA NA NA NA
#> 2 x a y b
#> 3 z b y c
#> 4 y a z c
#> 5 2 6 3 7
d <- data.frame(
x = c(NA, "abcdefghijk", "hijklmnopqr", "lmnopqrstuvw", "pqrstuvwxyz"),
y = c(NA, "12234567i89", "545643543j5jkjkl", "434234234jlk432423", "45543kljkjk45435345345")
)
d
#> x y
#> 1 <NA> <NA>
#> 2 abcdefghijk 12234567i89
#> 3 hijklmnopqr 545643543j5jkjkl
#> 4 lmnopqrstuvw 434234234jlk432423
#> 5 pqrstuvwxyz 45543kljkjk45435345345
data_separate(d, separator = c(2, 5, 7))
#> split_1 split_2 split_3 split_4 split_1.1 split_2.1 split_3.1
#> V1 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> V2 a bcd ef ghijk 1 223 45
#> V3 h ijk lm nopqr 5 456 43
#> V4 l mno pq rstuvw 4 342 34
#> V5 p qrs tu vwxyz 4 554 3k
#> split_4.1
#> V1 <NA>
#> V2 67i89
#> V3 543j5jkjkl
#> V4 234jlk432423
#> V5 ljkjk45435345345
data_separate(d, separator = c(2, 5, 7), merge_multiple = TRUE)
#> split_1 split_2 split_3 split_4
#> V1 NA NA NA NA NA NA NA NA
#> V2 a 1 bcd 223 ef 45 ghijk 67i89
#> V3 h 5 ijk 456 lm 43 nopqr 543j5jkjkl
#> V4 l 4 mno 342 pq 34 rstuvw 234jlk432423
#> V5 p 4 qrs 554 tu 3k vwxyz ljkjk45435345345
data_separate(d, separator = c(3, 7, 11))
#> split_1 split_2 split_3 split_4 split_1.1 split_2.1 split_3.1 split_4.1
#> V1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> V2 ab cdef ghij k 12 2345 67i8 9
#> V3 hi jklm nopq r 54 5643 543j 5jkjkl
#> V4 lm nopq rstu vw 43 4234 234j lk432423
#> V5 pq rstu vwxy z 45 543k ljkj k45435345345
data_separate(d, separator = c(3, 7, 11), merge_multiple = TRUE)
#> split_1 split_2 split_3 split_4
#> V1 NA NA NA NA NA NA NA NA
#> V2 ab 12 cdef 2345 ghij 67i8 k 9
#> V3 hi 54 jklm 5643 nopq 543j r 5jkjkl
#> V4 lm 43 nopq 4234 rstu 234j vw lk432423
#> V5 pq 45 rstu 543k vwxy ljkj z k45435345345 Created on 2023-06-09 with reprex v2.0.2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Before reviewing the code, I'd like to discuss a bit about the behavior of data_separate()
because there are things that are weird to me (sorry I feel like I'm the guy always criticizing your implementation of functions ^^).
Basically there are two things that I don't like in the current behavior:
- the fact that it can be applied on several cols
- automatic naming of new cols (which is a consequence of the first point)
I feel that separating several columns at once is not desirable because it's harder to control the output. Maybe you have use cases for this, but personally I've always needed separate()
on one column at a time because I want to control exactly the new columns that will be generated.
Also, I think being able to split only one col at a time leads to more readable code. For instance, some code with the current implementation would be:
library(datawizard)
# multiple columns to split
d <- data.frame(
dep_date = c("2022-07-02", "2001-09-11", "2010-12-24"),
arr_date = c("2023-01-02", "2011-12-10", "2011-01-24"),
stringsAsFactors = FALSE
)
d
#> dep_date arr_date
#> 1 2022-07-02 2023-01-02
#> 2 2001-09-11 2011-12-10
#> 3 2010-12-24 2011-01-24
# split two columns, default column names
d |>
data_separate(select = contains("date")) |>
data_rename(
pattern = c("split_1", "split_2", "split_3",
"split_1.1", "split_2.1", "split_3.1"),
replacement = c("year_dep", "month_dep", "day_dep",
"year_arr", "month_arr", "day_arr")
)
#> Column `dep_date` had different number of values after splitting.
#> Variable was split into 3 columns.
#> Column `arr_date` had different number of values after splitting.
#> Variable was split into 3 columns.
#> year_dep month_dep day_dep year_arr month_arr day_arr
#> 1 2022 07 02 2023 01 02
#> 2 2001 09 11 2011 12 10
#> 3 2010 12 24 2011 01 24
We are forced to rename the new cols because it's very unlikely that the generated names are satisfying. The problem is that depending if the order of the two columns changes in the original data, then the renaming will be false.
I think it would be more readable and safe to have something like this:
d |>
data_separate(dep_date, new_columns = c("year_dep", "month_dep", "day_dep")) |>
data_separate(arr_date, new_columns = c("year_arr", "month_arr", "day_arr"))
In summary, I think we should remove:
- the automatic column renaming and therefore the arg
guess_columns
- separating several columns at once and therefore the args
merge_multiple
andmerge_separator
(which I find quite confusing and I really don't see a use case for them).
What do you think?
I'm not sure if this code can be reached: # check if column names should be recycled
if (ncol(out) != length(new_column_names)) {
# recycle names, avoid duplicates
new_column_names <- make.unique(rep(new_column_names, times = ncol(out) / new_column_names))
} else, we have 100% code coverage in tests. Snapshot outputs are validated. |
What about this: library(datawizard)
# separate multiple columns, give proper column names
d_sep <- data.frame(
x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"),
y = c("m.n.99.22", "77.f.g.34", "44.9", NA),
stringsAsFactors = FALSE
)
data_separate(
d_sep,
select = c("x", "y"),
new_columns = list(
x = c("A", "B", "C"), # separate "x" into three columns
y = c("EE", "FF", "GG", "HH") # separate "y" into four columns
),
verbose = FALSE
)
#> A B C EE FF GG HH
#> 1 1 a 6 m n 99 22
#> 2 2 b 7 77 f g 34
#> 3 3 c 8 44 9 <NA> <NA>
#> 4 5 j <NA> <NA> <NA> <NA> <NA> Created on 2023-06-11 with reprex v2.0.2 |
Separating multiple colsActually, I'm fine with separating several columns at the same time. I can see this happening if you have multiple columns with dates for example. I like specifying data_separate(data, c("date1", "date2", "date3", etc.), new_columns = c("year", "month", "day")) would create "date1_year", "date1_month", "date1_day", "date2_year", etc. instead of "year", "month", "day", "year.1", "month.1"... Guessing columnsI still don't see the point of d <- data.frame(
x = c("1.a.6", "2.b.7", "3.c.8"),
stringsAsFactors = FALSE
)
d
#> x
#> 1 1.a.6
#> 2 2.b.7
#> 3 3.c.8
datawizard::data_separate(d)
#> Column `x` had different number of values after splitting. Variable was
#> split into 3 columns.
#> split_1 split_2 split_3
#> 1 1 a 6
#> 2 2 b 7
#> 3 3 c 8 Same for the |
This comment was marked as outdated.
This comment was marked as outdated.
library(datawizard)
d_sep <- data.frame(
x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"),
y = c("m.n.99.22", "77.f.g.34", "44.9", NA),
stringsAsFactors = FALSE
)
data_separate(d_sep)
#> Error: Cannot separate values. Either `new_columns` or `guess_columns` must be
#> provided.
data_separate(d_sep, guess_columns = "mode")
#> Column `x` had different number of values after splitting. Variable was
#> split into 3 columns.
#> `x` returned more columns than expected after splitting. Right-most
#> columns have been dropped.
#> `x`returned fewer columns than expected after splitting. Right-most
#> columns were filled with `NA`.
#> Column `y` had different number of values after splitting. Variable was
#> split into 4 columns.
#> `y`returned fewer columns than expected after splitting. Right-most
#> columns were filled with `NA`.
#> x_1 x_2 x_3 y_1 y_2 y_3 y_4
#> 1 1 a 6 m n 99 22
#> 2 2 b 7 77 f g 34
#> 3 3 c 8 44 9 <NA> <NA>
#> 4 5 j <NA> <NA> <NA> <NA> <NA>
data_separate(d_sep, new_columns = c("AA", "BB"))
#> `x` returned more columns than expected after splitting. Right-most
#> columns have been dropped.
#> `y` returned more columns than expected after splitting. Right-most
#> columns have been dropped.
#> x_AA x_BB y_AA y_BB
#> 1 1 a m n
#> 2 2 b 77 f
#> 3 3 c 44 9
#> 4 5 j <NA> <NA>
data_separate(d_sep, new_columns = c("AA", "BB"), merge_multiple = TRUE)
#> `x` returned more columns than expected after splitting. Right-most
#> columns have been dropped.
#> `y` returned more columns than expected after splitting. Right-most
#> columns have been dropped.
#> AA BB
#> 1 1m an
#> 2 277 bf
#> 3 344 c9
#> 4 5NA jNA
data_separate(d_sep, new_columns = list(x = c("AA", "BB"), y = c("KK", "LL")))
#> `x` returned more columns than expected after splitting. Right-most
#> columns have been dropped.
#> `y` returned more columns than expected after splitting. Right-most
#> columns have been dropped.
#> AA BB KK LL
#> 1 1 a m n
#> 2 2 b 77 f
#> 3 3 c 44 9
#> 4 5 j <NA> <NA> Created on 2023-06-12 with reprex v2.0.2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, tests are very complete, just a few points (mostly about an unwanted message and the append
arg). Thank you @strengejacke
Also can you udpate the "coming from tidyverse" vignette with this function? |
I'm not sure about all the |
I'll give it a try |
Can you look at the changes I made to the vignette? |
Yes it looks fine |
Fixes #423