Skip to content

Commit

Permalink
Created first working version of function variant of data key harmoni…
Browse files Browse the repository at this point in the history
…zation workflow
  • Loading branch information
njlyon0 committed Jan 12, 2024
1 parent bc33e12 commit 12273ca
Showing 1 changed file with 113 additions and 13 deletions.
126 changes: 113 additions & 13 deletions dev/harmonize.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,15 @@ librarian::shelf(devtools, tidyverse)
write.csv(df1, file = file.path("dev", "test_df1.csv"), na = "", row.names = F)
write.csv(df2, file = file.path("dev", "test_df2.csv"), na = "", row.names = F)

# Export one with a different filename that we won't include in the data key
write.csv(df2, file = file.path("dev", "test_df3.csv"), na = "", row.names = F)


# Create data key
(key <- data.frame("source" = c("test_df1.csv", "test_df1.csv", "test_df1.csv",
(data_key <- data.frame("source" = c("test_df1.csv", "test_df1.csv", "test_df1.csv",
"test_df2.csv", "test_df2.csv", "test_df2.csv"),
"raw_name" = c("x", "gar_typo", "y", "NUMBERS", "LETTERS", "BONUS"),
"harmony_name" = c("numbers", "garbage", "letters", "numbers", "letters", NA)) )
"tidy_name" = c("numbers", "garbage", "letters", "numbers", "letters", NA)) )

## ----------------------------------- ##
# Script Variant ----
Expand All @@ -41,15 +45,15 @@ write.csv(df2, file = file.path("dev", "test_df2.csv"), na = "", row.names = F)
file_list <- list()

# Loop across files named in the data key
for(file in unique(key$source)){
for(file in unique(data_key$source)){
## file <- "test_df1.csv"

# Prepare the data key
key_sub <- key %>%
data_key_sub <- data_key %>%
# Subset to just this file
dplyr::filter(source == file) %>%
# Drop any instances where the harmonized name is absent (i.e., the raw column is unwanted)
dplyr::filter(is.na(harmony_name) != TRUE & nchar(harmony_name) != 0)
dplyr::filter(is.na(tidy_name) != TRUE & nchar(tidy_name) != 0)

# Read in file
df_v1 <- read.csv(file = file.path("dev", file))
Expand All @@ -69,7 +73,7 @@ for(file in unique(key$source)){
values_to = "values")

# Identify any columns in the data key but apparently not in the data key
missing_cols <- setdiff(x = unique(key_sub$raw_name),
missing_cols <- setdiff(x = unique(data_key_sub$raw_name),
y = unique(df_v2$raw_name))

# Warn the user if any are found
Expand All @@ -83,13 +87,13 @@ for(file in unique(key$source)){
# Perform actual harmonization
df_v3 <- df_v2 %>%
# Attach the data key to the data
dplyr::left_join(y = key_sub, by = c("source", "raw_name")) %>%
dplyr::left_join(y = data_key_sub, by = c("source", "raw_name")) %>%
# Drop any columns without a standardized name
dplyr::filter(is.na(harmony_name) != TRUE & nchar(harmony_name) != 0) %>%
dplyr::filter(is.na(tidy_name) != TRUE & nchar(tidy_name) != 0) %>%
# Trash the old column names
dplyr::select(-raw_name) %>%
# Pivot back to original format
tidyr::pivot_wider(names_from = harmony_name, values_from = values) %>%
tidyr::pivot_wider(names_from = tidy_name, values_from = values) %>%
# Trash row number column created in loop
dplyr::select(-xxxx_row_num)

Expand All @@ -102,18 +106,114 @@ harmonized_df <- purrr::list_rbind(x = file_list)
# Check that out
harmonized_df

# Clear environment of everything except data key
rm(list = setdiff(x = ls(), y = "data_key"))

## ----------------------------------- ##
# Function Variant ----
# Function Variant ----
## ----------------------------------- ##
# Data key harmonization _in function format_

# Define function
harmonize <- function(key = NULL, raw_folder = NULL, quiet = TRUE){
# key <- data_key
# raw_folder <- "dev"
# quiet <- F

# Error out if data key does not contain all needed information
if(all(c("source", "raw_name", "tidy_name") %in% names(key)) != TRUE)
stop("Data key must include 'source', 'raw_name' and 'tidy_name' columns")

# Drop any unnecessary data key columns
key_actual <- key %>%
dplyr::select(source, raw_name, tidy_name) %>%
dplyr::distinct()

# Identify available raw files *that are also present in the data key*
raw_files <- generics::intersect(x = dir(path = raw_folder, pattern = ".csv"),
y = unique(key_actual$source))

# Identify any files present but not in data key
unk_files <- generics::setdiff(x = dir(path = raw_folder, pattern = ".csv"),
y = unique(key_actual$source))

# If any are found and `quiet` isn't `TRUE`, report on these files
if(length(unk_files) != 0 & quiet != TRUE){
message("Following files found in raw path but not in data key:")
print(paste0("'", unk_files, "'", collapse = " & ")) }

# Create an empty list for storing each harmonized raw file (pre-combination)
file_list <- list()

# Loop across files that *are* in the data key
for(focal_file in raw_files){
# focal_file <- "test_df1.csv"

# Prepare the data key
key_sub <- key_actual %>%
# Subset to just this file
dplyr::filter(source == focal_file) %>%
# Drop any instances where the harmonized name is absent
## (assuming that raw column is unwanted in harmonized data object)
dplyr::filter(is.na(tidy_name) != TRUE & nchar(tidy_name) != 0)

# Read in the first file
dat_v1 <- read.csv(file = file.path(raw_folder, focal_file))

# Prepare the data for harmonization via data key
dat_v2 <- dat_v1 %>%
# Add a source folumn and row number column to preserve original rows
## (name is bizarre to avoid overwriting extant column name)
dplyr::mutate(xxxx_row_num = 1:nrow(.),
source = focal_file) %>%
# Make all columns characters
dplyr::mutate(dplyr::across(.cols = dplyr::everything(),
.fns = as.character)) %>%
# Pivot the data into 'ultimate' long format
tidyr::pivot_longer(cols = -xxxx_row_num:-source,
names_to = "raw_name",
values_to = "values")

# Identify any columns in the data key but apparently not in the data key
missing_cols <- generics::setdiff(x = unique(key_sub$raw_name),
y = unique(dat_v2$raw_name))

# Warn the user if any are found (this is a warning so no `quiet` argument used)
if(length(missing_cols) != 0){
rlang::warn(message = paste0("Removing the following columns in data key NOT found in '", focal_file, "': '", missing_cols, "'", collapse = " & ")) }

# Perform actual harmonization
dat_v3 <- dat_v2 %>%
# Attach the data key to the data
dplyr::left_join(y = key_sub, by = c("source", "raw_name")) %>%
# Drop any columns without a standardized name
dplyr::filter(is.na(tidy_name) != TRUE & nchar(tidy_name) != 0) %>%
# Trash the old column names
dplyr::select(-raw_name) %>%
# Pivot back to original format
tidyr::pivot_wider(names_from = tidy_name, values_from = values) %>%
# Trash row number column created in loop
dplyr::select(-xxxx_row_num)

# Add to list
file_list[[focal_file]] <- dat_v3

} # Close harmonization loop

# Unlist the list
files_df <- purrr::list_rbind(x = file_list)

# Return that
return(files_df) }

# Invoke function
fxn_out <- harmonize(key = data_key, raw_folder = "dev", quiet = F)

# Invoke function where `quiet` is allowed to default to `TRUE`
fxn_out <- harmonize(key = data_key, raw_folder = "dev")




# Look at output
fxn_out

# Clear environment / collect garbage
rm(list = ls()); gc()
Expand Down

0 comments on commit 12273ca

Please sign in to comment.