Skip to content

Commit

Permalink
update wide_format_2_long_format for 2017-2019
Browse files Browse the repository at this point in the history
  • Loading branch information
tvolegtv committed May 1, 2024
1 parent b027383 commit 91c8429
Show file tree
Hide file tree
Showing 10 changed files with 385 additions and 2 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(calculate_most_frequent)
export(check_patterns_in_column)
export(convert_to)
export(correct_decimal_sign)
export(cut_numeric_value)
Expand Down
4 changes: 3 additions & 1 deletion R/read_product_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ reading_product_data_step1 <-
product_df <- create_new_rows(product_df)
}

if (any(grepl("2017_Mandalay|2018_Mandalay|2019_Mandalay", basename(tracker_data_file)))) {
if (any(grepl("2017_Mandalay|2018_Mandalay|2019_Mandalay", basename(tracker_data_file))) &
(check_patterns_in_column(product_df, (find_string_cols(product_df, "Released To"))))) {
logWarn(
log_to_json(
message = "Sheet {values['sheet']} Tracker data file contains Mandalay Children's Hospital data (2017-2019), may contain wide-format cells. The wide-format cells should be changed to standart long-format!",
Expand All @@ -91,6 +92,7 @@ reading_product_data_step1 <-
warningCode = "invalid_tracker"
)
)
product_df <- wide_cells_2_rows(product_df)
}

# If after extraction, dataframe is empty, this iteration is also skipped.
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
227 changes: 226 additions & 1 deletion R/wide_format_2_long_format.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,223 @@ check_wide_format_columns <- function(df) {
all(required_columns %in% names(df))
}

#' @title Find Columns Containing Specific String
#'
#' @description
#' This function returns the column names of a dataframe that contain a specific string.
#'
#' @param df A dataframe in which to search for the string.
#' @param string The string to search for in the column names.
#'
#' @return A character vector of matching column names.
#'
#' @examples
#' data(mtcars)
#' find_string_cols(mtcars, "mpg")
find_string_cols <- function(df, string) {
matching_cols <- colnames(df)[grepl(string, names(df))]
return(matching_cols)
}

#' @title Check for Specific Patterns in a Dataframe Column
#'
#' @description
#' This function checks if any cell in a specific column of a dataframe contains a pattern with a hyphen followed by a digit, or a comma (which could be leading with a space). It returns TRUE if any cell in the column contains the pattern, and FALSE otherwise.
#'
#' @param df A dataframe to check for the specific patterns.
#' @param column_name The name of the column to check for the patterns.
#'
#' @return A logical value indicating whether any cell in the specified column contains the patterns.
#'
#' @examples
#' data(mtcars)
#' check_patterns_in_column(mtcars, "mpg")
#'
#' @export
check_patterns_in_column <- function(df, column_name) {
# pattern <- "(-\\d)|(,\\s*)"
pattern <- "(-\\s*\\d)|(,\\s*)"
return(any(grepl(pattern, df[[column_name]])))
}

#' @title Remove Original Cells based on Relationship
#'
#' @description
#' This function removes rows from the column `col_units_rel` where the corresponding value in `col_rel_to` contains a comma (",").
#' It also removes those comma-separated values from the `col_rel_to` column itself.
#'
#' @param df A data frame containing the columns `col_units_rel` and `col_rel_to`.
#' @param col_units_rel The name of the column containing units (cells) potentially related to multiple other units.
#' @param col_rel_to The name of the column containing relationships between units, where comma (",") indicates a relationship with multiple units.
#'
#' @return
#' The modified data frame where rows with comma-separated relationships in `col_rel_to` have corresponding values in `col_units_rel` set to NA and the comma-separated values removed from `col_rel_to`.
#' @examples
#' # Create a sample data frame
#' df <- data.frame(col_units_rel = c(1, 2, 3, 4), col_rel_to = c("A", "B,C", "D", "A,B"))
#'
#' # Remove original cells based on relationship
#' df_modified <- remove_original_cells(df, col_units_rel = "col_units_rel", col_rel_to = "col_rel_to")
#'
#' # Print the modified data frame
#' print(df_modified)
remove_original_cells <- function(df, col_units_rel, col_rel_to) {
df[[col_units_rel]][df[[col_rel_to]] %in% grep(",", df[[col_rel_to]], value = TRUE)] <- NA
df[[col_rel_to]][df[[col_rel_to]] %in% grep(",", df[[col_rel_to]], value = TRUE)] <- NA
return(df)
}

#' @title Replace Extra "Total" Values with NA
#'
#' @description
#' This function replaces values in a specified column (`column_name`) with NA if one of the two preceding columns contains the word "Total".
#' This is intended to handle cases where there might be multiple "Total" values in close proximity, keeping only the first.
#'
#' @param df A data frame containing the column to be processed.
#' @param column_name The name of the column where extra "Total" values will be replaced with NA.
#'
#' @return
#' The modified data frame where any value in `column_name` preceded by "Total" in one of the two preceding columns is replaced with NA.
#'
#' @examples
#' # Create a sample data frame
#' df <- data.frame(col1 = c("A", "B", "Total"), col2 = c(1, 2, 3), col3 = c(4, 5, 6), col4 = c(7, "Total", 9))
#'
#' # Replace extra "Total" values with NA
#' df_modified <- replace_extra_total_values_with_NA(df, column_name = "col4")
#'
#' # Print the modified data frame
#' print(df_modified)
replace_extra_total_values_with_NA <- function(df, column_name) {
# Get the column index
col_index <- which(names(df) == column_name)

# Check if the column index is greater than 2 (to avoid negative indexing)
if (col_index > 2) {
# Loop over rows of the dataframe
for (i in 1:nrow(df)) {
# If one of the two previous columns contains 'Total', replace the cell with NA
if ("Total" %in% df[i, (col_index - 2):(col_index - 1)]) {
df[i, column_name] <- NA
}
}
}

return(df)
}

#' @title Convert Wide-Format Cells to Long-Format Rows
#'
#' @description
#' This function converts a data frame from wide-format to long-format. It assumes specific column naming conventions:
#' - Columns containing "Released To" are expected to have comma-separated values representing units released to.
#' - Columns containing "Units Released" are expected to store the corresponding unit quantity for each "Released To" value.
#' - A column containing "Date" (or similar) should exist to hold the date information.
#'
#' @param df A data frame in wide-format with columns named as described.
#'
#' @return
#' A new data frame in long-format where each row represents a single release event with columns:
#' - The original columns from the input data frame.
#' - "Released To": The recipient unit for a specific release.
#' - "Units Released": The quantity of units released to the corresponding recipient.
#' - "Date": The date of the release event (unchanged from the input).
#' - "Received From": unchanged from the input.
#'
#' @examples
#' # Create a sample data frame in wide-format
#' df <- data.frame(
#' ID = c(1, 2, 3),
#' "Units Available" = c(10, 15, 20),
#' "Released To A,B" = c(NA, "5-2", "8-1"),
#' Date = c("2024-01-01", "2024-02-01", "2024-03-01")
#' )
#'
#' # Convert wide-format cells to long-format rows
#' df_long <- wide_cells_2_rows(df)
#'
#' # Print the long-format data frame
#' print(df_long)
wide_cells_2_rows <- function(df) {
tryCatch(
{
col_released_to <- find_string_cols(df, "Released To")
col_units_released <- find_string_cols(df, "Units Released")
col_date <- find_string_cols(df, "Date")
col_recived_from <- find_string_cols(df, "Received From")

# Create an empty dataframe to store the new rows
new_df <- df[FALSE, ]
# Loop over rows of the dataframe
for (i in 1:nrow(df)) {
# Add the current row to the new dataframe
new_df <- rbind(new_df, df[i, ])
# If the cell in 'Released To ...' is not empty, split its value by comma
if (!is.na(df[[col_released_to]][i]) & grepl(",", df[[col_released_to]][i])) {
# Split the cell content by comma
cell_content <- strsplit(as.character(df[[col_released_to]][i]), ",")[[1]]
for (content in cell_content) {
# Split the content by '-'
split_content <- strsplit(content, "-")[[1]]
new_row <- df[i, ]
new_row[] <- NA
new_row[[col_date]] <- df[[col_date]][i]
new_row[[col_recived_from]] <- df[[col_recived_from]][i]
# Assign the part before '-' to 'Released To ...'
new_row[[col_released_to]] <- split_content[1]
# Assign the part after '-' to 'Units Released'
new_row[[col_units_released]] <- split_content[2]
# Add the new row to the new dataframe
new_df <- rbind(new_df, new_row)
}
}
}

new_df <- remove_original_cells(new_df, col_units_released, col_released_to)
new_df <- replace_extra_total_values_with_NA(new_df, col_units_released)

logDebug(
log_to_json(
"The wide-format cells has been converted to long-format!",
script = "script1",
file = "wide_format_2_long_format.R",
functionName = "wide_cells_2_rows"
)
)
return(new_df)
},
error = function(e) {
# Handle errors
logError(
log_to_json(
"Error occurred while converting a wide-format cells to a long-format, dataframe was returned without changes. Error: {values['e']}",
values = list(e = e$message),
script = "script1",
file = "wide_format_2_long_format.R",
errorCode = "tryCatch",
functionName = "wide_cells_2_rows"
)
)
# Return original df
return(df)
},
warning = function(w) {
# Handle warnings
logWarn(
log_to_json(
"Warning occurred while converting a wide-format cells to a long-format, dataframe was returned without changes. Warning: {values['w']}",
values = list(w = w$message),
script = "script1",
file = "wide_format_2_long_format.R",
warningCode = "tryCatch",
functionName = "wide_cells_2_rows"
)
)
# Return original df
return(df)
}
)
}

#' @title Create New Rows in a DataFrame
#'
Expand Down Expand Up @@ -65,7 +282,15 @@ create_new_rows <- function(df) {
}
}
}
logDebug("The wide-format area has been converted to long-format!")
logDebug(
log_to_json(
"The wide-format area has been converted to long-format!",
script = "script1",
file = "wide_format_2_long_format.R",
functionName = "create_new_rows"
)
)

return(new_df)
},
error = function(e) {
Expand Down
24 changes: 24 additions & 0 deletions man/check_patterns_in_column.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions man/find_string_cols.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 32 additions & 0 deletions man/remove_original_cells.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions man/replace_extra_total_values_with_NA.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 91c8429

Please sign in to comment.