From cfdd7da9837afde6347ff46ecac283f60881b2b8 Mon Sep 17 00:00:00 2001 From: Michael Aydinbas Date: Sat, 23 Dec 2023 23:31:30 +0100 Subject: [PATCH 1/8] wip: adapt first script to use future and furrr for parallelizing main loop --- .Rprofile | 2 +- DESCRIPTION | 4 +- NAMESPACE | 6 + R/helper_main.R | 3 + R/helper_scipt_1.R | 154 +++++++++++++++++++ man/process_patient_data.Rd | 25 ++++ man/process_product_data.Rd | 25 ++++ man/process_tracker_file.Rd | 18 +++ renv.lock | 151 +++++++++++++++---- scripts/run_script_1_extract_raw_data.R | 189 ++++-------------------- 10 files changed, 381 insertions(+), 196 deletions(-) create mode 100644 R/helper_scipt_1.R create mode 100644 man/process_patient_data.Rd create mode 100644 man/process_product_data.Rd create mode 100644 man/process_tracker_file.Rd diff --git a/.Rprofile b/.Rprofile index b3b3274..eec9766 100644 --- a/.Rprofile +++ b/.Rprofile @@ -3,5 +3,5 @@ source("renv/activate.R") if (interactive()) { require("devtools", quietly = TRUE) # automatically attaches usethis - devtools::load_all() + devtools::install() } diff --git a/DESCRIPTION b/DESCRIPTION index 4be403b..a1d7052 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,7 +25,9 @@ Imports: tidyr, yaml, zoo, - arrow + arrow, + furrr, + progressr Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 8e7d6d3..f8e779f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,9 +14,15 @@ export(fix_id) export(fix_sex) export(fix_t1d_diagnosis_age) export(fix_testing_frequency) +export(get_files) +export(get_synonyms) export(get_tracker_year) export(harmonize_patient_data_columns) export(id_2_county_hospisal) +export(init_paths) +export(process_patient_data) +export(process_product_data) +export(process_tracker_file) export(read_cleaned_patient_data) export(read_column_synonyms) export(report_empty_intersections) diff --git a/R/helper_main.R b/R/helper_main.R index 50c9383..ca8d1a3 100644 --- a/R/helper_main.R +++ b/R/helper_main.R @@ -12,6 +12,7 @@ #' @param delete If TRUE, delete all files under output. #' #' @return A list with tracker_root_path and output_root path +#' @export init_paths <- function(names, output_dir_name = "output", delete = FALSE) { paths <- list() tracker_root_path <- select_A4D_directory() @@ -57,6 +58,7 @@ init_paths <- function(names, output_dir_name = "output", delete = FALSE) { #' @param pattern The search pattern to filter files. #' #' @return A vector with file names. +#' @export get_files <- function(tracker_root, pattern = "\\.xlsx$") { tracker_files <- list.files(path = tracker_root, recursive = T, pattern = pattern) tracker_files <- @@ -70,6 +72,7 @@ get_files <- function(tracker_root, pattern = "\\.xlsx$") { #' Read in all defined synonyms from the YAML files inside the synonyms folder. #' #' @return A list with both patient and product data synonyms as tibble. +#' @export get_synonyms <- function() { ## Extract synonyms for products and patients ## If you encounter new columns, just add the synonyms to these YAML files diff --git a/R/helper_scipt_1.R b/R/helper_scipt_1.R new file mode 100644 index 0000000..222d5f1 --- /dev/null +++ b/R/helper_scipt_1.R @@ -0,0 +1,154 @@ + + +#' @title Process a single tracker file and extract patient and product data. +#' +#' @param tracker_file Filename of the tracler. +#' @param paths a list with the paths to the tracker root dir, the patient and product output dir and the root output dir. +#' @param p progressor from progressr package. +#' +#' @export +process_tracker_file <- function(tracker_file, paths, p) { + p() + tracker_name <- tools::file_path_sans_ext(basename(tracker_file)) + synonyms <- get_synonyms() + tracker_data_file <- + file.path(paths$tracker_root, tracker_file) + logDebug("Start process_tracker_file.") + logInfo( + "Current file: ", + tracker_name + ) + + logfile <- paste0(tracker_name, "_", "patient") + with_file_logger(logfile, + { + tryCatch( + process_patient_data( + tracker_name = tracker_name, + tracker_data_file = tracker_data_file, + output_root = paths$patient_data_raw, + synonyms_patient = synonyms$patient + ), + error = function(e) { + logError("Could not process patient data. Error = ", e$message, ".") + }, + warning = function(w) { + logWarn("Could not process patient data. Warning = ", w$message, ".") + } + ) + }, + output_root = paths$output_root + ) + + logfile <- paste0(tracker_name, "_", "product") + + with_file_logger(logfile, + { + tryCatch( + process_product_data( + tracker_name = tracker_name, + tracker_data_file = tracker_data_file, + output_root = paths$product_data_raw, + synonyms_product = synonyms$product + ), + error = function(e) { + logError("Could not process product data. Error = ", e$message, ".") + }, + warning = function(w) { + logWarn("Could not process product data. Warning = ", w$message, ".") + } + ) + }, + output_root = paths$output_root + ) + + logInfo("Finish process_tracker_file.") +} + + +#' @title Extract patient data. +#' +#' @param tracker_name Filename without extension. +#' @param tracker_data_file Filename of the tracker. +#' @param output_root Directory for storing extracted patient data. +#' @param synonyms_patient Synonyms for patient data header names. +#' +#' @export +process_patient_data <- + function(tracker_name, + tracker_data_file, + output_root, + synonyms_patient) { + logDebug("Start process_patient_data.") + + df_raw_patient <- + reading_patient_data( + tracker_data_file = tracker_data_file, + columns_synonyms = synonyms_patient + ) + + df_raw_patient <- df_raw_patient %>% dplyr::mutate(file_name = tracker_name) + + logDebug( + "df_raw_patient dim: ", + dim(df_raw_patient) %>% as.data.frame(), + "." + ) + + export_data_as_parquet( + data = df_raw_patient, + filename = tracker_name, + output_root = output_root, + suffix = "_patient_raw" + ) + + logInfo("Finish process_patient_data.") + } + + +#' @title Extract product data. +#' +#' @param tracker_name Filename without extension. +#' @param tracker_data_file Filename of the tracker. +#' @param output_root Directory for storing extracted product data. +#' @param synonyms_product Synonyms for product data header names. +#' +#' @export +process_product_data <- + function(tracker_name, + tracker_data_file, + output_root, + synonyms_product) { + logDebug("Start process_product_data.") + + df_raw_product <- + reading_product_data_step1( + tracker_data_file = tracker_data_file, + columns_synonyms = synonyms_product + ) + + if (!is.null(df_raw_product)) { + df_raw_product <- df_raw_product %>% dplyr::mutate(file_name = tracker_name) + } else { + logDebug("Empty product data") + } + + logDebug( + "df_raw_product dim: ", + dim(df_raw_product) %>% as.data.frame(), + "." + ) + + # product set sensitive column to NA and add tracker file name as a column + if (!is.null(df_raw_product)) { + export_data_as_parquet( + data = df_raw_product, + filename = tracker_name, + output_root = output_root, + suffix = "_product_raw" + ) + } else { + logWarn("No product data in the file") + } + logDebug("Finish process_product_data.") + } diff --git a/man/process_patient_data.Rd b/man/process_patient_data.Rd new file mode 100644 index 0000000..e1334b2 --- /dev/null +++ b/man/process_patient_data.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper_scipt_1.R +\name{process_patient_data} +\alias{process_patient_data} +\title{Extract patient data.} +\usage{ +process_patient_data( + tracker_name, + tracker_data_file, + output_root, + synonyms_patient +) +} +\arguments{ +\item{tracker_name}{Filename without extension.} + +\item{tracker_data_file}{Filename of the tracker.} + +\item{output_root}{Directory for storing extracted patient data.} + +\item{synonyms_patient}{Synonyms for patient data header names.} +} +\description{ +Extract patient data. +} diff --git a/man/process_product_data.Rd b/man/process_product_data.Rd new file mode 100644 index 0000000..0109c18 --- /dev/null +++ b/man/process_product_data.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper_scipt_1.R +\name{process_product_data} +\alias{process_product_data} +\title{Extract product data.} +\usage{ +process_product_data( + tracker_name, + tracker_data_file, + output_root, + synonyms_product +) +} +\arguments{ +\item{tracker_name}{Filename without extension.} + +\item{tracker_data_file}{Filename of the tracker.} + +\item{output_root}{Directory for storing extracted product data.} + +\item{synonyms_product}{Synonyms for product data header names.} +} +\description{ +Extract product data. +} diff --git a/man/process_tracker_file.Rd b/man/process_tracker_file.Rd new file mode 100644 index 0000000..b9728ae --- /dev/null +++ b/man/process_tracker_file.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper_scipt_1.R +\name{process_tracker_file} +\alias{process_tracker_file} +\title{Process a single tracker file and extract patient and product data.} +\usage{ +process_tracker_file(paths, tracker_file, tracker_name) +} +\arguments{ +\item{paths}{a list with the paths to the tracker root dir, the patient and product output dir and the root output dir.} + +\item{tracker_file}{filename.} + +\item{tracker_name}{filename without extension.} +} +\description{ +Process a single tracker file and extract patient and product data. +} diff --git a/renv.lock b/renv.lock index 9a44c60..5cb4660 100644 --- a/renv.lock +++ b/renv.lock @@ -65,7 +65,7 @@ }, "R.utils": { "Package": "R.utils", - "Version": "2.12.2", + "Version": "2.12.3", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -76,7 +76,7 @@ "tools", "utils" ], - "Hash": "325f01db13da12c04d8f6e7be36ff514" + "Hash": "3dc2829b790254bfba21e60965787651" }, "R6": { "Package": "R6", @@ -101,7 +101,7 @@ }, "arrow": { "Package": "arrow", - "Version": "13.0.0.1", + "Version": "14.0.0.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -119,7 +119,7 @@ "utils", "vctrs" ], - "Hash": "5f5124b727a451be32d23dcd2c41ebe8" + "Hash": "042f2ee2286a91abe5a3d66c9be92380" }, "askpass": { "Package": "askpass", @@ -247,14 +247,14 @@ }, "cli": { "Package": "cli", - "Version": "3.6.1", + "Version": "3.6.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "utils" ], - "Hash": "89e6d8219950eac806ae0c489052048a" + "Hash": "1216ac65ac55ec0058a6f75d7ca0fd52" }, "clipr": { "Package": "clipr", @@ -266,6 +266,16 @@ ], "Hash": "3f038e5ac7f41d4ac41ce658c85e3042" }, + "codetools": { + "Package": "codetools", + "Version": "0.2-19", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "c089a619a7fae175d149d89164f8c7d8" + }, "commonmark": { "Package": "commonmark", "Version": "1.9.0", @@ -275,13 +285,13 @@ }, "cpp11": { "Package": "cpp11", - "Version": "0.4.6", + "Version": "0.4.7", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R" ], - "Hash": "707fae4bbf73697ec8d85f9d7076c061" + "Hash": "5a295d7d963cc5035284dcdbaf334f4e" }, "crayon": { "Package": "crayon", @@ -321,14 +331,14 @@ }, "data.table": { "Package": "data.table", - "Version": "1.14.8", + "Version": "1.14.10", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "methods" ], - "Hash": "b4c06e554f33344e044ccd7fdca750a9" + "Hash": "6ea17a32294d8ca00455825ab0cf71b9" }, "desc": { "Package": "desc", @@ -426,7 +436,7 @@ }, "dplyr": { "Package": "dplyr", - "Version": "1.1.3", + "Version": "1.1.4", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -445,7 +455,7 @@ "utils", "vctrs" ], - "Hash": "e85ffbebaad5f70e1a2e2ef4302b4949" + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec" }, "ellipsis": { "Package": "ellipsis", @@ -471,7 +481,7 @@ }, "fansi": { "Package": "fansi", - "Version": "1.0.5", + "Version": "1.0.6", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -479,7 +489,7 @@ "grDevices", "utils" ], - "Hash": "3e8583a60163b4bc1a80016e63b9959e" + "Hash": "962174cf2aeb5b9eea581522286a911f" }, "fastmap": { "Package": "fastmap", @@ -511,6 +521,37 @@ ], "Hash": "47b5f30c720c23999b913a1a635cf0bb" }, + "furrr": { + "Package": "furrr", + "Version": "0.3.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "future", + "globals", + "lifecycle", + "purrr", + "rlang", + "vctrs" + ], + "Hash": "da7a4c32196cb2262a41dd5a25d486ff" + }, + "future": { + "Package": "future", + "Version": "1.33.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "digest", + "globals", + "listenv", + "parallel", + "parallelly", + "utils" + ], + "Hash": "e57e292737f7a4efa9d8a91c5908222c" + }, "generics": { "Package": "generics", "Version": "0.1.3", @@ -563,6 +604,17 @@ ], "Hash": "ab08ac61f3e1be454ae21911eb8bc2fe" }, + "globals": { + "Package": "globals", + "Version": "0.16.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "codetools" + ], + "Hash": "baa9585ab4ce47a9f4618e671778cc6f" + }, "glue": { "Package": "glue", "Version": "1.6.2", @@ -709,13 +761,13 @@ }, "jsonlite": { "Package": "jsonlite", - "Version": "1.8.7", + "Version": "1.8.8", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "methods" ], - "Hash": "266a20443ca13c65688b2116d5220f76" + "Hash": "e1b9c55281c5adc4dd113652d9e26768" }, "knitr": { "Package": "knitr", @@ -772,6 +824,16 @@ ], "Hash": "b8552d117e1b808b09a832f589b79035" }, + "listenv": { + "Package": "listenv", + "Version": "0.9.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "4fbd3679ec8ee169ba28d4b1ea7d0e8f" + }, "lubridate": { "Package": "lubridate", "Version": "1.9.3", @@ -855,6 +917,18 @@ ], "Hash": "c03b4c18d42da881fb8e15a085c2b9d6" }, + "parallelly": { + "Package": "parallelly", + "Version": "1.36.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "parallel", + "tools", + "utils" + ], + "Hash": "bca377e1c87ec89ebed77bba00635b2e" + }, "pillar": { "Package": "pillar", "Version": "1.9.0", @@ -998,16 +1072,29 @@ }, "progress": { "Package": "progress", - "Version": "1.2.2", + "Version": "1.2.3", "Source": "Repository", "Repository": "CRAN", "Requirements": [ + "R", "R6", "crayon", "hms", "prettyunits" ], - "Hash": "14dc9f7a3c91ebb14ec5bb9208a07061" + "Hash": "f4625e061cb2865f111b47ff163a5ca6" + }, + "progressr": { + "Package": "progressr", + "Version": "0.14.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "digest", + "utils" + ], + "Hash": "ac50c4ffa8f6a46580dd4d7813add3c4" }, "promises": { "Package": "promises", @@ -1347,7 +1434,7 @@ }, "stringi": { "Package": "stringi", - "Version": "1.7.12", + "Version": "1.8.3", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1356,11 +1443,11 @@ "tools", "utils" ], - "Hash": "ca8bd84263c77310739d2cf64d84d7c9" + "Hash": "058aebddea264f4c99401515182e656a" }, "stringr": { "Package": "stringr", - "Version": "1.5.0", + "Version": "1.5.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1373,7 +1460,7 @@ "stringi", "vctrs" ], - "Hash": "671a4d384ae9d32fc47a14e98bfa3dc8" + "Hash": "960e2ae9e09656611e0b8214ad543207" }, "styler": { "Package": "styler", @@ -1608,7 +1695,7 @@ }, "vctrs": { "Package": "vctrs", - "Version": "0.6.4", + "Version": "0.6.5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1618,11 +1705,11 @@ "lifecycle", "rlang" ], - "Hash": "266c1ca411266ba8f365fcc726444b87" + "Hash": "c03fa420630029418f7e6da3667aac4a" }, "vroom": { "Package": "vroom", - "Version": "1.6.4", + "Version": "1.6.5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1644,7 +1731,7 @@ "vctrs", "withr" ], - "Hash": "9db52c1656cf19c124f93124ea57f0fd" + "Hash": "390f9315bc0025be03012054103d227c" }, "waldo": { "Package": "waldo", @@ -1697,14 +1784,16 @@ }, "xml2": { "Package": "xml2", - "Version": "1.3.5", + "Version": "1.3.6", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", - "methods" + "cli", + "methods", + "rlang" ], - "Hash": "6c40e5cfcc6aefd88110666e18c31f40" + "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61" }, "xopen": { "Package": "xopen", @@ -1731,10 +1820,10 @@ }, "yaml": { "Package": "yaml", - "Version": "2.3.7", + "Version": "2.3.8", "Source": "Repository", "Repository": "CRAN", - "Hash": "0d0056cc5383fbc240ccd0cb584bf436" + "Hash": "29240487a071f535f5e5d5a323b7afbd" }, "zip": { "Package": "zip", diff --git a/scripts/run_script_1_extract_raw_data.R b/scripts/run_script_1_extract_raw_data.R index 5f6f5c3..d4961ca 100644 --- a/scripts/run_script_1_extract_raw_data.R +++ b/scripts/run_script_1_extract_raw_data.R @@ -1,168 +1,31 @@ options(readxl.show_progress = FALSE) - -main <- function() { - paths <- init_paths(c("patient_data_raw", "product_data_raw"), delete = TRUE) - setup_logger(paths$output_root, "script1") - tracker_files <- get_files(paths$tracker_root) - logInfo( - "Found ", - length(tracker_files), - " xlsx files under ", - paths$tracker_root, - "." - ) - - synonyms <- get_synonyms() - - logInfo("Start processing tracker files.") - - for (i in seq_along(tracker_files)) { - tracker_file <- tracker_files[i] - tracker_name <- tools::file_path_sans_ext(basename(tracker_file)) - tryCatch( - process_tracker_file(paths, tracker_file, tracker_name, synonyms), - error = function(e) { - logError("Could not process ", tracker_name, ". Error = ", e$message, ".") - }, - warning = function(w) { - logWarn("Could not process ", tracker_name, ". Warning = ", w$message, ".") - } - ) - cat(paste("Processed ", i, " of ", length(tracker_files), " (", round(i / length(tracker_files) * 100, 0), "%) tracker files.\n")) - } - logInfo("Finish processing all tracker files.") -} - - -process_tracker_file <- function(paths, tracker_file, tracker_name, synonyms) { - tracker_data_file <- - file.path(paths$tracker_root, tracker_file) - logDebug("Start process_tracker_file.") - logInfo( - "Current file: ", - tracker_name +future::plan(future::multisession, workers = 4) + +Sys.setenv(A4D_DATA_ROOT = "data") +paths <- a4d::init_paths(c("patient_data_raw", "product_data_raw"), delete = TRUE) +setup_logger(paths$output_root, "script1") +tracker_files <- a4d::get_files(paths$tracker_root) +logInfo( + "Found ", + length(tracker_files), + " xlsx files under ", + paths$tracker_root, + "." +) + +logInfo("Start processing tracker files.") + +progressr::with_progress({ + p <- progressr::progressor(steps = length(tracker_files)) + + result <- furrr::future_map( + tracker_files, + a4d::process_tracker_file, + paths=paths, + p=p ) +}) - - logfile <- paste0(tracker_name, "_", "patient") - with_file_logger(logfile, - { - tryCatch( - process_patient_data( - tracker_name = tracker_name, - tracker_data_file = tracker_data_file, - output_root = paths$patient_data_raw, - synonyms_patient = synonyms$patient - ), - error = function(e) { - logError("Could not process patient data. Error = ", e$message, ".") - }, - warning = function(w) { - logWarn("Could not process patient data. Warning = ", w$message, ".") - } - ) - }, - output_root = paths$output_root - ) - - logfile <- paste0(tracker_name, "_", "product") - - with_file_logger(logfile, - { - tryCatch( - process_product_data( - tracker_name = tracker_name, - tracker_data_file = tracker_data_file, - output_root = paths$product_data_raw, - synonyms_product = synonyms$product - ), - error = function(e) { - logError("Could not process product data. Error = ", e$message, ".") - }, - warning = function(w) { - logWarn("Could not process product data. Warning = ", w$message, ".") - } - ) - }, - output_root = paths$output_root - ) - - logInfo("Finish process_tracker_file.") -} - - -process_patient_data <- - function(tracker_name, - tracker_data_file, - output_root, - synonyms_patient) { - logDebug("Start process_patient_data.") - - df_raw_patient <- - reading_patient_data( - tracker_data_file = tracker_data_file, - columns_synonyms = synonyms_patient - ) - - df_raw_patient <- df_raw_patient %>% dplyr::mutate(file_name = tracker_name) - - logDebug( - "df_raw_patient dim: ", - dim(df_raw_patient) %>% as.data.frame(), - "." - ) - - export_data_as_parquet( - data = df_raw_patient, - filename = tracker_name, - output_root = output_root, - suffix = "_patient_raw" - ) - - logInfo("Finish process_patient_data.") - } - - -process_product_data <- - function(tracker_name, - tracker_data_file, - output_root, - synonyms_product) { - logDebug("Start process_product_data.") - - df_raw_product <- - reading_product_data_step1( - tracker_data_file = tracker_data_file, - columns_synonyms = synonyms_product - ) - - if (!is.null(df_raw_product)) { - df_raw_product <- df_raw_product %>% dplyr::mutate(file_name = tracker_name) - } else { - logDebug("Empty product data") - } - - logDebug( - "df_raw_product dim: ", - dim(df_raw_product) %>% as.data.frame(), - "." - ) - - # product set sensitive column to NA and add tracker file name as a column - if (!is.null(df_raw_product)) { - export_data_as_parquet( - data = df_raw_product, - filename = tracker_name, - output_root = output_root, - suffix = "_product_raw" - ) - } else { - logWarn("No product data in the file") - } - logDebug("Finish process_product_data.") - } - -# profvis(main()) -main() +logInfo("Finish processing all tracker files.") clearLoggers() From 7c5d157f7ffd538d59515d8961432cfdbf9439b2 Mon Sep 17 00:00:00 2001 From: Michael Aydinbas Date: Thu, 28 Dec 2023 21:27:53 +0100 Subject: [PATCH 2/8] working future code for script 1 and 2 --- DESCRIPTION | 4 +- NAMESPACE | 3 + R/create_table_patient_data.R | 4 +- R/create_table_patient_data_changes_only.R | 4 +- R/create_table_patient_data_static.R | 4 +- R/create_table_product_data.R | 42 +- R/helper_main.R | 18 +- R/helper_patient_data_fix.R | 38 +- R/helper_product_data.R | 12 +- R/helper_read_patient_data.R | 32 +- R/helper_scipt_1.R | 93 +++-- R/helper_script_2.R | 364 ++++++++++++++++++ R/link_product_patient.R | 12 +- R/logger.R | 19 +- R/read_cleaned_patient_data.R | 4 +- R/read_patient_data.R | 26 +- R/read_product_data.R | 60 +-- R/sysdata.rda | Bin 2763 -> 2763 bytes man/process_patient_file.Rd | 25 ++ man/process_product_file.Rd | 18 + man/process_tracker_file.Rd | 8 +- reference_data/build_package_data.R | 2 +- scripts/run_script_1_extract_raw_data.R | 18 +- scripts/run_script_2_clean_data.R | 422 +++------------------ scripts/run_script_3_create_tables.R | 32 +- 25 files changed, 678 insertions(+), 586 deletions(-) create mode 100644 R/helper_script_2.R create mode 100644 man/process_patient_file.Rd create mode 100644 man/process_product_file.Rd diff --git a/DESCRIPTION b/DESCRIPTION index a1d7052..c273a8f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,8 +8,7 @@ License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 -Depends: - ParallelLogger +Depends: Imports: data.table, digest, @@ -17,6 +16,7 @@ Imports: here, lubridate, openxlsx, + ParallelLogger, readr, readxl, stringr, diff --git a/NAMESPACE b/NAMESPACE index f8e779f..ca93149 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,7 +21,9 @@ export(harmonize_patient_data_columns) export(id_2_county_hospisal) export(init_paths) export(process_patient_data) +export(process_patient_file) export(process_product_data) +export(process_product_file) export(process_tracker_file) export(read_cleaned_patient_data) export(read_column_synonyms) @@ -29,6 +31,7 @@ export(report_empty_intersections) export(sanitize_str) export(select_A4D_directory) export(set_a4d_data_root) +export(setup_file_logger) export(setup_logger) export(with_file_logger) importFrom(data.table,"%like%") diff --git a/R/create_table_patient_data.R b/R/create_table_patient_data.R index 3f5c864..c6793ed 100644 --- a/R/create_table_patient_data.R +++ b/R/create_table_patient_data.R @@ -9,7 +9,7 @@ #' @param input_root root directory of the input CSV files. #' @param output_root root directory of the output folder. create_table_patient_data_monthly <- function(patient_data_files, input_root, output_root) { - logInfo("Start creating single csv for table patient_data_monthly.") + ParallelLogger::logInfo("Start creating single csv for table patient_data_monthly.") # THERE MIGHT BE MONTHLY COLUMNS MISSING - PLEASE ADD THEM dynamic_patient_columns <- @@ -58,5 +58,5 @@ create_table_patient_data_monthly <- function(patient_data_files, input_root, ou suffix = "" ) - logInfo("Finish creating single csv for table patient_data_monthly.") + ParallelLogger::logInfo("Finish creating single csv for table patient_data_monthly.") } diff --git a/R/create_table_patient_data_changes_only.R b/R/create_table_patient_data_changes_only.R index 6a06fe8..1a4178c 100644 --- a/R/create_table_patient_data_changes_only.R +++ b/R/create_table_patient_data_changes_only.R @@ -16,7 +16,7 @@ create_table_longitudinal_data <- output_root, variable, name) { - logInfo("Start creating single csv for table create_table_longitudinal_data and variable ", variable) + ParallelLogger::logInfo("Start creating single csv for table create_table_longitudinal_data and variable ", variable) dynamic_patient_columns <- c( @@ -79,5 +79,5 @@ create_table_longitudinal_data <- suffix = "" ) - logInfo("Finish creating single csv for table create_table_longitudinal_data and variable ", variable) + ParallelLogger::logInfo("Finish creating single csv for table create_table_longitudinal_data and variable ", variable) } diff --git a/R/create_table_patient_data_static.R b/R/create_table_patient_data_static.R index aaf180d..55c8c1d 100644 --- a/R/create_table_patient_data_static.R +++ b/R/create_table_patient_data_static.R @@ -9,7 +9,7 @@ #' @param input_root root directory of the input CSV files. #' @param output_root root directory of the output folder. create_table_patient_data_static <- function(patient_data_files, input_root, output_root) { - logInfo("Start creating single csv for table patient_data_static.") + ParallelLogger::logInfo("Start creating single csv for table patient_data_static.") # THERE MIGHT BE STATIC COLUMNS MISSING - PLEASE ADD THEM static_patient_columns <- @@ -59,5 +59,5 @@ create_table_patient_data_static <- function(patient_data_files, input_root, out suffix = "_static" ) - logInfo("Finish creating single csv for table patient_data_static.") + ParallelLogger::logInfo("Finish creating single csv for table patient_data_static.") } diff --git a/R/create_table_product_data.R b/R/create_table_product_data.R index aaa6448..4ee7eaa 100644 --- a/R/create_table_product_data.R +++ b/R/create_table_product_data.R @@ -17,7 +17,7 @@ #' create_table_product_data("path/to/input/directory", "path/to/output/directory") #' } create_table_product_data <- function(input_root, output_root) { - logInfo("Start creating single file for table product_data.") + ParallelLogger::logInfo("Start creating single file for table product_data.") # Get a list of all CSV files in the input_root directory files <- list.files(input_root, pattern = "*.parquet", full.names = TRUE) @@ -25,7 +25,7 @@ create_table_product_data <- function(input_root, output_root) { # Read all CSV files and store them in a list data_list <- lapply(files, function(x) arrow::read_parquet(x)) - logInfo(length(data_list), " files will be processed for creating the single file for table product_data.") + ParallelLogger::logInfo(length(data_list), " files will be processed for creating the single file for table product_data.") # Get the union of all column names all_names <- unique(unlist(lapply(data_list, colnames))) @@ -39,25 +39,25 @@ create_table_product_data <- function(input_root, output_root) { # Merge all data frames merged_data <- do.call(rbind, data_list) - logDebug("Copying original parient IDs...") + ParallelLogger::logDebug("Copying original parient IDs...") merged_data$orig_product_released_to <- merged_data$product_released_to - logDebug("Trying to fix patient IDs...") + ParallelLogger::logDebug("Trying to fix patient IDs...") merged_data$product_released_to <- sapply(merged_data$product_released_to, fix_id) - logDebug("Extracting product_county and product_hospisal from patients IDs...") + ParallelLogger::logDebug("Extracting product_county and product_hospisal from patients IDs...") merged_data <- id_2_county_hospisal( merged_data, "product_released_to", "product_country", "product_hospital" ) - logDebug("Calculating the most frequent 'product_hospital' for each 'file_name'...") + ParallelLogger::logDebug("Calculating the most frequent 'product_hospital' for each 'file_name'...") tryCatch( { merged_data <- calculate_most_frequent(merged_data, "file_name", "product_hospital", "table_hospital") }, error = function(e) { - logError("Error in calculating the most frequent 'product_hospital': ", e) + ParallelLogger::logError("Error in calculating the most frequent 'product_hospital': ", e) } ) @@ -67,14 +67,14 @@ create_table_product_data <- function(input_root, output_root) { merged_data <- calculate_most_frequent(merged_data, "file_name", "product_country", "table_country") }, error = function(e) { - logError("Error in calculating the most frequent 'product_country': ", e) + ParallelLogger::logError("Error in calculating the most frequent 'product_country': ", e) } ) # Reorder, add, and ensures the correct data type for each column according to the list of fields merged_data <- preparing_product_fields(merged_data) - logDebug("Checking 'table_country' for each 'file_name'...") + ParallelLogger::logDebug("Checking 'table_country' for each 'file_name'...") report_empty_intersections(merged_data, "file_name", "table_country") # Write the merged and processed data to a file in the output_root directory @@ -85,7 +85,7 @@ create_table_product_data <- function(input_root, output_root) { suffix = "" ) - logInfo("Finish creating single file for table product_data.") + ParallelLogger::logInfo("Finish creating single file for table product_data.") } @@ -168,7 +168,7 @@ preparing_product_fields <- function(merged_data) { "table_hospital" = "character" ) - logInfo("Start processing fields for the single csv product_data...") + ParallelLogger::logInfo("Start processing fields for the single csv product_data...") # Check if all fields are present in merged_data missing_fields <- setdiff(names(fields), names(merged_data)) @@ -186,7 +186,7 @@ preparing_product_fields <- function(merged_data) { merged_data[[field]] <- suppressWarnings(as.Date(original_values)) incorrect_rows <- which(is.na(merged_data[[field]]) & !is.na(original_values)) if (length(incorrect_rows) > 0) { - logWarn(paste( + ParallelLogger::logWarn(paste( "In", field, "incorrect date values were replaced with", ERROR_VAL_DATE, "in", length(incorrect_rows), "rows:", paste(incorrect_rows, collapse = ", ") @@ -198,7 +198,7 @@ preparing_product_fields <- function(merged_data) { merged_data[[field]] <- suppressWarnings(as.numeric(original_values)) incorrect_rows <- which(is.na(merged_data[[field]]) & !is.na(original_values)) if (length(incorrect_rows) > 0) { - logWarn(paste( + ParallelLogger::logWarn(paste( "In", field, "incorrect numeric values were replaced with", ERROR_VAL_NUMERIC, "in", length(incorrect_rows), "rows:", paste(incorrect_rows, collapse = ", ") @@ -210,7 +210,7 @@ preparing_product_fields <- function(merged_data) { merged_data[[field]] <- suppressWarnings(as.integer(original_values)) incorrect_rows <- which(is.na(merged_data[[field]]) & !is.na(original_values)) if (length(incorrect_rows) > 0) { - logWarn(paste( + ParallelLogger::logWarn(paste( "In", field, "incorrect integer values were replaced with", ERROR_VAL_NUMERIC, "in", length(incorrect_rows), "rows:", paste(incorrect_rows, collapse = ", ") @@ -222,7 +222,7 @@ preparing_product_fields <- function(merged_data) { merged_data[[field]] <- as.character(original_values) incorrect_rows <- which(is.na(merged_data[[field]]) & !is.na(original_values)) if (length(incorrect_rows) > 0) { - logWarn(paste( + ParallelLogger::logWarn(paste( "In", field, "incorrect character values were replaced with", ERROR_VAL_CHARACTER, "in", length(incorrect_rows), "rows:", paste(incorrect_rows, collapse = ", ") @@ -231,19 +231,19 @@ preparing_product_fields <- function(merged_data) { } } }, warning = function(w) { - logError(paste("Warning in converting", field, ": ", w)) + ParallelLogger::logError(paste("Warning in converting", field, ": ", w)) }, error = function(e) { - logWarn(paste("Error in converting", field, ": ", e)) + ParallelLogger::logWarn(paste("Error in converting", field, ": ", e)) }, finally = { - logDebug(paste("Finished converting", field)) + ParallelLogger::logDebug(paste("Finished converting", field)) }) } # Reorder the columns according to the list of fields - logInfo("Reorder the columns according to the list of fields...") + ParallelLogger::logInfo("Reorder the columns according to the list of fields...") merged_data <- merged_data[, c(names(fields), setdiff(names(merged_data), names(fields)))] - logInfo("Finished processing fields for the single csv product_data.") + ParallelLogger::logInfo("Finished processing fields for the single csv product_data.") return(merged_data) } @@ -313,7 +313,7 @@ report_empty_intersections <- function(df, row_category, col_category) { df_row_sums <- df_row_sums[df_row_sums$sum == 0, ] if (nrow(df_row_sums) > 0) { - logWarn( + ParallelLogger::logWarn( "The number of ", row_category, " with empty ", col_category, " is ", nrow(df_row_sums), ": ", paste(df_row_sums$row_name, sep = "", collapse = ", ") diff --git a/R/helper_main.R b/R/helper_main.R index ca8d1a3..b5cd0a1 100644 --- a/R/helper_main.R +++ b/R/helper_main.R @@ -139,7 +139,7 @@ read_column_synonyms <- function(synonym_file, path_prefixes = c("reference_data #' ) #' } export_data <- function(data, filename, output_root, suffix) { - logDebug("Start export_data. Suffix = ", suffix, ".") + ParallelLogger::logDebug("Start export_data. Suffix = ", suffix, ".") data %>% write.csv( file = @@ -156,7 +156,7 @@ export_data <- function(data, filename, output_root, suffix) { fileEncoding = "UTF-16LE", quote = T ) - logInfo("Finish export_data. Suffix = ", suffix, ".") + ParallelLogger::logInfo("Finish export_data. Suffix = ", suffix, ".") } @@ -177,12 +177,12 @@ export_data <- function(data, filename, output_root, suffix) { #' ) #' } export_data_as_parquet <- function(data, filename, output_root, suffix) { - logDebug("Start export_data. Suffix = ", suffix, ".") + ParallelLogger::logDebug("Start export_data. Suffix = ", suffix, ".") data %>% arrow::write_parquet( sink = file.path(output_root, paste0(filename, suffix, ".parquet")), ) - logInfo("Finish export_data. Suffix = ", suffix, ".") + ParallelLogger::logInfo("Finish export_data. Suffix = ", suffix, ".") } @@ -193,7 +193,7 @@ export_data_as_parquet <- function(data, filename, output_root, suffix) { #' #' @return tibble with patient data read_raw_csv <- function(file) { - logDebug("Start reading data with read_csv.") + ParallelLogger::logDebug("Start reading data with read_csv.") df_patient_raw <- readr::read_csv( file, name_repair = "check_unique", @@ -202,9 +202,9 @@ read_raw_csv <- function(file) { col_types = readr::cols(.default = "c"), locale = readr::locale(encoding = "UTF-16LE") ) - logDebug("Finished loading data with read_csv.") - logInfo("Dim: ", dim(df_patient_raw)) - logInfo("Columns: ", spec(df_patient_raw)) + ParallelLogger::logDebug("Finished loading data with read_csv.") + ParallelLogger::logInfo("Dim: ", dim(df_patient_raw)) + ParallelLogger::logInfo("Columns: ", spec(df_patient_raw)) df_patient_raw } @@ -218,6 +218,6 @@ read_raw_csv <- function(file) { #' @return A named character vector with all allowed provinces. get_allowed_provinces <- function() { ## Should new countries and provinces be added, update the YAML file - provinces <- yaml::read_yaml("reference_data/provinces/allowed_provinces.yaml") %>% unlist() + provinces <- yaml::read_yaml("reference_data/provinces/allowed_provinces.yaml") |> unlist() return(provinces) } diff --git a/R/helper_patient_data_fix.R b/R/helper_patient_data_fix.R index 05bb168..5b10d72 100644 --- a/R/helper_patient_data_fix.R +++ b/R/helper_patient_data_fix.R @@ -24,11 +24,11 @@ convert_to <- function(x, cast_fnc, error_val, col_name = "", id = "") { x <- tryCatch( cast_fnc(x), error = function(e) { - logError("Could not convert value ", x, " in column ", col_name, " for patient: ", id) + ParallelLogger::logError("Could not convert value ", x, " in column ", col_name, " for patient: ", id) x <- error_val }, warning = function(w) { - logWarn("Could not convert value ", x, " in column ", col_name, " for patient: ", id) + ParallelLogger::logWarn("Could not convert value ", x, " in column ", col_name, " for patient: ", id) x <- error_val } ) @@ -55,7 +55,7 @@ cut_numeric_value <- function(x, } if (x < min || x > max) { - logWarn( + ParallelLogger::logWarn( "Found invalid value ", x, " for column ", col_name, " outside [", min, ", ", max, "]. ", "Value was replaced with ", ERROR_VAL_NUMERIC, "." ) @@ -154,7 +154,7 @@ parse_dates <- function(date) { parsed_date <- suppressWarnings(lubridate::as_date(date)) if (is.na(parsed_date)) { - logWarn( + ParallelLogger::logWarn( "Could not parse date value ", date, ". ", "Trying to parse with lubridate::parse_date_time and orders = c('dmy', 'dmY', 'by', 'bY')." ) @@ -195,9 +195,9 @@ check_allowed_values <- function(x, valid_values, id, replace_invalid = TRUE, er valid_value_mapping <- setNames(as.list(valid_values), sanitize_str(valid_values)) if (!sanitize_str(x) %in% names(valid_value_mapping)) { - logWarn("Patient ", id, ": Value ", x, " for column ", col, " is not in the list of allowed values. ") + ParallelLogger::logWarn("Patient ", id, ": Value ", x, " for column ", col, " is not in the list of allowed values. ") if (replace_invalid) { - logInfo("Replacing ", x, " with ", error_val, ".") + ParallelLogger::logInfo("Replacing ", x, " with ", error_val, ".") return(error_val) } else { return(x) @@ -275,13 +275,13 @@ fix_age <- function(age, dob, tracker_year, tracker_month, id) { } if (is.na(age)) { - logWarn( + ParallelLogger::logWarn( "Patient ", id, ": age is missing. Using calculated age ", calc_age, " instead of original age." ) } else { if (calc_age != age) { - logWarn( + ParallelLogger::logWarn( "Patient ", id, ": age ", age, " is different from calculated age ", calc_age, ". Using calculated age instead of original age." ) @@ -289,7 +289,7 @@ fix_age <- function(age, dob, tracker_year, tracker_month, id) { } if (calc_age < 0) { - logWarn("Patient ", id, ": calculated age is negative. Something went wrong.") + ParallelLogger::logWarn("Patient ", id, ": calculated age is negative. Something went wrong.") calc_age <- ERROR_VAL_NUMERIC } } @@ -322,11 +322,11 @@ fix_bmi <- function(weight, height, id) { if (!is.na(weight) && weight == ERROR_VAL_CHARACTER) { - logWarn("Patient ", id, ": the weight is out of bounds.") + ParallelLogger::logWarn("Patient ", id, ": the weight is out of bounds.") } if (!is.na(height) && height == ERROR_VAL_CHARACTER) { - logWarn("Patient ", id, ": the height is out of bounds.") + ParallelLogger::logWarn("Patient ", id, ": the height is out of bounds.") } bmi } @@ -353,7 +353,7 @@ fix_sex <- function(sex, id) { ) if (!is.na(fixed_sex) && fixed_sex == ERROR_VAL_CHARACTER) { - logWarn("Patient ", id, ": sex ", sex, " is not in the list of synonyms. Replacing it with ", fixed_sex, ".") + ParallelLogger::logWarn("Patient ", id, ": sex ", sex, " is not in the list of synonyms. Replacing it with ", fixed_sex, ".") } fixed_sex } @@ -463,7 +463,7 @@ fix_testing_frequency <- function(test_frq) { } if (grepl("-", test_frq, fixed = TRUE)) { - logInfo("Found a range for testing_frequency. Replacing it with the mean.") + ParallelLogger::logInfo("Found a range for testing_frequency. Replacing it with the mean.") test_frq <- try(as.character(replace_range_with_mean(test_frq), silent = TRUE)) } @@ -492,7 +492,7 @@ replace_range_with_mean <- function(x) { #' #' @return data frame with two new columns: blood_pressure_sys_mmhg and blood_pressure_dias_mmhg. split_bp_in_sys_and_dias <- function(df) { - logInfo("Splitting blood_pressure_mmhg into blood_pressure_sys_mmhg and blood_pressure_dias_mmhg.") + ParallelLogger::logInfo("Splitting blood_pressure_mmhg into blood_pressure_sys_mmhg and blood_pressure_dias_mmhg.") df <- df %>% dplyr::mutate( blood_pressure_mmhg = dplyr::case_when( @@ -502,7 +502,7 @@ split_bp_in_sys_and_dias <- function(df) { ) if (paste(ERROR_VAL_NUMERIC, ERROR_VAL_NUMERIC, sep = "/") %in% df$blood_pressure_mmhg) { - logWarn( + ParallelLogger::logWarn( "Found invalid values for column blood_pressure_mmhg that do not follow the format X/Y. ", "Values were replaced with ", ERROR_VAL_NUMERIC, "." ) @@ -514,7 +514,7 @@ split_bp_in_sys_and_dias <- function(df) { delim = "/", names = c("blood_pressure_sys_mmhg", "blood_pressure_dias_mmhg"), ) - logDebug("Finished splitting blood_pressure_mmhg into blood_pressure_sys_mmhg and blood_pressure_dias_mmhg.") + ParallelLogger::logDebug("Finished splitting blood_pressure_mmhg into blood_pressure_sys_mmhg and blood_pressure_dias_mmhg.") df } @@ -550,12 +550,12 @@ fix_id <- function(id) { id <- stringr::str_replace(id, "-", "_") if (!grepl("^[[:upper:]]{2}_[[:upper:]]{2}[[:digit:]]{3}$", id)) { - logWarn("Patient ", id, ": id cannot be matched to a 7 letter alpha numeric code like XX_YY001. ") + ParallelLogger::logWarn("Patient ", id, ": id cannot be matched to a 7 letter alpha numeric code like XX_YY001. ") if (stringr::str_length(id) > 8) { - logWarn("Patient ", id, ": id was truncated because it is longer than 8 characters.") + ParallelLogger::logWarn("Patient ", id, ": id was truncated because it is longer than 8 characters.") id <- stringr::str_sub(id, 1, 8) } else { - logError("Patient ", id, ": id is not valid.") + ParallelLogger::logError("Patient ", id, ": id is not valid.") id <- ERROR_VAL_CHARACTER } } diff --git a/R/helper_product_data.R b/R/helper_product_data.R index 6f04cd7..3e56f6b 100644 --- a/R/helper_product_data.R +++ b/R/helper_product_data.R @@ -83,7 +83,7 @@ get_patient_end <- function(df, j) { # @Description: Reads product data from a monthly file based on extraction logic extract_product_data <- function(monthly_tracker_df) { - logDebug("Starting extract_product_data.") + ParallelLogger::logDebug("Starting extract_product_data.") start_df_msd <- NULL end_df_msd <- NULL @@ -103,7 +103,7 @@ extract_product_data <- function(monthly_tracker_df) { # Clean empty remaining first row product_data_df <- set_second_row_as_headers_and_remove_first_row(product_data_df) - logDebug("Finish extract_product_data.") + ParallelLogger::logDebug("Finish extract_product_data.") return(product_data_df) } @@ -134,7 +134,7 @@ extract_product_data <- function(monthly_tracker_df) { # column synonyms to unify column names # @columns_synonyms: Long format output of read_column_synonyms to match columns harmonize_input_data_columns <- function(product_df, columns_synonyms) { - logDebug("Start harmonize_input_data_columns.") + ParallelLogger::logDebug("Start harmonize_input_data_columns.") # In case that there is additional data in strange columns, keep only relevant columns # keep.cols <- names(product_df) %in% c("") @@ -147,14 +147,14 @@ harmonize_input_data_columns <- function(product_df, columns_synonyms) { ## report all column names which have not been found unknown_column_names <- colnames(product_df)[!colnames(product_df) %in% synonym_headers] if (length(unknown_column_names) > 0) { - logWarn("Unknown column names in sheet: ", paste(unknown_column_names, collapse = ", ")) + ParallelLogger::logWarn("Unknown column names in sheet: ", paste(unknown_column_names, collapse = ", ")) } # replacing var codes colnames_found <- match(colnames(product_df), synonym_headers, nomatch = 0) colnames(product_df)[colnames(product_df) %in% synonym_headers] <- columns_synonyms$name_clean[colnames_found] - logDebug("Finish harmonize_input_data_columns.") + ParallelLogger::logDebug("Finish harmonize_input_data_columns.") if (sum(colnames_found == 0) != 0) { "Non-matching column names found (see 0)" # SK: remove non matching column names @@ -282,7 +282,7 @@ update_receivedfrom <- function(product_df) { grepl("Balance", product_units_received, ignore.case = TRUE) & !is.na(product_received_from) ~ product_received_from )) %>% dplyr::mutate(product_units_released = ifelse(!is.na(product_received_from), NA, product_units_released)) - logInfo("The rule for the case was applied successfully- Released (product_units_released) column also includes values for Start/End Balance") + ParallelLogger::logInfo("The rule for the case was applied successfully- Released (product_units_released) column also includes values for Start/End Balance") } return(product_df) } diff --git a/R/helper_read_patient_data.R b/R/helper_read_patient_data.R index cca5779..8f61b04 100644 --- a/R/helper_read_patient_data.R +++ b/R/helper_read_patient_data.R @@ -1,7 +1,7 @@ # extracting country and clinic code from patient ID # expects that patient ID has a certain format extract_country_clinic_code <- function(patient_data) { - logDebug("Start extract_country_clinic_code.") + ParallelLogger::logDebug("Start extract_country_clinic_code.") patient_ids <- patient_data["id"] %>% dplyr::filter(id != "0") %>% tidyr::drop_na() %>% @@ -19,9 +19,9 @@ extract_country_clinic_code <- function(patient_data) { clinic_code <- names(sort(table(patient_ids$clinic), decreasing = T))[1] - logDebug("country_code = ", country_code, ".") - logDebug("clinic_code = ", clinic_code, ".") - logDebug("Finish extract_country_clinic_code.") + ParallelLogger::logDebug("country_code = ", country_code, ".") + ParallelLogger::logDebug("clinic_code = ", clinic_code, ".") + ParallelLogger::logDebug("Finish extract_country_clinic_code.") return(list("country_code" = country_code, "clinic_code" = clinic_code)) } @@ -40,9 +40,9 @@ extract_country_clinic_code <- function(patient_data) { #' @return data.frame with the patient data #' @export extract_patient_data <- function(tracker_data_file, sheet, year) { - logDebug("Start extract_patient_data for sheet = ", sheet, ".") + ParallelLogger::logDebug("Start extract_patient_data for sheet = ", sheet, ".") - logDebug("Start openxlsx::read.xlsx to get tracker_data.") + ParallelLogger::logDebug("Start openxlsx::read.xlsx to get tracker_data.") tracker_data <- openxlsx::read.xlsx( xlsxFile = tracker_data_file, sheet = sheet, @@ -59,7 +59,7 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { # col_names = F, # .name_repair = "unique_quiet" # ) - logDebug("Finish openxlsx::read.xlsx.") + ParallelLogger::logDebug("Finish openxlsx::read.xlsx.") # Assumption: first column is always empty until patient data begins patient_data_range <- which(!is.na(tracker_data[, 1])) @@ -80,8 +80,8 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { row_max <- row_max + 1 } - logInfo("Patient data found in rows ", row_min, " to ", row_max, ".") - logDebug("Start readxl::read_excel to get patient data.") + ParallelLogger::logInfo("Patient data found in rows ", row_min, " to ", row_max, ".") + ParallelLogger::logDebug("Start readxl::read_excel to get patient data.") df_patient <- readxl::read_excel( path = tracker_data_file, sheet = sheet, @@ -91,11 +91,11 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { col_types = c("text"), .name_repair = "unique_quiet" ) - logDebug("Finish readxl::read_excel.") + ParallelLogger::logDebug("Finish readxl::read_excel.") if (header_cols[2] == header_cols_2[2]) { # take into account that date info gets separated from the updated values (not in the same row, usually in the bottom row) - logInfo("Read in multiline header.") + ParallelLogger::logInfo("Read in multiline header.") diff_colnames <- which((header_cols != header_cols_2)) header_cols[diff_colnames] <- @@ -106,7 +106,7 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { } colnames(df_patient) <- header_cols - logDebug("Found patient column names = ", paste(header_cols, collapse = ","), ".") + ParallelLogger::logDebug("Found patient column names = ", paste(header_cols, collapse = ","), ".") # delete columns without a header (=NA) df_patient <- df_patient[, !is.na(colnames(df_patient))] @@ -117,7 +117,7 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { df_patient <- df_patient[rowSums(is.na(df_patient)) != ncol(df_patient), ] - logDebug("Finish extract_patient_data.") + ParallelLogger::logDebug("Finish extract_patient_data.") df_patient } @@ -137,7 +137,7 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { #' @export harmonize_patient_data_columns <- function(patient_df, columns_synonyms) { - logDebug("Start harmonize_patient_data_columns.") + ParallelLogger::logDebug("Start harmonize_patient_data_columns.") patient_df <- patient_df[!is.na(names(patient_df))] @@ -151,11 +151,11 @@ harmonize_patient_data_columns <- mismatching_column_ids <- which(colnames_found == 0) if (length(mismatching_column_ids) > 0) { - logWarn( + ParallelLogger::logWarn( "Non-matching column names found: ", paste(colnames(patient_df)[mismatching_column_ids], collapse = ","), "." ) } - logDebug("Finish harmonize_patient_data_columns.") + ParallelLogger::logDebug("Finish harmonize_patient_data_columns.") patient_df } diff --git a/R/helper_scipt_1.R b/R/helper_scipt_1.R index 222d5f1..f1cf102 100644 --- a/R/helper_scipt_1.R +++ b/R/helper_scipt_1.R @@ -1,5 +1,3 @@ - - #' @title Process a single tracker file and extract patient and product data. #' #' @param tracker_file Filename of the tracler. @@ -13,56 +11,57 @@ process_tracker_file <- function(tracker_file, paths, p) { synonyms <- get_synonyms() tracker_data_file <- file.path(paths$tracker_root, tracker_file) - logDebug("Start process_tracker_file.") - logInfo( + + ParallelLogger::logDebug("Start process_tracker_file.") + ParallelLogger::logInfo( "Current file: ", tracker_name ) logfile <- paste0(tracker_name, "_", "patient") with_file_logger(logfile, - { - tryCatch( - process_patient_data( - tracker_name = tracker_name, - tracker_data_file = tracker_data_file, - output_root = paths$patient_data_raw, - synonyms_patient = synonyms$patient - ), - error = function(e) { - logError("Could not process patient data. Error = ", e$message, ".") - }, - warning = function(w) { - logWarn("Could not process patient data. Warning = ", w$message, ".") - } - ) - }, - output_root = paths$output_root + { + tryCatch( + process_patient_data( + tracker_name = tracker_name, + tracker_data_file = tracker_data_file, + output_root = paths$patient_data_raw, + synonyms_patient = synonyms$patient + ), + error = function(e) { + ParallelLogger::logError("Could not process patient data. Error = ", e$message, ".") + }, + warning = function(w) { + ParallelLogger::logWarn("Could not process patient data. Warning = ", w$message, ".") + } + ) + }, + output_root = paths$output_root ) logfile <- paste0(tracker_name, "_", "product") with_file_logger(logfile, - { - tryCatch( - process_product_data( - tracker_name = tracker_name, - tracker_data_file = tracker_data_file, - output_root = paths$product_data_raw, - synonyms_product = synonyms$product - ), - error = function(e) { - logError("Could not process product data. Error = ", e$message, ".") - }, - warning = function(w) { - logWarn("Could not process product data. Warning = ", w$message, ".") - } - ) - }, - output_root = paths$output_root + { + tryCatch( + process_product_data( + tracker_name = tracker_name, + tracker_data_file = tracker_data_file, + output_root = paths$product_data_raw, + synonyms_product = synonyms$product + ), + error = function(e) { + ParallelLogger::logError("Could not process product data. Error = ", e$message, ".") + }, + warning = function(w) { + ParallelLogger::logWarn("Could not process product data. Warning = ", w$message, ".") + } + ) + }, + output_root = paths$output_root ) - logInfo("Finish process_tracker_file.") + ParallelLogger::logDebug("Finish process_tracker_file.") } @@ -79,7 +78,7 @@ process_patient_data <- tracker_data_file, output_root, synonyms_patient) { - logDebug("Start process_patient_data.") + ParallelLogger::logDebug("Start process_patient_data.") df_raw_patient <- reading_patient_data( @@ -89,7 +88,7 @@ process_patient_data <- df_raw_patient <- df_raw_patient %>% dplyr::mutate(file_name = tracker_name) - logDebug( + ParallelLogger::logDebug( "df_raw_patient dim: ", dim(df_raw_patient) %>% as.data.frame(), "." @@ -102,7 +101,7 @@ process_patient_data <- suffix = "_patient_raw" ) - logInfo("Finish process_patient_data.") + ParallelLogger::logDebug("Finish process_patient_data.") } @@ -119,7 +118,7 @@ process_product_data <- tracker_data_file, output_root, synonyms_product) { - logDebug("Start process_product_data.") + ParallelLogger::logDebug("Start process_product_data.") df_raw_product <- reading_product_data_step1( @@ -130,10 +129,10 @@ process_product_data <- if (!is.null(df_raw_product)) { df_raw_product <- df_raw_product %>% dplyr::mutate(file_name = tracker_name) } else { - logDebug("Empty product data") + ParallelLogger::logDebug("Empty product data") } - logDebug( + ParallelLogger::logDebug( "df_raw_product dim: ", dim(df_raw_product) %>% as.data.frame(), "." @@ -148,7 +147,7 @@ process_product_data <- suffix = "_product_raw" ) } else { - logWarn("No product data in the file") + ParallelLogger::logWarn("No product data in the file") } - logDebug("Finish process_product_data.") + ParallelLogger::logDebug("Finish process_product_data.") } diff --git a/R/helper_script_2.R b/R/helper_script_2.R new file mode 100644 index 0000000..035b5db --- /dev/null +++ b/R/helper_script_2.R @@ -0,0 +1,364 @@ +#' Process and clean patient raw data +#' +#' @description +#' Read in the output of the first script as parquet (all character). +#' Merge extracted patient data with the metadata schema that contains all +#' necessary columns. Process each variable in three steps: +#' 1. Fix any problems with character representation. +#' 2. Convert column to target data type, for example a date or number. +#' 3. Apply any post-processing logic for the target data type. +#' +#' +#' @param patient_file Path to patient raw parquet file from first script. +#' @param paths List with paths to patient_data_cleaned and product_data_cleaned folders. +#' @param p progressor from progressr package. +#' +#' @export +process_patient_file <- function(patient_file, paths, p) { + p() + + ERROR_VAL_NUMERIC <<- 999999 + ERROR_VAL_CHARACTER <<- "Undefined" + ERROR_VAL_DATE <<- "9999-09-09" + + patient_file_name <- tools::file_path_sans_ext(basename(patient_file)) + patient_file_path <- + file.path(paths$tracker_root, patient_file) + output_root <- paths$patient_data_cleaned + + ParallelLogger::logDebug("Start process_patient_file.") + ParallelLogger::logInfo( + "Current file: ", + patient_file_name + ) + + logfile <- paste0(patient_file_name) + + with_file_logger(logfile, + { + tryCatch( + process_patient_file_worker( + patient_file_path = patient_file_path, + patient_file_name = patient_file_name, + output_root = output_root + ), + error = function(e) { + ParallelLogger::logError("Could not process raw patient data. Error = ", e$message, ".") + }, + warning = function(w) { + ParallelLogger::logWarn("Could not process raw patient data. Warning = ", w$message, ".") + } + ) + }, + output_root = paths$output_root + ) + + ParallelLogger::logInfo("Finish process_patient_file.") +} + + +process_patient_file_worker <- function(patient_file_path, patient_file_name, output_root) { + allowed_provinces <- get_allowed_provinces() + + df_patient_raw <- arrow::read_parquet(patient_file_path) + + # filter all rows with no patient id or patient name + df_patient_raw <- df_patient_raw %>% + dplyr::filter(!(is.na(id) & is.na(name))) %>% + dplyr::filter(!(id == "0" & name == "0")) + + # --- TRANSFORMATIONS --- + # data before 2019 had only one column for updated hba1c and fbg + # with date as part of the value + if (!"hba1c_updated_date" %in% colnames(df_patient_raw) && "hba1c_updated" %in% colnames(df_patient_raw)) { + ParallelLogger::logInfo("Column updated_hba1c_date not found. Trying to parse from hba1c_updated.") + df_patient_raw <- + extract_date_from_measurement(df_patient_raw, "hba1c_updated") + ParallelLogger::logDebug("Finished parsing dates from hba1c_updated.") + } + + if (!"fbg_updated_date" %in% colnames(df_patient_raw) && "fbg_updated_mg" %in% colnames(df_patient_raw)) { + ParallelLogger::logInfo("Column updated_fbg_date not found. Trying to parse from fbg_updated_mg.") + df_patient_raw <- + extract_date_from_measurement(df_patient_raw, "fbg_updated_mg") + ParallelLogger::logDebug("Finished parsing dates from fbg_updated_mg.") + } + + if (!"fbg_updated_date" %in% colnames(df_patient_raw) && "fbg_updated_mmol" %in% colnames(df_patient_raw)) { + ParallelLogger::logInfo("Column fbg_updated_date not found. Trying to parse from fbg_updated_mmol.") + df_patient_raw <- + extract_date_from_measurement(df_patient_raw, "fbg_updated_mmol") + ParallelLogger::logDebug("Finished parsing dates from fbg_updated_mmol.") + } + + # blood pressure is given as sys/dias value pair, + # so we split this column in two separate columns + if ("blood_pressure_mmhg" %in% colnames(df_patient_raw)) { + df_patient_raw <- split_bp_in_sys_and_dias(df_patient_raw) + } + + # The maximum value available for hba1c will be around 14% - 18%, + # depending on the equipment being used. + # If the reading is above the maximum available value the > sign is used - + # we would prefer to retain this character in the database as it is important for data analysis. + ParallelLogger::logInfo("Adding columns hba1c_baseline_exceeds and hba1c_updated_exceeds.") + df_patient_raw <- df_patient_raw %>% + dplyr::mutate( + hba1c_baseline_exceeds = ifelse(grepl(">|<", hba1c_baseline), TRUE, FALSE), + hba1c_updated_exceeds = ifelse(grepl(">|<", hba1c_updated), TRUE, FALSE) + ) + + # --- META SCHEMA --- + # meta schema has all final columns for the database + # along with their corresponding data types + ParallelLogger::logInfo("Creating meta schema.") + # short type string for read_csv: + # iiinDccDcnnDnncnlnlDncDccDDDccccDccccciDciiiDn + schema <- tibble::tibble( + # clinic_visit = logical(), + # complication_screening = character(), + # complication_screening_date = lubridate::as_date(1), + # complication_screening_results = character(), + # dm_complication_comment = character(), # TODO + # dm_complication_eye = character(), # TODO + # dm_complication_kidney = character(), # TODO + # dm_complication_other = character(), # TODO + # est_strips_pmonth = integer(), + # family_support_scale = character(), # TODO + # inactive_reason = character(), + # insulin_dosage = character(), + # meter_received_date = lubridate::as_date(1), # TODO + # remarks = character(), + # remote_followup = logical(), + # additional_support = character(), + age = integer(), + blood_pressure_dias_mmhg = integer(), + blood_pressure_sys_mmhg = integer(), + bmi = numeric(), + bmi_date = lubridate::as_date(1), + clinic_code = character(), + country_code = character(), + dob = lubridate::as_date(1), + edu_occ = character(), + fbg_baseline_mg = numeric(), + fbg_baseline_mmol = numeric(), + fbg_updated_date = lubridate::as_date(1), + fbg_updated_mg = numeric(), + fbg_updated_mmol = numeric(), + file_name = character(), + hba1c_baseline = numeric(), + hba1c_baseline_exceeds = logical(), + hba1c_updated = numeric(), + hba1c_updated_exceeds = logical(), + hba1c_updated_date = lubridate::as_date(1), + height = numeric(), + hospitalisation_cause = character(), + hospitalisation_date = lubridate::as_date(1), + id = character(), + insulin_regimen = character(), + last_clinic_visit_date = lubridate::as_date(1), + last_remote_followup_date = lubridate::as_date(1), + lost_date = lubridate::as_date(1), + name = character(), + observations = character(), + observations_category = character(), + province = character(), + recruitment_date = lubridate::as_date(1), + sex = character(), + sheet_name = character(), + status = character(), + status_out = character(), + support_from_a4d = character(), + t1d_diagnosis_age = integer(), + t1d_diagnosis_date = lubridate::as_date(1), + t1d_diagnosis_with_dka = character(), + testing_frequency = integer(), + tracker_date = lubridate::as_date(1), + tracker_month = integer(), + tracker_year = integer(), + updated_2022_date = lubridate::as_date(1), + weight = numeric() + ) + + cols_extra <- colnames(df_patient_raw)[!colnames(df_patient_raw) %in% colnames(schema)] + ParallelLogger::logWarn("Extra columns in patient data: ", paste(cols_extra, collapse = ", ")) + + cols_missing <- + colnames(schema)[!colnames(schema) %in% colnames(df_patient_raw)] + ParallelLogger::logWarn("Missing columns in patient data: ", paste(cols_missing, collapse = ", ")) + + # add all columns of schema to df_patient_raw + # keep all rows, only append missing cols + ParallelLogger::logInfo("Merging df_patient with meta schema and selecting all columns of meta schema.") + df_patient <- merge.default(df_patient_raw, schema, all.x = T) + df_patient <- df_patient[colnames(schema)] + + # the cleaning, fixing and validating happens in three major steps: + # 1. make sure we fix any known problems in the raw character columns + df_patient <- + df_patient %>% + dplyr::rowwise() %>% + # 1. handle known problems before converting to target type + dplyr::mutate( + t1d_diagnosis_age = fix_t1d_diagnosis_age(t1d_diagnosis_age, id), + hba1c_baseline = stringr::str_replace(hba1c_baseline, "<|>", ""), + hba1c_updated = stringr::str_replace(hba1c_updated, "<|>", ""), + fbg_baseline_mg = fix_fbg(fbg_baseline_mg), + fbg_baseline_mmol = fix_fbg(fbg_baseline_mmol), + fbg_updated_mg = fix_fbg(fbg_updated_mg), + fbg_updated_mmol = fix_fbg(fbg_updated_mmol), + testing_frequency = fix_testing_frequency(testing_frequency) + ) %>% + # 2. convert the refined character columns into the target data type + dplyr::mutate( + dplyr::across( + schema %>% dplyr::select(tidyselect::where(is.numeric)) %>% names(), + \(x) convert_to(correct_decimal_sign(x), as.numeric, ERROR_VAL_NUMERIC, dplyr::cur_column(), id = id) + ), + dplyr::across( + schema %>% dplyr::select(tidyselect::where(is.logical)) %>% names(), + \(x) convert_to(x, as.logical, FALSE, dplyr::cur_column(), id = id) + ), + dplyr::across( + schema %>% dplyr::select(tidyselect::where(lubridate::is.Date)) %>% names(), + \(x) convert_to(fix_digit_date(x), parse_dates, as.Date(ERROR_VAL_DATE), dplyr::cur_column(), id = id) + ), + dplyr::across( + schema %>% dplyr::select(tidyselect::where(is.integer)) %>% names(), + \(x) convert_to(x, function(x) as.integer(round(as.double(x))), ERROR_VAL_NUMERIC, dplyr::cur_column(), id = id) + ) + ) %>% + # 3. fix remaining problems in the target data type + dplyr::mutate( + # height and weight are needed to calculate bmi + height = transform_cm_to_m(height) %>% + cut_numeric_value(min = 0, max = 2.3, col_name = "height"), + weight = cut_numeric_value(weight, min = 0, max = 200, col_name = "weight"), + bmi = fix_bmi(weight, height, id) %>% + cut_numeric_value(min = 4, max = 60, "bmi"), + age = fix_age(age, dob, tracker_year, tracker_month, id) %>% + cut_numeric_value(min = 0, max = 25, "age"), + sex = fix_sex(sex, id), + hba1c_baseline = cut_numeric_value(hba1c_baseline, min = 4, max = 18, "hba1c_baseline"), + # https://www.cleveland19.com/story/1425584/ohio-man-holds-world-record-of-highest-blood-sugar/ + fbg_baseline_mmol = cut_numeric_value(fbg_baseline_mmol, min = 0, max = 136.5, "fbg_baseline_mmol"), + # https://www.cleveland19.com/story/1425584/ohio-man-holds-world-record-of-highest-blood-sugar/ + fbg_updated_mmol = cut_numeric_value(fbg_updated_mmol, min = 0, max = 136.5, "fbg_updated_mmol"), + blood_pressure_sys_mmhg = cut_numeric_value(blood_pressure_sys_mmhg, min = 20, max = 250, "blood_pressure_sys_mmhg"), + blood_pressure_dias_mmhg = cut_numeric_value(blood_pressure_dias_mmhg, min = 20, max = 220, "blood_pressure_dias_mmhg"), + tracker_date = lubridate::ym(paste(tracker_year, tracker_month, sep = "-")), + !!!parse_character_cleaning_config(a4d:::config$cleaning), + # should be fixed last as other fix functions use id to log invalid rows! + id = fix_id(id) + ) %>% + dplyr::ungroup() + + # add clinic and country code after having fixed all issues with patient id + cc_codes <- extract_country_clinic_code(df_patient) + df_patient["clinic_code"] <- cc_codes$clinic_code + df_patient["country_code"] <- cc_codes$country_code + + # Formula to calculate mmol/l from mg/dl: mmol/l = mg/dl / 18 + if (all(is.na(df_patient$fbg_baseline_mmol))) { + df_patient <- df_patient %>% + dplyr::mutate(fbg_baseline_mmol = dplyr::case_when( + fbg_baseline_mg != ERROR_VAL_NUMERIC ~ fbg_baseline_mg / 18 + )) + } + if (all(is.na(df_patient$fbg_updated_mmol))) { + df_patient <- df_patient %>% + dplyr::mutate(fbg_updated_mmol = dplyr::case_when( + fbg_updated_mg != ERROR_VAL_NUMERIC ~ fbg_updated_mg / 18 + )) + } + + # Formula to calculate mg/dl from mmol/l: mg/dl = 18 × mmol/l + if (all(is.na(df_patient$fbg_baseline_mg))) { + df_patient <- df_patient %>% + dplyr::mutate(fbg_baseline_mg = dplyr::case_when( + fbg_baseline_mmol != ERROR_VAL_NUMERIC ~ fbg_baseline_mmol * 18 + )) + } + if (all(is.na(df_patient$fbg_updated_mg))) { + df_patient <- df_patient %>% + dplyr::mutate(fbg_updated_mg = dplyr::case_when( + fbg_updated_mmol != ERROR_VAL_NUMERIC ~ fbg_updated_mmol * 18 + )) + } + + # sort by year and month like it is in the tracker files + df_patient <- df_patient %>% + dplyr::arrange(tracker_date, id) + + ParallelLogger::logDebug( + "df_patient dim: ", + dim(df_patient) %>% as.data.frame(), + "." + ) + + export_data_as_parquet( + data = df_patient, + filename = stringr::str_replace(patient_file_name, "_patient_raw", ""), + output_root = output_root, + suffix = "_patient_cleaned" + ) +} + + +#' Process and clean product raw data +#' +#' @description +#' A short description... +#' +#' +#' @param product_file Path to product raw parquet file from first script. +#' @param paths List with paths to patient_data_cleaned and product_data_cleaned folders. +#' @param p progressor from progressr package. +#' +#' @export +process_product_file <- function(product_file, paths, p) { + p() + synonyms <- get_synonyms() + synonyms_product <- synonyms$product + product_file_name <- tools::file_path_sans_ext(basename(product_file)) + product_file_path <- + file.path(paths$tracker_root, product_file) + + ParallelLogger::logDebug("Start process_product_file.") + ParallelLogger::logInfo( + "Current file: ", + product_file_name + ) + df_product_raw <- arrow::read_parquet(product_file_path) + + logfile <- paste0(product_file_name) + with_file_logger(logfile, + { + tryCatch( + df_product_raw <- reading_product_data_step2(df_product_raw, synonyms_product), + error = function(e) { + ParallelLogger::logError("Could not process raw product data. Error = ", e$message, ".") + }, + warning = function(w) { + ParallelLogger::logWarn("Could not process raw product data. Warning = ", w$message, ".") + } + ) + + ParallelLogger::logDebug( + "df_product_raw dim: ", + dim(df_product_raw) %>% as.data.frame(), + "." + ) + + export_data_as_parquet( + data = df_product_raw, + filename = stringr::str_replace(product_file_name, "_product_raw", ""), + output_root = paths$product_data_cleaned, + suffix = "_product_cleaned" + ) + }, + output_root = paths$output_root + ) + + ParallelLogger::logInfo("Finish process_product_file.") +} diff --git a/R/link_product_patient.R b/R/link_product_patient.R index 119de01..8816d57 100644 --- a/R/link_product_patient.R +++ b/R/link_product_patient.R @@ -14,7 +14,7 @@ #' link_product_patient("path/to/product_data.parquet", "path/to/patient_data.parquet") #' } link_product_patient <- function(product_file, patient_file) { - logInfo("Trying to link product file ", product_file, " with patient file ", patient_file) + ParallelLogger::logInfo("Trying to link product file ", product_file, " with patient file ", patient_file) patient_data <- arrow::read_parquet(patient_file) product_data <- arrow::read_parquet(product_file) @@ -44,7 +44,7 @@ link_product_patient <- function(product_file, patient_file) { tryCatch( { if (nrow(summary_df) > 0) { - logWarn( + ParallelLogger::logWarn( "The number of mismatched patient IDs between the product and patient data is ", nrow(summary_df), ". ", paste("File Name: ", summary_df$file_name, @@ -56,18 +56,18 @@ link_product_patient <- function(product_file, patient_file) { } }, error = function(e) { - logError("Could not link csv files for product and patient data. Error: ", e$message) + ParallelLogger::logError("Could not link csv files for product and patient data. Error: ", e$message) }, warning = function(w) { - logWarn("Could not link csv files for product and patient data. Warning: ", w$message) + ParallelLogger::logWarn("Could not link csv files for product and patient data. Warning: ", w$message) } ) } else { - logInfo( + ParallelLogger::logInfo( "There are no mismatched patient IDs between the product data - ", product_file, " and patient data - ", patient_file ) } - logInfo("Finished attempting to link product csv file with patient csv file.") + ParallelLogger::logInfo("Finished attempting to link product csv file with patient csv file.") } diff --git a/R/logger.R b/R/logger.R index 57958bd..e2a111b 100644 --- a/R/logger.R +++ b/R/logger.R @@ -9,17 +9,17 @@ setup_logger <- function(output_dir, log_name) { file.remove(logFileName) } - logger <- createLogger( + logger <- ParallelLogger::createLogger( name = "MAIN", threshold = "TRACE", appenders = list( - createFileAppender( - layout = layoutParallel, + ParallelLogger::createFileAppender( + layout = ParallelLogger::layoutParallel, fileName = logFileName ) ) ) - registerLogger(logger) + ParallelLogger::registerLogger(logger) log_dir <- file.path(output_dir, "logs") @@ -33,14 +33,15 @@ setup_logger <- function(output_dir, log_name) { #' @param output_root Output root directory for the current process. #' #' @return returns the loggers that where previously set for usage with with_ +#' @export setup_file_logger <- function(logfile, output_root) { - loggers <- getLoggers() - clearLoggers() + loggers <- ParallelLogger::getLoggers() + ParallelLogger::clearLoggers() logFileName <- file.path(output_root, "logs", paste0(logfile, ".log")) if (file.exists(logFileName)) { file.remove(logFileName) } - addDefaultFileLogger(logFileName, name = logfile) + ParallelLogger::addDefaultFileLogger(logFileName, name = logfile) loggers } @@ -58,9 +59,9 @@ setup_file_logger <- function(logfile, output_root) { with_file_logger <- withr::with_( setup_file_logger, function(loggers) { - clearLoggers() + ParallelLogger::clearLoggers() for (logger in loggers) { - registerLogger(logger) + ParallelLogger::registerLogger(logger) } } ) diff --git a/R/read_cleaned_patient_data.R b/R/read_cleaned_patient_data.R index ae56e34..b217da4 100644 --- a/R/read_cleaned_patient_data.R +++ b/R/read_cleaned_patient_data.R @@ -7,7 +7,7 @@ #' @export read_cleaned_patient_data <- function(input_root, patient_data_files) { - logInfo("Start read_cleaned_patient_data") + ParallelLogger::logInfo("Start read_cleaned_patient_data") patient_data <- patient_data_files %>% purrr::map(function(patient_file) { @@ -17,6 +17,6 @@ read_cleaned_patient_data <- - logInfo("Finish read_cleaned_patient_data") + ParallelLogger::logInfo("Finish read_cleaned_patient_data") patient_data } diff --git a/R/read_patient_data.R b/R/read_patient_data.R index b292f1c..61f6839 100644 --- a/R/read_patient_data.R +++ b/R/read_patient_data.R @@ -1,9 +1,9 @@ reading_patient_data <- function(tracker_data_file, columns_synonyms) { - logDebug("Start reading_patient_data.") + ParallelLogger::logDebug("Start reading_patient_data.") sheet_list <- readxl::excel_sheets(tracker_data_file) testit::assert(length(sheet_list) > 0) - logInfo( + ParallelLogger::logInfo( "Found ", length(sheet_list), " sheets inside the current file = ", @@ -13,7 +13,7 @@ reading_patient_data <- month_list <- sheet_list[na.omit(pmatch(month.abb, sheet_list))] - logInfo( + ParallelLogger::logInfo( "Found ", length(month_list), " month sheets inside the current file = ", @@ -24,18 +24,18 @@ reading_patient_data <- # Extract year year <- get_tracker_year(tracker_data_file, month_list) - logInfo("Tracker year = ", year, ".") + ParallelLogger::logInfo("Tracker year = ", year, ".") testit::assert(year %in% c(2017, 2018, 2019, 2020, 2021, 2022)) tidy_tracker_list <- NULL - logDebug("Start processing sheets.") + ParallelLogger::logDebug("Start processing sheets.") for (curr_sheet in month_list) { - logDebug("Start processing sheet ", curr_sheet, ".") + ParallelLogger::logDebug("Start processing sheet ", curr_sheet, ".") df_patient <- extract_patient_data(tracker_data_file, curr_sheet, year) testit::assert(nrow(df_patient) > 0) - logDebug("df_patient dim: ", dim(df_patient) %>% as.data.frame(), ".") + ParallelLogger::logDebug("df_patient dim: ", dim(df_patient) %>% as.data.frame(), ".") df_patient <- harmonize_patient_data_columns(df_patient, columns_synonyms) @@ -66,15 +66,15 @@ reading_patient_data <- ) tidy_tracker_list[[curr_sheet]] <- df_patient - logDebug("Finish processing sheet ", curr_sheet, ".") + ParallelLogger::logDebug("Finish processing sheet ", curr_sheet, ".") } - logDebug("Start combining sheet data into single data frame.") + ParallelLogger::logDebug("Start combining sheet data into single data frame.") df_raw <- dplyr::bind_rows(tidy_tracker_list) - logDebug("Finish combining sheet data into single data frame.") + ParallelLogger::logDebug("Finish combining sheet data into single data frame.") if ("Patient List" %in% sheet_list) { - logDebug("Start extracting patient list.") + ParallelLogger::logDebug("Start extracting patient list.") patient_list <- extract_patient_data( tracker_data_file, "Patient List", @@ -98,9 +98,9 @@ reading_patient_data <- by = "id", relationship = "many-to-one" ) - logDebug("Finish extracting patient list.") + ParallelLogger::logDebug("Finish extracting patient list.") } - logInfo("Finish reading_patient_data.") + ParallelLogger::logInfo("Finish reading_patient_data.") return(df_raw) } diff --git a/R/read_product_data.R b/R/read_product_data.R index 10d8886..d1fc051 100644 --- a/R/read_product_data.R +++ b/R/read_product_data.R @@ -4,7 +4,7 @@ # function based on parts from run_a4d_product_data.R and helper functions reading_product_data_step1 <- function(tracker_data_file, columns_synonyms) { - logDebug("Start reading_product_data_step1.") + ParallelLogger::logDebug("Start reading_product_data_step1.") # rename column names to match colnames(columns_synonyms) <- c("name_clean", "name_to_be_matched") @@ -13,13 +13,13 @@ reading_product_data_step1 <- month_list <- sheet_list[na.omit(pmatch(month.abb, sheet_list))] year <- get_tracker_year(tracker_data_file, month_list) - logDebug("Found ", length(sheet_list), " sheets: ", paste(sheet_list, collapse = ", "), ".") - logDebug("Found ", length(month_list), " month sheets: ", paste(month_list, collapse = ", "), ".") + ParallelLogger::logDebug("Found ", length(sheet_list), " sheets: ", paste(sheet_list, collapse = ", "), ".") + ParallelLogger::logDebug("Found ", length(month_list), " month sheets: ", paste(month_list, collapse = ", "), ".") # loop through all months for (curr_sheet in month_list) { - logDebug("Start processing the following sheet: ", curr_sheet) + ParallelLogger::logDebug("Start processing the following sheet: ", curr_sheet) # open tracker data tracker_data <- data.frame( @@ -37,7 +37,7 @@ reading_product_data_step1 <- grepl("Description of Support", tracker_data[, ]))) ) { # go to next month - logInfo("Could not find product data in tracker data. Skipping ", curr_sheet, ".") + ParallelLogger::logInfo("Could not find product data in tracker data. Skipping ", curr_sheet, ".") next } @@ -46,7 +46,7 @@ reading_product_data_step1 <- # If after extraction, dataframe is empty, this iteration is also skipped. if (all(is.na(product_df))) { - logInfo("Product data is empty. Skipping ", curr_sheet, ".") + ParallelLogger::logInfo("Product data is empty. Skipping ", curr_sheet, ".") next } @@ -69,11 +69,11 @@ reading_product_data_step1 <- tryCatch( { if (num_na_rows > 0) { - logInfo(curr_sheet, " the number of rows where the patient's name is missing: ", col_released, " is not NA and ", col_released_to, " (patient's name) is NA = ", num_na_rows) + ParallelLogger::logInfo(curr_sheet, " the number of rows where the patient's name is missing: ", col_released, " is not NA and ", col_released_to, " (patient's name) is NA = ", num_na_rows) } }, error = function(e) { - logError(curr_sheet, " trying with num_na_rows for products. Error: ", e$message) + ParallelLogger::logError(curr_sheet, " trying with num_na_rows for products. Error: ", e$message) } ) @@ -83,7 +83,7 @@ reading_product_data_step1 <- tryCatch( { if (nrow(non_processed_dates) > 0) { - logWarn( + ParallelLogger::logWarn( curr_sheet, " the number of rows with non-processed dates in product_entry_date is ", nrow(non_processed_dates), ": ", @@ -92,7 +92,7 @@ reading_product_data_step1 <- } }, error = function(e) { - logError(curr_sheet, " trying with non_processed_dates in product_entry_date. Error: ", e$message) + ParallelLogger::logError(curr_sheet, " trying with non_processed_dates in product_entry_date. Error: ", e$message) } ) @@ -107,7 +107,7 @@ reading_product_data_step1 <- # Check if the entry dates for the balance match the month/year on the sheet check_entry_dates(product_df, curr_sheet) - logDebug("Finish processing sheet: ", curr_sheet) + ParallelLogger::logDebug("Finish processing sheet: ", curr_sheet) # combine all months if (!exists("df_final")) { @@ -121,7 +121,7 @@ reading_product_data_step1 <- } else { return(NULL) } - logDebug("Finish reading_product_data_step1.") + ParallelLogger::logDebug("Finish reading_product_data_step1.") } @@ -151,7 +151,7 @@ count_na_rows <- function(df, units_released_col, released_to_col) { #' #' @return This function does not return a value. It logs a warning message if there are any dates in 'product_entry_date' that don't match the month/year on the sheet. check_entry_dates <- function(df, Sheet) { - logDebug("Start check_entry_dates.") + ParallelLogger::logDebug("Start check_entry_dates.") # Check if the entry dates for the balance match the month/year on the sheet entry_dates_df <- df %>% dplyr::filter(grepl("^[0-9]+$", product_entry_date)) @@ -167,14 +167,14 @@ check_entry_dates <- function(df, Sheet) { not_same <- entry_dates_df[entry_dates_df$ed_month != entry_dates_df$product_table_month | entry_dates_df$ed_year != entry_dates_df$product_table_year, ] if (nrow(not_same) > 0) { - logWarn( + ParallelLogger::logWarn( Sheet, " the number of dates in product_entry_date that don't match the month/year on the sheet is ", nrow(not_same), ": ", paste(not_same$ed_date, collapse = ", ") ) } - logDebug("Finish check_entry_dates.") + ParallelLogger::logDebug("Finish check_entry_dates.") } #' @title Remove Rows with NA Values in Specified Columns. @@ -192,7 +192,7 @@ remove_rows_with_na_columns <- na_rows <- apply(df[column_names], 1, function(x) all(is.na(x))) # log message - logInfo(paste(length(na_rows[na_rows == T]), " rows deleted out of ", nrow(df), " rows (reason: rows not containing additional info).", sep = "")) + ParallelLogger::logInfo(paste(length(na_rows[na_rows == T]), " rows deleted out of ", nrow(df), " rows (reason: rows not containing additional info).", sep = "")) # Return the data frame without the NA rows return(df[!na_rows, ]) @@ -215,7 +215,7 @@ check_negative_balance <- function(df, Sheet) { # Check if there are any rows in the new data frame if (nrow(negative_df) > 0) { # Log a warning message with the number of negative values and their corresponding product_balance values - logWarn( + ParallelLogger::logWarn( Sheet, " number of negative values in product_balance on the sheet is ", nrow(negative_df), ": ", @@ -242,7 +242,7 @@ switch_columns_stock <- "product_units_received" = "product_received_from", "product_received_from" = "product_units_received" ) - logDebug("Columns product_units_received and product_received_from were switched") + ParallelLogger::logDebug("Columns product_units_received and product_received_from were switched") return(df) } else { return(df) @@ -290,14 +290,14 @@ report_unknown_products <- function(df, Sheet, stock_list_df) { # Check if there are any unknown products names if (length(unmatched_products) > 0) { # Log a warning message with the number of unknown products names - logWarn( + ParallelLogger::logWarn( Sheet, " the number of unknown product names on the sheet is ", length(unmatched_products), ": ", paste(unmatched_products, collapse = ", ") ) } else { - logInfo(Sheet, " no unknown product names on the sheet") + ParallelLogger::logInfo(Sheet, " no unknown product names on the sheet") } } @@ -317,16 +317,16 @@ report_unknown_products <- function(df, Sheet, stock_list_df) { #' product_list <- load_product_reference_data("your_file.xlsx") #' } load_product_reference_data <- function(stock_summary_xlsx = "reference_data/master_tracker_variables.xlsx") { - logDebug("Trying to load the product list from the Stock Summary, ", stock_summary_xlsx, "...") + ParallelLogger::logDebug("Trying to load the product list from the Stock Summary, ", stock_summary_xlsx, "...") tryCatch( { product_names_df <- readxl::read_excel(stock_summary_xlsx, "Stock_Summary") colnames(product_names_df) <- tolower(colnames(product_names_df)) - logDebug(nrow(product_names_df), " product names were loaded from the Stock Summary.") + ParallelLogger::logDebug(nrow(product_names_df), " product names were loaded from the Stock Summary.") return(product_names_df) }, error = function(e) { - logError("Error in loading stock product list: ", e) + ParallelLogger::logError("Error in loading stock product list: ", e) } ) } @@ -371,7 +371,7 @@ add_product_categories <- function(inventory_data, product_category_mapping) { #' df <- extract_unit_capacity(df, "product") #' } extract_unit_capacity <- function(df, column_name) { - logDebug("Trying to extract Unit Capacity from ", column_name, " column") + ParallelLogger::logDebug("Trying to extract Unit Capacity from ", column_name, " column") # Extract all symbols between parentheses df$product_unit_capacity <- stringr::str_extract(df[[column_name]], "\\(([^)]+)\\)") @@ -393,7 +393,7 @@ extract_unit_capacity <- function(df, column_name) { # Add 1 to NA values df$product_unit_capacity[is.na(df$product_unit_capacity)] <- 1 - logDebug("Finished extracting Unit Capacity from ", column_name, " column") + ParallelLogger::logDebug("Finished extracting Unit Capacity from ", column_name, " column") return(df) } @@ -410,7 +410,7 @@ extract_unit_capacity <- function(df, column_name) { #' @return Cleaned product data for one specified tracker. reading_product_data_step2 <- function(df, columns_synonyms) { - logDebug("Start reading_product_data_step2.") + ParallelLogger::logDebug("Start reading_product_data_step2.") # rename column names to match colnames(columns_synonyms) <- c("name_clean", "name_to_be_matched") @@ -425,7 +425,7 @@ reading_product_data_step2 <- # loop through all months for (sheet_month in unique(df$product_sheet_name)) { - logDebug(paste("Start processing the following sheet:", sheet_month)) + ParallelLogger::logDebug(paste("Start processing the following sheet:", sheet_month)) # filter on month sheet product_df <- df %>% @@ -453,7 +453,7 @@ reading_product_data_step2 <- product_df <- remove_rows_with_na_columns(product_df, column_names_check) # jump to next sheet if dataframe empty from here if (nrow(product_df) == 0) { - logDebug(paste(sheet_month, " sheet is empty after filtering and skipped", sep = "")) + ParallelLogger::logDebug(paste(sheet_month, " sheet is empty after filtering and skipped", sep = "")) next } @@ -527,12 +527,12 @@ reading_product_data_step2 <- df_final <- df_final %>% rbind(product_df) - logDebug(paste("Finished processing the following sheet:", sheet_month)) + ParallelLogger::logDebug(paste("Finished processing the following sheet:", sheet_month)) } if (nrow(df_final) > 0) { return(df_final) } else { - logDebug(paste("No product data extracted for the following tracker:", df$file_name[1])) + ParallelLogger::logDebug(paste("No product data extracted for the following tracker:", df$file_name[1])) } } diff --git a/R/sysdata.rda b/R/sysdata.rda index eb303cf804b77f1badf0b2d97168f0a3f6920e66..be6b002cff04d1329d0fdca86a74da91bb6a0978 100644 GIT binary patch delta 19 XcmX>tdRmlAzMF#q445}^9pnN4Eu91g delta 19 XcmX>tdRmlAzMF#q4A?eu9pnN4Evy6y diff --git a/man/process_patient_file.Rd b/man/process_patient_file.Rd new file mode 100644 index 0000000..69c9a62 --- /dev/null +++ b/man/process_patient_file.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper_script_2.R +\name{process_patient_file} +\alias{process_patient_file} +\title{Process and clean patient raw data} +\usage{ +process_patient_file(patient_file, paths, p) +} +\arguments{ +\item{patient_file}{Path to patient raw parquet file from first script.} + +\item{paths}{List with paths to patient_data_cleaned and product_data_cleaned folders.} + +\item{p}{progressor from progressr package.} +} +\description{ +Read in the output of the first script as parquet (all character). +Merge extracted patient data with the metadata schema that contains all +necessary columns. Process each variable in three steps: +\enumerate{ +\item Fix any problems with character representation. +\item Convert column to target data type, for example a date or number. +\item Apply any post-processing logic for the target data type. +} +} diff --git a/man/process_product_file.Rd b/man/process_product_file.Rd new file mode 100644 index 0000000..07d01d9 --- /dev/null +++ b/man/process_product_file.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper_script_2.R +\name{process_product_file} +\alias{process_product_file} +\title{Process and clean product raw data} +\usage{ +process_product_file(product_file, paths, p) +} +\arguments{ +\item{product_file}{Path to product raw parquet file from first script.} + +\item{paths}{List with paths to patient_data_cleaned and product_data_cleaned folders.} + +\item{p}{progressor from progressr package.} +} +\description{ +A short description... +} diff --git a/man/process_tracker_file.Rd b/man/process_tracker_file.Rd index b9728ae..2453d08 100644 --- a/man/process_tracker_file.Rd +++ b/man/process_tracker_file.Rd @@ -4,14 +4,14 @@ \alias{process_tracker_file} \title{Process a single tracker file and extract patient and product data.} \usage{ -process_tracker_file(paths, tracker_file, tracker_name) +process_tracker_file(tracker_file, paths, p) } \arguments{ -\item{paths}{a list with the paths to the tracker root dir, the patient and product output dir and the root output dir.} +\item{tracker_file}{Filename of the tracler.} -\item{tracker_file}{filename.} +\item{paths}{a list with the paths to the tracker root dir, the patient and product output dir and the root output dir.} -\item{tracker_name}{filename without extension.} +\item{p}{progressor from progressr package.} } \description{ Process a single tracker file and extract patient and product data. diff --git a/reference_data/build_package_data.R b/reference_data/build_package_data.R index 3d34392..7d3dc8c 100644 --- a/reference_data/build_package_data.R +++ b/reference_data/build_package_data.R @@ -1,5 +1,5 @@ cleaning_config <- yaml::read_yaml(here::here("reference_data", "data_cleaning.yaml")) -allowed_provinces <- yaml::read_yaml(here::here("reference_data", "provinces", "allowed_provinces.yaml")) %>% unlist() +allowed_provinces <- yaml::read_yaml(here::here("reference_data", "provinces", "allowed_provinces.yaml")) |> unlist() for (i in length(cleaning_config$province$steps)) { if (cleaning_config$province$steps[[i]]$type == "allowed_values") { diff --git a/scripts/run_script_1_extract_raw_data.R b/scripts/run_script_1_extract_raw_data.R index d4961ca..7fe3684 100644 --- a/scripts/run_script_1_extract_raw_data.R +++ b/scripts/run_script_1_extract_raw_data.R @@ -1,11 +1,11 @@ options(readxl.show_progress = FALSE) -future::plan(future::multisession, workers = 4) +future::plan(future::multisession) -Sys.setenv(A4D_DATA_ROOT = "data") paths <- a4d::init_paths(c("patient_data_raw", "product_data_raw"), delete = TRUE) -setup_logger(paths$output_root, "script1") +a4d::setup_logger(paths$output_root, "script1") tracker_files <- a4d::get_files(paths$tracker_root) -logInfo( + +ParallelLogger::logInfo( "Found ", length(tracker_files), " xlsx files under ", @@ -13,7 +13,7 @@ logInfo( "." ) -logInfo("Start processing tracker files.") +ParallelLogger::logInfo("Start processing tracker files.") progressr::with_progress({ p <- progressr::progressor(steps = length(tracker_files)) @@ -21,11 +21,11 @@ progressr::with_progress({ result <- furrr::future_map( tracker_files, a4d::process_tracker_file, - paths=paths, - p=p + paths = paths, + p = p ) }) -logInfo("Finish processing all tracker files.") +ParallelLogger::logInfo("Finish processing all tracker files.") -clearLoggers() +ParallelLogger::clearLoggers() diff --git a/scripts/run_script_2_clean_data.R b/scripts/run_script_2_clean_data.R index e20727a..21986c9 100644 --- a/scripts/run_script_2_clean_data.R +++ b/scripts/run_script_2_clean_data.R @@ -1,371 +1,53 @@ options(readxl.show_progress = FALSE) - -ERROR_VAL_NUMERIC <<- 999999 -ERROR_VAL_CHARACTER <<- "Undefined" -ERROR_VAL_DATE <<- "9999-09-09" - -main <- function() { - paths <- init_paths(c("patient_data_cleaned", "product_data_cleaned"), delete = TRUE) - setup_logger(paths$output_root, "script2") - patient_data_files <- get_files(paths$tracker_root, pattern = "patient_raw.parquet$") - product_data_files <- get_files(paths$tracker_root, pattern = "product_raw.parquet$") - logInfo( - "Found ", - length(patient_data_files), - " patient csv files under ", - paths$tracker_root, - "." - ) - logInfo( - "Found ", - length(product_data_files), - " product csv files under ", - paths$tracker_root, - "." - ) - - logInfo("Start processing patient csv files.") - - for (i in seq_along(patient_data_files)) { - patient_file <- patient_data_files[i] - patient_file_name <- tools::file_path_sans_ext(basename(patient_file)) - logfile <- paste0(patient_file_name) - with_file_logger(logfile, - { - tryCatch( - process_patient_file(paths, patient_file, patient_file_name, paths$patient_data_cleaned), - error = function(e) { - logError("Could not process raw patient data. Error = ", e$message, ".") - }, - warning = function(w) { - logWarn("Could not process raw patient data. Warning = ", w$message, ".") - } - ) - }, - output_root = paths$output_root - ) - cat(paste("Processed ", i, " of ", length(patient_data_files), " (", round(i / length(patient_data_files) * 100, 0), "%) patient files.\n")) - } - - logInfo("Finish processing all patient csv files.") - - logDebug("Start processing product csv files.") - synonyms <- get_synonyms() - synonyms_product <- synonyms$product - - for (i in seq_along(product_data_files)) { - product_file <- product_data_files[i] - product_file_name <- tools::file_path_sans_ext(basename(product_file)) - logfile <- paste0(product_file_name) - - with_file_logger(logfile, - { - tryCatch( - process_product_file(paths, product_file, product_file_name, synonyms_product, paths$product_data_cleaned), - error = function(e) { - logError("Could not process raw product data. Error = ", e$message, ".") - }, - warning = function(w) { - logWarn("Could not process raw product data. Warning = ", w$message, ".") - } - ) - }, - output_root = paths$output_root - ) - cat(paste("Processed ", i, " of ", length(product_data_files), " (", round(i / length(product_data_files) * 100, 0), "%) product files.\n")) - } - - logInfo("Finish processing all csv files.") -} - - -process_patient_file <- function(paths, patient_file, patient_file_name, output_root) { - patient_file_path <- - file.path(paths$tracker_root, patient_file) - logDebug("Start process_patient_file.") - logInfo( - "Current file: ", - patient_file_name - ) - - allowed_provinces <- get_allowed_provinces() - - df_patient_raw <- arrow::read_parquet(patient_file_path) - - # filter all rows with no patient id or patient name - df_patient_raw <- df_patient_raw %>% - dplyr::filter(!(is.na(id) & is.na(name))) %>% - dplyr::filter(!(id == "0" & name == "0")) - - # --- TRANSFORMATIONS --- - # data before 2019 had only one column for updated hba1c and fbg - # with date as part of the value - if (!"hba1c_updated_date" %in% colnames(df_patient_raw) && "hba1c_updated" %in% colnames(df_patient_raw)) { - logInfo("Column updated_hba1c_date not found. Trying to parse from hba1c_updated.") - df_patient_raw <- - extract_date_from_measurement(df_patient_raw, "hba1c_updated") - logDebug("Finished parsing dates from hba1c_updated.") - } - - if (!"fbg_updated_date" %in% colnames(df_patient_raw) && "fbg_updated_mg" %in% colnames(df_patient_raw)) { - logInfo("Column updated_fbg_date not found. Trying to parse from fbg_updated_mg.") - df_patient_raw <- - extract_date_from_measurement(df_patient_raw, "fbg_updated_mg") - logDebug("Finished parsing dates from fbg_updated_mg.") - } - - if (!"fbg_updated_date" %in% colnames(df_patient_raw) && "fbg_updated_mmol" %in% colnames(df_patient_raw)) { - logInfo("Column fbg_updated_date not found. Trying to parse from fbg_updated_mmol.") - df_patient_raw <- - extract_date_from_measurement(df_patient_raw, "fbg_updated_mmol") - logDebug("Finished parsing dates from fbg_updated_mmol.") - } - - # blood pressure is given as sys/dias value pair, - # so we split this column in two separate columns - if ("blood_pressure_mmhg" %in% colnames(df_patient_raw)) { - df_patient_raw <- split_bp_in_sys_and_dias(df_patient_raw) - } - - # The maximum value available for hba1c will be around 14% - 18%, - # depending on the equipment being used. - # If the reading is above the maximum available value the > sign is used - - # we would prefer to retain this character in the database as it is important for data analysis. - logInfo("Adding columns hba1c_baseline_exceeds and hba1c_updated_exceeds.") - df_patient_raw <- df_patient_raw %>% - dplyr::mutate( - hba1c_baseline_exceeds = ifelse(grepl(">|<", hba1c_baseline), TRUE, FALSE), - hba1c_updated_exceeds = ifelse(grepl(">|<", hba1c_updated), TRUE, FALSE) - ) - - # --- META SCHEMA --- - # meta schema has all final columns for the database - # along with their corresponding data types - logInfo("Creating meta schema.") - # short type string for read_csv: - # iiinDccDcnnDnncnlnlDncDccDDDccccDccccciDciiiDn - schema <- tibble::tibble( - # clinic_visit = logical(), - # complication_screening = character(), - # complication_screening_date = lubridate::as_date(1), - # complication_screening_results = character(), - # dm_complication_comment = character(), # TODO - # dm_complication_eye = character(), # TODO - # dm_complication_kidney = character(), # TODO - # dm_complication_other = character(), # TODO - # est_strips_pmonth = integer(), - # family_support_scale = character(), # TODO - # inactive_reason = character(), - # insulin_dosage = character(), - # meter_received_date = lubridate::as_date(1), # TODO - # remarks = character(), - # remote_followup = logical(), - # additional_support = character(), - age = integer(), - blood_pressure_dias_mmhg = integer(), - blood_pressure_sys_mmhg = integer(), - bmi = numeric(), - bmi_date = lubridate::as_date(1), - clinic_code = character(), - country_code = character(), - dob = lubridate::as_date(1), - edu_occ = character(), - fbg_baseline_mg = numeric(), - fbg_baseline_mmol = numeric(), - fbg_updated_date = lubridate::as_date(1), - fbg_updated_mg = numeric(), - fbg_updated_mmol = numeric(), - file_name = character(), - hba1c_baseline = numeric(), - hba1c_baseline_exceeds = logical(), - hba1c_updated = numeric(), - hba1c_updated_exceeds = logical(), - hba1c_updated_date = lubridate::as_date(1), - height = numeric(), - hospitalisation_cause = character(), - hospitalisation_date = lubridate::as_date(1), - id = character(), - insulin_regimen = character(), - last_clinic_visit_date = lubridate::as_date(1), - last_remote_followup_date = lubridate::as_date(1), - lost_date = lubridate::as_date(1), - name = character(), - observations = character(), - observations_category = character(), - province = character(), - recruitment_date = lubridate::as_date(1), - sex = character(), - sheet_name = character(), - status = character(), - status_out = character(), - support_from_a4d = character(), - t1d_diagnosis_age = integer(), - t1d_diagnosis_date = lubridate::as_date(1), - t1d_diagnosis_with_dka = character(), - testing_frequency = integer(), - tracker_date = lubridate::as_date(1), - tracker_month = integer(), - tracker_year = integer(), - updated_2022_date = lubridate::as_date(1), - weight = numeric() - ) - - cols_extra <- colnames(df_patient_raw)[!colnames(df_patient_raw) %in% colnames(schema)] - logWarn("Extra columns in patient data: ", paste(cols_extra, collapse = ", ")) - - cols_missing <- - colnames(schema)[!colnames(schema) %in% colnames(df_patient_raw)] - logWarn("Missing columns in patient data: ", paste(cols_missing, collapse = ", ")) - - # add all columns of schema to df_patient_raw - # keep all rows, only append missing cols - logInfo("Merging df_patient with meta schema and selecting all columns of meta schema.") - df_patient <- merge.default(df_patient_raw, schema, all.x = T) - df_patient <- df_patient[colnames(schema)] - - # the cleaning, fixing and validating happens in three major steps: - # 1. make sure we fix any known problems in the raw character columns - df_patient <- - df_patient %>% - dplyr::rowwise() %>% - # 1. handle known problems before converting to target type - dplyr::mutate( - t1d_diagnosis_age = fix_t1d_diagnosis_age(t1d_diagnosis_age, id), - hba1c_baseline = stringr::str_replace(hba1c_baseline, "<|>", ""), - hba1c_updated = stringr::str_replace(hba1c_updated, "<|>", ""), - fbg_baseline_mg = fix_fbg(fbg_baseline_mg), - fbg_baseline_mmol = fix_fbg(fbg_baseline_mmol), - fbg_updated_mg = fix_fbg(fbg_updated_mg), - fbg_updated_mmol = fix_fbg(fbg_updated_mmol), - testing_frequency = fix_testing_frequency(testing_frequency) - ) %>% - # 2. convert the refined character columns into the target data type - dplyr::mutate( - dplyr::across( - schema %>% dplyr::select(tidyselect::where(is.numeric)) %>% names(), - \(x) convert_to(correct_decimal_sign(x), as.numeric, ERROR_VAL_NUMERIC, dplyr::cur_column(), id = id) - ), - dplyr::across( - schema %>% dplyr::select(tidyselect::where(is.logical)) %>% names(), - \(x) convert_to(x, as.logical, FALSE, dplyr::cur_column(), id = id) - ), - dplyr::across( - schema %>% dplyr::select(tidyselect::where(lubridate::is.Date)) %>% names(), - \(x) convert_to(fix_digit_date(x), parse_dates, as.Date(ERROR_VAL_DATE), dplyr::cur_column(), id = id) - ), - dplyr::across( - schema %>% dplyr::select(tidyselect::where(is.integer)) %>% names(), - \(x) convert_to(x, function(x) as.integer(round(as.double(x))), ERROR_VAL_NUMERIC, dplyr::cur_column(), id = id) - ) - ) %>% - # 3. fix remaining problems in the target data type - dplyr::mutate( - # height and weight are needed to calculate bmi - height = transform_cm_to_m(height) %>% - cut_numeric_value(min = 0, max = 2.3, col_name = "height"), - weight = cut_numeric_value(weight, min = 0, max = 200, col_name = "weight"), - bmi = fix_bmi(weight, height, id) %>% - cut_numeric_value(min = 4, max = 60, "bmi"), - age = fix_age(age, dob, tracker_year, tracker_month, id) %>% - cut_numeric_value(min = 0, max = 25, "age"), - sex = fix_sex(sex, id), - hba1c_baseline = cut_numeric_value(hba1c_baseline, min = 4, max = 18, "hba1c_baseline"), - # https://www.cleveland19.com/story/1425584/ohio-man-holds-world-record-of-highest-blood-sugar/ - fbg_baseline_mmol = cut_numeric_value(fbg_baseline_mmol, min = 0, max = 136.5, "fbg_baseline_mmol"), - # https://www.cleveland19.com/story/1425584/ohio-man-holds-world-record-of-highest-blood-sugar/ - fbg_updated_mmol = cut_numeric_value(fbg_updated_mmol, min = 0, max = 136.5, "fbg_updated_mmol"), - blood_pressure_sys_mmhg = cut_numeric_value(blood_pressure_sys_mmhg, min = 20, max = 250, "blood_pressure_sys_mmhg"), - blood_pressure_dias_mmhg = cut_numeric_value(blood_pressure_dias_mmhg, min = 20, max = 220, "blood_pressure_dias_mmhg"), - tracker_date = lubridate::ym(paste(tracker_year, tracker_month, sep="-")), - !!!parse_character_cleaning_config(a4d:::config$cleaning), - # should be fixed last as other fix functions use id to log invalid rows! - id = fix_id(id) - ) %>% - dplyr::ungroup() - - # add clinic and country code after having fixed all issues with patient id - cc_codes <- extract_country_clinic_code(df_patient) - df_patient["clinic_code"] <- cc_codes$clinic_code - df_patient["country_code"] <- cc_codes$country_code - - # Formula to calculate mmol/l from mg/dl: mmol/l = mg/dl / 18 - if (all(is.na(df_patient$fbg_baseline_mmol))) { - df_patient <- df_patient %>% - dplyr::mutate(fbg_baseline_mmol = dplyr::case_when( - fbg_baseline_mg != ERROR_VAL_NUMERIC ~ fbg_baseline_mg / 18 - )) - } - if (all(is.na(df_patient$fbg_updated_mmol))) { - df_patient <- df_patient %>% - dplyr::mutate(fbg_updated_mmol = dplyr::case_when( - fbg_updated_mg != ERROR_VAL_NUMERIC ~ fbg_updated_mg / 18 - )) - } - - # Formula to calculate mg/dl from mmol/l: mg/dl = 18 × mmol/l - if (all(is.na(df_patient$fbg_baseline_mg))) { - df_patient <- df_patient %>% - dplyr::mutate(fbg_baseline_mg = dplyr::case_when( - fbg_baseline_mmol != ERROR_VAL_NUMERIC ~ fbg_baseline_mmol * 18 - )) - } - if (all(is.na(df_patient$fbg_updated_mg))) { - df_patient <- df_patient %>% - dplyr::mutate(fbg_updated_mg = dplyr::case_when( - fbg_updated_mmol != ERROR_VAL_NUMERIC ~ fbg_updated_mmol * 18 - )) - } - - # sort by year and month like it is in the tracker files - df_patient <- df_patient %>% - dplyr::arrange(tracker_date, id) - - logDebug( - "df_patient dim: ", - dim(df_patient) %>% as.data.frame(), - "." - ) - - export_data_as_parquet( - data = df_patient, - filename = stringr::str_replace(patient_file_name, "_patient_raw", ""), - output_root = output_root, - suffix = "_patient_cleaned" - ) - - logInfo("Finish process_patient_file.") -} - - -process_product_file <- function(paths, product_file, product_file_name, synonyms_product, output_root) { - product_file_path <- - file.path(paths$tracker_root, product_file) - logDebug("Start process_product_file.") - logInfo( - "Current file: ", - product_file_name - ) - - df_product_raw <- arrow::read_parquet(product_file_path) - - df_product_raw <- reading_product_data_step2(df_product_raw, synonyms_product) - - logDebug( - "df_product_raw dim: ", - dim(df_product_raw) %>% as.data.frame(), - "." - ) - - export_data_as_parquet( - data = df_product_raw, - filename = stringr::str_replace(product_file_name, "_product_raw", ""), - output_root = output_root, - suffix = "_product_cleaned" - ) - - logInfo("Finish process_product_file.") -} - -main() - -clearLoggers() +future::plan(future::multisession) + +paths <- a4d::init_paths(c("patient_data_cleaned", "product_data_cleaned"), delete = TRUE) +a4d::setup_logger(paths$output_root, "script2") +patient_data_files <- a4d::get_files(paths$tracker_root, pattern = "patient_raw.parquet$") +product_data_files <- a4d::get_files(paths$tracker_root, pattern = "product_raw.parquet$") +ParallelLogger::logInfo( + "Found ", + length(patient_data_files), + " patient csv files under ", + paths$tracker_root, + "." +) +ParallelLogger::logInfo( + "Found ", + length(product_data_files), + " product csv files under ", + paths$tracker_root, + "." +) + +ParallelLogger::logInfo("Start processing patient csv files.") + +progressr::with_progress({ + p <- progressr::progressor(steps = length(patient_data_files)) + + result <- furrr::future_map( + patient_data_files, + a4d::process_patient_file, + paths = paths, + p = p + ) +}) + +ParallelLogger::logInfo("Finish processing all patient csv files.") + +ParallelLogger::logDebug("Start processing product csv files.") + +progressr::with_progress({ + p <- progressr::progressor(steps = length(product_data_files)) + + result <- furrr::future_map( + product_data_files, + a4d::process_product_file, + paths = paths, + p = p + ) +}) + +ParallelLogger::logInfo("Finish processing all csv files.") + +ParallelLogger::clearLoggers() diff --git a/scripts/run_script_3_create_tables.R b/scripts/run_script_3_create_tables.R index 7962867..4cd6216 100644 --- a/scripts/run_script_3_create_tables.R +++ b/scripts/run_script_3_create_tables.R @@ -9,14 +9,14 @@ main <- function() { setup_logger(paths$output_root, "script3") patient_data_files <- get_files(file.path(paths$output_root, "patient_data_cleaned"), pattern = "\\.parquet$") product_data_files <- get_files(file.path(paths$output_root, "product_data_cleaned"), pattern = "\\.parquet$") - logInfo( + ParallelLogger::logInfo( "Found ", length(patient_data_files), " patient csv files under ", paths$tracker_root, "." ) - logInfo( + ParallelLogger::logInfo( "Found ", length(product_data_files), " product csv files under ", @@ -24,7 +24,7 @@ main <- function() { "." ) - logInfo("Start creating table csv files.") + ParallelLogger::logInfo("Start creating table csv files.") logfile <- "table_patient_data_static" with_file_logger(logfile, @@ -34,10 +34,10 @@ main <- function() { create_table_patient_data_static(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) }, error = function(e) { - logError("Could not create table csv for static patient data. Error: ", e$message) + ParallelLogger::logError("Could not create table csv for static patient data. Error: ", e$message) }, warning = function(w) { - logWarn("Could not create table csv for static patient data. Error: ", w$message) + ParallelLogger::logWarn("Could not create table csv for static patient data. Error: ", w$message) } ) }, @@ -52,10 +52,10 @@ main <- function() { create_table_patient_data_monthly(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) }, error = function(e) { - logError("Could not create table csv for monthly patient data. Error: ", e$message) + ParallelLogger::logError("Could not create table csv for monthly patient data. Error: ", e$message) }, warning = function(w) { - logWarn("Could not create table csv for monthly patient data. Error: ", w$message) + ParallelLogger::logWarn("Could not create table csv for monthly patient data. Error: ", w$message) } ) }, @@ -76,10 +76,10 @@ main <- function() { ) }, error = function(e) { - logError("Could not create table csv for longitudinal patient data. Error: ", e$message) + ParallelLogger::logError("Could not create table csv for longitudinal patient data. Error: ", e$message) }, warning = function(w) { - logWarn("Could not create table csv for longitudinal patient data. Error: ", w$message) + ParallelLogger::logWarn("Could not create table csv for longitudinal patient data. Error: ", w$message) } ) }, @@ -94,19 +94,19 @@ main <- function() { create_table_product_data(file.path(paths$output_root, "product_data_cleaned"), paths$tables) }, error = function(e) { - logError("Could not create table for product data. Error: ", e$message) + ParallelLogger::logError("Could not create table for product data. Error: ", e$message) }, warning = function(w) { - logWarn("Could not create table for product data. Warning: ", w$message) + ParallelLogger::logWarn("Could not create table for product data. Warning: ", w$message) } ) }, output_root = paths$output_root ) - logInfo("Finish creating table files.") + ParallelLogger::logInfo("Finish creating table files.") - logInfo("Trying to link files for product and patient data.") + ParallelLogger::logInfo("Trying to link files for product and patient data.") logfile <- "link_product_patient_data" @@ -120,17 +120,17 @@ main <- function() { ) }, error = function(e) { - logError("Could not link files for product and patient data. Error: ", e$message) + ParallelLogger::logError("Could not link files for product and patient data. Error: ", e$message) }, warning = function(w) { - logWarn("Could not link files for product and patient data. Warning: ", w$message) + ParallelLogger::logWarn("Could not link files for product and patient data. Warning: ", w$message) } ) }, output_root = paths$output_root ) - logInfo("Finished linking files for product and patient data.") + ParallelLogger::logInfo("Finished linking files for product and patient data.") } main() From 7e3b3f0fed472f761cfd00ecc4d48797b7aead99 Mon Sep 17 00:00:00 2001 From: Michael Aydinbas Date: Thu, 28 Dec 2023 21:27:53 +0100 Subject: [PATCH 3/8] working future code for script 1 and 2 --- DESCRIPTION | 4 +- NAMESPACE | 3 + R/create_table_patient_data.R | 4 +- R/create_table_patient_data_changes_only.R | 4 +- R/create_table_patient_data_static.R | 4 +- R/create_table_product_data.R | 42 +- R/helper_main.R | 18 +- R/helper_patient_data_fix.R | 38 +- R/helper_product_data.R | 12 +- R/helper_read_patient_data.R | 32 +- R/helper_scipt_1.R | 93 +++-- R/helper_script_2.R | 364 ++++++++++++++++++ R/link_product_patient.R | 12 +- R/logger.R | 19 +- R/read_cleaned_patient_data.R | 4 +- R/read_patient_data.R | 26 +- R/read_product_data.R | 60 +-- R/sysdata.rda | Bin 2763 -> 2763 bytes man/process_patient_file.Rd | 25 ++ man/process_product_file.Rd | 18 + man/process_tracker_file.Rd | 8 +- reference_data/build_package_data.R | 2 +- scripts/run_script_1_extract_raw_data.R | 18 +- scripts/run_script_2_clean_data.R | 422 +++------------------ scripts/run_script_3_create_tables.R | 32 +- 25 files changed, 678 insertions(+), 586 deletions(-) create mode 100644 R/helper_script_2.R create mode 100644 man/process_patient_file.Rd create mode 100644 man/process_product_file.Rd diff --git a/DESCRIPTION b/DESCRIPTION index a1d7052..c273a8f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,8 +8,7 @@ License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 -Depends: - ParallelLogger +Depends: Imports: data.table, digest, @@ -17,6 +16,7 @@ Imports: here, lubridate, openxlsx, + ParallelLogger, readr, readxl, stringr, diff --git a/NAMESPACE b/NAMESPACE index f8e779f..ca93149 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,7 +21,9 @@ export(harmonize_patient_data_columns) export(id_2_county_hospisal) export(init_paths) export(process_patient_data) +export(process_patient_file) export(process_product_data) +export(process_product_file) export(process_tracker_file) export(read_cleaned_patient_data) export(read_column_synonyms) @@ -29,6 +31,7 @@ export(report_empty_intersections) export(sanitize_str) export(select_A4D_directory) export(set_a4d_data_root) +export(setup_file_logger) export(setup_logger) export(with_file_logger) importFrom(data.table,"%like%") diff --git a/R/create_table_patient_data.R b/R/create_table_patient_data.R index 3f5c864..c6793ed 100644 --- a/R/create_table_patient_data.R +++ b/R/create_table_patient_data.R @@ -9,7 +9,7 @@ #' @param input_root root directory of the input CSV files. #' @param output_root root directory of the output folder. create_table_patient_data_monthly <- function(patient_data_files, input_root, output_root) { - logInfo("Start creating single csv for table patient_data_monthly.") + ParallelLogger::logInfo("Start creating single csv for table patient_data_monthly.") # THERE MIGHT BE MONTHLY COLUMNS MISSING - PLEASE ADD THEM dynamic_patient_columns <- @@ -58,5 +58,5 @@ create_table_patient_data_monthly <- function(patient_data_files, input_root, ou suffix = "" ) - logInfo("Finish creating single csv for table patient_data_monthly.") + ParallelLogger::logInfo("Finish creating single csv for table patient_data_monthly.") } diff --git a/R/create_table_patient_data_changes_only.R b/R/create_table_patient_data_changes_only.R index 6a06fe8..1a4178c 100644 --- a/R/create_table_patient_data_changes_only.R +++ b/R/create_table_patient_data_changes_only.R @@ -16,7 +16,7 @@ create_table_longitudinal_data <- output_root, variable, name) { - logInfo("Start creating single csv for table create_table_longitudinal_data and variable ", variable) + ParallelLogger::logInfo("Start creating single csv for table create_table_longitudinal_data and variable ", variable) dynamic_patient_columns <- c( @@ -79,5 +79,5 @@ create_table_longitudinal_data <- suffix = "" ) - logInfo("Finish creating single csv for table create_table_longitudinal_data and variable ", variable) + ParallelLogger::logInfo("Finish creating single csv for table create_table_longitudinal_data and variable ", variable) } diff --git a/R/create_table_patient_data_static.R b/R/create_table_patient_data_static.R index aaf180d..55c8c1d 100644 --- a/R/create_table_patient_data_static.R +++ b/R/create_table_patient_data_static.R @@ -9,7 +9,7 @@ #' @param input_root root directory of the input CSV files. #' @param output_root root directory of the output folder. create_table_patient_data_static <- function(patient_data_files, input_root, output_root) { - logInfo("Start creating single csv for table patient_data_static.") + ParallelLogger::logInfo("Start creating single csv for table patient_data_static.") # THERE MIGHT BE STATIC COLUMNS MISSING - PLEASE ADD THEM static_patient_columns <- @@ -59,5 +59,5 @@ create_table_patient_data_static <- function(patient_data_files, input_root, out suffix = "_static" ) - logInfo("Finish creating single csv for table patient_data_static.") + ParallelLogger::logInfo("Finish creating single csv for table patient_data_static.") } diff --git a/R/create_table_product_data.R b/R/create_table_product_data.R index aaa6448..4ee7eaa 100644 --- a/R/create_table_product_data.R +++ b/R/create_table_product_data.R @@ -17,7 +17,7 @@ #' create_table_product_data("path/to/input/directory", "path/to/output/directory") #' } create_table_product_data <- function(input_root, output_root) { - logInfo("Start creating single file for table product_data.") + ParallelLogger::logInfo("Start creating single file for table product_data.") # Get a list of all CSV files in the input_root directory files <- list.files(input_root, pattern = "*.parquet", full.names = TRUE) @@ -25,7 +25,7 @@ create_table_product_data <- function(input_root, output_root) { # Read all CSV files and store them in a list data_list <- lapply(files, function(x) arrow::read_parquet(x)) - logInfo(length(data_list), " files will be processed for creating the single file for table product_data.") + ParallelLogger::logInfo(length(data_list), " files will be processed for creating the single file for table product_data.") # Get the union of all column names all_names <- unique(unlist(lapply(data_list, colnames))) @@ -39,25 +39,25 @@ create_table_product_data <- function(input_root, output_root) { # Merge all data frames merged_data <- do.call(rbind, data_list) - logDebug("Copying original parient IDs...") + ParallelLogger::logDebug("Copying original parient IDs...") merged_data$orig_product_released_to <- merged_data$product_released_to - logDebug("Trying to fix patient IDs...") + ParallelLogger::logDebug("Trying to fix patient IDs...") merged_data$product_released_to <- sapply(merged_data$product_released_to, fix_id) - logDebug("Extracting product_county and product_hospisal from patients IDs...") + ParallelLogger::logDebug("Extracting product_county and product_hospisal from patients IDs...") merged_data <- id_2_county_hospisal( merged_data, "product_released_to", "product_country", "product_hospital" ) - logDebug("Calculating the most frequent 'product_hospital' for each 'file_name'...") + ParallelLogger::logDebug("Calculating the most frequent 'product_hospital' for each 'file_name'...") tryCatch( { merged_data <- calculate_most_frequent(merged_data, "file_name", "product_hospital", "table_hospital") }, error = function(e) { - logError("Error in calculating the most frequent 'product_hospital': ", e) + ParallelLogger::logError("Error in calculating the most frequent 'product_hospital': ", e) } ) @@ -67,14 +67,14 @@ create_table_product_data <- function(input_root, output_root) { merged_data <- calculate_most_frequent(merged_data, "file_name", "product_country", "table_country") }, error = function(e) { - logError("Error in calculating the most frequent 'product_country': ", e) + ParallelLogger::logError("Error in calculating the most frequent 'product_country': ", e) } ) # Reorder, add, and ensures the correct data type for each column according to the list of fields merged_data <- preparing_product_fields(merged_data) - logDebug("Checking 'table_country' for each 'file_name'...") + ParallelLogger::logDebug("Checking 'table_country' for each 'file_name'...") report_empty_intersections(merged_data, "file_name", "table_country") # Write the merged and processed data to a file in the output_root directory @@ -85,7 +85,7 @@ create_table_product_data <- function(input_root, output_root) { suffix = "" ) - logInfo("Finish creating single file for table product_data.") + ParallelLogger::logInfo("Finish creating single file for table product_data.") } @@ -168,7 +168,7 @@ preparing_product_fields <- function(merged_data) { "table_hospital" = "character" ) - logInfo("Start processing fields for the single csv product_data...") + ParallelLogger::logInfo("Start processing fields for the single csv product_data...") # Check if all fields are present in merged_data missing_fields <- setdiff(names(fields), names(merged_data)) @@ -186,7 +186,7 @@ preparing_product_fields <- function(merged_data) { merged_data[[field]] <- suppressWarnings(as.Date(original_values)) incorrect_rows <- which(is.na(merged_data[[field]]) & !is.na(original_values)) if (length(incorrect_rows) > 0) { - logWarn(paste( + ParallelLogger::logWarn(paste( "In", field, "incorrect date values were replaced with", ERROR_VAL_DATE, "in", length(incorrect_rows), "rows:", paste(incorrect_rows, collapse = ", ") @@ -198,7 +198,7 @@ preparing_product_fields <- function(merged_data) { merged_data[[field]] <- suppressWarnings(as.numeric(original_values)) incorrect_rows <- which(is.na(merged_data[[field]]) & !is.na(original_values)) if (length(incorrect_rows) > 0) { - logWarn(paste( + ParallelLogger::logWarn(paste( "In", field, "incorrect numeric values were replaced with", ERROR_VAL_NUMERIC, "in", length(incorrect_rows), "rows:", paste(incorrect_rows, collapse = ", ") @@ -210,7 +210,7 @@ preparing_product_fields <- function(merged_data) { merged_data[[field]] <- suppressWarnings(as.integer(original_values)) incorrect_rows <- which(is.na(merged_data[[field]]) & !is.na(original_values)) if (length(incorrect_rows) > 0) { - logWarn(paste( + ParallelLogger::logWarn(paste( "In", field, "incorrect integer values were replaced with", ERROR_VAL_NUMERIC, "in", length(incorrect_rows), "rows:", paste(incorrect_rows, collapse = ", ") @@ -222,7 +222,7 @@ preparing_product_fields <- function(merged_data) { merged_data[[field]] <- as.character(original_values) incorrect_rows <- which(is.na(merged_data[[field]]) & !is.na(original_values)) if (length(incorrect_rows) > 0) { - logWarn(paste( + ParallelLogger::logWarn(paste( "In", field, "incorrect character values were replaced with", ERROR_VAL_CHARACTER, "in", length(incorrect_rows), "rows:", paste(incorrect_rows, collapse = ", ") @@ -231,19 +231,19 @@ preparing_product_fields <- function(merged_data) { } } }, warning = function(w) { - logError(paste("Warning in converting", field, ": ", w)) + ParallelLogger::logError(paste("Warning in converting", field, ": ", w)) }, error = function(e) { - logWarn(paste("Error in converting", field, ": ", e)) + ParallelLogger::logWarn(paste("Error in converting", field, ": ", e)) }, finally = { - logDebug(paste("Finished converting", field)) + ParallelLogger::logDebug(paste("Finished converting", field)) }) } # Reorder the columns according to the list of fields - logInfo("Reorder the columns according to the list of fields...") + ParallelLogger::logInfo("Reorder the columns according to the list of fields...") merged_data <- merged_data[, c(names(fields), setdiff(names(merged_data), names(fields)))] - logInfo("Finished processing fields for the single csv product_data.") + ParallelLogger::logInfo("Finished processing fields for the single csv product_data.") return(merged_data) } @@ -313,7 +313,7 @@ report_empty_intersections <- function(df, row_category, col_category) { df_row_sums <- df_row_sums[df_row_sums$sum == 0, ] if (nrow(df_row_sums) > 0) { - logWarn( + ParallelLogger::logWarn( "The number of ", row_category, " with empty ", col_category, " is ", nrow(df_row_sums), ": ", paste(df_row_sums$row_name, sep = "", collapse = ", ") diff --git a/R/helper_main.R b/R/helper_main.R index ca8d1a3..b5cd0a1 100644 --- a/R/helper_main.R +++ b/R/helper_main.R @@ -139,7 +139,7 @@ read_column_synonyms <- function(synonym_file, path_prefixes = c("reference_data #' ) #' } export_data <- function(data, filename, output_root, suffix) { - logDebug("Start export_data. Suffix = ", suffix, ".") + ParallelLogger::logDebug("Start export_data. Suffix = ", suffix, ".") data %>% write.csv( file = @@ -156,7 +156,7 @@ export_data <- function(data, filename, output_root, suffix) { fileEncoding = "UTF-16LE", quote = T ) - logInfo("Finish export_data. Suffix = ", suffix, ".") + ParallelLogger::logInfo("Finish export_data. Suffix = ", suffix, ".") } @@ -177,12 +177,12 @@ export_data <- function(data, filename, output_root, suffix) { #' ) #' } export_data_as_parquet <- function(data, filename, output_root, suffix) { - logDebug("Start export_data. Suffix = ", suffix, ".") + ParallelLogger::logDebug("Start export_data. Suffix = ", suffix, ".") data %>% arrow::write_parquet( sink = file.path(output_root, paste0(filename, suffix, ".parquet")), ) - logInfo("Finish export_data. Suffix = ", suffix, ".") + ParallelLogger::logInfo("Finish export_data. Suffix = ", suffix, ".") } @@ -193,7 +193,7 @@ export_data_as_parquet <- function(data, filename, output_root, suffix) { #' #' @return tibble with patient data read_raw_csv <- function(file) { - logDebug("Start reading data with read_csv.") + ParallelLogger::logDebug("Start reading data with read_csv.") df_patient_raw <- readr::read_csv( file, name_repair = "check_unique", @@ -202,9 +202,9 @@ read_raw_csv <- function(file) { col_types = readr::cols(.default = "c"), locale = readr::locale(encoding = "UTF-16LE") ) - logDebug("Finished loading data with read_csv.") - logInfo("Dim: ", dim(df_patient_raw)) - logInfo("Columns: ", spec(df_patient_raw)) + ParallelLogger::logDebug("Finished loading data with read_csv.") + ParallelLogger::logInfo("Dim: ", dim(df_patient_raw)) + ParallelLogger::logInfo("Columns: ", spec(df_patient_raw)) df_patient_raw } @@ -218,6 +218,6 @@ read_raw_csv <- function(file) { #' @return A named character vector with all allowed provinces. get_allowed_provinces <- function() { ## Should new countries and provinces be added, update the YAML file - provinces <- yaml::read_yaml("reference_data/provinces/allowed_provinces.yaml") %>% unlist() + provinces <- yaml::read_yaml("reference_data/provinces/allowed_provinces.yaml") |> unlist() return(provinces) } diff --git a/R/helper_patient_data_fix.R b/R/helper_patient_data_fix.R index 05bb168..5b10d72 100644 --- a/R/helper_patient_data_fix.R +++ b/R/helper_patient_data_fix.R @@ -24,11 +24,11 @@ convert_to <- function(x, cast_fnc, error_val, col_name = "", id = "") { x <- tryCatch( cast_fnc(x), error = function(e) { - logError("Could not convert value ", x, " in column ", col_name, " for patient: ", id) + ParallelLogger::logError("Could not convert value ", x, " in column ", col_name, " for patient: ", id) x <- error_val }, warning = function(w) { - logWarn("Could not convert value ", x, " in column ", col_name, " for patient: ", id) + ParallelLogger::logWarn("Could not convert value ", x, " in column ", col_name, " for patient: ", id) x <- error_val } ) @@ -55,7 +55,7 @@ cut_numeric_value <- function(x, } if (x < min || x > max) { - logWarn( + ParallelLogger::logWarn( "Found invalid value ", x, " for column ", col_name, " outside [", min, ", ", max, "]. ", "Value was replaced with ", ERROR_VAL_NUMERIC, "." ) @@ -154,7 +154,7 @@ parse_dates <- function(date) { parsed_date <- suppressWarnings(lubridate::as_date(date)) if (is.na(parsed_date)) { - logWarn( + ParallelLogger::logWarn( "Could not parse date value ", date, ". ", "Trying to parse with lubridate::parse_date_time and orders = c('dmy', 'dmY', 'by', 'bY')." ) @@ -195,9 +195,9 @@ check_allowed_values <- function(x, valid_values, id, replace_invalid = TRUE, er valid_value_mapping <- setNames(as.list(valid_values), sanitize_str(valid_values)) if (!sanitize_str(x) %in% names(valid_value_mapping)) { - logWarn("Patient ", id, ": Value ", x, " for column ", col, " is not in the list of allowed values. ") + ParallelLogger::logWarn("Patient ", id, ": Value ", x, " for column ", col, " is not in the list of allowed values. ") if (replace_invalid) { - logInfo("Replacing ", x, " with ", error_val, ".") + ParallelLogger::logInfo("Replacing ", x, " with ", error_val, ".") return(error_val) } else { return(x) @@ -275,13 +275,13 @@ fix_age <- function(age, dob, tracker_year, tracker_month, id) { } if (is.na(age)) { - logWarn( + ParallelLogger::logWarn( "Patient ", id, ": age is missing. Using calculated age ", calc_age, " instead of original age." ) } else { if (calc_age != age) { - logWarn( + ParallelLogger::logWarn( "Patient ", id, ": age ", age, " is different from calculated age ", calc_age, ". Using calculated age instead of original age." ) @@ -289,7 +289,7 @@ fix_age <- function(age, dob, tracker_year, tracker_month, id) { } if (calc_age < 0) { - logWarn("Patient ", id, ": calculated age is negative. Something went wrong.") + ParallelLogger::logWarn("Patient ", id, ": calculated age is negative. Something went wrong.") calc_age <- ERROR_VAL_NUMERIC } } @@ -322,11 +322,11 @@ fix_bmi <- function(weight, height, id) { if (!is.na(weight) && weight == ERROR_VAL_CHARACTER) { - logWarn("Patient ", id, ": the weight is out of bounds.") + ParallelLogger::logWarn("Patient ", id, ": the weight is out of bounds.") } if (!is.na(height) && height == ERROR_VAL_CHARACTER) { - logWarn("Patient ", id, ": the height is out of bounds.") + ParallelLogger::logWarn("Patient ", id, ": the height is out of bounds.") } bmi } @@ -353,7 +353,7 @@ fix_sex <- function(sex, id) { ) if (!is.na(fixed_sex) && fixed_sex == ERROR_VAL_CHARACTER) { - logWarn("Patient ", id, ": sex ", sex, " is not in the list of synonyms. Replacing it with ", fixed_sex, ".") + ParallelLogger::logWarn("Patient ", id, ": sex ", sex, " is not in the list of synonyms. Replacing it with ", fixed_sex, ".") } fixed_sex } @@ -463,7 +463,7 @@ fix_testing_frequency <- function(test_frq) { } if (grepl("-", test_frq, fixed = TRUE)) { - logInfo("Found a range for testing_frequency. Replacing it with the mean.") + ParallelLogger::logInfo("Found a range for testing_frequency. Replacing it with the mean.") test_frq <- try(as.character(replace_range_with_mean(test_frq), silent = TRUE)) } @@ -492,7 +492,7 @@ replace_range_with_mean <- function(x) { #' #' @return data frame with two new columns: blood_pressure_sys_mmhg and blood_pressure_dias_mmhg. split_bp_in_sys_and_dias <- function(df) { - logInfo("Splitting blood_pressure_mmhg into blood_pressure_sys_mmhg and blood_pressure_dias_mmhg.") + ParallelLogger::logInfo("Splitting blood_pressure_mmhg into blood_pressure_sys_mmhg and blood_pressure_dias_mmhg.") df <- df %>% dplyr::mutate( blood_pressure_mmhg = dplyr::case_when( @@ -502,7 +502,7 @@ split_bp_in_sys_and_dias <- function(df) { ) if (paste(ERROR_VAL_NUMERIC, ERROR_VAL_NUMERIC, sep = "/") %in% df$blood_pressure_mmhg) { - logWarn( + ParallelLogger::logWarn( "Found invalid values for column blood_pressure_mmhg that do not follow the format X/Y. ", "Values were replaced with ", ERROR_VAL_NUMERIC, "." ) @@ -514,7 +514,7 @@ split_bp_in_sys_and_dias <- function(df) { delim = "/", names = c("blood_pressure_sys_mmhg", "blood_pressure_dias_mmhg"), ) - logDebug("Finished splitting blood_pressure_mmhg into blood_pressure_sys_mmhg and blood_pressure_dias_mmhg.") + ParallelLogger::logDebug("Finished splitting blood_pressure_mmhg into blood_pressure_sys_mmhg and blood_pressure_dias_mmhg.") df } @@ -550,12 +550,12 @@ fix_id <- function(id) { id <- stringr::str_replace(id, "-", "_") if (!grepl("^[[:upper:]]{2}_[[:upper:]]{2}[[:digit:]]{3}$", id)) { - logWarn("Patient ", id, ": id cannot be matched to a 7 letter alpha numeric code like XX_YY001. ") + ParallelLogger::logWarn("Patient ", id, ": id cannot be matched to a 7 letter alpha numeric code like XX_YY001. ") if (stringr::str_length(id) > 8) { - logWarn("Patient ", id, ": id was truncated because it is longer than 8 characters.") + ParallelLogger::logWarn("Patient ", id, ": id was truncated because it is longer than 8 characters.") id <- stringr::str_sub(id, 1, 8) } else { - logError("Patient ", id, ": id is not valid.") + ParallelLogger::logError("Patient ", id, ": id is not valid.") id <- ERROR_VAL_CHARACTER } } diff --git a/R/helper_product_data.R b/R/helper_product_data.R index 6f04cd7..3e56f6b 100644 --- a/R/helper_product_data.R +++ b/R/helper_product_data.R @@ -83,7 +83,7 @@ get_patient_end <- function(df, j) { # @Description: Reads product data from a monthly file based on extraction logic extract_product_data <- function(monthly_tracker_df) { - logDebug("Starting extract_product_data.") + ParallelLogger::logDebug("Starting extract_product_data.") start_df_msd <- NULL end_df_msd <- NULL @@ -103,7 +103,7 @@ extract_product_data <- function(monthly_tracker_df) { # Clean empty remaining first row product_data_df <- set_second_row_as_headers_and_remove_first_row(product_data_df) - logDebug("Finish extract_product_data.") + ParallelLogger::logDebug("Finish extract_product_data.") return(product_data_df) } @@ -134,7 +134,7 @@ extract_product_data <- function(monthly_tracker_df) { # column synonyms to unify column names # @columns_synonyms: Long format output of read_column_synonyms to match columns harmonize_input_data_columns <- function(product_df, columns_synonyms) { - logDebug("Start harmonize_input_data_columns.") + ParallelLogger::logDebug("Start harmonize_input_data_columns.") # In case that there is additional data in strange columns, keep only relevant columns # keep.cols <- names(product_df) %in% c("") @@ -147,14 +147,14 @@ harmonize_input_data_columns <- function(product_df, columns_synonyms) { ## report all column names which have not been found unknown_column_names <- colnames(product_df)[!colnames(product_df) %in% synonym_headers] if (length(unknown_column_names) > 0) { - logWarn("Unknown column names in sheet: ", paste(unknown_column_names, collapse = ", ")) + ParallelLogger::logWarn("Unknown column names in sheet: ", paste(unknown_column_names, collapse = ", ")) } # replacing var codes colnames_found <- match(colnames(product_df), synonym_headers, nomatch = 0) colnames(product_df)[colnames(product_df) %in% synonym_headers] <- columns_synonyms$name_clean[colnames_found] - logDebug("Finish harmonize_input_data_columns.") + ParallelLogger::logDebug("Finish harmonize_input_data_columns.") if (sum(colnames_found == 0) != 0) { "Non-matching column names found (see 0)" # SK: remove non matching column names @@ -282,7 +282,7 @@ update_receivedfrom <- function(product_df) { grepl("Balance", product_units_received, ignore.case = TRUE) & !is.na(product_received_from) ~ product_received_from )) %>% dplyr::mutate(product_units_released = ifelse(!is.na(product_received_from), NA, product_units_released)) - logInfo("The rule for the case was applied successfully- Released (product_units_released) column also includes values for Start/End Balance") + ParallelLogger::logInfo("The rule for the case was applied successfully- Released (product_units_released) column also includes values for Start/End Balance") } return(product_df) } diff --git a/R/helper_read_patient_data.R b/R/helper_read_patient_data.R index cca5779..8f61b04 100644 --- a/R/helper_read_patient_data.R +++ b/R/helper_read_patient_data.R @@ -1,7 +1,7 @@ # extracting country and clinic code from patient ID # expects that patient ID has a certain format extract_country_clinic_code <- function(patient_data) { - logDebug("Start extract_country_clinic_code.") + ParallelLogger::logDebug("Start extract_country_clinic_code.") patient_ids <- patient_data["id"] %>% dplyr::filter(id != "0") %>% tidyr::drop_na() %>% @@ -19,9 +19,9 @@ extract_country_clinic_code <- function(patient_data) { clinic_code <- names(sort(table(patient_ids$clinic), decreasing = T))[1] - logDebug("country_code = ", country_code, ".") - logDebug("clinic_code = ", clinic_code, ".") - logDebug("Finish extract_country_clinic_code.") + ParallelLogger::logDebug("country_code = ", country_code, ".") + ParallelLogger::logDebug("clinic_code = ", clinic_code, ".") + ParallelLogger::logDebug("Finish extract_country_clinic_code.") return(list("country_code" = country_code, "clinic_code" = clinic_code)) } @@ -40,9 +40,9 @@ extract_country_clinic_code <- function(patient_data) { #' @return data.frame with the patient data #' @export extract_patient_data <- function(tracker_data_file, sheet, year) { - logDebug("Start extract_patient_data for sheet = ", sheet, ".") + ParallelLogger::logDebug("Start extract_patient_data for sheet = ", sheet, ".") - logDebug("Start openxlsx::read.xlsx to get tracker_data.") + ParallelLogger::logDebug("Start openxlsx::read.xlsx to get tracker_data.") tracker_data <- openxlsx::read.xlsx( xlsxFile = tracker_data_file, sheet = sheet, @@ -59,7 +59,7 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { # col_names = F, # .name_repair = "unique_quiet" # ) - logDebug("Finish openxlsx::read.xlsx.") + ParallelLogger::logDebug("Finish openxlsx::read.xlsx.") # Assumption: first column is always empty until patient data begins patient_data_range <- which(!is.na(tracker_data[, 1])) @@ -80,8 +80,8 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { row_max <- row_max + 1 } - logInfo("Patient data found in rows ", row_min, " to ", row_max, ".") - logDebug("Start readxl::read_excel to get patient data.") + ParallelLogger::logInfo("Patient data found in rows ", row_min, " to ", row_max, ".") + ParallelLogger::logDebug("Start readxl::read_excel to get patient data.") df_patient <- readxl::read_excel( path = tracker_data_file, sheet = sheet, @@ -91,11 +91,11 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { col_types = c("text"), .name_repair = "unique_quiet" ) - logDebug("Finish readxl::read_excel.") + ParallelLogger::logDebug("Finish readxl::read_excel.") if (header_cols[2] == header_cols_2[2]) { # take into account that date info gets separated from the updated values (not in the same row, usually in the bottom row) - logInfo("Read in multiline header.") + ParallelLogger::logInfo("Read in multiline header.") diff_colnames <- which((header_cols != header_cols_2)) header_cols[diff_colnames] <- @@ -106,7 +106,7 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { } colnames(df_patient) <- header_cols - logDebug("Found patient column names = ", paste(header_cols, collapse = ","), ".") + ParallelLogger::logDebug("Found patient column names = ", paste(header_cols, collapse = ","), ".") # delete columns without a header (=NA) df_patient <- df_patient[, !is.na(colnames(df_patient))] @@ -117,7 +117,7 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { df_patient <- df_patient[rowSums(is.na(df_patient)) != ncol(df_patient), ] - logDebug("Finish extract_patient_data.") + ParallelLogger::logDebug("Finish extract_patient_data.") df_patient } @@ -137,7 +137,7 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { #' @export harmonize_patient_data_columns <- function(patient_df, columns_synonyms) { - logDebug("Start harmonize_patient_data_columns.") + ParallelLogger::logDebug("Start harmonize_patient_data_columns.") patient_df <- patient_df[!is.na(names(patient_df))] @@ -151,11 +151,11 @@ harmonize_patient_data_columns <- mismatching_column_ids <- which(colnames_found == 0) if (length(mismatching_column_ids) > 0) { - logWarn( + ParallelLogger::logWarn( "Non-matching column names found: ", paste(colnames(patient_df)[mismatching_column_ids], collapse = ","), "." ) } - logDebug("Finish harmonize_patient_data_columns.") + ParallelLogger::logDebug("Finish harmonize_patient_data_columns.") patient_df } diff --git a/R/helper_scipt_1.R b/R/helper_scipt_1.R index 222d5f1..f1cf102 100644 --- a/R/helper_scipt_1.R +++ b/R/helper_scipt_1.R @@ -1,5 +1,3 @@ - - #' @title Process a single tracker file and extract patient and product data. #' #' @param tracker_file Filename of the tracler. @@ -13,56 +11,57 @@ process_tracker_file <- function(tracker_file, paths, p) { synonyms <- get_synonyms() tracker_data_file <- file.path(paths$tracker_root, tracker_file) - logDebug("Start process_tracker_file.") - logInfo( + + ParallelLogger::logDebug("Start process_tracker_file.") + ParallelLogger::logInfo( "Current file: ", tracker_name ) logfile <- paste0(tracker_name, "_", "patient") with_file_logger(logfile, - { - tryCatch( - process_patient_data( - tracker_name = tracker_name, - tracker_data_file = tracker_data_file, - output_root = paths$patient_data_raw, - synonyms_patient = synonyms$patient - ), - error = function(e) { - logError("Could not process patient data. Error = ", e$message, ".") - }, - warning = function(w) { - logWarn("Could not process patient data. Warning = ", w$message, ".") - } - ) - }, - output_root = paths$output_root + { + tryCatch( + process_patient_data( + tracker_name = tracker_name, + tracker_data_file = tracker_data_file, + output_root = paths$patient_data_raw, + synonyms_patient = synonyms$patient + ), + error = function(e) { + ParallelLogger::logError("Could not process patient data. Error = ", e$message, ".") + }, + warning = function(w) { + ParallelLogger::logWarn("Could not process patient data. Warning = ", w$message, ".") + } + ) + }, + output_root = paths$output_root ) logfile <- paste0(tracker_name, "_", "product") with_file_logger(logfile, - { - tryCatch( - process_product_data( - tracker_name = tracker_name, - tracker_data_file = tracker_data_file, - output_root = paths$product_data_raw, - synonyms_product = synonyms$product - ), - error = function(e) { - logError("Could not process product data. Error = ", e$message, ".") - }, - warning = function(w) { - logWarn("Could not process product data. Warning = ", w$message, ".") - } - ) - }, - output_root = paths$output_root + { + tryCatch( + process_product_data( + tracker_name = tracker_name, + tracker_data_file = tracker_data_file, + output_root = paths$product_data_raw, + synonyms_product = synonyms$product + ), + error = function(e) { + ParallelLogger::logError("Could not process product data. Error = ", e$message, ".") + }, + warning = function(w) { + ParallelLogger::logWarn("Could not process product data. Warning = ", w$message, ".") + } + ) + }, + output_root = paths$output_root ) - logInfo("Finish process_tracker_file.") + ParallelLogger::logDebug("Finish process_tracker_file.") } @@ -79,7 +78,7 @@ process_patient_data <- tracker_data_file, output_root, synonyms_patient) { - logDebug("Start process_patient_data.") + ParallelLogger::logDebug("Start process_patient_data.") df_raw_patient <- reading_patient_data( @@ -89,7 +88,7 @@ process_patient_data <- df_raw_patient <- df_raw_patient %>% dplyr::mutate(file_name = tracker_name) - logDebug( + ParallelLogger::logDebug( "df_raw_patient dim: ", dim(df_raw_patient) %>% as.data.frame(), "." @@ -102,7 +101,7 @@ process_patient_data <- suffix = "_patient_raw" ) - logInfo("Finish process_patient_data.") + ParallelLogger::logDebug("Finish process_patient_data.") } @@ -119,7 +118,7 @@ process_product_data <- tracker_data_file, output_root, synonyms_product) { - logDebug("Start process_product_data.") + ParallelLogger::logDebug("Start process_product_data.") df_raw_product <- reading_product_data_step1( @@ -130,10 +129,10 @@ process_product_data <- if (!is.null(df_raw_product)) { df_raw_product <- df_raw_product %>% dplyr::mutate(file_name = tracker_name) } else { - logDebug("Empty product data") + ParallelLogger::logDebug("Empty product data") } - logDebug( + ParallelLogger::logDebug( "df_raw_product dim: ", dim(df_raw_product) %>% as.data.frame(), "." @@ -148,7 +147,7 @@ process_product_data <- suffix = "_product_raw" ) } else { - logWarn("No product data in the file") + ParallelLogger::logWarn("No product data in the file") } - logDebug("Finish process_product_data.") + ParallelLogger::logDebug("Finish process_product_data.") } diff --git a/R/helper_script_2.R b/R/helper_script_2.R new file mode 100644 index 0000000..035b5db --- /dev/null +++ b/R/helper_script_2.R @@ -0,0 +1,364 @@ +#' Process and clean patient raw data +#' +#' @description +#' Read in the output of the first script as parquet (all character). +#' Merge extracted patient data with the metadata schema that contains all +#' necessary columns. Process each variable in three steps: +#' 1. Fix any problems with character representation. +#' 2. Convert column to target data type, for example a date or number. +#' 3. Apply any post-processing logic for the target data type. +#' +#' +#' @param patient_file Path to patient raw parquet file from first script. +#' @param paths List with paths to patient_data_cleaned and product_data_cleaned folders. +#' @param p progressor from progressr package. +#' +#' @export +process_patient_file <- function(patient_file, paths, p) { + p() + + ERROR_VAL_NUMERIC <<- 999999 + ERROR_VAL_CHARACTER <<- "Undefined" + ERROR_VAL_DATE <<- "9999-09-09" + + patient_file_name <- tools::file_path_sans_ext(basename(patient_file)) + patient_file_path <- + file.path(paths$tracker_root, patient_file) + output_root <- paths$patient_data_cleaned + + ParallelLogger::logDebug("Start process_patient_file.") + ParallelLogger::logInfo( + "Current file: ", + patient_file_name + ) + + logfile <- paste0(patient_file_name) + + with_file_logger(logfile, + { + tryCatch( + process_patient_file_worker( + patient_file_path = patient_file_path, + patient_file_name = patient_file_name, + output_root = output_root + ), + error = function(e) { + ParallelLogger::logError("Could not process raw patient data. Error = ", e$message, ".") + }, + warning = function(w) { + ParallelLogger::logWarn("Could not process raw patient data. Warning = ", w$message, ".") + } + ) + }, + output_root = paths$output_root + ) + + ParallelLogger::logInfo("Finish process_patient_file.") +} + + +process_patient_file_worker <- function(patient_file_path, patient_file_name, output_root) { + allowed_provinces <- get_allowed_provinces() + + df_patient_raw <- arrow::read_parquet(patient_file_path) + + # filter all rows with no patient id or patient name + df_patient_raw <- df_patient_raw %>% + dplyr::filter(!(is.na(id) & is.na(name))) %>% + dplyr::filter(!(id == "0" & name == "0")) + + # --- TRANSFORMATIONS --- + # data before 2019 had only one column for updated hba1c and fbg + # with date as part of the value + if (!"hba1c_updated_date" %in% colnames(df_patient_raw) && "hba1c_updated" %in% colnames(df_patient_raw)) { + ParallelLogger::logInfo("Column updated_hba1c_date not found. Trying to parse from hba1c_updated.") + df_patient_raw <- + extract_date_from_measurement(df_patient_raw, "hba1c_updated") + ParallelLogger::logDebug("Finished parsing dates from hba1c_updated.") + } + + if (!"fbg_updated_date" %in% colnames(df_patient_raw) && "fbg_updated_mg" %in% colnames(df_patient_raw)) { + ParallelLogger::logInfo("Column updated_fbg_date not found. Trying to parse from fbg_updated_mg.") + df_patient_raw <- + extract_date_from_measurement(df_patient_raw, "fbg_updated_mg") + ParallelLogger::logDebug("Finished parsing dates from fbg_updated_mg.") + } + + if (!"fbg_updated_date" %in% colnames(df_patient_raw) && "fbg_updated_mmol" %in% colnames(df_patient_raw)) { + ParallelLogger::logInfo("Column fbg_updated_date not found. Trying to parse from fbg_updated_mmol.") + df_patient_raw <- + extract_date_from_measurement(df_patient_raw, "fbg_updated_mmol") + ParallelLogger::logDebug("Finished parsing dates from fbg_updated_mmol.") + } + + # blood pressure is given as sys/dias value pair, + # so we split this column in two separate columns + if ("blood_pressure_mmhg" %in% colnames(df_patient_raw)) { + df_patient_raw <- split_bp_in_sys_and_dias(df_patient_raw) + } + + # The maximum value available for hba1c will be around 14% - 18%, + # depending on the equipment being used. + # If the reading is above the maximum available value the > sign is used - + # we would prefer to retain this character in the database as it is important for data analysis. + ParallelLogger::logInfo("Adding columns hba1c_baseline_exceeds and hba1c_updated_exceeds.") + df_patient_raw <- df_patient_raw %>% + dplyr::mutate( + hba1c_baseline_exceeds = ifelse(grepl(">|<", hba1c_baseline), TRUE, FALSE), + hba1c_updated_exceeds = ifelse(grepl(">|<", hba1c_updated), TRUE, FALSE) + ) + + # --- META SCHEMA --- + # meta schema has all final columns for the database + # along with their corresponding data types + ParallelLogger::logInfo("Creating meta schema.") + # short type string for read_csv: + # iiinDccDcnnDnncnlnlDncDccDDDccccDccccciDciiiDn + schema <- tibble::tibble( + # clinic_visit = logical(), + # complication_screening = character(), + # complication_screening_date = lubridate::as_date(1), + # complication_screening_results = character(), + # dm_complication_comment = character(), # TODO + # dm_complication_eye = character(), # TODO + # dm_complication_kidney = character(), # TODO + # dm_complication_other = character(), # TODO + # est_strips_pmonth = integer(), + # family_support_scale = character(), # TODO + # inactive_reason = character(), + # insulin_dosage = character(), + # meter_received_date = lubridate::as_date(1), # TODO + # remarks = character(), + # remote_followup = logical(), + # additional_support = character(), + age = integer(), + blood_pressure_dias_mmhg = integer(), + blood_pressure_sys_mmhg = integer(), + bmi = numeric(), + bmi_date = lubridate::as_date(1), + clinic_code = character(), + country_code = character(), + dob = lubridate::as_date(1), + edu_occ = character(), + fbg_baseline_mg = numeric(), + fbg_baseline_mmol = numeric(), + fbg_updated_date = lubridate::as_date(1), + fbg_updated_mg = numeric(), + fbg_updated_mmol = numeric(), + file_name = character(), + hba1c_baseline = numeric(), + hba1c_baseline_exceeds = logical(), + hba1c_updated = numeric(), + hba1c_updated_exceeds = logical(), + hba1c_updated_date = lubridate::as_date(1), + height = numeric(), + hospitalisation_cause = character(), + hospitalisation_date = lubridate::as_date(1), + id = character(), + insulin_regimen = character(), + last_clinic_visit_date = lubridate::as_date(1), + last_remote_followup_date = lubridate::as_date(1), + lost_date = lubridate::as_date(1), + name = character(), + observations = character(), + observations_category = character(), + province = character(), + recruitment_date = lubridate::as_date(1), + sex = character(), + sheet_name = character(), + status = character(), + status_out = character(), + support_from_a4d = character(), + t1d_diagnosis_age = integer(), + t1d_diagnosis_date = lubridate::as_date(1), + t1d_diagnosis_with_dka = character(), + testing_frequency = integer(), + tracker_date = lubridate::as_date(1), + tracker_month = integer(), + tracker_year = integer(), + updated_2022_date = lubridate::as_date(1), + weight = numeric() + ) + + cols_extra <- colnames(df_patient_raw)[!colnames(df_patient_raw) %in% colnames(schema)] + ParallelLogger::logWarn("Extra columns in patient data: ", paste(cols_extra, collapse = ", ")) + + cols_missing <- + colnames(schema)[!colnames(schema) %in% colnames(df_patient_raw)] + ParallelLogger::logWarn("Missing columns in patient data: ", paste(cols_missing, collapse = ", ")) + + # add all columns of schema to df_patient_raw + # keep all rows, only append missing cols + ParallelLogger::logInfo("Merging df_patient with meta schema and selecting all columns of meta schema.") + df_patient <- merge.default(df_patient_raw, schema, all.x = T) + df_patient <- df_patient[colnames(schema)] + + # the cleaning, fixing and validating happens in three major steps: + # 1. make sure we fix any known problems in the raw character columns + df_patient <- + df_patient %>% + dplyr::rowwise() %>% + # 1. handle known problems before converting to target type + dplyr::mutate( + t1d_diagnosis_age = fix_t1d_diagnosis_age(t1d_diagnosis_age, id), + hba1c_baseline = stringr::str_replace(hba1c_baseline, "<|>", ""), + hba1c_updated = stringr::str_replace(hba1c_updated, "<|>", ""), + fbg_baseline_mg = fix_fbg(fbg_baseline_mg), + fbg_baseline_mmol = fix_fbg(fbg_baseline_mmol), + fbg_updated_mg = fix_fbg(fbg_updated_mg), + fbg_updated_mmol = fix_fbg(fbg_updated_mmol), + testing_frequency = fix_testing_frequency(testing_frequency) + ) %>% + # 2. convert the refined character columns into the target data type + dplyr::mutate( + dplyr::across( + schema %>% dplyr::select(tidyselect::where(is.numeric)) %>% names(), + \(x) convert_to(correct_decimal_sign(x), as.numeric, ERROR_VAL_NUMERIC, dplyr::cur_column(), id = id) + ), + dplyr::across( + schema %>% dplyr::select(tidyselect::where(is.logical)) %>% names(), + \(x) convert_to(x, as.logical, FALSE, dplyr::cur_column(), id = id) + ), + dplyr::across( + schema %>% dplyr::select(tidyselect::where(lubridate::is.Date)) %>% names(), + \(x) convert_to(fix_digit_date(x), parse_dates, as.Date(ERROR_VAL_DATE), dplyr::cur_column(), id = id) + ), + dplyr::across( + schema %>% dplyr::select(tidyselect::where(is.integer)) %>% names(), + \(x) convert_to(x, function(x) as.integer(round(as.double(x))), ERROR_VAL_NUMERIC, dplyr::cur_column(), id = id) + ) + ) %>% + # 3. fix remaining problems in the target data type + dplyr::mutate( + # height and weight are needed to calculate bmi + height = transform_cm_to_m(height) %>% + cut_numeric_value(min = 0, max = 2.3, col_name = "height"), + weight = cut_numeric_value(weight, min = 0, max = 200, col_name = "weight"), + bmi = fix_bmi(weight, height, id) %>% + cut_numeric_value(min = 4, max = 60, "bmi"), + age = fix_age(age, dob, tracker_year, tracker_month, id) %>% + cut_numeric_value(min = 0, max = 25, "age"), + sex = fix_sex(sex, id), + hba1c_baseline = cut_numeric_value(hba1c_baseline, min = 4, max = 18, "hba1c_baseline"), + # https://www.cleveland19.com/story/1425584/ohio-man-holds-world-record-of-highest-blood-sugar/ + fbg_baseline_mmol = cut_numeric_value(fbg_baseline_mmol, min = 0, max = 136.5, "fbg_baseline_mmol"), + # https://www.cleveland19.com/story/1425584/ohio-man-holds-world-record-of-highest-blood-sugar/ + fbg_updated_mmol = cut_numeric_value(fbg_updated_mmol, min = 0, max = 136.5, "fbg_updated_mmol"), + blood_pressure_sys_mmhg = cut_numeric_value(blood_pressure_sys_mmhg, min = 20, max = 250, "blood_pressure_sys_mmhg"), + blood_pressure_dias_mmhg = cut_numeric_value(blood_pressure_dias_mmhg, min = 20, max = 220, "blood_pressure_dias_mmhg"), + tracker_date = lubridate::ym(paste(tracker_year, tracker_month, sep = "-")), + !!!parse_character_cleaning_config(a4d:::config$cleaning), + # should be fixed last as other fix functions use id to log invalid rows! + id = fix_id(id) + ) %>% + dplyr::ungroup() + + # add clinic and country code after having fixed all issues with patient id + cc_codes <- extract_country_clinic_code(df_patient) + df_patient["clinic_code"] <- cc_codes$clinic_code + df_patient["country_code"] <- cc_codes$country_code + + # Formula to calculate mmol/l from mg/dl: mmol/l = mg/dl / 18 + if (all(is.na(df_patient$fbg_baseline_mmol))) { + df_patient <- df_patient %>% + dplyr::mutate(fbg_baseline_mmol = dplyr::case_when( + fbg_baseline_mg != ERROR_VAL_NUMERIC ~ fbg_baseline_mg / 18 + )) + } + if (all(is.na(df_patient$fbg_updated_mmol))) { + df_patient <- df_patient %>% + dplyr::mutate(fbg_updated_mmol = dplyr::case_when( + fbg_updated_mg != ERROR_VAL_NUMERIC ~ fbg_updated_mg / 18 + )) + } + + # Formula to calculate mg/dl from mmol/l: mg/dl = 18 × mmol/l + if (all(is.na(df_patient$fbg_baseline_mg))) { + df_patient <- df_patient %>% + dplyr::mutate(fbg_baseline_mg = dplyr::case_when( + fbg_baseline_mmol != ERROR_VAL_NUMERIC ~ fbg_baseline_mmol * 18 + )) + } + if (all(is.na(df_patient$fbg_updated_mg))) { + df_patient <- df_patient %>% + dplyr::mutate(fbg_updated_mg = dplyr::case_when( + fbg_updated_mmol != ERROR_VAL_NUMERIC ~ fbg_updated_mmol * 18 + )) + } + + # sort by year and month like it is in the tracker files + df_patient <- df_patient %>% + dplyr::arrange(tracker_date, id) + + ParallelLogger::logDebug( + "df_patient dim: ", + dim(df_patient) %>% as.data.frame(), + "." + ) + + export_data_as_parquet( + data = df_patient, + filename = stringr::str_replace(patient_file_name, "_patient_raw", ""), + output_root = output_root, + suffix = "_patient_cleaned" + ) +} + + +#' Process and clean product raw data +#' +#' @description +#' A short description... +#' +#' +#' @param product_file Path to product raw parquet file from first script. +#' @param paths List with paths to patient_data_cleaned and product_data_cleaned folders. +#' @param p progressor from progressr package. +#' +#' @export +process_product_file <- function(product_file, paths, p) { + p() + synonyms <- get_synonyms() + synonyms_product <- synonyms$product + product_file_name <- tools::file_path_sans_ext(basename(product_file)) + product_file_path <- + file.path(paths$tracker_root, product_file) + + ParallelLogger::logDebug("Start process_product_file.") + ParallelLogger::logInfo( + "Current file: ", + product_file_name + ) + df_product_raw <- arrow::read_parquet(product_file_path) + + logfile <- paste0(product_file_name) + with_file_logger(logfile, + { + tryCatch( + df_product_raw <- reading_product_data_step2(df_product_raw, synonyms_product), + error = function(e) { + ParallelLogger::logError("Could not process raw product data. Error = ", e$message, ".") + }, + warning = function(w) { + ParallelLogger::logWarn("Could not process raw product data. Warning = ", w$message, ".") + } + ) + + ParallelLogger::logDebug( + "df_product_raw dim: ", + dim(df_product_raw) %>% as.data.frame(), + "." + ) + + export_data_as_parquet( + data = df_product_raw, + filename = stringr::str_replace(product_file_name, "_product_raw", ""), + output_root = paths$product_data_cleaned, + suffix = "_product_cleaned" + ) + }, + output_root = paths$output_root + ) + + ParallelLogger::logInfo("Finish process_product_file.") +} diff --git a/R/link_product_patient.R b/R/link_product_patient.R index 119de01..8816d57 100644 --- a/R/link_product_patient.R +++ b/R/link_product_patient.R @@ -14,7 +14,7 @@ #' link_product_patient("path/to/product_data.parquet", "path/to/patient_data.parquet") #' } link_product_patient <- function(product_file, patient_file) { - logInfo("Trying to link product file ", product_file, " with patient file ", patient_file) + ParallelLogger::logInfo("Trying to link product file ", product_file, " with patient file ", patient_file) patient_data <- arrow::read_parquet(patient_file) product_data <- arrow::read_parquet(product_file) @@ -44,7 +44,7 @@ link_product_patient <- function(product_file, patient_file) { tryCatch( { if (nrow(summary_df) > 0) { - logWarn( + ParallelLogger::logWarn( "The number of mismatched patient IDs between the product and patient data is ", nrow(summary_df), ". ", paste("File Name: ", summary_df$file_name, @@ -56,18 +56,18 @@ link_product_patient <- function(product_file, patient_file) { } }, error = function(e) { - logError("Could not link csv files for product and patient data. Error: ", e$message) + ParallelLogger::logError("Could not link csv files for product and patient data. Error: ", e$message) }, warning = function(w) { - logWarn("Could not link csv files for product and patient data. Warning: ", w$message) + ParallelLogger::logWarn("Could not link csv files for product and patient data. Warning: ", w$message) } ) } else { - logInfo( + ParallelLogger::logInfo( "There are no mismatched patient IDs between the product data - ", product_file, " and patient data - ", patient_file ) } - logInfo("Finished attempting to link product csv file with patient csv file.") + ParallelLogger::logInfo("Finished attempting to link product csv file with patient csv file.") } diff --git a/R/logger.R b/R/logger.R index 57958bd..e2a111b 100644 --- a/R/logger.R +++ b/R/logger.R @@ -9,17 +9,17 @@ setup_logger <- function(output_dir, log_name) { file.remove(logFileName) } - logger <- createLogger( + logger <- ParallelLogger::createLogger( name = "MAIN", threshold = "TRACE", appenders = list( - createFileAppender( - layout = layoutParallel, + ParallelLogger::createFileAppender( + layout = ParallelLogger::layoutParallel, fileName = logFileName ) ) ) - registerLogger(logger) + ParallelLogger::registerLogger(logger) log_dir <- file.path(output_dir, "logs") @@ -33,14 +33,15 @@ setup_logger <- function(output_dir, log_name) { #' @param output_root Output root directory for the current process. #' #' @return returns the loggers that where previously set for usage with with_ +#' @export setup_file_logger <- function(logfile, output_root) { - loggers <- getLoggers() - clearLoggers() + loggers <- ParallelLogger::getLoggers() + ParallelLogger::clearLoggers() logFileName <- file.path(output_root, "logs", paste0(logfile, ".log")) if (file.exists(logFileName)) { file.remove(logFileName) } - addDefaultFileLogger(logFileName, name = logfile) + ParallelLogger::addDefaultFileLogger(logFileName, name = logfile) loggers } @@ -58,9 +59,9 @@ setup_file_logger <- function(logfile, output_root) { with_file_logger <- withr::with_( setup_file_logger, function(loggers) { - clearLoggers() + ParallelLogger::clearLoggers() for (logger in loggers) { - registerLogger(logger) + ParallelLogger::registerLogger(logger) } } ) diff --git a/R/read_cleaned_patient_data.R b/R/read_cleaned_patient_data.R index ae56e34..b217da4 100644 --- a/R/read_cleaned_patient_data.R +++ b/R/read_cleaned_patient_data.R @@ -7,7 +7,7 @@ #' @export read_cleaned_patient_data <- function(input_root, patient_data_files) { - logInfo("Start read_cleaned_patient_data") + ParallelLogger::logInfo("Start read_cleaned_patient_data") patient_data <- patient_data_files %>% purrr::map(function(patient_file) { @@ -17,6 +17,6 @@ read_cleaned_patient_data <- - logInfo("Finish read_cleaned_patient_data") + ParallelLogger::logInfo("Finish read_cleaned_patient_data") patient_data } diff --git a/R/read_patient_data.R b/R/read_patient_data.R index b292f1c..61f6839 100644 --- a/R/read_patient_data.R +++ b/R/read_patient_data.R @@ -1,9 +1,9 @@ reading_patient_data <- function(tracker_data_file, columns_synonyms) { - logDebug("Start reading_patient_data.") + ParallelLogger::logDebug("Start reading_patient_data.") sheet_list <- readxl::excel_sheets(tracker_data_file) testit::assert(length(sheet_list) > 0) - logInfo( + ParallelLogger::logInfo( "Found ", length(sheet_list), " sheets inside the current file = ", @@ -13,7 +13,7 @@ reading_patient_data <- month_list <- sheet_list[na.omit(pmatch(month.abb, sheet_list))] - logInfo( + ParallelLogger::logInfo( "Found ", length(month_list), " month sheets inside the current file = ", @@ -24,18 +24,18 @@ reading_patient_data <- # Extract year year <- get_tracker_year(tracker_data_file, month_list) - logInfo("Tracker year = ", year, ".") + ParallelLogger::logInfo("Tracker year = ", year, ".") testit::assert(year %in% c(2017, 2018, 2019, 2020, 2021, 2022)) tidy_tracker_list <- NULL - logDebug("Start processing sheets.") + ParallelLogger::logDebug("Start processing sheets.") for (curr_sheet in month_list) { - logDebug("Start processing sheet ", curr_sheet, ".") + ParallelLogger::logDebug("Start processing sheet ", curr_sheet, ".") df_patient <- extract_patient_data(tracker_data_file, curr_sheet, year) testit::assert(nrow(df_patient) > 0) - logDebug("df_patient dim: ", dim(df_patient) %>% as.data.frame(), ".") + ParallelLogger::logDebug("df_patient dim: ", dim(df_patient) %>% as.data.frame(), ".") df_patient <- harmonize_patient_data_columns(df_patient, columns_synonyms) @@ -66,15 +66,15 @@ reading_patient_data <- ) tidy_tracker_list[[curr_sheet]] <- df_patient - logDebug("Finish processing sheet ", curr_sheet, ".") + ParallelLogger::logDebug("Finish processing sheet ", curr_sheet, ".") } - logDebug("Start combining sheet data into single data frame.") + ParallelLogger::logDebug("Start combining sheet data into single data frame.") df_raw <- dplyr::bind_rows(tidy_tracker_list) - logDebug("Finish combining sheet data into single data frame.") + ParallelLogger::logDebug("Finish combining sheet data into single data frame.") if ("Patient List" %in% sheet_list) { - logDebug("Start extracting patient list.") + ParallelLogger::logDebug("Start extracting patient list.") patient_list <- extract_patient_data( tracker_data_file, "Patient List", @@ -98,9 +98,9 @@ reading_patient_data <- by = "id", relationship = "many-to-one" ) - logDebug("Finish extracting patient list.") + ParallelLogger::logDebug("Finish extracting patient list.") } - logInfo("Finish reading_patient_data.") + ParallelLogger::logInfo("Finish reading_patient_data.") return(df_raw) } diff --git a/R/read_product_data.R b/R/read_product_data.R index 10d8886..d1fc051 100644 --- a/R/read_product_data.R +++ b/R/read_product_data.R @@ -4,7 +4,7 @@ # function based on parts from run_a4d_product_data.R and helper functions reading_product_data_step1 <- function(tracker_data_file, columns_synonyms) { - logDebug("Start reading_product_data_step1.") + ParallelLogger::logDebug("Start reading_product_data_step1.") # rename column names to match colnames(columns_synonyms) <- c("name_clean", "name_to_be_matched") @@ -13,13 +13,13 @@ reading_product_data_step1 <- month_list <- sheet_list[na.omit(pmatch(month.abb, sheet_list))] year <- get_tracker_year(tracker_data_file, month_list) - logDebug("Found ", length(sheet_list), " sheets: ", paste(sheet_list, collapse = ", "), ".") - logDebug("Found ", length(month_list), " month sheets: ", paste(month_list, collapse = ", "), ".") + ParallelLogger::logDebug("Found ", length(sheet_list), " sheets: ", paste(sheet_list, collapse = ", "), ".") + ParallelLogger::logDebug("Found ", length(month_list), " month sheets: ", paste(month_list, collapse = ", "), ".") # loop through all months for (curr_sheet in month_list) { - logDebug("Start processing the following sheet: ", curr_sheet) + ParallelLogger::logDebug("Start processing the following sheet: ", curr_sheet) # open tracker data tracker_data <- data.frame( @@ -37,7 +37,7 @@ reading_product_data_step1 <- grepl("Description of Support", tracker_data[, ]))) ) { # go to next month - logInfo("Could not find product data in tracker data. Skipping ", curr_sheet, ".") + ParallelLogger::logInfo("Could not find product data in tracker data. Skipping ", curr_sheet, ".") next } @@ -46,7 +46,7 @@ reading_product_data_step1 <- # If after extraction, dataframe is empty, this iteration is also skipped. if (all(is.na(product_df))) { - logInfo("Product data is empty. Skipping ", curr_sheet, ".") + ParallelLogger::logInfo("Product data is empty. Skipping ", curr_sheet, ".") next } @@ -69,11 +69,11 @@ reading_product_data_step1 <- tryCatch( { if (num_na_rows > 0) { - logInfo(curr_sheet, " the number of rows where the patient's name is missing: ", col_released, " is not NA and ", col_released_to, " (patient's name) is NA = ", num_na_rows) + ParallelLogger::logInfo(curr_sheet, " the number of rows where the patient's name is missing: ", col_released, " is not NA and ", col_released_to, " (patient's name) is NA = ", num_na_rows) } }, error = function(e) { - logError(curr_sheet, " trying with num_na_rows for products. Error: ", e$message) + ParallelLogger::logError(curr_sheet, " trying with num_na_rows for products. Error: ", e$message) } ) @@ -83,7 +83,7 @@ reading_product_data_step1 <- tryCatch( { if (nrow(non_processed_dates) > 0) { - logWarn( + ParallelLogger::logWarn( curr_sheet, " the number of rows with non-processed dates in product_entry_date is ", nrow(non_processed_dates), ": ", @@ -92,7 +92,7 @@ reading_product_data_step1 <- } }, error = function(e) { - logError(curr_sheet, " trying with non_processed_dates in product_entry_date. Error: ", e$message) + ParallelLogger::logError(curr_sheet, " trying with non_processed_dates in product_entry_date. Error: ", e$message) } ) @@ -107,7 +107,7 @@ reading_product_data_step1 <- # Check if the entry dates for the balance match the month/year on the sheet check_entry_dates(product_df, curr_sheet) - logDebug("Finish processing sheet: ", curr_sheet) + ParallelLogger::logDebug("Finish processing sheet: ", curr_sheet) # combine all months if (!exists("df_final")) { @@ -121,7 +121,7 @@ reading_product_data_step1 <- } else { return(NULL) } - logDebug("Finish reading_product_data_step1.") + ParallelLogger::logDebug("Finish reading_product_data_step1.") } @@ -151,7 +151,7 @@ count_na_rows <- function(df, units_released_col, released_to_col) { #' #' @return This function does not return a value. It logs a warning message if there are any dates in 'product_entry_date' that don't match the month/year on the sheet. check_entry_dates <- function(df, Sheet) { - logDebug("Start check_entry_dates.") + ParallelLogger::logDebug("Start check_entry_dates.") # Check if the entry dates for the balance match the month/year on the sheet entry_dates_df <- df %>% dplyr::filter(grepl("^[0-9]+$", product_entry_date)) @@ -167,14 +167,14 @@ check_entry_dates <- function(df, Sheet) { not_same <- entry_dates_df[entry_dates_df$ed_month != entry_dates_df$product_table_month | entry_dates_df$ed_year != entry_dates_df$product_table_year, ] if (nrow(not_same) > 0) { - logWarn( + ParallelLogger::logWarn( Sheet, " the number of dates in product_entry_date that don't match the month/year on the sheet is ", nrow(not_same), ": ", paste(not_same$ed_date, collapse = ", ") ) } - logDebug("Finish check_entry_dates.") + ParallelLogger::logDebug("Finish check_entry_dates.") } #' @title Remove Rows with NA Values in Specified Columns. @@ -192,7 +192,7 @@ remove_rows_with_na_columns <- na_rows <- apply(df[column_names], 1, function(x) all(is.na(x))) # log message - logInfo(paste(length(na_rows[na_rows == T]), " rows deleted out of ", nrow(df), " rows (reason: rows not containing additional info).", sep = "")) + ParallelLogger::logInfo(paste(length(na_rows[na_rows == T]), " rows deleted out of ", nrow(df), " rows (reason: rows not containing additional info).", sep = "")) # Return the data frame without the NA rows return(df[!na_rows, ]) @@ -215,7 +215,7 @@ check_negative_balance <- function(df, Sheet) { # Check if there are any rows in the new data frame if (nrow(negative_df) > 0) { # Log a warning message with the number of negative values and their corresponding product_balance values - logWarn( + ParallelLogger::logWarn( Sheet, " number of negative values in product_balance on the sheet is ", nrow(negative_df), ": ", @@ -242,7 +242,7 @@ switch_columns_stock <- "product_units_received" = "product_received_from", "product_received_from" = "product_units_received" ) - logDebug("Columns product_units_received and product_received_from were switched") + ParallelLogger::logDebug("Columns product_units_received and product_received_from were switched") return(df) } else { return(df) @@ -290,14 +290,14 @@ report_unknown_products <- function(df, Sheet, stock_list_df) { # Check if there are any unknown products names if (length(unmatched_products) > 0) { # Log a warning message with the number of unknown products names - logWarn( + ParallelLogger::logWarn( Sheet, " the number of unknown product names on the sheet is ", length(unmatched_products), ": ", paste(unmatched_products, collapse = ", ") ) } else { - logInfo(Sheet, " no unknown product names on the sheet") + ParallelLogger::logInfo(Sheet, " no unknown product names on the sheet") } } @@ -317,16 +317,16 @@ report_unknown_products <- function(df, Sheet, stock_list_df) { #' product_list <- load_product_reference_data("your_file.xlsx") #' } load_product_reference_data <- function(stock_summary_xlsx = "reference_data/master_tracker_variables.xlsx") { - logDebug("Trying to load the product list from the Stock Summary, ", stock_summary_xlsx, "...") + ParallelLogger::logDebug("Trying to load the product list from the Stock Summary, ", stock_summary_xlsx, "...") tryCatch( { product_names_df <- readxl::read_excel(stock_summary_xlsx, "Stock_Summary") colnames(product_names_df) <- tolower(colnames(product_names_df)) - logDebug(nrow(product_names_df), " product names were loaded from the Stock Summary.") + ParallelLogger::logDebug(nrow(product_names_df), " product names were loaded from the Stock Summary.") return(product_names_df) }, error = function(e) { - logError("Error in loading stock product list: ", e) + ParallelLogger::logError("Error in loading stock product list: ", e) } ) } @@ -371,7 +371,7 @@ add_product_categories <- function(inventory_data, product_category_mapping) { #' df <- extract_unit_capacity(df, "product") #' } extract_unit_capacity <- function(df, column_name) { - logDebug("Trying to extract Unit Capacity from ", column_name, " column") + ParallelLogger::logDebug("Trying to extract Unit Capacity from ", column_name, " column") # Extract all symbols between parentheses df$product_unit_capacity <- stringr::str_extract(df[[column_name]], "\\(([^)]+)\\)") @@ -393,7 +393,7 @@ extract_unit_capacity <- function(df, column_name) { # Add 1 to NA values df$product_unit_capacity[is.na(df$product_unit_capacity)] <- 1 - logDebug("Finished extracting Unit Capacity from ", column_name, " column") + ParallelLogger::logDebug("Finished extracting Unit Capacity from ", column_name, " column") return(df) } @@ -410,7 +410,7 @@ extract_unit_capacity <- function(df, column_name) { #' @return Cleaned product data for one specified tracker. reading_product_data_step2 <- function(df, columns_synonyms) { - logDebug("Start reading_product_data_step2.") + ParallelLogger::logDebug("Start reading_product_data_step2.") # rename column names to match colnames(columns_synonyms) <- c("name_clean", "name_to_be_matched") @@ -425,7 +425,7 @@ reading_product_data_step2 <- # loop through all months for (sheet_month in unique(df$product_sheet_name)) { - logDebug(paste("Start processing the following sheet:", sheet_month)) + ParallelLogger::logDebug(paste("Start processing the following sheet:", sheet_month)) # filter on month sheet product_df <- df %>% @@ -453,7 +453,7 @@ reading_product_data_step2 <- product_df <- remove_rows_with_na_columns(product_df, column_names_check) # jump to next sheet if dataframe empty from here if (nrow(product_df) == 0) { - logDebug(paste(sheet_month, " sheet is empty after filtering and skipped", sep = "")) + ParallelLogger::logDebug(paste(sheet_month, " sheet is empty after filtering and skipped", sep = "")) next } @@ -527,12 +527,12 @@ reading_product_data_step2 <- df_final <- df_final %>% rbind(product_df) - logDebug(paste("Finished processing the following sheet:", sheet_month)) + ParallelLogger::logDebug(paste("Finished processing the following sheet:", sheet_month)) } if (nrow(df_final) > 0) { return(df_final) } else { - logDebug(paste("No product data extracted for the following tracker:", df$file_name[1])) + ParallelLogger::logDebug(paste("No product data extracted for the following tracker:", df$file_name[1])) } } diff --git a/R/sysdata.rda b/R/sysdata.rda index eb303cf804b77f1badf0b2d97168f0a3f6920e66..be6b002cff04d1329d0fdca86a74da91bb6a0978 100644 GIT binary patch delta 19 XcmX>tdRmlAzMF#q445}^9pnN4Eu91g delta 19 XcmX>tdRmlAzMF#q4A?eu9pnN4Evy6y diff --git a/man/process_patient_file.Rd b/man/process_patient_file.Rd new file mode 100644 index 0000000..69c9a62 --- /dev/null +++ b/man/process_patient_file.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper_script_2.R +\name{process_patient_file} +\alias{process_patient_file} +\title{Process and clean patient raw data} +\usage{ +process_patient_file(patient_file, paths, p) +} +\arguments{ +\item{patient_file}{Path to patient raw parquet file from first script.} + +\item{paths}{List with paths to patient_data_cleaned and product_data_cleaned folders.} + +\item{p}{progressor from progressr package.} +} +\description{ +Read in the output of the first script as parquet (all character). +Merge extracted patient data with the metadata schema that contains all +necessary columns. Process each variable in three steps: +\enumerate{ +\item Fix any problems with character representation. +\item Convert column to target data type, for example a date or number. +\item Apply any post-processing logic for the target data type. +} +} diff --git a/man/process_product_file.Rd b/man/process_product_file.Rd new file mode 100644 index 0000000..07d01d9 --- /dev/null +++ b/man/process_product_file.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper_script_2.R +\name{process_product_file} +\alias{process_product_file} +\title{Process and clean product raw data} +\usage{ +process_product_file(product_file, paths, p) +} +\arguments{ +\item{product_file}{Path to product raw parquet file from first script.} + +\item{paths}{List with paths to patient_data_cleaned and product_data_cleaned folders.} + +\item{p}{progressor from progressr package.} +} +\description{ +A short description... +} diff --git a/man/process_tracker_file.Rd b/man/process_tracker_file.Rd index b9728ae..2453d08 100644 --- a/man/process_tracker_file.Rd +++ b/man/process_tracker_file.Rd @@ -4,14 +4,14 @@ \alias{process_tracker_file} \title{Process a single tracker file and extract patient and product data.} \usage{ -process_tracker_file(paths, tracker_file, tracker_name) +process_tracker_file(tracker_file, paths, p) } \arguments{ -\item{paths}{a list with the paths to the tracker root dir, the patient and product output dir and the root output dir.} +\item{tracker_file}{Filename of the tracler.} -\item{tracker_file}{filename.} +\item{paths}{a list with the paths to the tracker root dir, the patient and product output dir and the root output dir.} -\item{tracker_name}{filename without extension.} +\item{p}{progressor from progressr package.} } \description{ Process a single tracker file and extract patient and product data. diff --git a/reference_data/build_package_data.R b/reference_data/build_package_data.R index 3d34392..7d3dc8c 100644 --- a/reference_data/build_package_data.R +++ b/reference_data/build_package_data.R @@ -1,5 +1,5 @@ cleaning_config <- yaml::read_yaml(here::here("reference_data", "data_cleaning.yaml")) -allowed_provinces <- yaml::read_yaml(here::here("reference_data", "provinces", "allowed_provinces.yaml")) %>% unlist() +allowed_provinces <- yaml::read_yaml(here::here("reference_data", "provinces", "allowed_provinces.yaml")) |> unlist() for (i in length(cleaning_config$province$steps)) { if (cleaning_config$province$steps[[i]]$type == "allowed_values") { diff --git a/scripts/run_script_1_extract_raw_data.R b/scripts/run_script_1_extract_raw_data.R index d4961ca..696e610 100644 --- a/scripts/run_script_1_extract_raw_data.R +++ b/scripts/run_script_1_extract_raw_data.R @@ -1,11 +1,11 @@ options(readxl.show_progress = FALSE) -future::plan(future::multisession, workers = 4) +future::plan("multisession") -Sys.setenv(A4D_DATA_ROOT = "data") paths <- a4d::init_paths(c("patient_data_raw", "product_data_raw"), delete = TRUE) -setup_logger(paths$output_root, "script1") +a4d::setup_logger(paths$output_root, "script1") tracker_files <- a4d::get_files(paths$tracker_root) -logInfo( + +ParallelLogger::logInfo( "Found ", length(tracker_files), " xlsx files under ", @@ -13,7 +13,7 @@ logInfo( "." ) -logInfo("Start processing tracker files.") +ParallelLogger::logInfo("Start processing tracker files.") progressr::with_progress({ p <- progressr::progressor(steps = length(tracker_files)) @@ -21,11 +21,11 @@ progressr::with_progress({ result <- furrr::future_map( tracker_files, a4d::process_tracker_file, - paths=paths, - p=p + paths = paths, + p = p ) }) -logInfo("Finish processing all tracker files.") +ParallelLogger::logInfo("Finish processing all tracker files.") -clearLoggers() +ParallelLogger::clearLoggers() diff --git a/scripts/run_script_2_clean_data.R b/scripts/run_script_2_clean_data.R index e20727a..03c36da 100644 --- a/scripts/run_script_2_clean_data.R +++ b/scripts/run_script_2_clean_data.R @@ -1,371 +1,53 @@ options(readxl.show_progress = FALSE) - -ERROR_VAL_NUMERIC <<- 999999 -ERROR_VAL_CHARACTER <<- "Undefined" -ERROR_VAL_DATE <<- "9999-09-09" - -main <- function() { - paths <- init_paths(c("patient_data_cleaned", "product_data_cleaned"), delete = TRUE) - setup_logger(paths$output_root, "script2") - patient_data_files <- get_files(paths$tracker_root, pattern = "patient_raw.parquet$") - product_data_files <- get_files(paths$tracker_root, pattern = "product_raw.parquet$") - logInfo( - "Found ", - length(patient_data_files), - " patient csv files under ", - paths$tracker_root, - "." - ) - logInfo( - "Found ", - length(product_data_files), - " product csv files under ", - paths$tracker_root, - "." - ) - - logInfo("Start processing patient csv files.") - - for (i in seq_along(patient_data_files)) { - patient_file <- patient_data_files[i] - patient_file_name <- tools::file_path_sans_ext(basename(patient_file)) - logfile <- paste0(patient_file_name) - with_file_logger(logfile, - { - tryCatch( - process_patient_file(paths, patient_file, patient_file_name, paths$patient_data_cleaned), - error = function(e) { - logError("Could not process raw patient data. Error = ", e$message, ".") - }, - warning = function(w) { - logWarn("Could not process raw patient data. Warning = ", w$message, ".") - } - ) - }, - output_root = paths$output_root - ) - cat(paste("Processed ", i, " of ", length(patient_data_files), " (", round(i / length(patient_data_files) * 100, 0), "%) patient files.\n")) - } - - logInfo("Finish processing all patient csv files.") - - logDebug("Start processing product csv files.") - synonyms <- get_synonyms() - synonyms_product <- synonyms$product - - for (i in seq_along(product_data_files)) { - product_file <- product_data_files[i] - product_file_name <- tools::file_path_sans_ext(basename(product_file)) - logfile <- paste0(product_file_name) - - with_file_logger(logfile, - { - tryCatch( - process_product_file(paths, product_file, product_file_name, synonyms_product, paths$product_data_cleaned), - error = function(e) { - logError("Could not process raw product data. Error = ", e$message, ".") - }, - warning = function(w) { - logWarn("Could not process raw product data. Warning = ", w$message, ".") - } - ) - }, - output_root = paths$output_root - ) - cat(paste("Processed ", i, " of ", length(product_data_files), " (", round(i / length(product_data_files) * 100, 0), "%) product files.\n")) - } - - logInfo("Finish processing all csv files.") -} - - -process_patient_file <- function(paths, patient_file, patient_file_name, output_root) { - patient_file_path <- - file.path(paths$tracker_root, patient_file) - logDebug("Start process_patient_file.") - logInfo( - "Current file: ", - patient_file_name - ) - - allowed_provinces <- get_allowed_provinces() - - df_patient_raw <- arrow::read_parquet(patient_file_path) - - # filter all rows with no patient id or patient name - df_patient_raw <- df_patient_raw %>% - dplyr::filter(!(is.na(id) & is.na(name))) %>% - dplyr::filter(!(id == "0" & name == "0")) - - # --- TRANSFORMATIONS --- - # data before 2019 had only one column for updated hba1c and fbg - # with date as part of the value - if (!"hba1c_updated_date" %in% colnames(df_patient_raw) && "hba1c_updated" %in% colnames(df_patient_raw)) { - logInfo("Column updated_hba1c_date not found. Trying to parse from hba1c_updated.") - df_patient_raw <- - extract_date_from_measurement(df_patient_raw, "hba1c_updated") - logDebug("Finished parsing dates from hba1c_updated.") - } - - if (!"fbg_updated_date" %in% colnames(df_patient_raw) && "fbg_updated_mg" %in% colnames(df_patient_raw)) { - logInfo("Column updated_fbg_date not found. Trying to parse from fbg_updated_mg.") - df_patient_raw <- - extract_date_from_measurement(df_patient_raw, "fbg_updated_mg") - logDebug("Finished parsing dates from fbg_updated_mg.") - } - - if (!"fbg_updated_date" %in% colnames(df_patient_raw) && "fbg_updated_mmol" %in% colnames(df_patient_raw)) { - logInfo("Column fbg_updated_date not found. Trying to parse from fbg_updated_mmol.") - df_patient_raw <- - extract_date_from_measurement(df_patient_raw, "fbg_updated_mmol") - logDebug("Finished parsing dates from fbg_updated_mmol.") - } - - # blood pressure is given as sys/dias value pair, - # so we split this column in two separate columns - if ("blood_pressure_mmhg" %in% colnames(df_patient_raw)) { - df_patient_raw <- split_bp_in_sys_and_dias(df_patient_raw) - } - - # The maximum value available for hba1c will be around 14% - 18%, - # depending on the equipment being used. - # If the reading is above the maximum available value the > sign is used - - # we would prefer to retain this character in the database as it is important for data analysis. - logInfo("Adding columns hba1c_baseline_exceeds and hba1c_updated_exceeds.") - df_patient_raw <- df_patient_raw %>% - dplyr::mutate( - hba1c_baseline_exceeds = ifelse(grepl(">|<", hba1c_baseline), TRUE, FALSE), - hba1c_updated_exceeds = ifelse(grepl(">|<", hba1c_updated), TRUE, FALSE) - ) - - # --- META SCHEMA --- - # meta schema has all final columns for the database - # along with their corresponding data types - logInfo("Creating meta schema.") - # short type string for read_csv: - # iiinDccDcnnDnncnlnlDncDccDDDccccDccccciDciiiDn - schema <- tibble::tibble( - # clinic_visit = logical(), - # complication_screening = character(), - # complication_screening_date = lubridate::as_date(1), - # complication_screening_results = character(), - # dm_complication_comment = character(), # TODO - # dm_complication_eye = character(), # TODO - # dm_complication_kidney = character(), # TODO - # dm_complication_other = character(), # TODO - # est_strips_pmonth = integer(), - # family_support_scale = character(), # TODO - # inactive_reason = character(), - # insulin_dosage = character(), - # meter_received_date = lubridate::as_date(1), # TODO - # remarks = character(), - # remote_followup = logical(), - # additional_support = character(), - age = integer(), - blood_pressure_dias_mmhg = integer(), - blood_pressure_sys_mmhg = integer(), - bmi = numeric(), - bmi_date = lubridate::as_date(1), - clinic_code = character(), - country_code = character(), - dob = lubridate::as_date(1), - edu_occ = character(), - fbg_baseline_mg = numeric(), - fbg_baseline_mmol = numeric(), - fbg_updated_date = lubridate::as_date(1), - fbg_updated_mg = numeric(), - fbg_updated_mmol = numeric(), - file_name = character(), - hba1c_baseline = numeric(), - hba1c_baseline_exceeds = logical(), - hba1c_updated = numeric(), - hba1c_updated_exceeds = logical(), - hba1c_updated_date = lubridate::as_date(1), - height = numeric(), - hospitalisation_cause = character(), - hospitalisation_date = lubridate::as_date(1), - id = character(), - insulin_regimen = character(), - last_clinic_visit_date = lubridate::as_date(1), - last_remote_followup_date = lubridate::as_date(1), - lost_date = lubridate::as_date(1), - name = character(), - observations = character(), - observations_category = character(), - province = character(), - recruitment_date = lubridate::as_date(1), - sex = character(), - sheet_name = character(), - status = character(), - status_out = character(), - support_from_a4d = character(), - t1d_diagnosis_age = integer(), - t1d_diagnosis_date = lubridate::as_date(1), - t1d_diagnosis_with_dka = character(), - testing_frequency = integer(), - tracker_date = lubridate::as_date(1), - tracker_month = integer(), - tracker_year = integer(), - updated_2022_date = lubridate::as_date(1), - weight = numeric() - ) - - cols_extra <- colnames(df_patient_raw)[!colnames(df_patient_raw) %in% colnames(schema)] - logWarn("Extra columns in patient data: ", paste(cols_extra, collapse = ", ")) - - cols_missing <- - colnames(schema)[!colnames(schema) %in% colnames(df_patient_raw)] - logWarn("Missing columns in patient data: ", paste(cols_missing, collapse = ", ")) - - # add all columns of schema to df_patient_raw - # keep all rows, only append missing cols - logInfo("Merging df_patient with meta schema and selecting all columns of meta schema.") - df_patient <- merge.default(df_patient_raw, schema, all.x = T) - df_patient <- df_patient[colnames(schema)] - - # the cleaning, fixing and validating happens in three major steps: - # 1. make sure we fix any known problems in the raw character columns - df_patient <- - df_patient %>% - dplyr::rowwise() %>% - # 1. handle known problems before converting to target type - dplyr::mutate( - t1d_diagnosis_age = fix_t1d_diagnosis_age(t1d_diagnosis_age, id), - hba1c_baseline = stringr::str_replace(hba1c_baseline, "<|>", ""), - hba1c_updated = stringr::str_replace(hba1c_updated, "<|>", ""), - fbg_baseline_mg = fix_fbg(fbg_baseline_mg), - fbg_baseline_mmol = fix_fbg(fbg_baseline_mmol), - fbg_updated_mg = fix_fbg(fbg_updated_mg), - fbg_updated_mmol = fix_fbg(fbg_updated_mmol), - testing_frequency = fix_testing_frequency(testing_frequency) - ) %>% - # 2. convert the refined character columns into the target data type - dplyr::mutate( - dplyr::across( - schema %>% dplyr::select(tidyselect::where(is.numeric)) %>% names(), - \(x) convert_to(correct_decimal_sign(x), as.numeric, ERROR_VAL_NUMERIC, dplyr::cur_column(), id = id) - ), - dplyr::across( - schema %>% dplyr::select(tidyselect::where(is.logical)) %>% names(), - \(x) convert_to(x, as.logical, FALSE, dplyr::cur_column(), id = id) - ), - dplyr::across( - schema %>% dplyr::select(tidyselect::where(lubridate::is.Date)) %>% names(), - \(x) convert_to(fix_digit_date(x), parse_dates, as.Date(ERROR_VAL_DATE), dplyr::cur_column(), id = id) - ), - dplyr::across( - schema %>% dplyr::select(tidyselect::where(is.integer)) %>% names(), - \(x) convert_to(x, function(x) as.integer(round(as.double(x))), ERROR_VAL_NUMERIC, dplyr::cur_column(), id = id) - ) - ) %>% - # 3. fix remaining problems in the target data type - dplyr::mutate( - # height and weight are needed to calculate bmi - height = transform_cm_to_m(height) %>% - cut_numeric_value(min = 0, max = 2.3, col_name = "height"), - weight = cut_numeric_value(weight, min = 0, max = 200, col_name = "weight"), - bmi = fix_bmi(weight, height, id) %>% - cut_numeric_value(min = 4, max = 60, "bmi"), - age = fix_age(age, dob, tracker_year, tracker_month, id) %>% - cut_numeric_value(min = 0, max = 25, "age"), - sex = fix_sex(sex, id), - hba1c_baseline = cut_numeric_value(hba1c_baseline, min = 4, max = 18, "hba1c_baseline"), - # https://www.cleveland19.com/story/1425584/ohio-man-holds-world-record-of-highest-blood-sugar/ - fbg_baseline_mmol = cut_numeric_value(fbg_baseline_mmol, min = 0, max = 136.5, "fbg_baseline_mmol"), - # https://www.cleveland19.com/story/1425584/ohio-man-holds-world-record-of-highest-blood-sugar/ - fbg_updated_mmol = cut_numeric_value(fbg_updated_mmol, min = 0, max = 136.5, "fbg_updated_mmol"), - blood_pressure_sys_mmhg = cut_numeric_value(blood_pressure_sys_mmhg, min = 20, max = 250, "blood_pressure_sys_mmhg"), - blood_pressure_dias_mmhg = cut_numeric_value(blood_pressure_dias_mmhg, min = 20, max = 220, "blood_pressure_dias_mmhg"), - tracker_date = lubridate::ym(paste(tracker_year, tracker_month, sep="-")), - !!!parse_character_cleaning_config(a4d:::config$cleaning), - # should be fixed last as other fix functions use id to log invalid rows! - id = fix_id(id) - ) %>% - dplyr::ungroup() - - # add clinic and country code after having fixed all issues with patient id - cc_codes <- extract_country_clinic_code(df_patient) - df_patient["clinic_code"] <- cc_codes$clinic_code - df_patient["country_code"] <- cc_codes$country_code - - # Formula to calculate mmol/l from mg/dl: mmol/l = mg/dl / 18 - if (all(is.na(df_patient$fbg_baseline_mmol))) { - df_patient <- df_patient %>% - dplyr::mutate(fbg_baseline_mmol = dplyr::case_when( - fbg_baseline_mg != ERROR_VAL_NUMERIC ~ fbg_baseline_mg / 18 - )) - } - if (all(is.na(df_patient$fbg_updated_mmol))) { - df_patient <- df_patient %>% - dplyr::mutate(fbg_updated_mmol = dplyr::case_when( - fbg_updated_mg != ERROR_VAL_NUMERIC ~ fbg_updated_mg / 18 - )) - } - - # Formula to calculate mg/dl from mmol/l: mg/dl = 18 × mmol/l - if (all(is.na(df_patient$fbg_baseline_mg))) { - df_patient <- df_patient %>% - dplyr::mutate(fbg_baseline_mg = dplyr::case_when( - fbg_baseline_mmol != ERROR_VAL_NUMERIC ~ fbg_baseline_mmol * 18 - )) - } - if (all(is.na(df_patient$fbg_updated_mg))) { - df_patient <- df_patient %>% - dplyr::mutate(fbg_updated_mg = dplyr::case_when( - fbg_updated_mmol != ERROR_VAL_NUMERIC ~ fbg_updated_mmol * 18 - )) - } - - # sort by year and month like it is in the tracker files - df_patient <- df_patient %>% - dplyr::arrange(tracker_date, id) - - logDebug( - "df_patient dim: ", - dim(df_patient) %>% as.data.frame(), - "." - ) - - export_data_as_parquet( - data = df_patient, - filename = stringr::str_replace(patient_file_name, "_patient_raw", ""), - output_root = output_root, - suffix = "_patient_cleaned" - ) - - logInfo("Finish process_patient_file.") -} - - -process_product_file <- function(paths, product_file, product_file_name, synonyms_product, output_root) { - product_file_path <- - file.path(paths$tracker_root, product_file) - logDebug("Start process_product_file.") - logInfo( - "Current file: ", - product_file_name - ) - - df_product_raw <- arrow::read_parquet(product_file_path) - - df_product_raw <- reading_product_data_step2(df_product_raw, synonyms_product) - - logDebug( - "df_product_raw dim: ", - dim(df_product_raw) %>% as.data.frame(), - "." - ) - - export_data_as_parquet( - data = df_product_raw, - filename = stringr::str_replace(product_file_name, "_product_raw", ""), - output_root = output_root, - suffix = "_product_cleaned" - ) - - logInfo("Finish process_product_file.") -} - -main() - -clearLoggers() +future::plan("multisession") + +paths <- a4d::init_paths(c("patient_data_cleaned", "product_data_cleaned"), delete = TRUE) +a4d::setup_logger(paths$output_root, "script2") +patient_data_files <- a4d::get_files(paths$tracker_root, pattern = "patient_raw.parquet$") +product_data_files <- a4d::get_files(paths$tracker_root, pattern = "product_raw.parquet$") +ParallelLogger::logInfo( + "Found ", + length(patient_data_files), + " patient csv files under ", + paths$tracker_root, + "." +) +ParallelLogger::logInfo( + "Found ", + length(product_data_files), + " product csv files under ", + paths$tracker_root, + "." +) + +ParallelLogger::logInfo("Start processing patient csv files.") + +progressr::with_progress({ + p <- progressr::progressor(steps = length(patient_data_files)) + + result <- furrr::future_map( + patient_data_files, + a4d::process_patient_file, + paths = paths, + p = p + ) +}) + +ParallelLogger::logInfo("Finish processing all patient csv files.") + +ParallelLogger::logDebug("Start processing product csv files.") + +progressr::with_progress({ + p <- progressr::progressor(steps = length(product_data_files)) + + result <- furrr::future_map( + product_data_files, + a4d::process_product_file, + paths = paths, + p = p + ) +}) + +ParallelLogger::logInfo("Finish processing all csv files.") + +ParallelLogger::clearLoggers() diff --git a/scripts/run_script_3_create_tables.R b/scripts/run_script_3_create_tables.R index 7962867..4cd6216 100644 --- a/scripts/run_script_3_create_tables.R +++ b/scripts/run_script_3_create_tables.R @@ -9,14 +9,14 @@ main <- function() { setup_logger(paths$output_root, "script3") patient_data_files <- get_files(file.path(paths$output_root, "patient_data_cleaned"), pattern = "\\.parquet$") product_data_files <- get_files(file.path(paths$output_root, "product_data_cleaned"), pattern = "\\.parquet$") - logInfo( + ParallelLogger::logInfo( "Found ", length(patient_data_files), " patient csv files under ", paths$tracker_root, "." ) - logInfo( + ParallelLogger::logInfo( "Found ", length(product_data_files), " product csv files under ", @@ -24,7 +24,7 @@ main <- function() { "." ) - logInfo("Start creating table csv files.") + ParallelLogger::logInfo("Start creating table csv files.") logfile <- "table_patient_data_static" with_file_logger(logfile, @@ -34,10 +34,10 @@ main <- function() { create_table_patient_data_static(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) }, error = function(e) { - logError("Could not create table csv for static patient data. Error: ", e$message) + ParallelLogger::logError("Could not create table csv for static patient data. Error: ", e$message) }, warning = function(w) { - logWarn("Could not create table csv for static patient data. Error: ", w$message) + ParallelLogger::logWarn("Could not create table csv for static patient data. Error: ", w$message) } ) }, @@ -52,10 +52,10 @@ main <- function() { create_table_patient_data_monthly(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) }, error = function(e) { - logError("Could not create table csv for monthly patient data. Error: ", e$message) + ParallelLogger::logError("Could not create table csv for monthly patient data. Error: ", e$message) }, warning = function(w) { - logWarn("Could not create table csv for monthly patient data. Error: ", w$message) + ParallelLogger::logWarn("Could not create table csv for monthly patient data. Error: ", w$message) } ) }, @@ -76,10 +76,10 @@ main <- function() { ) }, error = function(e) { - logError("Could not create table csv for longitudinal patient data. Error: ", e$message) + ParallelLogger::logError("Could not create table csv for longitudinal patient data. Error: ", e$message) }, warning = function(w) { - logWarn("Could not create table csv for longitudinal patient data. Error: ", w$message) + ParallelLogger::logWarn("Could not create table csv for longitudinal patient data. Error: ", w$message) } ) }, @@ -94,19 +94,19 @@ main <- function() { create_table_product_data(file.path(paths$output_root, "product_data_cleaned"), paths$tables) }, error = function(e) { - logError("Could not create table for product data. Error: ", e$message) + ParallelLogger::logError("Could not create table for product data. Error: ", e$message) }, warning = function(w) { - logWarn("Could not create table for product data. Warning: ", w$message) + ParallelLogger::logWarn("Could not create table for product data. Warning: ", w$message) } ) }, output_root = paths$output_root ) - logInfo("Finish creating table files.") + ParallelLogger::logInfo("Finish creating table files.") - logInfo("Trying to link files for product and patient data.") + ParallelLogger::logInfo("Trying to link files for product and patient data.") logfile <- "link_product_patient_data" @@ -120,17 +120,17 @@ main <- function() { ) }, error = function(e) { - logError("Could not link files for product and patient data. Error: ", e$message) + ParallelLogger::logError("Could not link files for product and patient data. Error: ", e$message) }, warning = function(w) { - logWarn("Could not link files for product and patient data. Warning: ", w$message) + ParallelLogger::logWarn("Could not link files for product and patient data. Warning: ", w$message) } ) }, output_root = paths$output_root ) - logInfo("Finished linking files for product and patient data.") + ParallelLogger::logInfo("Finished linking files for product and patient data.") } main() From fef6e826915692ae4ff3baba5b88010517469849 Mon Sep 17 00:00:00 2001 From: Michael Aydinbas Date: Sat, 30 Dec 2023 16:05:59 +0100 Subject: [PATCH 4/8] working version for script 1 and 2 --- DESCRIPTION | 5 +- NAMESPACE | 1 + R/create_table_patient_data.R | 6 +- R/create_table_patient_data_changes_only.R | 6 +- R/create_table_patient_data_static.R | 6 +- R/create_table_product_data.R | 43 ++-- R/helper_main.R | 17 +- R/helper_patient_data_fix.R | 38 ++-- R/helper_product_data.R | 12 +- R/helper_read_patient_data.R | 32 +-- R/{helper_scipt_1.R => helper_script_1.R} | 38 ++-- R/helper_script_2.R | 66 +++--- R/link_product_patient.R | 13 +- R/read_cleaned_patient_data.R | 4 +- R/read_patient_data.R | 26 +-- R/read_product_data.R | 60 +++--- man/process_patient_data.Rd | 2 +- man/process_patient_file.Rd | 4 +- man/process_product_data.Rd | 2 +- man/process_product_file.Rd | 4 +- man/process_tracker_file.Rd | 8 +- renv.lock | 92 ++++---- scripts/pre_commit.R | 2 + scripts/run_script_1_extract_raw_data.R | 34 +-- scripts/run_script_2_clean_data.R | 60 ++++-- scripts/run_script_3_create_tables.R | 237 ++++++++++----------- 26 files changed, 430 insertions(+), 388 deletions(-) rename R/{helper_scipt_1.R => helper_script_1.R} (76%) diff --git a/DESCRIPTION b/DESCRIPTION index c273a8f..1ad85e1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,6 +9,7 @@ Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 Depends: + ParallelLogger Imports: data.table, digest, @@ -16,7 +17,6 @@ Imports: here, lubridate, openxlsx, - ParallelLogger, readr, readxl, stringr, @@ -27,7 +27,8 @@ Imports: zoo, arrow, furrr, - progressr + progressr, + tictoc Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index ca93149..298a603 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,6 +14,7 @@ export(fix_id) export(fix_sex) export(fix_t1d_diagnosis_age) export(fix_testing_frequency) +export(get_allowed_provinces) export(get_files) export(get_synonyms) export(get_tracker_year) diff --git a/R/create_table_patient_data.R b/R/create_table_patient_data.R index c6793ed..3eec539 100644 --- a/R/create_table_patient_data.R +++ b/R/create_table_patient_data.R @@ -8,8 +8,10 @@ #' @param patient_data_files list of CSV files with cleaned patient data from step 2. #' @param input_root root directory of the input CSV files. #' @param output_root root directory of the output folder. +#' +#' @export create_table_patient_data_monthly <- function(patient_data_files, input_root, output_root) { - ParallelLogger::logInfo("Start creating single csv for table patient_data_monthly.") + logInfo("Start creating single csv for table patient_data_monthly.") # THERE MIGHT BE MONTHLY COLUMNS MISSING - PLEASE ADD THEM dynamic_patient_columns <- @@ -58,5 +60,5 @@ create_table_patient_data_monthly <- function(patient_data_files, input_root, ou suffix = "" ) - ParallelLogger::logInfo("Finish creating single csv for table patient_data_monthly.") + logInfo("Finish creating single csv for table patient_data_monthly.") } diff --git a/R/create_table_patient_data_changes_only.R b/R/create_table_patient_data_changes_only.R index 1a4178c..8c28365 100644 --- a/R/create_table_patient_data_changes_only.R +++ b/R/create_table_patient_data_changes_only.R @@ -10,13 +10,15 @@ #' @param output_root root directory of the output folder. #' @param variable name of the column that should be exported. #' @param name name used to create the export file name. +#' +#' @export create_table_longitudinal_data <- function(patient_data_files, input_root, output_root, variable, name) { - ParallelLogger::logInfo("Start creating single csv for table create_table_longitudinal_data and variable ", variable) + logInfo("Start creating single csv for table create_table_longitudinal_data and variable ", variable) dynamic_patient_columns <- c( @@ -79,5 +81,5 @@ create_table_longitudinal_data <- suffix = "" ) - ParallelLogger::logInfo("Finish creating single csv for table create_table_longitudinal_data and variable ", variable) + logInfo("Finish creating single csv for table create_table_longitudinal_data and variable ", variable) } diff --git a/R/create_table_patient_data_static.R b/R/create_table_patient_data_static.R index 55c8c1d..cd4e56b 100644 --- a/R/create_table_patient_data_static.R +++ b/R/create_table_patient_data_static.R @@ -8,8 +8,10 @@ #' @param patient_data_files list of CSV files with cleaned patient data from step 2. #' @param input_root root directory of the input CSV files. #' @param output_root root directory of the output folder. +#' +#' @export create_table_patient_data_static <- function(patient_data_files, input_root, output_root) { - ParallelLogger::logInfo("Start creating single csv for table patient_data_static.") + logInfo("Start creating single csv for table patient_data_static.") # THERE MIGHT BE STATIC COLUMNS MISSING - PLEASE ADD THEM static_patient_columns <- @@ -59,5 +61,5 @@ create_table_patient_data_static <- function(patient_data_files, input_root, out suffix = "_static" ) - ParallelLogger::logInfo("Finish creating single csv for table patient_data_static.") + logInfo("Finish creating single csv for table patient_data_static.") } diff --git a/R/create_table_product_data.R b/R/create_table_product_data.R index 4ee7eaa..e88b09d 100644 --- a/R/create_table_product_data.R +++ b/R/create_table_product_data.R @@ -11,13 +11,14 @@ #' #' @return This function does not return a value. It writes the merged data to a new CSV file #' (with reordered columns according to the list of fields) in the output_root directory. +#' @export #' #' @examples #' \dontrun{ #' create_table_product_data("path/to/input/directory", "path/to/output/directory") #' } create_table_product_data <- function(input_root, output_root) { - ParallelLogger::logInfo("Start creating single file for table product_data.") + logInfo("Start creating single file for table product_data.") # Get a list of all CSV files in the input_root directory files <- list.files(input_root, pattern = "*.parquet", full.names = TRUE) @@ -25,7 +26,7 @@ create_table_product_data <- function(input_root, output_root) { # Read all CSV files and store them in a list data_list <- lapply(files, function(x) arrow::read_parquet(x)) - ParallelLogger::logInfo(length(data_list), " files will be processed for creating the single file for table product_data.") + logInfo(length(data_list), " files will be processed for creating the single file for table product_data.") # Get the union of all column names all_names <- unique(unlist(lapply(data_list, colnames))) @@ -39,25 +40,25 @@ create_table_product_data <- function(input_root, output_root) { # Merge all data frames merged_data <- do.call(rbind, data_list) - ParallelLogger::logDebug("Copying original parient IDs...") + logDebug("Copying original parient IDs...") merged_data$orig_product_released_to <- merged_data$product_released_to - ParallelLogger::logDebug("Trying to fix patient IDs...") + logDebug("Trying to fix patient IDs...") merged_data$product_released_to <- sapply(merged_data$product_released_to, fix_id) - ParallelLogger::logDebug("Extracting product_county and product_hospisal from patients IDs...") + logDebug("Extracting product_county and product_hospisal from patients IDs...") merged_data <- id_2_county_hospisal( merged_data, "product_released_to", "product_country", "product_hospital" ) - ParallelLogger::logDebug("Calculating the most frequent 'product_hospital' for each 'file_name'...") + logDebug("Calculating the most frequent 'product_hospital' for each 'file_name'...") tryCatch( { merged_data <- calculate_most_frequent(merged_data, "file_name", "product_hospital", "table_hospital") }, error = function(e) { - ParallelLogger::logError("Error in calculating the most frequent 'product_hospital': ", e) + logError("Error in calculating the most frequent 'product_hospital': ", e) } ) @@ -67,14 +68,14 @@ create_table_product_data <- function(input_root, output_root) { merged_data <- calculate_most_frequent(merged_data, "file_name", "product_country", "table_country") }, error = function(e) { - ParallelLogger::logError("Error in calculating the most frequent 'product_country': ", e) + logError("Error in calculating the most frequent 'product_country': ", e) } ) # Reorder, add, and ensures the correct data type for each column according to the list of fields merged_data <- preparing_product_fields(merged_data) - ParallelLogger::logDebug("Checking 'table_country' for each 'file_name'...") + logDebug("Checking 'table_country' for each 'file_name'...") report_empty_intersections(merged_data, "file_name", "table_country") # Write the merged and processed data to a file in the output_root directory @@ -85,7 +86,7 @@ create_table_product_data <- function(input_root, output_root) { suffix = "" ) - ParallelLogger::logInfo("Finish creating single file for table product_data.") + logInfo("Finish creating single file for table product_data.") } @@ -168,7 +169,7 @@ preparing_product_fields <- function(merged_data) { "table_hospital" = "character" ) - ParallelLogger::logInfo("Start processing fields for the single csv product_data...") + logInfo("Start processing fields for the single csv product_data...") # Check if all fields are present in merged_data missing_fields <- setdiff(names(fields), names(merged_data)) @@ -186,7 +187,7 @@ preparing_product_fields <- function(merged_data) { merged_data[[field]] <- suppressWarnings(as.Date(original_values)) incorrect_rows <- which(is.na(merged_data[[field]]) & !is.na(original_values)) if (length(incorrect_rows) > 0) { - ParallelLogger::logWarn(paste( + logWarn(paste( "In", field, "incorrect date values were replaced with", ERROR_VAL_DATE, "in", length(incorrect_rows), "rows:", paste(incorrect_rows, collapse = ", ") @@ -198,7 +199,7 @@ preparing_product_fields <- function(merged_data) { merged_data[[field]] <- suppressWarnings(as.numeric(original_values)) incorrect_rows <- which(is.na(merged_data[[field]]) & !is.na(original_values)) if (length(incorrect_rows) > 0) { - ParallelLogger::logWarn(paste( + logWarn(paste( "In", field, "incorrect numeric values were replaced with", ERROR_VAL_NUMERIC, "in", length(incorrect_rows), "rows:", paste(incorrect_rows, collapse = ", ") @@ -210,7 +211,7 @@ preparing_product_fields <- function(merged_data) { merged_data[[field]] <- suppressWarnings(as.integer(original_values)) incorrect_rows <- which(is.na(merged_data[[field]]) & !is.na(original_values)) if (length(incorrect_rows) > 0) { - ParallelLogger::logWarn(paste( + logWarn(paste( "In", field, "incorrect integer values were replaced with", ERROR_VAL_NUMERIC, "in", length(incorrect_rows), "rows:", paste(incorrect_rows, collapse = ", ") @@ -222,7 +223,7 @@ preparing_product_fields <- function(merged_data) { merged_data[[field]] <- as.character(original_values) incorrect_rows <- which(is.na(merged_data[[field]]) & !is.na(original_values)) if (length(incorrect_rows) > 0) { - ParallelLogger::logWarn(paste( + logWarn(paste( "In", field, "incorrect character values were replaced with", ERROR_VAL_CHARACTER, "in", length(incorrect_rows), "rows:", paste(incorrect_rows, collapse = ", ") @@ -231,19 +232,19 @@ preparing_product_fields <- function(merged_data) { } } }, warning = function(w) { - ParallelLogger::logError(paste("Warning in converting", field, ": ", w)) + logError(paste("Warning in converting", field, ": ", w)) }, error = function(e) { - ParallelLogger::logWarn(paste("Error in converting", field, ": ", e)) + logWarn(paste("Error in converting", field, ": ", e)) }, finally = { - ParallelLogger::logDebug(paste("Finished converting", field)) + logDebug(paste("Finished converting", field)) }) } # Reorder the columns according to the list of fields - ParallelLogger::logInfo("Reorder the columns according to the list of fields...") + logInfo("Reorder the columns according to the list of fields...") merged_data <- merged_data[, c(names(fields), setdiff(names(merged_data), names(fields)))] - ParallelLogger::logInfo("Finished processing fields for the single csv product_data.") + logInfo("Finished processing fields for the single csv product_data.") return(merged_data) } @@ -313,7 +314,7 @@ report_empty_intersections <- function(df, row_category, col_category) { df_row_sums <- df_row_sums[df_row_sums$sum == 0, ] if (nrow(df_row_sums) > 0) { - ParallelLogger::logWarn( + logWarn( "The number of ", row_category, " with empty ", col_category, " is ", nrow(df_row_sums), ": ", paste(df_row_sums$row_name, sep = "", collapse = ", ") diff --git a/R/helper_main.R b/R/helper_main.R index b5cd0a1..97bd8c0 100644 --- a/R/helper_main.R +++ b/R/helper_main.R @@ -139,7 +139,7 @@ read_column_synonyms <- function(synonym_file, path_prefixes = c("reference_data #' ) #' } export_data <- function(data, filename, output_root, suffix) { - ParallelLogger::logDebug("Start export_data. Suffix = ", suffix, ".") + logDebug("Start export_data. Suffix = ", suffix, ".") data %>% write.csv( file = @@ -156,7 +156,7 @@ export_data <- function(data, filename, output_root, suffix) { fileEncoding = "UTF-16LE", quote = T ) - ParallelLogger::logInfo("Finish export_data. Suffix = ", suffix, ".") + logInfo("Finish export_data. Suffix = ", suffix, ".") } @@ -177,12 +177,12 @@ export_data <- function(data, filename, output_root, suffix) { #' ) #' } export_data_as_parquet <- function(data, filename, output_root, suffix) { - ParallelLogger::logDebug("Start export_data. Suffix = ", suffix, ".") + logDebug("Start export_data. Suffix = ", suffix, ".") data %>% arrow::write_parquet( sink = file.path(output_root, paste0(filename, suffix, ".parquet")), ) - ParallelLogger::logInfo("Finish export_data. Suffix = ", suffix, ".") + logInfo("Finish export_data. Suffix = ", suffix, ".") } @@ -193,7 +193,7 @@ export_data_as_parquet <- function(data, filename, output_root, suffix) { #' #' @return tibble with patient data read_raw_csv <- function(file) { - ParallelLogger::logDebug("Start reading data with read_csv.") + logDebug("Start reading data with read_csv.") df_patient_raw <- readr::read_csv( file, name_repair = "check_unique", @@ -202,9 +202,9 @@ read_raw_csv <- function(file) { col_types = readr::cols(.default = "c"), locale = readr::locale(encoding = "UTF-16LE") ) - ParallelLogger::logDebug("Finished loading data with read_csv.") - ParallelLogger::logInfo("Dim: ", dim(df_patient_raw)) - ParallelLogger::logInfo("Columns: ", spec(df_patient_raw)) + logDebug("Finished loading data with read_csv.") + logInfo("Dim: ", dim(df_patient_raw)) + logInfo("Columns: ", spec(df_patient_raw)) df_patient_raw } @@ -216,6 +216,7 @@ read_raw_csv <- function(file) { #' Read in all provinces from a YAML file inside the provinces folder. #' #' @return A named character vector with all allowed provinces. +#' @export get_allowed_provinces <- function() { ## Should new countries and provinces be added, update the YAML file provinces <- yaml::read_yaml("reference_data/provinces/allowed_provinces.yaml") |> unlist() diff --git a/R/helper_patient_data_fix.R b/R/helper_patient_data_fix.R index 5b10d72..05bb168 100644 --- a/R/helper_patient_data_fix.R +++ b/R/helper_patient_data_fix.R @@ -24,11 +24,11 @@ convert_to <- function(x, cast_fnc, error_val, col_name = "", id = "") { x <- tryCatch( cast_fnc(x), error = function(e) { - ParallelLogger::logError("Could not convert value ", x, " in column ", col_name, " for patient: ", id) + logError("Could not convert value ", x, " in column ", col_name, " for patient: ", id) x <- error_val }, warning = function(w) { - ParallelLogger::logWarn("Could not convert value ", x, " in column ", col_name, " for patient: ", id) + logWarn("Could not convert value ", x, " in column ", col_name, " for patient: ", id) x <- error_val } ) @@ -55,7 +55,7 @@ cut_numeric_value <- function(x, } if (x < min || x > max) { - ParallelLogger::logWarn( + logWarn( "Found invalid value ", x, " for column ", col_name, " outside [", min, ", ", max, "]. ", "Value was replaced with ", ERROR_VAL_NUMERIC, "." ) @@ -154,7 +154,7 @@ parse_dates <- function(date) { parsed_date <- suppressWarnings(lubridate::as_date(date)) if (is.na(parsed_date)) { - ParallelLogger::logWarn( + logWarn( "Could not parse date value ", date, ". ", "Trying to parse with lubridate::parse_date_time and orders = c('dmy', 'dmY', 'by', 'bY')." ) @@ -195,9 +195,9 @@ check_allowed_values <- function(x, valid_values, id, replace_invalid = TRUE, er valid_value_mapping <- setNames(as.list(valid_values), sanitize_str(valid_values)) if (!sanitize_str(x) %in% names(valid_value_mapping)) { - ParallelLogger::logWarn("Patient ", id, ": Value ", x, " for column ", col, " is not in the list of allowed values. ") + logWarn("Patient ", id, ": Value ", x, " for column ", col, " is not in the list of allowed values. ") if (replace_invalid) { - ParallelLogger::logInfo("Replacing ", x, " with ", error_val, ".") + logInfo("Replacing ", x, " with ", error_val, ".") return(error_val) } else { return(x) @@ -275,13 +275,13 @@ fix_age <- function(age, dob, tracker_year, tracker_month, id) { } if (is.na(age)) { - ParallelLogger::logWarn( + logWarn( "Patient ", id, ": age is missing. Using calculated age ", calc_age, " instead of original age." ) } else { if (calc_age != age) { - ParallelLogger::logWarn( + logWarn( "Patient ", id, ": age ", age, " is different from calculated age ", calc_age, ". Using calculated age instead of original age." ) @@ -289,7 +289,7 @@ fix_age <- function(age, dob, tracker_year, tracker_month, id) { } if (calc_age < 0) { - ParallelLogger::logWarn("Patient ", id, ": calculated age is negative. Something went wrong.") + logWarn("Patient ", id, ": calculated age is negative. Something went wrong.") calc_age <- ERROR_VAL_NUMERIC } } @@ -322,11 +322,11 @@ fix_bmi <- function(weight, height, id) { if (!is.na(weight) && weight == ERROR_VAL_CHARACTER) { - ParallelLogger::logWarn("Patient ", id, ": the weight is out of bounds.") + logWarn("Patient ", id, ": the weight is out of bounds.") } if (!is.na(height) && height == ERROR_VAL_CHARACTER) { - ParallelLogger::logWarn("Patient ", id, ": the height is out of bounds.") + logWarn("Patient ", id, ": the height is out of bounds.") } bmi } @@ -353,7 +353,7 @@ fix_sex <- function(sex, id) { ) if (!is.na(fixed_sex) && fixed_sex == ERROR_VAL_CHARACTER) { - ParallelLogger::logWarn("Patient ", id, ": sex ", sex, " is not in the list of synonyms. Replacing it with ", fixed_sex, ".") + logWarn("Patient ", id, ": sex ", sex, " is not in the list of synonyms. Replacing it with ", fixed_sex, ".") } fixed_sex } @@ -463,7 +463,7 @@ fix_testing_frequency <- function(test_frq) { } if (grepl("-", test_frq, fixed = TRUE)) { - ParallelLogger::logInfo("Found a range for testing_frequency. Replacing it with the mean.") + logInfo("Found a range for testing_frequency. Replacing it with the mean.") test_frq <- try(as.character(replace_range_with_mean(test_frq), silent = TRUE)) } @@ -492,7 +492,7 @@ replace_range_with_mean <- function(x) { #' #' @return data frame with two new columns: blood_pressure_sys_mmhg and blood_pressure_dias_mmhg. split_bp_in_sys_and_dias <- function(df) { - ParallelLogger::logInfo("Splitting blood_pressure_mmhg into blood_pressure_sys_mmhg and blood_pressure_dias_mmhg.") + logInfo("Splitting blood_pressure_mmhg into blood_pressure_sys_mmhg and blood_pressure_dias_mmhg.") df <- df %>% dplyr::mutate( blood_pressure_mmhg = dplyr::case_when( @@ -502,7 +502,7 @@ split_bp_in_sys_and_dias <- function(df) { ) if (paste(ERROR_VAL_NUMERIC, ERROR_VAL_NUMERIC, sep = "/") %in% df$blood_pressure_mmhg) { - ParallelLogger::logWarn( + logWarn( "Found invalid values for column blood_pressure_mmhg that do not follow the format X/Y. ", "Values were replaced with ", ERROR_VAL_NUMERIC, "." ) @@ -514,7 +514,7 @@ split_bp_in_sys_and_dias <- function(df) { delim = "/", names = c("blood_pressure_sys_mmhg", "blood_pressure_dias_mmhg"), ) - ParallelLogger::logDebug("Finished splitting blood_pressure_mmhg into blood_pressure_sys_mmhg and blood_pressure_dias_mmhg.") + logDebug("Finished splitting blood_pressure_mmhg into blood_pressure_sys_mmhg and blood_pressure_dias_mmhg.") df } @@ -550,12 +550,12 @@ fix_id <- function(id) { id <- stringr::str_replace(id, "-", "_") if (!grepl("^[[:upper:]]{2}_[[:upper:]]{2}[[:digit:]]{3}$", id)) { - ParallelLogger::logWarn("Patient ", id, ": id cannot be matched to a 7 letter alpha numeric code like XX_YY001. ") + logWarn("Patient ", id, ": id cannot be matched to a 7 letter alpha numeric code like XX_YY001. ") if (stringr::str_length(id) > 8) { - ParallelLogger::logWarn("Patient ", id, ": id was truncated because it is longer than 8 characters.") + logWarn("Patient ", id, ": id was truncated because it is longer than 8 characters.") id <- stringr::str_sub(id, 1, 8) } else { - ParallelLogger::logError("Patient ", id, ": id is not valid.") + logError("Patient ", id, ": id is not valid.") id <- ERROR_VAL_CHARACTER } } diff --git a/R/helper_product_data.R b/R/helper_product_data.R index 3e56f6b..6f04cd7 100644 --- a/R/helper_product_data.R +++ b/R/helper_product_data.R @@ -83,7 +83,7 @@ get_patient_end <- function(df, j) { # @Description: Reads product data from a monthly file based on extraction logic extract_product_data <- function(monthly_tracker_df) { - ParallelLogger::logDebug("Starting extract_product_data.") + logDebug("Starting extract_product_data.") start_df_msd <- NULL end_df_msd <- NULL @@ -103,7 +103,7 @@ extract_product_data <- function(monthly_tracker_df) { # Clean empty remaining first row product_data_df <- set_second_row_as_headers_and_remove_first_row(product_data_df) - ParallelLogger::logDebug("Finish extract_product_data.") + logDebug("Finish extract_product_data.") return(product_data_df) } @@ -134,7 +134,7 @@ extract_product_data <- function(monthly_tracker_df) { # column synonyms to unify column names # @columns_synonyms: Long format output of read_column_synonyms to match columns harmonize_input_data_columns <- function(product_df, columns_synonyms) { - ParallelLogger::logDebug("Start harmonize_input_data_columns.") + logDebug("Start harmonize_input_data_columns.") # In case that there is additional data in strange columns, keep only relevant columns # keep.cols <- names(product_df) %in% c("") @@ -147,14 +147,14 @@ harmonize_input_data_columns <- function(product_df, columns_synonyms) { ## report all column names which have not been found unknown_column_names <- colnames(product_df)[!colnames(product_df) %in% synonym_headers] if (length(unknown_column_names) > 0) { - ParallelLogger::logWarn("Unknown column names in sheet: ", paste(unknown_column_names, collapse = ", ")) + logWarn("Unknown column names in sheet: ", paste(unknown_column_names, collapse = ", ")) } # replacing var codes colnames_found <- match(colnames(product_df), synonym_headers, nomatch = 0) colnames(product_df)[colnames(product_df) %in% synonym_headers] <- columns_synonyms$name_clean[colnames_found] - ParallelLogger::logDebug("Finish harmonize_input_data_columns.") + logDebug("Finish harmonize_input_data_columns.") if (sum(colnames_found == 0) != 0) { "Non-matching column names found (see 0)" # SK: remove non matching column names @@ -282,7 +282,7 @@ update_receivedfrom <- function(product_df) { grepl("Balance", product_units_received, ignore.case = TRUE) & !is.na(product_received_from) ~ product_received_from )) %>% dplyr::mutate(product_units_released = ifelse(!is.na(product_received_from), NA, product_units_released)) - ParallelLogger::logInfo("The rule for the case was applied successfully- Released (product_units_released) column also includes values for Start/End Balance") + logInfo("The rule for the case was applied successfully- Released (product_units_released) column also includes values for Start/End Balance") } return(product_df) } diff --git a/R/helper_read_patient_data.R b/R/helper_read_patient_data.R index 8f61b04..cca5779 100644 --- a/R/helper_read_patient_data.R +++ b/R/helper_read_patient_data.R @@ -1,7 +1,7 @@ # extracting country and clinic code from patient ID # expects that patient ID has a certain format extract_country_clinic_code <- function(patient_data) { - ParallelLogger::logDebug("Start extract_country_clinic_code.") + logDebug("Start extract_country_clinic_code.") patient_ids <- patient_data["id"] %>% dplyr::filter(id != "0") %>% tidyr::drop_na() %>% @@ -19,9 +19,9 @@ extract_country_clinic_code <- function(patient_data) { clinic_code <- names(sort(table(patient_ids$clinic), decreasing = T))[1] - ParallelLogger::logDebug("country_code = ", country_code, ".") - ParallelLogger::logDebug("clinic_code = ", clinic_code, ".") - ParallelLogger::logDebug("Finish extract_country_clinic_code.") + logDebug("country_code = ", country_code, ".") + logDebug("clinic_code = ", clinic_code, ".") + logDebug("Finish extract_country_clinic_code.") return(list("country_code" = country_code, "clinic_code" = clinic_code)) } @@ -40,9 +40,9 @@ extract_country_clinic_code <- function(patient_data) { #' @return data.frame with the patient data #' @export extract_patient_data <- function(tracker_data_file, sheet, year) { - ParallelLogger::logDebug("Start extract_patient_data for sheet = ", sheet, ".") + logDebug("Start extract_patient_data for sheet = ", sheet, ".") - ParallelLogger::logDebug("Start openxlsx::read.xlsx to get tracker_data.") + logDebug("Start openxlsx::read.xlsx to get tracker_data.") tracker_data <- openxlsx::read.xlsx( xlsxFile = tracker_data_file, sheet = sheet, @@ -59,7 +59,7 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { # col_names = F, # .name_repair = "unique_quiet" # ) - ParallelLogger::logDebug("Finish openxlsx::read.xlsx.") + logDebug("Finish openxlsx::read.xlsx.") # Assumption: first column is always empty until patient data begins patient_data_range <- which(!is.na(tracker_data[, 1])) @@ -80,8 +80,8 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { row_max <- row_max + 1 } - ParallelLogger::logInfo("Patient data found in rows ", row_min, " to ", row_max, ".") - ParallelLogger::logDebug("Start readxl::read_excel to get patient data.") + logInfo("Patient data found in rows ", row_min, " to ", row_max, ".") + logDebug("Start readxl::read_excel to get patient data.") df_patient <- readxl::read_excel( path = tracker_data_file, sheet = sheet, @@ -91,11 +91,11 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { col_types = c("text"), .name_repair = "unique_quiet" ) - ParallelLogger::logDebug("Finish readxl::read_excel.") + logDebug("Finish readxl::read_excel.") if (header_cols[2] == header_cols_2[2]) { # take into account that date info gets separated from the updated values (not in the same row, usually in the bottom row) - ParallelLogger::logInfo("Read in multiline header.") + logInfo("Read in multiline header.") diff_colnames <- which((header_cols != header_cols_2)) header_cols[diff_colnames] <- @@ -106,7 +106,7 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { } colnames(df_patient) <- header_cols - ParallelLogger::logDebug("Found patient column names = ", paste(header_cols, collapse = ","), ".") + logDebug("Found patient column names = ", paste(header_cols, collapse = ","), ".") # delete columns without a header (=NA) df_patient <- df_patient[, !is.na(colnames(df_patient))] @@ -117,7 +117,7 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { df_patient <- df_patient[rowSums(is.na(df_patient)) != ncol(df_patient), ] - ParallelLogger::logDebug("Finish extract_patient_data.") + logDebug("Finish extract_patient_data.") df_patient } @@ -137,7 +137,7 @@ extract_patient_data <- function(tracker_data_file, sheet, year) { #' @export harmonize_patient_data_columns <- function(patient_df, columns_synonyms) { - ParallelLogger::logDebug("Start harmonize_patient_data_columns.") + logDebug("Start harmonize_patient_data_columns.") patient_df <- patient_df[!is.na(names(patient_df))] @@ -151,11 +151,11 @@ harmonize_patient_data_columns <- mismatching_column_ids <- which(colnames_found == 0) if (length(mismatching_column_ids) > 0) { - ParallelLogger::logWarn( + logWarn( "Non-matching column names found: ", paste(colnames(patient_df)[mismatching_column_ids], collapse = ","), "." ) } - ParallelLogger::logDebug("Finish harmonize_patient_data_columns.") + logDebug("Finish harmonize_patient_data_columns.") patient_df } diff --git a/R/helper_scipt_1.R b/R/helper_script_1.R similarity index 76% rename from R/helper_scipt_1.R rename to R/helper_script_1.R index f1cf102..67d2901 100644 --- a/R/helper_scipt_1.R +++ b/R/helper_script_1.R @@ -1,19 +1,17 @@ #' @title Process a single tracker file and extract patient and product data. #' -#' @param tracker_file Filename of the tracler. +#' @param tracker_file File name of the tracker. #' @param paths a list with the paths to the tracker root dir, the patient and product output dir and the root output dir. -#' @param p progressor from progressr package. +#' @param synonyms a list with the synonyms for patient and product data header names. #' #' @export -process_tracker_file <- function(tracker_file, paths, p) { - p() +process_tracker_file <- function(tracker_file, paths, synonyms) { tracker_name <- tools::file_path_sans_ext(basename(tracker_file)) - synonyms <- get_synonyms() tracker_data_file <- file.path(paths$tracker_root, tracker_file) - ParallelLogger::logDebug("Start process_tracker_file.") - ParallelLogger::logInfo( + logDebug("Start process_tracker_file.") + logInfo( "Current file: ", tracker_name ) @@ -29,10 +27,10 @@ process_tracker_file <- function(tracker_file, paths, p) { synonyms_patient = synonyms$patient ), error = function(e) { - ParallelLogger::logError("Could not process patient data. Error = ", e$message, ".") + logError("Could not process patient data. Error = ", e$message, ".") }, warning = function(w) { - ParallelLogger::logWarn("Could not process patient data. Warning = ", w$message, ".") + logWarn("Could not process patient data. Warning = ", w$message, ".") } ) }, @@ -51,17 +49,17 @@ process_tracker_file <- function(tracker_file, paths, p) { synonyms_product = synonyms$product ), error = function(e) { - ParallelLogger::logError("Could not process product data. Error = ", e$message, ".") + logError("Could not process product data. Error = ", e$message, ".") }, warning = function(w) { - ParallelLogger::logWarn("Could not process product data. Warning = ", w$message, ".") + logWarn("Could not process product data. Warning = ", w$message, ".") } ) }, output_root = paths$output_root ) - ParallelLogger::logDebug("Finish process_tracker_file.") + logDebug("Finish process_tracker_file.") } @@ -78,7 +76,7 @@ process_patient_data <- tracker_data_file, output_root, synonyms_patient) { - ParallelLogger::logDebug("Start process_patient_data.") + logDebug("Start process_patient_data.") df_raw_patient <- reading_patient_data( @@ -88,7 +86,7 @@ process_patient_data <- df_raw_patient <- df_raw_patient %>% dplyr::mutate(file_name = tracker_name) - ParallelLogger::logDebug( + logDebug( "df_raw_patient dim: ", dim(df_raw_patient) %>% as.data.frame(), "." @@ -101,7 +99,7 @@ process_patient_data <- suffix = "_patient_raw" ) - ParallelLogger::logDebug("Finish process_patient_data.") + logDebug("Finish process_patient_data.") } @@ -118,7 +116,7 @@ process_product_data <- tracker_data_file, output_root, synonyms_product) { - ParallelLogger::logDebug("Start process_product_data.") + logDebug("Start process_product_data.") df_raw_product <- reading_product_data_step1( @@ -129,10 +127,10 @@ process_product_data <- if (!is.null(df_raw_product)) { df_raw_product <- df_raw_product %>% dplyr::mutate(file_name = tracker_name) } else { - ParallelLogger::logDebug("Empty product data") + logDebug("Empty product data") } - ParallelLogger::logDebug( + logDebug( "df_raw_product dim: ", dim(df_raw_product) %>% as.data.frame(), "." @@ -147,7 +145,7 @@ process_product_data <- suffix = "_product_raw" ) } else { - ParallelLogger::logWarn("No product data in the file") + logWarn("No product data in the file") } - ParallelLogger::logDebug("Finish process_product_data.") + logDebug("Finish process_product_data.") } diff --git a/R/helper_script_2.R b/R/helper_script_2.R index 035b5db..e1c444b 100644 --- a/R/helper_script_2.R +++ b/R/helper_script_2.R @@ -11,12 +11,10 @@ #' #' @param patient_file Path to patient raw parquet file from first script. #' @param paths List with paths to patient_data_cleaned and product_data_cleaned folders. -#' @param p progressor from progressr package. +#' @param allowed_provinces A named character vector with names of allowed provinces. #' #' @export -process_patient_file <- function(patient_file, paths, p) { - p() - +process_patient_file <- function(patient_file, paths, allowed_provinces) { ERROR_VAL_NUMERIC <<- 999999 ERROR_VAL_CHARACTER <<- "Undefined" ERROR_VAL_DATE <<- "9999-09-09" @@ -26,8 +24,8 @@ process_patient_file <- function(patient_file, paths, p) { file.path(paths$tracker_root, patient_file) output_root <- paths$patient_data_cleaned - ParallelLogger::logDebug("Start process_patient_file.") - ParallelLogger::logInfo( + logDebug("Start process_patient_file.") + logInfo( "Current file: ", patient_file_name ) @@ -40,26 +38,25 @@ process_patient_file <- function(patient_file, paths, p) { process_patient_file_worker( patient_file_path = patient_file_path, patient_file_name = patient_file_name, - output_root = output_root + output_root = output_root, + allowed_provinces = allowed_provinces ), error = function(e) { - ParallelLogger::logError("Could not process raw patient data. Error = ", e$message, ".") + logError("Could not process raw patient data. Error = ", e$message, ".") }, warning = function(w) { - ParallelLogger::logWarn("Could not process raw patient data. Warning = ", w$message, ".") + logWarn("Could not process raw patient data. Warning = ", w$message, ".") } ) }, output_root = paths$output_root ) - ParallelLogger::logInfo("Finish process_patient_file.") + logInfo("Finish process_patient_file.") } -process_patient_file_worker <- function(patient_file_path, patient_file_name, output_root) { - allowed_provinces <- get_allowed_provinces() - +process_patient_file_worker <- function(patient_file_path, patient_file_name, output_root, allowed_provinces) { df_patient_raw <- arrow::read_parquet(patient_file_path) # filter all rows with no patient id or patient name @@ -71,24 +68,24 @@ process_patient_file_worker <- function(patient_file_path, patient_file_name, ou # data before 2019 had only one column for updated hba1c and fbg # with date as part of the value if (!"hba1c_updated_date" %in% colnames(df_patient_raw) && "hba1c_updated" %in% colnames(df_patient_raw)) { - ParallelLogger::logInfo("Column updated_hba1c_date not found. Trying to parse from hba1c_updated.") + logInfo("Column updated_hba1c_date not found. Trying to parse from hba1c_updated.") df_patient_raw <- extract_date_from_measurement(df_patient_raw, "hba1c_updated") - ParallelLogger::logDebug("Finished parsing dates from hba1c_updated.") + logDebug("Finished parsing dates from hba1c_updated.") } if (!"fbg_updated_date" %in% colnames(df_patient_raw) && "fbg_updated_mg" %in% colnames(df_patient_raw)) { - ParallelLogger::logInfo("Column updated_fbg_date not found. Trying to parse from fbg_updated_mg.") + logInfo("Column updated_fbg_date not found. Trying to parse from fbg_updated_mg.") df_patient_raw <- extract_date_from_measurement(df_patient_raw, "fbg_updated_mg") - ParallelLogger::logDebug("Finished parsing dates from fbg_updated_mg.") + logDebug("Finished parsing dates from fbg_updated_mg.") } if (!"fbg_updated_date" %in% colnames(df_patient_raw) && "fbg_updated_mmol" %in% colnames(df_patient_raw)) { - ParallelLogger::logInfo("Column fbg_updated_date not found. Trying to parse from fbg_updated_mmol.") + logInfo("Column fbg_updated_date not found. Trying to parse from fbg_updated_mmol.") df_patient_raw <- extract_date_from_measurement(df_patient_raw, "fbg_updated_mmol") - ParallelLogger::logDebug("Finished parsing dates from fbg_updated_mmol.") + logDebug("Finished parsing dates from fbg_updated_mmol.") } # blood pressure is given as sys/dias value pair, @@ -101,7 +98,7 @@ process_patient_file_worker <- function(patient_file_path, patient_file_name, ou # depending on the equipment being used. # If the reading is above the maximum available value the > sign is used - # we would prefer to retain this character in the database as it is important for data analysis. - ParallelLogger::logInfo("Adding columns hba1c_baseline_exceeds and hba1c_updated_exceeds.") + logInfo("Adding columns hba1c_baseline_exceeds and hba1c_updated_exceeds.") df_patient_raw <- df_patient_raw %>% dplyr::mutate( hba1c_baseline_exceeds = ifelse(grepl(">|<", hba1c_baseline), TRUE, FALSE), @@ -111,7 +108,7 @@ process_patient_file_worker <- function(patient_file_path, patient_file_name, ou # --- META SCHEMA --- # meta schema has all final columns for the database # along with their corresponding data types - ParallelLogger::logInfo("Creating meta schema.") + logInfo("Creating meta schema.") # short type string for read_csv: # iiinDccDcnnDnncnlnlDncDccDDDccccDccccciDciiiDn schema <- tibble::tibble( @@ -181,15 +178,15 @@ process_patient_file_worker <- function(patient_file_path, patient_file_name, ou ) cols_extra <- colnames(df_patient_raw)[!colnames(df_patient_raw) %in% colnames(schema)] - ParallelLogger::logWarn("Extra columns in patient data: ", paste(cols_extra, collapse = ", ")) + logWarn("Extra columns in patient data: ", paste(cols_extra, collapse = ", ")) cols_missing <- colnames(schema)[!colnames(schema) %in% colnames(df_patient_raw)] - ParallelLogger::logWarn("Missing columns in patient data: ", paste(cols_missing, collapse = ", ")) + logWarn("Missing columns in patient data: ", paste(cols_missing, collapse = ", ")) # add all columns of schema to df_patient_raw # keep all rows, only append missing cols - ParallelLogger::logInfo("Merging df_patient with meta schema and selecting all columns of meta schema.") + logInfo("Merging df_patient with meta schema and selecting all columns of meta schema.") df_patient <- merge.default(df_patient_raw, schema, all.x = T) df_patient <- df_patient[colnames(schema)] @@ -290,7 +287,7 @@ process_patient_file_worker <- function(patient_file_path, patient_file_name, ou df_patient <- df_patient %>% dplyr::arrange(tracker_date, id) - ParallelLogger::logDebug( + logDebug( "df_patient dim: ", dim(df_patient) %>% as.data.frame(), "." @@ -313,19 +310,16 @@ process_patient_file_worker <- function(patient_file_path, patient_file_name, ou #' #' @param product_file Path to product raw parquet file from first script. #' @param paths List with paths to patient_data_cleaned and product_data_cleaned folders. -#' @param p progressor from progressr package. +#' @param synonyms_product Synonyms for product data header names. #' #' @export -process_product_file <- function(product_file, paths, p) { - p() - synonyms <- get_synonyms() - synonyms_product <- synonyms$product +process_product_file <- function(product_file, paths, synonyms_product) { product_file_name <- tools::file_path_sans_ext(basename(product_file)) product_file_path <- file.path(paths$tracker_root, product_file) - ParallelLogger::logDebug("Start process_product_file.") - ParallelLogger::logInfo( + logDebug("Start process_product_file.") + logInfo( "Current file: ", product_file_name ) @@ -337,14 +331,14 @@ process_product_file <- function(product_file, paths, p) { tryCatch( df_product_raw <- reading_product_data_step2(df_product_raw, synonyms_product), error = function(e) { - ParallelLogger::logError("Could not process raw product data. Error = ", e$message, ".") + logError("Could not process raw product data. Error = ", e$message, ".") }, warning = function(w) { - ParallelLogger::logWarn("Could not process raw product data. Warning = ", w$message, ".") + logWarn("Could not process raw product data. Warning = ", w$message, ".") } ) - ParallelLogger::logDebug( + logDebug( "df_product_raw dim: ", dim(df_product_raw) %>% as.data.frame(), "." @@ -360,5 +354,5 @@ process_product_file <- function(product_file, paths, p) { output_root = paths$output_root ) - ParallelLogger::logInfo("Finish process_product_file.") + logInfo("Finish process_product_file.") } diff --git a/R/link_product_patient.R b/R/link_product_patient.R index 8816d57..2a29378 100644 --- a/R/link_product_patient.R +++ b/R/link_product_patient.R @@ -8,13 +8,14 @@ #' @param patient_file A string specifying the path to the patient file. #' #' @return This function does not return a value. It prints log messages about the linking process. +#' @export #' #' @examples #' \dontrun{ #' link_product_patient("path/to/product_data.parquet", "path/to/patient_data.parquet") #' } link_product_patient <- function(product_file, patient_file) { - ParallelLogger::logInfo("Trying to link product file ", product_file, " with patient file ", patient_file) + logInfo("Trying to link product file ", product_file, " with patient file ", patient_file) patient_data <- arrow::read_parquet(patient_file) product_data <- arrow::read_parquet(product_file) @@ -44,7 +45,7 @@ link_product_patient <- function(product_file, patient_file) { tryCatch( { if (nrow(summary_df) > 0) { - ParallelLogger::logWarn( + logWarn( "The number of mismatched patient IDs between the product and patient data is ", nrow(summary_df), ". ", paste("File Name: ", summary_df$file_name, @@ -56,18 +57,18 @@ link_product_patient <- function(product_file, patient_file) { } }, error = function(e) { - ParallelLogger::logError("Could not link csv files for product and patient data. Error: ", e$message) + logError("Could not link csv files for product and patient data. Error: ", e$message) }, warning = function(w) { - ParallelLogger::logWarn("Could not link csv files for product and patient data. Warning: ", w$message) + logWarn("Could not link csv files for product and patient data. Warning: ", w$message) } ) } else { - ParallelLogger::logInfo( + logInfo( "There are no mismatched patient IDs between the product data - ", product_file, " and patient data - ", patient_file ) } - ParallelLogger::logInfo("Finished attempting to link product csv file with patient csv file.") + logInfo("Finished attempting to link product csv file with patient csv file.") } diff --git a/R/read_cleaned_patient_data.R b/R/read_cleaned_patient_data.R index b217da4..ae56e34 100644 --- a/R/read_cleaned_patient_data.R +++ b/R/read_cleaned_patient_data.R @@ -7,7 +7,7 @@ #' @export read_cleaned_patient_data <- function(input_root, patient_data_files) { - ParallelLogger::logInfo("Start read_cleaned_patient_data") + logInfo("Start read_cleaned_patient_data") patient_data <- patient_data_files %>% purrr::map(function(patient_file) { @@ -17,6 +17,6 @@ read_cleaned_patient_data <- - ParallelLogger::logInfo("Finish read_cleaned_patient_data") + logInfo("Finish read_cleaned_patient_data") patient_data } diff --git a/R/read_patient_data.R b/R/read_patient_data.R index 61f6839..b292f1c 100644 --- a/R/read_patient_data.R +++ b/R/read_patient_data.R @@ -1,9 +1,9 @@ reading_patient_data <- function(tracker_data_file, columns_synonyms) { - ParallelLogger::logDebug("Start reading_patient_data.") + logDebug("Start reading_patient_data.") sheet_list <- readxl::excel_sheets(tracker_data_file) testit::assert(length(sheet_list) > 0) - ParallelLogger::logInfo( + logInfo( "Found ", length(sheet_list), " sheets inside the current file = ", @@ -13,7 +13,7 @@ reading_patient_data <- month_list <- sheet_list[na.omit(pmatch(month.abb, sheet_list))] - ParallelLogger::logInfo( + logInfo( "Found ", length(month_list), " month sheets inside the current file = ", @@ -24,18 +24,18 @@ reading_patient_data <- # Extract year year <- get_tracker_year(tracker_data_file, month_list) - ParallelLogger::logInfo("Tracker year = ", year, ".") + logInfo("Tracker year = ", year, ".") testit::assert(year %in% c(2017, 2018, 2019, 2020, 2021, 2022)) tidy_tracker_list <- NULL - ParallelLogger::logDebug("Start processing sheets.") + logDebug("Start processing sheets.") for (curr_sheet in month_list) { - ParallelLogger::logDebug("Start processing sheet ", curr_sheet, ".") + logDebug("Start processing sheet ", curr_sheet, ".") df_patient <- extract_patient_data(tracker_data_file, curr_sheet, year) testit::assert(nrow(df_patient) > 0) - ParallelLogger::logDebug("df_patient dim: ", dim(df_patient) %>% as.data.frame(), ".") + logDebug("df_patient dim: ", dim(df_patient) %>% as.data.frame(), ".") df_patient <- harmonize_patient_data_columns(df_patient, columns_synonyms) @@ -66,15 +66,15 @@ reading_patient_data <- ) tidy_tracker_list[[curr_sheet]] <- df_patient - ParallelLogger::logDebug("Finish processing sheet ", curr_sheet, ".") + logDebug("Finish processing sheet ", curr_sheet, ".") } - ParallelLogger::logDebug("Start combining sheet data into single data frame.") + logDebug("Start combining sheet data into single data frame.") df_raw <- dplyr::bind_rows(tidy_tracker_list) - ParallelLogger::logDebug("Finish combining sheet data into single data frame.") + logDebug("Finish combining sheet data into single data frame.") if ("Patient List" %in% sheet_list) { - ParallelLogger::logDebug("Start extracting patient list.") + logDebug("Start extracting patient list.") patient_list <- extract_patient_data( tracker_data_file, "Patient List", @@ -98,9 +98,9 @@ reading_patient_data <- by = "id", relationship = "many-to-one" ) - ParallelLogger::logDebug("Finish extracting patient list.") + logDebug("Finish extracting patient list.") } - ParallelLogger::logInfo("Finish reading_patient_data.") + logInfo("Finish reading_patient_data.") return(df_raw) } diff --git a/R/read_product_data.R b/R/read_product_data.R index d1fc051..10d8886 100644 --- a/R/read_product_data.R +++ b/R/read_product_data.R @@ -4,7 +4,7 @@ # function based on parts from run_a4d_product_data.R and helper functions reading_product_data_step1 <- function(tracker_data_file, columns_synonyms) { - ParallelLogger::logDebug("Start reading_product_data_step1.") + logDebug("Start reading_product_data_step1.") # rename column names to match colnames(columns_synonyms) <- c("name_clean", "name_to_be_matched") @@ -13,13 +13,13 @@ reading_product_data_step1 <- month_list <- sheet_list[na.omit(pmatch(month.abb, sheet_list))] year <- get_tracker_year(tracker_data_file, month_list) - ParallelLogger::logDebug("Found ", length(sheet_list), " sheets: ", paste(sheet_list, collapse = ", "), ".") - ParallelLogger::logDebug("Found ", length(month_list), " month sheets: ", paste(month_list, collapse = ", "), ".") + logDebug("Found ", length(sheet_list), " sheets: ", paste(sheet_list, collapse = ", "), ".") + logDebug("Found ", length(month_list), " month sheets: ", paste(month_list, collapse = ", "), ".") # loop through all months for (curr_sheet in month_list) { - ParallelLogger::logDebug("Start processing the following sheet: ", curr_sheet) + logDebug("Start processing the following sheet: ", curr_sheet) # open tracker data tracker_data <- data.frame( @@ -37,7 +37,7 @@ reading_product_data_step1 <- grepl("Description of Support", tracker_data[, ]))) ) { # go to next month - ParallelLogger::logInfo("Could not find product data in tracker data. Skipping ", curr_sheet, ".") + logInfo("Could not find product data in tracker data. Skipping ", curr_sheet, ".") next } @@ -46,7 +46,7 @@ reading_product_data_step1 <- # If after extraction, dataframe is empty, this iteration is also skipped. if (all(is.na(product_df))) { - ParallelLogger::logInfo("Product data is empty. Skipping ", curr_sheet, ".") + logInfo("Product data is empty. Skipping ", curr_sheet, ".") next } @@ -69,11 +69,11 @@ reading_product_data_step1 <- tryCatch( { if (num_na_rows > 0) { - ParallelLogger::logInfo(curr_sheet, " the number of rows where the patient's name is missing: ", col_released, " is not NA and ", col_released_to, " (patient's name) is NA = ", num_na_rows) + logInfo(curr_sheet, " the number of rows where the patient's name is missing: ", col_released, " is not NA and ", col_released_to, " (patient's name) is NA = ", num_na_rows) } }, error = function(e) { - ParallelLogger::logError(curr_sheet, " trying with num_na_rows for products. Error: ", e$message) + logError(curr_sheet, " trying with num_na_rows for products. Error: ", e$message) } ) @@ -83,7 +83,7 @@ reading_product_data_step1 <- tryCatch( { if (nrow(non_processed_dates) > 0) { - ParallelLogger::logWarn( + logWarn( curr_sheet, " the number of rows with non-processed dates in product_entry_date is ", nrow(non_processed_dates), ": ", @@ -92,7 +92,7 @@ reading_product_data_step1 <- } }, error = function(e) { - ParallelLogger::logError(curr_sheet, " trying with non_processed_dates in product_entry_date. Error: ", e$message) + logError(curr_sheet, " trying with non_processed_dates in product_entry_date. Error: ", e$message) } ) @@ -107,7 +107,7 @@ reading_product_data_step1 <- # Check if the entry dates for the balance match the month/year on the sheet check_entry_dates(product_df, curr_sheet) - ParallelLogger::logDebug("Finish processing sheet: ", curr_sheet) + logDebug("Finish processing sheet: ", curr_sheet) # combine all months if (!exists("df_final")) { @@ -121,7 +121,7 @@ reading_product_data_step1 <- } else { return(NULL) } - ParallelLogger::logDebug("Finish reading_product_data_step1.") + logDebug("Finish reading_product_data_step1.") } @@ -151,7 +151,7 @@ count_na_rows <- function(df, units_released_col, released_to_col) { #' #' @return This function does not return a value. It logs a warning message if there are any dates in 'product_entry_date' that don't match the month/year on the sheet. check_entry_dates <- function(df, Sheet) { - ParallelLogger::logDebug("Start check_entry_dates.") + logDebug("Start check_entry_dates.") # Check if the entry dates for the balance match the month/year on the sheet entry_dates_df <- df %>% dplyr::filter(grepl("^[0-9]+$", product_entry_date)) @@ -167,14 +167,14 @@ check_entry_dates <- function(df, Sheet) { not_same <- entry_dates_df[entry_dates_df$ed_month != entry_dates_df$product_table_month | entry_dates_df$ed_year != entry_dates_df$product_table_year, ] if (nrow(not_same) > 0) { - ParallelLogger::logWarn( + logWarn( Sheet, " the number of dates in product_entry_date that don't match the month/year on the sheet is ", nrow(not_same), ": ", paste(not_same$ed_date, collapse = ", ") ) } - ParallelLogger::logDebug("Finish check_entry_dates.") + logDebug("Finish check_entry_dates.") } #' @title Remove Rows with NA Values in Specified Columns. @@ -192,7 +192,7 @@ remove_rows_with_na_columns <- na_rows <- apply(df[column_names], 1, function(x) all(is.na(x))) # log message - ParallelLogger::logInfo(paste(length(na_rows[na_rows == T]), " rows deleted out of ", nrow(df), " rows (reason: rows not containing additional info).", sep = "")) + logInfo(paste(length(na_rows[na_rows == T]), " rows deleted out of ", nrow(df), " rows (reason: rows not containing additional info).", sep = "")) # Return the data frame without the NA rows return(df[!na_rows, ]) @@ -215,7 +215,7 @@ check_negative_balance <- function(df, Sheet) { # Check if there are any rows in the new data frame if (nrow(negative_df) > 0) { # Log a warning message with the number of negative values and their corresponding product_balance values - ParallelLogger::logWarn( + logWarn( Sheet, " number of negative values in product_balance on the sheet is ", nrow(negative_df), ": ", @@ -242,7 +242,7 @@ switch_columns_stock <- "product_units_received" = "product_received_from", "product_received_from" = "product_units_received" ) - ParallelLogger::logDebug("Columns product_units_received and product_received_from were switched") + logDebug("Columns product_units_received and product_received_from were switched") return(df) } else { return(df) @@ -290,14 +290,14 @@ report_unknown_products <- function(df, Sheet, stock_list_df) { # Check if there are any unknown products names if (length(unmatched_products) > 0) { # Log a warning message with the number of unknown products names - ParallelLogger::logWarn( + logWarn( Sheet, " the number of unknown product names on the sheet is ", length(unmatched_products), ": ", paste(unmatched_products, collapse = ", ") ) } else { - ParallelLogger::logInfo(Sheet, " no unknown product names on the sheet") + logInfo(Sheet, " no unknown product names on the sheet") } } @@ -317,16 +317,16 @@ report_unknown_products <- function(df, Sheet, stock_list_df) { #' product_list <- load_product_reference_data("your_file.xlsx") #' } load_product_reference_data <- function(stock_summary_xlsx = "reference_data/master_tracker_variables.xlsx") { - ParallelLogger::logDebug("Trying to load the product list from the Stock Summary, ", stock_summary_xlsx, "...") + logDebug("Trying to load the product list from the Stock Summary, ", stock_summary_xlsx, "...") tryCatch( { product_names_df <- readxl::read_excel(stock_summary_xlsx, "Stock_Summary") colnames(product_names_df) <- tolower(colnames(product_names_df)) - ParallelLogger::logDebug(nrow(product_names_df), " product names were loaded from the Stock Summary.") + logDebug(nrow(product_names_df), " product names were loaded from the Stock Summary.") return(product_names_df) }, error = function(e) { - ParallelLogger::logError("Error in loading stock product list: ", e) + logError("Error in loading stock product list: ", e) } ) } @@ -371,7 +371,7 @@ add_product_categories <- function(inventory_data, product_category_mapping) { #' df <- extract_unit_capacity(df, "product") #' } extract_unit_capacity <- function(df, column_name) { - ParallelLogger::logDebug("Trying to extract Unit Capacity from ", column_name, " column") + logDebug("Trying to extract Unit Capacity from ", column_name, " column") # Extract all symbols between parentheses df$product_unit_capacity <- stringr::str_extract(df[[column_name]], "\\(([^)]+)\\)") @@ -393,7 +393,7 @@ extract_unit_capacity <- function(df, column_name) { # Add 1 to NA values df$product_unit_capacity[is.na(df$product_unit_capacity)] <- 1 - ParallelLogger::logDebug("Finished extracting Unit Capacity from ", column_name, " column") + logDebug("Finished extracting Unit Capacity from ", column_name, " column") return(df) } @@ -410,7 +410,7 @@ extract_unit_capacity <- function(df, column_name) { #' @return Cleaned product data for one specified tracker. reading_product_data_step2 <- function(df, columns_synonyms) { - ParallelLogger::logDebug("Start reading_product_data_step2.") + logDebug("Start reading_product_data_step2.") # rename column names to match colnames(columns_synonyms) <- c("name_clean", "name_to_be_matched") @@ -425,7 +425,7 @@ reading_product_data_step2 <- # loop through all months for (sheet_month in unique(df$product_sheet_name)) { - ParallelLogger::logDebug(paste("Start processing the following sheet:", sheet_month)) + logDebug(paste("Start processing the following sheet:", sheet_month)) # filter on month sheet product_df <- df %>% @@ -453,7 +453,7 @@ reading_product_data_step2 <- product_df <- remove_rows_with_na_columns(product_df, column_names_check) # jump to next sheet if dataframe empty from here if (nrow(product_df) == 0) { - ParallelLogger::logDebug(paste(sheet_month, " sheet is empty after filtering and skipped", sep = "")) + logDebug(paste(sheet_month, " sheet is empty after filtering and skipped", sep = "")) next } @@ -527,12 +527,12 @@ reading_product_data_step2 <- df_final <- df_final %>% rbind(product_df) - ParallelLogger::logDebug(paste("Finished processing the following sheet:", sheet_month)) + logDebug(paste("Finished processing the following sheet:", sheet_month)) } if (nrow(df_final) > 0) { return(df_final) } else { - ParallelLogger::logDebug(paste("No product data extracted for the following tracker:", df$file_name[1])) + logDebug(paste("No product data extracted for the following tracker:", df$file_name[1])) } } diff --git a/man/process_patient_data.Rd b/man/process_patient_data.Rd index e1334b2..f92d060 100644 --- a/man/process_patient_data.Rd +++ b/man/process_patient_data.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper_scipt_1.R +% Please edit documentation in R/helper_script_1.R \name{process_patient_data} \alias{process_patient_data} \title{Extract patient data.} diff --git a/man/process_patient_file.Rd b/man/process_patient_file.Rd index 69c9a62..21e3f60 100644 --- a/man/process_patient_file.Rd +++ b/man/process_patient_file.Rd @@ -4,14 +4,14 @@ \alias{process_patient_file} \title{Process and clean patient raw data} \usage{ -process_patient_file(patient_file, paths, p) +process_patient_file(patient_file, paths, allowed_provinces) } \arguments{ \item{patient_file}{Path to patient raw parquet file from first script.} \item{paths}{List with paths to patient_data_cleaned and product_data_cleaned folders.} -\item{p}{progressor from progressr package.} +\item{allowed_provinces}{A named character vector with names of allowed provinces.} } \description{ Read in the output of the first script as parquet (all character). diff --git a/man/process_product_data.Rd b/man/process_product_data.Rd index 0109c18..e6a18c5 100644 --- a/man/process_product_data.Rd +++ b/man/process_product_data.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper_scipt_1.R +% Please edit documentation in R/helper_script_1.R \name{process_product_data} \alias{process_product_data} \title{Extract product data.} diff --git a/man/process_product_file.Rd b/man/process_product_file.Rd index 07d01d9..a88ee9e 100644 --- a/man/process_product_file.Rd +++ b/man/process_product_file.Rd @@ -4,14 +4,14 @@ \alias{process_product_file} \title{Process and clean product raw data} \usage{ -process_product_file(product_file, paths, p) +process_product_file(product_file, paths, synonyms_product) } \arguments{ \item{product_file}{Path to product raw parquet file from first script.} \item{paths}{List with paths to patient_data_cleaned and product_data_cleaned folders.} -\item{p}{progressor from progressr package.} +\item{synonyms_product}{Synonyms for product data header names.} } \description{ A short description... diff --git a/man/process_tracker_file.Rd b/man/process_tracker_file.Rd index 2453d08..67c566b 100644 --- a/man/process_tracker_file.Rd +++ b/man/process_tracker_file.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper_scipt_1.R +% Please edit documentation in R/helper_script_1.R \name{process_tracker_file} \alias{process_tracker_file} \title{Process a single tracker file and extract patient and product data.} \usage{ -process_tracker_file(tracker_file, paths, p) +process_tracker_file(tracker_file, paths, synonyms) } \arguments{ -\item{tracker_file}{Filename of the tracler.} +\item{tracker_file}{File name of the tracker.} \item{paths}{a list with the paths to the tracker root dir, the patient and product output dir and the root output dir.} -\item{p}{progressor from progressr package.} +\item{synonyms}{a list with the synonyms for patient and product data header names.} } \description{ Process a single tracker file and extract patient and product data. diff --git a/renv.lock b/renv.lock index 5cb4660..f684844 100644 --- a/renv.lock +++ b/renv.lock @@ -177,21 +177,24 @@ }, "brew": { "Package": "brew", - "Version": "1.0-8", + "Version": "1.0-10", "Source": "Repository", "Repository": "CRAN", - "Hash": "d69a786e85775b126bddbee185ae6084" + "Hash": "8f4a384e19dccd8c65356dc096847b76" }, "brio": { "Package": "brio", - "Version": "1.1.3", + "Version": "1.1.4", "Source": "Repository", "Repository": "CRAN", - "Hash": "976cf154dfb043c012d87cddd8bca363" + "Requirements": [ + "R" + ], + "Hash": "68bd2b066e1fe780bbf62fc8bcc36de3" }, "bslib": { "Package": "bslib", - "Version": "0.5.1", + "Version": "0.6.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -202,12 +205,13 @@ "htmltools", "jquerylib", "jsonlite", + "lifecycle", "memoise", "mime", "rlang", "sass" ], - "Hash": "283015ddfbb9d7bf15ea9f0b5698f0d9" + "Hash": "c0d8599494bc7fb408cd206bbdd9cab0" }, "cachem": { "Package": "cachem", @@ -321,13 +325,13 @@ }, "curl": { "Package": "curl", - "Version": "5.1.0", + "Version": "5.2.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R" ], - "Hash": "9123f3ef96a2c1a93927d828b2fe7d4c" + "Hash": "ce88d13c0b10fe88a37d9c59dba2d7f9" }, "data.table": { "Package": "data.table", @@ -342,17 +346,16 @@ }, "desc": { "Package": "desc", - "Version": "1.4.2", + "Version": "1.4.3", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "R6", "cli", - "rprojroot", "utils" ], - "Hash": "6b9602c7ebbe87101a9c8edb6e8b6d21" + "Hash": "99b79fcbd6c4d1ce087f5c5c758b384f" }, "devtools": { "Package": "devtools", @@ -565,7 +568,7 @@ }, "gert": { "Package": "gert", - "Version": "2.0.0", + "Version": "2.0.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -576,7 +579,7 @@ "sys", "zip" ], - "Hash": "bbbd21a253d473f4671d7dcbd6d8971f" + "Hash": "f70d3fe2d9e7654213a946963d1591eb" }, "gh": { "Package": "gh", @@ -680,7 +683,7 @@ }, "htmlwidgets": { "Package": "htmlwidgets", - "Version": "1.6.2", + "Version": "1.6.4", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -691,11 +694,11 @@ "rmarkdown", "yaml" ], - "Hash": "a865aa85bcb2697f47505bfd70422471" + "Hash": "04291cc45198225444a397606810ac37" }, "httpuv": { "Package": "httpuv", - "Version": "1.6.12", + "Version": "1.6.13", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -706,7 +709,7 @@ "promises", "utils" ], - "Hash": "c992f75861325961c29a188b45e549f7" + "Hash": "d23d2879001f3d82ee9dc38a9ef53c4c" }, "httr": { "Package": "httr", @@ -725,7 +728,7 @@ }, "httr2": { "Package": "httr2", - "Version": "0.2.3", + "Version": "1.0.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -734,13 +737,15 @@ "cli", "curl", "glue", + "lifecycle", "magrittr", "openssl", "rappdirs", "rlang", + "vctrs", "withr" ], - "Hash": "193bb297368afbbb42dc85784a46b36e" + "Hash": "e2b30f1fc039a0bab047dd52bb20ef71" }, "ini": { "Package": "ini", @@ -787,14 +792,14 @@ }, "later": { "Package": "later", - "Version": "1.3.1", + "Version": "1.3.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "Rcpp", "rlang" ], - "Hash": "40401c9cf2bc2259dfe83311c9384710" + "Hash": "a3e051d405326b8b0012377434c62b37" }, "lattice": { "Package": "lattice", @@ -948,7 +953,7 @@ }, "pkgbuild": { "Package": "pkgbuild", - "Version": "1.4.2", + "Version": "1.4.3", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -956,13 +961,10 @@ "R6", "callr", "cli", - "crayon", "desc", - "prettyunits", - "processx", - "rprojroot" + "processx" ], - "Hash": "beb25b32a957a22a5c301a9e441190b3" + "Hash": "c0143443203205e6a2760ce553dafc24" }, "pkgconfig": { "Package": "pkgconfig", @@ -1044,7 +1046,7 @@ }, "processx": { "Package": "processx", - "Version": "3.8.2", + "Version": "3.8.3", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1053,7 +1055,7 @@ "ps", "utils" ], - "Hash": "3efbd8ac1be0296a46c55387aeace0f3" + "Hash": "82d48b1aec56084d9438dbf98087a7e9" }, "profvis": { "Package": "profvis", @@ -1140,14 +1142,14 @@ }, "ragg": { "Package": "ragg", - "Version": "1.2.6", + "Version": "1.2.7", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "systemfonts", "textshaping" ], - "Hash": "6ba2fa8740abdc2cc148407836509901" + "Hash": "90a1b8b7e518d7f90480d56453b4d062" }, "rappdirs": { "Package": "rappdirs", @@ -1352,7 +1354,7 @@ }, "sass": { "Package": "sass", - "Version": "0.4.7", + "Version": "0.4.8", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1362,7 +1364,7 @@ "rappdirs", "rlang" ], - "Hash": "6bd4d33b50ff927191ec9acbf52fd056" + "Hash": "168f9353c76d4c4b0a0bbf72e2c2d035" }, "sessioninfo": { "Package": "sessioninfo", @@ -1379,7 +1381,7 @@ }, "shiny": { "Package": "shiny", - "Version": "1.7.5.1", + "Version": "1.8.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1409,7 +1411,7 @@ "withr", "xtable" ], - "Hash": "5ec01cc255f2138fc2f0dc74d2b1a1a1" + "Hash": "3a1f41807d648a908e3c7f0334bf85e6" }, "snow": { "Package": "snow", @@ -1508,7 +1510,7 @@ }, "testthat": { "Package": "testthat", - "Version": "3.2.0", + "Version": "3.2.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1519,7 +1521,6 @@ "cli", "desc", "digest", - "ellipsis", "evaluate", "jsonlite", "lifecycle", @@ -1534,7 +1535,7 @@ "waldo", "withr" ], - "Hash": "877508719fcb8c9525eccdadf07a5102" + "Hash": "4767a686ebe986e6cb01d075b3f09729" }, "textshaping": { "Package": "textshaping", @@ -1567,6 +1568,17 @@ ], "Hash": "a84e2cc86d07289b3b6f5069df7a004c" }, + "tictoc": { + "Package": "tictoc", + "Version": "1.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "2c6c63f3f4f103d41cb2aae1ab1c7ce1" + }, "tidyr": { "Package": "tidyr", "Version": "1.3.0", @@ -1619,13 +1631,13 @@ }, "tinytex": { "Package": "tinytex", - "Version": "0.48", + "Version": "0.49", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "xfun" ], - "Hash": "8f96d229b7311beb32b94cf413b13f84" + "Hash": "5ac22900ae0f386e54f1c307eca7d843" }, "tzdb": { "Package": "tzdb", diff --git a/scripts/pre_commit.R b/scripts/pre_commit.R index b1df661..e5f2ba7 100644 --- a/scripts/pre_commit.R +++ b/scripts/pre_commit.R @@ -11,3 +11,5 @@ do.call(styler::style_dir, c(list(path = here::here("tests")), styler_confs)) source(here::here("reference_data", "build_package_data.R")) devtools::document() devtools::test() +devtools::uninstall() +devtools::install() diff --git a/scripts/run_script_1_extract_raw_data.R b/scripts/run_script_1_extract_raw_data.R index febdb0f..e536bec 100644 --- a/scripts/run_script_1_extract_raw_data.R +++ b/scripts/run_script_1_extract_raw_data.R @@ -1,12 +1,17 @@ +library(furrr) +library(progressr) +library(tictoc) + options(readxl.show_progress = FALSE) -future::plan("multisession") +plan("multisession") paths <- a4d::init_paths(c("patient_data_raw", "product_data_raw"), delete = TRUE) a4d::setup_logger(paths$output_root, "script1") tracker_files <- a4d::get_files(paths$tracker_root) +synonyms <- a4d::get_synonyms() -ParallelLogger::logInfo( +logInfo( "Found ", length(tracker_files), " xlsx files under ", @@ -14,21 +19,26 @@ ParallelLogger::logInfo( "." ) -ParallelLogger::logInfo("Start processing tracker files.") +logInfo("Start processing tracker files.") -progressr::with_progress({ - p <- progressr::progressor(steps = length(tracker_files)) +tic() +with_progress({ + p <- progressor(steps = length(tracker_files), label = "Tracker files") - result <- furrr::future_map( + future_map( tracker_files, - a4d::process_tracker_file, - paths = paths, - p = p + function(tracker_file) { + a4d::process_tracker_file(tracker_file, paths, synonyms) + p() + return() + }, + .options = furrr_options(seed = NULL, packages = c("a4d", "ParallelLogger"), scheduling = FALSE) ) }) +toc() -ParallelLogger::logInfo("Finish processing all tracker files.") +logInfo("Finish processing all tracker files.") -ParallelLogger::clearLoggers() +clearLoggers() -future::plan("sequential") +plan("sequential") diff --git a/scripts/run_script_2_clean_data.R b/scripts/run_script_2_clean_data.R index 92ab7d2..96ca452 100644 --- a/scripts/run_script_2_clean_data.R +++ b/scripts/run_script_2_clean_data.R @@ -1,19 +1,25 @@ -options(readxl.show_progress = FALSE) -future::plan("multisession") +library(furrr) +library(progressr) +library(tictoc) + +plan("multisession") paths <- a4d::init_paths(c("patient_data_cleaned", "product_data_cleaned"), delete = TRUE) a4d::setup_logger(paths$output_root, "script2") patient_data_files <- a4d::get_files(paths$tracker_root, pattern = "patient_raw.parquet$") product_data_files <- a4d::get_files(paths$tracker_root, pattern = "product_raw.parquet$") -ParallelLogger::logInfo( +allowed_provinces <- a4d::get_allowed_provinces() +synonyms <- a4d::get_synonyms() + +logInfo( "Found ", length(patient_data_files), " patient csv files under ", paths$tracker_root, "." ) -ParallelLogger::logInfo( +logInfo( "Found ", length(product_data_files), " product csv files under ", @@ -21,36 +27,46 @@ ParallelLogger::logInfo( "." ) -ParallelLogger::logInfo("Start processing patient csv files.") +logInfo("Start processing patient csv files.") -progressr::with_progress({ - p <- progressr::progressor(steps = length(patient_data_files)) +tic() +with_progress({ + p <- progressor(steps = length(patient_data_files)) - result <- furrr::future_map( + result <- future_map( patient_data_files, - a4d::process_patient_file, - paths = paths, - p = p + function(patient_file) { + a4d::process_patient_file(patient_file, paths, allowed_provinces) + p() + return() + }, + .options = furrr_options(seed = NULL, packages = c("a4d", "ParallelLogger"), scheduling = FALSE) ) }) +toc() -ParallelLogger::logInfo("Finish processing all patient csv files.") +logInfo("Finish processing all patient csv files.") -ParallelLogger::logDebug("Start processing product csv files.") +logDebug("Start processing product csv files.") -progressr::with_progress({ - p <- progressr::progressor(steps = length(product_data_files)) +tic() +with_progress({ + p <- progressor(steps = length(product_data_files)) - result <- furrr::future_map( + result <- future_map( product_data_files, - a4d::process_product_file, - paths = paths, - p = p + function(patient_file) { + a4d::process_product_file(patient_file, paths, synonyms_product = synonyms$product) + p() + return() + }, + .options = furrr_options(seed = NULL, packages = c("a4d", "ParallelLogger"), scheduling = FALSE) ) }) +toc() -ParallelLogger::logInfo("Finish processing all csv files.") +logInfo("Finish processing all csv files.") -ParallelLogger::clearLoggers() +clearLoggers() -future::plan("sequential") +plan("sequential") diff --git a/scripts/run_script_3_create_tables.R b/scripts/run_script_3_create_tables.R index 4cd6216..4f6a6ad 100644 --- a/scripts/run_script_3_create_tables.R +++ b/scripts/run_script_3_create_tables.R @@ -4,133 +4,132 @@ ERROR_VAL_NUMERIC <<- 999999 ERROR_VAL_CHARACTER <<- "Undefined" ERROR_VAL_DATE <<- "9999-09-09" -main <- function() { - paths <- init_paths(c("tables"), delete = TRUE) - setup_logger(paths$output_root, "script3") - patient_data_files <- get_files(file.path(paths$output_root, "patient_data_cleaned"), pattern = "\\.parquet$") - product_data_files <- get_files(file.path(paths$output_root, "product_data_cleaned"), pattern = "\\.parquet$") - ParallelLogger::logInfo( - "Found ", - length(patient_data_files), - " patient csv files under ", - paths$tracker_root, - "." - ) - ParallelLogger::logInfo( - "Found ", - length(product_data_files), - " product csv files under ", - paths$tracker_root, - "." - ) - ParallelLogger::logInfo("Start creating table csv files.") +paths <- a4d::init_paths(c("tables"), delete = TRUE) +a4d::setup_logger(paths$output_root, "script3") +patient_data_files <- a4d::get_files(file.path(paths$output_root, "patient_data_cleaned"), pattern = "\\.parquet$") +product_data_files <- a4d::get_files(file.path(paths$output_root, "product_data_cleaned"), pattern = "\\.parquet$") - logfile <- "table_patient_data_static" - with_file_logger(logfile, - { - tryCatch( - { - create_table_patient_data_static(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) - }, - error = function(e) { - ParallelLogger::logError("Could not create table csv for static patient data. Error: ", e$message) - }, - warning = function(w) { - ParallelLogger::logWarn("Could not create table csv for static patient data. Error: ", w$message) - } - ) - }, - output_root = paths$output_root - ) +logInfo( + "Found ", + length(patient_data_files), + " patient csv files under ", + paths$tracker_root, + "." +) +logInfo( + "Found ", + length(product_data_files), + " product csv files under ", + paths$tracker_root, + "." +) - logfile <- "table_patient_data_monthly" - with_file_logger(logfile, - { - tryCatch( - { - create_table_patient_data_monthly(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) - }, - error = function(e) { - ParallelLogger::logError("Could not create table csv for monthly patient data. Error: ", e$message) - }, - warning = function(w) { - ParallelLogger::logWarn("Could not create table csv for monthly patient data. Error: ", w$message) - } - ) - }, - output_root = paths$output_root - ) +logInfo("Start creating table csv files.") - logfile <- "table_longitudinal_data_hba1c" - with_file_logger(logfile, - { - tryCatch( - { - create_table_longitudinal_data( - patient_data_files, - file.path(paths$output_root, "patient_data_cleaned"), - paths$tables, - "hba1c_updated", - "hba1c" - ) - }, - error = function(e) { - ParallelLogger::logError("Could not create table csv for longitudinal patient data. Error: ", e$message) - }, - warning = function(w) { - ParallelLogger::logWarn("Could not create table csv for longitudinal patient data. Error: ", w$message) - } - ) - }, - output_root = paths$output_root - ) +logfile <- "table_patient_data_static" +with_file_logger(logfile, + { + tryCatch( + { + a4d::create_table_patient_data_static(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) + }, + error = function(e) { + logError("Could not create table csv for static patient data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create table csv for static patient data. Error: ", w$message) + } + ) + }, + output_root = paths$output_root +) - logfile <- "table_product_data" - with_file_logger(logfile, - { - tryCatch( - { - create_table_product_data(file.path(paths$output_root, "product_data_cleaned"), paths$tables) - }, - error = function(e) { - ParallelLogger::logError("Could not create table for product data. Error: ", e$message) - }, - warning = function(w) { - ParallelLogger::logWarn("Could not create table for product data. Warning: ", w$message) - } - ) - }, - output_root = paths$output_root - ) +logfile <- "table_patient_data_monthly" +with_file_logger(logfile, + { + tryCatch( + { + a4d::create_table_patient_data_monthly(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) + }, + error = function(e) { + logError("Could not create table csv for monthly patient data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create table csv for monthly patient data. Error: ", w$message) + } + ) + }, + output_root = paths$output_root +) - ParallelLogger::logInfo("Finish creating table files.") +logfile <- "table_longitudinal_data_hba1c" +with_file_logger(logfile, + { + tryCatch( + { + a4d::create_table_longitudinal_data( + patient_data_files, + file.path(paths$output_root, "patient_data_cleaned"), + paths$tables, + "hba1c_updated", + "hba1c" + ) + }, + error = function(e) { + logError("Could not create table csv for longitudinal patient data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create table csv for longitudinal patient data. Error: ", w$message) + } + ) + }, + output_root = paths$output_root +) - ParallelLogger::logInfo("Trying to link files for product and patient data.") +logfile <- "table_product_data" +with_file_logger(logfile, + { + tryCatch( + { + a4d::create_table_product_data(file.path(paths$output_root, "product_data_cleaned"), paths$tables) + }, + error = function(e) { + logError("Could not create table for product data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create table for product data. Warning: ", w$message) + } + ) + }, + output_root = paths$output_root +) - logfile <- "link_product_patient_data" +logInfo("Finish creating table files.") - with_file_logger(logfile, - { - tryCatch( - { - link_product_patient( - file.path(paths$tables, "product_data.parquet"), - file.path(paths$tables, "patient_data_monthly.parquet") - ) - }, - error = function(e) { - ParallelLogger::logError("Could not link files for product and patient data. Error: ", e$message) - }, - warning = function(w) { - ParallelLogger::logWarn("Could not link files for product and patient data. Warning: ", w$message) - } - ) - }, - output_root = paths$output_root - ) +logInfo("Trying to link files for product and patient data.") - ParallelLogger::logInfo("Finished linking files for product and patient data.") -} +logfile <- "link_product_patient_data" + +with_file_logger(logfile, + { + tryCatch( + { + a4d::link_product_patient( + file.path(paths$tables, "product_data.parquet"), + file.path(paths$tables, "patient_data_monthly.parquet") + ) + }, + error = function(e) { + logError("Could not link files for product and patient data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not link files for product and patient data. Warning: ", w$message) + } + ) + }, + output_root = paths$output_root +) + +logInfo("Finished linking files for product and patient data.") -main() From bc15cd71ae345786e18105695cb0057a3be8807b Mon Sep 17 00:00:00 2001 From: Luke Boelling Date: Wed, 3 Jan 2024 11:23:32 +0100 Subject: [PATCH 5/8] Update Namespace Run Styler on helper Main and script 3 Export function to convert csv to parquet for static clinic data --- NAMESPACE | 6 + R/helper_main.R | 201 +++++++++++++------------- R/sysdata.rda | Bin 2763 -> 0 bytes scripts/run_script_3_create_tables.R | 207 +++++++++++++-------------- 4 files changed, 210 insertions(+), 204 deletions(-) delete mode 100644 R/sysdata.rda diff --git a/NAMESPACE b/NAMESPACE index 298a603..c69c3f1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,7 +3,12 @@ export(calculate_most_frequent) export(convert_to) export(correct_decimal_sign) +export(create_table_longitudinal_data) +export(create_table_patient_data_monthly) +export(create_table_patient_data_static) +export(create_table_product_data) export(cut_numeric_value) +export(export_data_as_parquet) export(extract_patient_data) export(extract_unit_capacity) export(extract_year_from_age) @@ -21,6 +26,7 @@ export(get_tracker_year) export(harmonize_patient_data_columns) export(id_2_county_hospisal) export(init_paths) +export(link_product_patient) export(process_patient_data) export(process_patient_file) export(process_product_data) diff --git a/R/helper_main.R b/R/helper_main.R index 97bd8c0..1bb1bc6 100644 --- a/R/helper_main.R +++ b/R/helper_main.R @@ -14,36 +14,36 @@ #' @return A list with tracker_root_path and output_root path #' @export init_paths <- function(names, output_dir_name = "output", delete = FALSE) { - paths <- list() - tracker_root_path <- select_A4D_directory() - paths$tracker_root <- tracker_root_path - - output_root <- file.path( - tracker_root_path, - output_dir_name + paths <- list() + tracker_root_path <- select_A4D_directory() + paths$tracker_root <- tracker_root_path + + output_root <- file.path( + tracker_root_path, + output_dir_name + ) + + paths$output_root <- output_root + + for (name in names) { + subdir <- file.path( + tracker_root_path, + output_dir_name, + name ) - paths$output_root <- output_root - - for (name in names) { - subdir <- file.path( - tracker_root_path, - output_dir_name, - name - ) - - if (fs::dir_exists(subdir)) { - if (delete) { - fs::dir_delete(subdir) - } - } + if (fs::dir_exists(subdir)) { + if (delete) { + fs::dir_delete(subdir) + } + } - fs::dir_create(subdir) + fs::dir_create(subdir) - paths[[name]] <- subdir - } + paths[[name]] <- subdir + } - paths + paths } @@ -60,9 +60,9 @@ init_paths <- function(names, output_dir_name = "output", delete = FALSE) { #' @return A vector with file names. #' @export get_files <- function(tracker_root, pattern = "\\.xlsx$") { - tracker_files <- list.files(path = tracker_root, recursive = T, pattern = pattern) - tracker_files <- - tracker_files[stringr::str_detect(tracker_files, "~", negate = T)] + tracker_files <- list.files(path = tracker_root, recursive = T, pattern = pattern) + tracker_files <- + tracker_files[stringr::str_detect(tracker_files, "~", negate = T)] } @@ -74,14 +74,14 @@ get_files <- function(tracker_root, pattern = "\\.xlsx$") { #' @return A list with both patient and product data synonyms as tibble. #' @export get_synonyms <- function() { - ## Extract synonyms for products and patients - ## If you encounter new columns, just add the synonyms to these YAML files - synonyms_patient <- - read_column_synonyms(synonym_file = "synonyms_patient.yaml") - synonyms_product <- - read_column_synonyms(synonym_file = "synonyms_product.yaml") - - list(patient = synonyms_patient, product = synonyms_product) + ## Extract synonyms for products and patients + ## If you encounter new columns, just add the synonyms to these YAML files + synonyms_patient <- + read_column_synonyms(synonym_file = "synonyms_patient.yaml") + synonyms_product <- + read_column_synonyms(synonym_file = "synonyms_product.yaml") + + list(patient = synonyms_patient, product = synonyms_product) } @@ -104,21 +104,21 @@ get_synonyms <- function() { #' read_column_synonyms(synonym_file = "synonyms_product.yaml") #' } read_column_synonyms <- function(synonym_file, path_prefixes = c("reference_data", "synonyms")) { - path <- do.call(file.path, as.list(c(path_prefixes, synonym_file))) - columns_synonyms <- - yaml::read_yaml(path) %>% - unlist() %>% - as.data.frame() %>% - tibble::rownames_to_column() %>% - # remove digits that were created when converting to data frame - dplyr::mutate( - rowname = stringr::str_replace(rowname, pattern = "[:digit:]$", "") - ) %>% - dplyr::rename( - "variable_name" = "rowname", - "tracker_name" = "." - ) %>% - dplyr::as_tibble() + path <- do.call(file.path, as.list(c(path_prefixes, synonym_file))) + columns_synonyms <- + yaml::read_yaml(path) %>% + unlist() %>% + as.data.frame() %>% + tibble::rownames_to_column() %>% + # remove digits that were created when converting to data frame + dplyr::mutate( + rowname = stringr::str_replace(rowname, pattern = "[:digit:]$", "") + ) %>% + dplyr::rename( + "variable_name" = "rowname", + "tracker_name" = "." + ) %>% + dplyr::as_tibble() } @@ -132,31 +132,31 @@ read_column_synonyms <- function(synonym_file, path_prefixes = c("reference_data #' @examples #' \dontrun{ #' export_data( -#' data = df_raw_product, -#' filename = tracker_name, -#' output_root = output_root, -#' suffix = "_product_data" +#' data = df_raw_product, +#' filename = tracker_name, +#' output_root = output_root, +#' suffix = "_product_data" #' ) #' } export_data <- function(data, filename, output_root, suffix) { - logDebug("Start export_data. Suffix = ", suffix, ".") - data %>% - write.csv( - file = - file.path( - output_root, - paste0( - filename, - suffix, - ".csv" - ) - ), - row.names = F, - na = "", - fileEncoding = "UTF-16LE", - quote = T - ) - logInfo("Finish export_data. Suffix = ", suffix, ".") + logDebug("Start export_data. Suffix = ", suffix, ".") + data %>% + write.csv( + file = + file.path( + output_root, + paste0( + filename, + suffix, + ".csv" + ) + ), + row.names = F, + na = "", + fileEncoding = "UTF-16LE", + quote = T + ) + logInfo("Finish export_data. Suffix = ", suffix, ".") } @@ -170,19 +170,20 @@ export_data <- function(data, filename, output_root, suffix) { #' @examples #' \dontrun{ #' export_data( -#' data = df_raw_product, -#' filename = tracker_name, -#' output_root = output_root, -#' suffix = "_product_data" +#' data = df_raw_product, +#' filename = tracker_name, +#' output_root = output_root, +#' suffix = "_product_data" #' ) #' } +#' @export export_data_as_parquet <- function(data, filename, output_root, suffix) { - logDebug("Start export_data. Suffix = ", suffix, ".") - data %>% - arrow::write_parquet( - sink = file.path(output_root, paste0(filename, suffix, ".parquet")), - ) - logInfo("Finish export_data. Suffix = ", suffix, ".") + logDebug("Start export_data. Suffix = ", suffix, ".") + data %>% + arrow::write_parquet( + sink = file.path(output_root, paste0(filename, suffix, ".parquet")), + ) + logInfo("Finish export_data. Suffix = ", suffix, ".") } @@ -193,20 +194,20 @@ export_data_as_parquet <- function(data, filename, output_root, suffix) { #' #' @return tibble with patient data read_raw_csv <- function(file) { - logDebug("Start reading data with read_csv.") - df_patient_raw <- readr::read_csv( - file, - name_repair = "check_unique", - progress = FALSE, - show_col_types = FALSE, - col_types = readr::cols(.default = "c"), - locale = readr::locale(encoding = "UTF-16LE") - ) - logDebug("Finished loading data with read_csv.") - logInfo("Dim: ", dim(df_patient_raw)) - logInfo("Columns: ", spec(df_patient_raw)) - - df_patient_raw + logDebug("Start reading data with read_csv.") + df_patient_raw <- readr::read_csv( + file, + name_repair = "check_unique", + progress = FALSE, + show_col_types = FALSE, + col_types = readr::cols(.default = "c"), + locale = readr::locale(encoding = "UTF-16LE") + ) + logDebug("Finished loading data with read_csv.") + logInfo("Dim: ", dim(df_patient_raw)) + logInfo("Columns: ", spec(df_patient_raw)) + + df_patient_raw } @@ -218,7 +219,7 @@ read_raw_csv <- function(file) { #' @return A named character vector with all allowed provinces. #' @export get_allowed_provinces <- function() { - ## Should new countries and provinces be added, update the YAML file - provinces <- yaml::read_yaml("reference_data/provinces/allowed_provinces.yaml") |> unlist() - return(provinces) + ## Should new countries and provinces be added, update the YAML file + provinces <- yaml::read_yaml("reference_data/provinces/allowed_provinces.yaml") |> unlist() + return(provinces) } diff --git a/R/sysdata.rda b/R/sysdata.rda deleted file mode 100644 index be6b002cff04d1329d0fdca86a74da91bb6a0978..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2763 zcmZvZXFL=P0Edl`z3FVx*|U(n#~G27vz=8qWIH($_nM)*iwntnMqPF~cV=c}WM@a> zIHNM+$avT9{nr1(^WpjS{QjaD7peXo?RNffh+r>ouRjQFl8)RWGUm^TxX>E)oEpAv zyotPpI}vxbV*CWU^u+rcdPT8Lzw}2gHbo?zhCWO-wr4|3`n{N>F0`MK+jEmvoc#3B^rJA*WE@xA6)^MT}yS@3*CGz z%tbIiafjBtO%qjB%^X77m_y|%6lU?Qlqs+AaHsX7!R>Il^2Q{}o+97_g5Nt-rZAJN zA@*_7=0#RYsnzf^X06S{{mD2IB0)lQ&(NZ`qs7?o7N6Gx>X3dqM)2VYXeG*+)cES> zso09*ktqDayTX*unlJZ8XPG^|_BCOC6gC$wA4M5DaZ=ZGNalHs4KP(S&Jino~!&?RB@TR zLJbIZi|Mj}zGt^bewx>xh<6A)r3lWkv6;@bHn7_+NWFatnA67v2R8qrnJn*;a@hmQ z#82^^)Yn1RTjo6^c+&@hgx-{Ke6y@C+V_m(^E@**e>A=c^CV*jA9D|t;y*XaEX%p|C|wei!)R&|&8jD^&D z;cGo@Ln}?FZkA-It5uii8U_i{q$C4LD)g0%g0pOAg z!#!>2YGPwXiO9303bL+>oYihqOssUOZ7#QMncQE2Y1>@HQrr0LiH|yIoqiiGP0fTb zV1_Ar7n@eSQ&aZKu4kJBNWZm39;|w2`?L30%Bg?BNxLe|_$XT;43X=_kFj=1KN&9g zM>Q?_Rd(3ZO2O_q!j8+v7yatjf!hA{4<+09w=M!%QOY$loI2p(*ozi9n$MKi43X?# zy;4%8SG41IuPH<#lbaueiI1yW+c@Zu6N`#sx0A~w364B#f4H8GmQlkxd7(I$(##{O z4J#G7Y82DpRi8cU?cO>a89yG>_k%-PtZ! z%t?SB&ye-qcQbsB<#)eo@%Z|5)@CC@~X6GcEb7^|MwN_`A16VbW!4cQ; z_i-?I@%hiO?{Kj-K3XAf9+MHYwy-&;@$S1{b5!B_0qLoM(+q8NzRB+Q zh0iC6QVY$S43D7MF#@$Gw{LjF{^Fs>2Zx+}I}nu=xUM`RaFS4pFHb;uPc7>C5<=ox z**UK(9VY$SpOHs^$3he%3(5ExQ*a_2Q3BqV5u!4St#Bk#`AhfRCH);z$3W(MPEbN^ zYZF^x9U})lUUSdli>?#tmn)>}=SkFJM!qWWGBxpFcBHSq)GV~VZ1(1p`oo$b@_`ewd>k|F8`>kTKTZKbIUESe0tk!#0QHXB3y#3U%+xsalZt# zCM21+F7>}Qs`zezM(UXsVE`{IuU;vzzVBfZV$clVb5R+!25GjPU8HYWS_=-SRuc!- zp-!Do$;_NTkUXeflc+7qQX*!gD1MY~b-M;Rza&WWL84u2d=0g)6R0x^rVoTG;*Uoq zLyP-tJw%8~-42J1T?v<$G}xCuQU8dOd8nPU^=?o^43WhSipj5Z-54ZIiI{?3F3Z^3 zKW4D}F~r;UVmVRq2b^v>kaN1(uICT5^;_WTq{!3W4aEt74_YhDs>Nc(sWuPoU64j1 zdW=ur2$Kf#`;nJtt<0I!+uOy$rr=x7mB&=_(OEUZM^ilWIBc*Ye%lkX`uR%I2We^7cSV zelGfyMHQWA~CP{kzx+K;;_ zq8W>wfsX1SVyDVWnANQVKC34rfx=SCMV`kM9KXqV1Wf4bW)7_rM(7>Ub(_oQ5&m@G zYLRQX(~9(V49Z%}YvYCFiQ|v$NF{`OJF_n9LKAF)_)qZwW-UGzc?`I=)~>^dmsuN3 z!I|)T&ZXPm;c>gkVyeR1V~d=-|t`rY2fku z4M^#ot~W7qOyQ=GYUx2@{I!4!TQQq4q)D~a;S8#gQOu@Ll|tsOJF667%c z{>CiFtATn`!qq#P-ekZ)AK35x8`V2kyl-F4HE`0^jhptL%=h)#>{HJPx;$qTMK^QX z=j+d&V4e%Z>U{m+kvdP$j_yYY1QXwD@N@?aa8d=A1&XwO?QJiN+Q2yUmTLW{Sgiv* z?P*a0n2GjMdZ3yzE)n#v;zs3TCb~<3Ks6Oy8py^vz{~z$)mh31#4FyeBp@ovuF6V!aK#K$UwgQSy<`8rp`*a{~@29JxOw^nyeJgdr~DaXV8!a)$#Zz@4J`_uw-fGz0v;{ z=RBLNR13HpCF_PL_U>qZlt$+}_FwNKmevb%chGOJ>*-$7={V_JtV3A6es+BHVp?AW zI7@q;6}vON*R?0rrlBQOP676YQPu44W-t^CvSP)#U!w`|hSnusUZx35d2hy32SPv(|RZ4%|P= J`6@|8^%oJ1MwkEq diff --git a/scripts/run_script_3_create_tables.R b/scripts/run_script_3_create_tables.R index 4ffebbc..995c0bd 100644 --- a/scripts/run_script_3_create_tables.R +++ b/scripts/run_script_3_create_tables.R @@ -11,117 +11,117 @@ patient_data_files <- a4d::get_files(file.path(paths$output_root, "patient_data_ product_data_files <- a4d::get_files(file.path(paths$output_root, "product_data_cleaned"), pattern = "\\.parquet$") logInfo( - "Found ", - length(patient_data_files), - " patient csv files under ", - paths$tracker_root, - "." + "Found ", + length(patient_data_files), + " patient csv files under ", + paths$tracker_root, + "." ) logInfo( - "Found ", - length(product_data_files), - " product csv files under ", - paths$tracker_root, - "." + "Found ", + length(product_data_files), + " product csv files under ", + paths$tracker_root, + "." ) logInfo("Start creating table csv files.") logfile <- "table_patient_data_static" with_file_logger(logfile, - { - tryCatch( - { - a4d::create_table_patient_data_static(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) - }, - error = function(e) { - logError("Could not create table csv for static patient data. Error: ", e$message) - }, - warning = function(w) { - logWarn("Could not create table csv for static patient data. Error: ", w$message) - } - ) - }, - output_root = paths$output_root + { + tryCatch( + { + a4d::create_table_patient_data_static(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) + }, + error = function(e) { + logError("Could not create table csv for static patient data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create table csv for static patient data. Error: ", w$message) + } + ) + }, + output_root = paths$output_root ) logfile <- "table_patient_data_monthly" with_file_logger(logfile, - { - tryCatch( - { - a4d::create_table_patient_data_monthly(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) - }, - error = function(e) { - logError("Could not create table csv for monthly patient data. Error: ", e$message) - }, - warning = function(w) { - logWarn("Could not create table csv for monthly patient data. Error: ", w$message) - } - ) - }, - output_root = paths$output_root + { + tryCatch( + { + a4d::create_table_patient_data_monthly(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) + }, + error = function(e) { + logError("Could not create table csv for monthly patient data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create table csv for monthly patient data. Error: ", w$message) + } + ) + }, + output_root = paths$output_root ) logfile <- "table_longitudinal_data_hba1c" with_file_logger(logfile, - { - tryCatch( - { - a4d::create_table_longitudinal_data( - patient_data_files, - file.path(paths$output_root, "patient_data_cleaned"), - paths$tables, - "hba1c_updated", - "hba1c" - ) - }, - error = function(e) { - logError("Could not create table csv for longitudinal patient data. Error: ", e$message) - }, - warning = function(w) { - logWarn("Could not create table csv for longitudinal patient data. Error: ", w$message) - } + { + tryCatch( + { + a4d::create_table_longitudinal_data( + patient_data_files, + file.path(paths$output_root, "patient_data_cleaned"), + paths$tables, + "hba1c_updated", + "hba1c" ) - }, - output_root = paths$output_root + }, + error = function(e) { + logError("Could not create table csv for longitudinal patient data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create table csv for longitudinal patient data. Error: ", w$message) + } + ) + }, + output_root = paths$output_root ) - logfile <- "clinic_data_static" - with_file_logger(logfile, - { - tryCatch( - { - a4d::export_data_as_parquet(data = read.csv("reference_data/clinic_data_static.csv"),filename = "clinic_data_static",output_root = paths$tables,suffix="") - }, - error = function(e) { - logError("Could not create clinic data static table. Error: ", e$message) - }, - warning = function(w) { - logWarn("Could not create clinic data static table. Warning: ", w$message) - } - ) - }, - output_root = paths$output_root +logfile <- "clinic_data_static" +with_file_logger(logfile, + { + tryCatch( + { + a4d::export_data_as_parquet(data = read.csv("reference_data/clinic_data_static.csv"), filename = "clinic_data_static", output_root = paths$tables, suffix = "") + }, + error = function(e) { + logError("Could not create clinic data static table. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create clinic data static table. Warning: ", w$message) + } ) + }, + output_root = paths$output_root +) logfile <- "table_product_data" with_file_logger(logfile, - { - tryCatch( - { - a4d::create_table_product_data(file.path(paths$output_root, "product_data_cleaned"), paths$tables) - }, - error = function(e) { - logError("Could not create table for product data. Error: ", e$message) - }, - warning = function(w) { - logWarn("Could not create table for product data. Warning: ", w$message) - } - ) - }, - output_root = paths$output_root + { + tryCatch( + { + a4d::create_table_product_data(file.path(paths$output_root, "product_data_cleaned"), paths$tables) + }, + error = function(e) { + logError("Could not create table for product data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create table for product data. Warning: ", w$message) + } + ) + }, + output_root = paths$output_root ) logInfo("Finish creating table files.") @@ -131,24 +131,23 @@ logInfo("Trying to link files for product and patient data.") logfile <- "link_product_patient_data" with_file_logger(logfile, - { - tryCatch( - { - a4d::link_product_patient( - file.path(paths$tables, "product_data.parquet"), - file.path(paths$tables, "patient_data_monthly.parquet") - ) - }, - error = function(e) { - logError("Could not link files for product and patient data. Error: ", e$message) - }, - warning = function(w) { - logWarn("Could not link files for product and patient data. Warning: ", w$message) - } + { + tryCatch( + { + a4d::link_product_patient( + file.path(paths$tables, "product_data.parquet"), + file.path(paths$tables, "patient_data_monthly.parquet") ) - }, - output_root = paths$output_root + }, + error = function(e) { + logError("Could not link files for product and patient data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not link files for product and patient data. Warning: ", w$message) + } + ) + }, + output_root = paths$output_root ) logInfo("Finished linking files for product and patient data.") - From 5bf846bd2c0932ce6ef9d4b1ee0a83388ead352f Mon Sep 17 00:00:00 2001 From: Luke Boelling Date: Wed, 3 Jan 2024 11:27:25 +0100 Subject: [PATCH 6/8] restyle with 4 indent --- R/helper_main.R | 200 +++++++++++++------------- scripts/run_pipeline.R | 1 - scripts/run_script_3_create_tables.R | 202 +++++++++++++-------------- 3 files changed, 201 insertions(+), 202 deletions(-) diff --git a/R/helper_main.R b/R/helper_main.R index 1bb1bc6..01c5810 100644 --- a/R/helper_main.R +++ b/R/helper_main.R @@ -14,36 +14,36 @@ #' @return A list with tracker_root_path and output_root path #' @export init_paths <- function(names, output_dir_name = "output", delete = FALSE) { - paths <- list() - tracker_root_path <- select_A4D_directory() - paths$tracker_root <- tracker_root_path - - output_root <- file.path( - tracker_root_path, - output_dir_name - ) - - paths$output_root <- output_root - - for (name in names) { - subdir <- file.path( - tracker_root_path, - output_dir_name, - name + paths <- list() + tracker_root_path <- select_A4D_directory() + paths$tracker_root <- tracker_root_path + + output_root <- file.path( + tracker_root_path, + output_dir_name ) - if (fs::dir_exists(subdir)) { - if (delete) { - fs::dir_delete(subdir) - } - } + paths$output_root <- output_root + + for (name in names) { + subdir <- file.path( + tracker_root_path, + output_dir_name, + name + ) - fs::dir_create(subdir) + if (fs::dir_exists(subdir)) { + if (delete) { + fs::dir_delete(subdir) + } + } - paths[[name]] <- subdir - } + fs::dir_create(subdir) + + paths[[name]] <- subdir + } - paths + paths } @@ -60,9 +60,9 @@ init_paths <- function(names, output_dir_name = "output", delete = FALSE) { #' @return A vector with file names. #' @export get_files <- function(tracker_root, pattern = "\\.xlsx$") { - tracker_files <- list.files(path = tracker_root, recursive = T, pattern = pattern) - tracker_files <- - tracker_files[stringr::str_detect(tracker_files, "~", negate = T)] + tracker_files <- list.files(path = tracker_root, recursive = T, pattern = pattern) + tracker_files <- + tracker_files[stringr::str_detect(tracker_files, "~", negate = T)] } @@ -74,14 +74,14 @@ get_files <- function(tracker_root, pattern = "\\.xlsx$") { #' @return A list with both patient and product data synonyms as tibble. #' @export get_synonyms <- function() { - ## Extract synonyms for products and patients - ## If you encounter new columns, just add the synonyms to these YAML files - synonyms_patient <- - read_column_synonyms(synonym_file = "synonyms_patient.yaml") - synonyms_product <- - read_column_synonyms(synonym_file = "synonyms_product.yaml") - - list(patient = synonyms_patient, product = synonyms_product) + ## Extract synonyms for products and patients + ## If you encounter new columns, just add the synonyms to these YAML files + synonyms_patient <- + read_column_synonyms(synonym_file = "synonyms_patient.yaml") + synonyms_product <- + read_column_synonyms(synonym_file = "synonyms_product.yaml") + + list(patient = synonyms_patient, product = synonyms_product) } @@ -104,21 +104,21 @@ get_synonyms <- function() { #' read_column_synonyms(synonym_file = "synonyms_product.yaml") #' } read_column_synonyms <- function(synonym_file, path_prefixes = c("reference_data", "synonyms")) { - path <- do.call(file.path, as.list(c(path_prefixes, synonym_file))) - columns_synonyms <- - yaml::read_yaml(path) %>% - unlist() %>% - as.data.frame() %>% - tibble::rownames_to_column() %>% - # remove digits that were created when converting to data frame - dplyr::mutate( - rowname = stringr::str_replace(rowname, pattern = "[:digit:]$", "") - ) %>% - dplyr::rename( - "variable_name" = "rowname", - "tracker_name" = "." - ) %>% - dplyr::as_tibble() + path <- do.call(file.path, as.list(c(path_prefixes, synonym_file))) + columns_synonyms <- + yaml::read_yaml(path) %>% + unlist() %>% + as.data.frame() %>% + tibble::rownames_to_column() %>% + # remove digits that were created when converting to data frame + dplyr::mutate( + rowname = stringr::str_replace(rowname, pattern = "[:digit:]$", "") + ) %>% + dplyr::rename( + "variable_name" = "rowname", + "tracker_name" = "." + ) %>% + dplyr::as_tibble() } @@ -132,31 +132,31 @@ read_column_synonyms <- function(synonym_file, path_prefixes = c("reference_data #' @examples #' \dontrun{ #' export_data( -#' data = df_raw_product, -#' filename = tracker_name, -#' output_root = output_root, -#' suffix = "_product_data" +#' data = df_raw_product, +#' filename = tracker_name, +#' output_root = output_root, +#' suffix = "_product_data" #' ) #' } export_data <- function(data, filename, output_root, suffix) { - logDebug("Start export_data. Suffix = ", suffix, ".") - data %>% - write.csv( - file = - file.path( - output_root, - paste0( - filename, - suffix, - ".csv" - ) - ), - row.names = F, - na = "", - fileEncoding = "UTF-16LE", - quote = T - ) - logInfo("Finish export_data. Suffix = ", suffix, ".") + logDebug("Start export_data. Suffix = ", suffix, ".") + data %>% + write.csv( + file = + file.path( + output_root, + paste0( + filename, + suffix, + ".csv" + ) + ), + row.names = F, + na = "", + fileEncoding = "UTF-16LE", + quote = T + ) + logInfo("Finish export_data. Suffix = ", suffix, ".") } @@ -170,20 +170,20 @@ export_data <- function(data, filename, output_root, suffix) { #' @examples #' \dontrun{ #' export_data( -#' data = df_raw_product, -#' filename = tracker_name, -#' output_root = output_root, -#' suffix = "_product_data" +#' data = df_raw_product, +#' filename = tracker_name, +#' output_root = output_root, +#' suffix = "_product_data" #' ) #' } #' @export export_data_as_parquet <- function(data, filename, output_root, suffix) { - logDebug("Start export_data. Suffix = ", suffix, ".") - data %>% - arrow::write_parquet( - sink = file.path(output_root, paste0(filename, suffix, ".parquet")), - ) - logInfo("Finish export_data. Suffix = ", suffix, ".") + logDebug("Start export_data. Suffix = ", suffix, ".") + data %>% + arrow::write_parquet( + sink = file.path(output_root, paste0(filename, suffix, ".parquet")), + ) + logInfo("Finish export_data. Suffix = ", suffix, ".") } @@ -194,20 +194,20 @@ export_data_as_parquet <- function(data, filename, output_root, suffix) { #' #' @return tibble with patient data read_raw_csv <- function(file) { - logDebug("Start reading data with read_csv.") - df_patient_raw <- readr::read_csv( - file, - name_repair = "check_unique", - progress = FALSE, - show_col_types = FALSE, - col_types = readr::cols(.default = "c"), - locale = readr::locale(encoding = "UTF-16LE") - ) - logDebug("Finished loading data with read_csv.") - logInfo("Dim: ", dim(df_patient_raw)) - logInfo("Columns: ", spec(df_patient_raw)) - - df_patient_raw + logDebug("Start reading data with read_csv.") + df_patient_raw <- readr::read_csv( + file, + name_repair = "check_unique", + progress = FALSE, + show_col_types = FALSE, + col_types = readr::cols(.default = "c"), + locale = readr::locale(encoding = "UTF-16LE") + ) + logDebug("Finished loading data with read_csv.") + logInfo("Dim: ", dim(df_patient_raw)) + logInfo("Columns: ", spec(df_patient_raw)) + + df_patient_raw } @@ -219,7 +219,7 @@ read_raw_csv <- function(file) { #' @return A named character vector with all allowed provinces. #' @export get_allowed_provinces <- function() { - ## Should new countries and provinces be added, update the YAML file - provinces <- yaml::read_yaml("reference_data/provinces/allowed_provinces.yaml") |> unlist() - return(provinces) + ## Should new countries and provinces be added, update the YAML file + provinces <- yaml::read_yaml("reference_data/provinces/allowed_provinces.yaml") |> unlist() + return(provinces) } diff --git a/scripts/run_pipeline.R b/scripts/run_pipeline.R index fc338f3..2f29708 100644 --- a/scripts/run_pipeline.R +++ b/scripts/run_pipeline.R @@ -113,4 +113,3 @@ ingest_data( table = "clinic_data_static", source = file.path(table_dir, "clinic_data_static.parquet") ) - diff --git a/scripts/run_script_3_create_tables.R b/scripts/run_script_3_create_tables.R index 995c0bd..bfeae2e 100644 --- a/scripts/run_script_3_create_tables.R +++ b/scripts/run_script_3_create_tables.R @@ -11,117 +11,117 @@ patient_data_files <- a4d::get_files(file.path(paths$output_root, "patient_data_ product_data_files <- a4d::get_files(file.path(paths$output_root, "product_data_cleaned"), pattern = "\\.parquet$") logInfo( - "Found ", - length(patient_data_files), - " patient csv files under ", - paths$tracker_root, - "." + "Found ", + length(patient_data_files), + " patient csv files under ", + paths$tracker_root, + "." ) logInfo( - "Found ", - length(product_data_files), - " product csv files under ", - paths$tracker_root, - "." + "Found ", + length(product_data_files), + " product csv files under ", + paths$tracker_root, + "." ) logInfo("Start creating table csv files.") logfile <- "table_patient_data_static" with_file_logger(logfile, - { - tryCatch( - { - a4d::create_table_patient_data_static(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) - }, - error = function(e) { - logError("Could not create table csv for static patient data. Error: ", e$message) - }, - warning = function(w) { - logWarn("Could not create table csv for static patient data. Error: ", w$message) - } - ) - }, - output_root = paths$output_root + { + tryCatch( + { + a4d::create_table_patient_data_static(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) + }, + error = function(e) { + logError("Could not create table csv for static patient data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create table csv for static patient data. Error: ", w$message) + } + ) + }, + output_root = paths$output_root ) logfile <- "table_patient_data_monthly" with_file_logger(logfile, - { - tryCatch( - { - a4d::create_table_patient_data_monthly(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) - }, - error = function(e) { - logError("Could not create table csv for monthly patient data. Error: ", e$message) - }, - warning = function(w) { - logWarn("Could not create table csv for monthly patient data. Error: ", w$message) - } - ) - }, - output_root = paths$output_root + { + tryCatch( + { + a4d::create_table_patient_data_monthly(patient_data_files, file.path(paths$output_root, "patient_data_cleaned"), paths$tables) + }, + error = function(e) { + logError("Could not create table csv for monthly patient data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create table csv for monthly patient data. Error: ", w$message) + } + ) + }, + output_root = paths$output_root ) logfile <- "table_longitudinal_data_hba1c" with_file_logger(logfile, - { - tryCatch( - { - a4d::create_table_longitudinal_data( - patient_data_files, - file.path(paths$output_root, "patient_data_cleaned"), - paths$tables, - "hba1c_updated", - "hba1c" + { + tryCatch( + { + a4d::create_table_longitudinal_data( + patient_data_files, + file.path(paths$output_root, "patient_data_cleaned"), + paths$tables, + "hba1c_updated", + "hba1c" + ) + }, + error = function(e) { + logError("Could not create table csv for longitudinal patient data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create table csv for longitudinal patient data. Error: ", w$message) + } ) - }, - error = function(e) { - logError("Could not create table csv for longitudinal patient data. Error: ", e$message) - }, - warning = function(w) { - logWarn("Could not create table csv for longitudinal patient data. Error: ", w$message) - } - ) - }, - output_root = paths$output_root + }, + output_root = paths$output_root ) logfile <- "clinic_data_static" with_file_logger(logfile, - { - tryCatch( - { - a4d::export_data_as_parquet(data = read.csv("reference_data/clinic_data_static.csv"), filename = "clinic_data_static", output_root = paths$tables, suffix = "") - }, - error = function(e) { - logError("Could not create clinic data static table. Error: ", e$message) - }, - warning = function(w) { - logWarn("Could not create clinic data static table. Warning: ", w$message) - } - ) - }, - output_root = paths$output_root + { + tryCatch( + { + a4d::export_data_as_parquet(data = read.csv("reference_data/clinic_data_static.csv"), filename = "clinic_data_static", output_root = paths$tables, suffix = "") + }, + error = function(e) { + logError("Could not create clinic data static table. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create clinic data static table. Warning: ", w$message) + } + ) + }, + output_root = paths$output_root ) logfile <- "table_product_data" with_file_logger(logfile, - { - tryCatch( - { - a4d::create_table_product_data(file.path(paths$output_root, "product_data_cleaned"), paths$tables) - }, - error = function(e) { - logError("Could not create table for product data. Error: ", e$message) - }, - warning = function(w) { - logWarn("Could not create table for product data. Warning: ", w$message) - } - ) - }, - output_root = paths$output_root + { + tryCatch( + { + a4d::create_table_product_data(file.path(paths$output_root, "product_data_cleaned"), paths$tables) + }, + error = function(e) { + logError("Could not create table for product data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not create table for product data. Warning: ", w$message) + } + ) + }, + output_root = paths$output_root ) logInfo("Finish creating table files.") @@ -131,23 +131,23 @@ logInfo("Trying to link files for product and patient data.") logfile <- "link_product_patient_data" with_file_logger(logfile, - { - tryCatch( - { - a4d::link_product_patient( - file.path(paths$tables, "product_data.parquet"), - file.path(paths$tables, "patient_data_monthly.parquet") + { + tryCatch( + { + a4d::link_product_patient( + file.path(paths$tables, "product_data.parquet"), + file.path(paths$tables, "patient_data_monthly.parquet") + ) + }, + error = function(e) { + logError("Could not link files for product and patient data. Error: ", e$message) + }, + warning = function(w) { + logWarn("Could not link files for product and patient data. Warning: ", w$message) + } ) - }, - error = function(e) { - logError("Could not link files for product and patient data. Error: ", e$message) - }, - warning = function(w) { - logWarn("Could not link files for product and patient data. Warning: ", w$message) - } - ) - }, - output_root = paths$output_root + }, + output_root = paths$output_root ) logInfo("Finished linking files for product and patient data.") From aea0eeca7b449ae11907913e2c95b1f645a6e18a Mon Sep 17 00:00:00 2001 From: Luke Boelling Date: Thu, 4 Jan 2024 11:48:41 +0100 Subject: [PATCH 7/8] adding pipe operator with usethis readd sysdata.rda to R afet failing to use lazyload in data. --- DESCRIPTION | 3 ++- NAMESPACE | 2 ++ R/sysdata.rda | Bin 0 -> 2763 bytes R/utils-pipe.R | 14 ++++++++++++++ man/pipe.Rd | 20 ++++++++++++++++++++ 5 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 R/sysdata.rda create mode 100644 R/utils-pipe.R create mode 100644 man/pipe.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 1ad85e1..bac3470 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,7 +28,8 @@ Imports: arrow, furrr, progressr, - tictoc + tictoc, + magrittr Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index c69c3f1..d7328b3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export("%>%") export(calculate_most_frequent) export(convert_to) export(correct_decimal_sign) @@ -42,3 +43,4 @@ export(setup_file_logger) export(setup_logger) export(with_file_logger) importFrom(data.table,"%like%") +importFrom(magrittr,"%>%") diff --git a/R/sysdata.rda b/R/sysdata.rda new file mode 100644 index 0000000000000000000000000000000000000000..eb303cf804b77f1badf0b2d97168f0a3f6920e66 GIT binary patch literal 2763 zcmZvZXFL=P0Edyix6T&Mo`vi^&WNO(6=xL=*-nncy=Lg{;zIJCQJ0<0of#oAva=&` zoKYEZWW4M5e(V3?`S5&uet)q{Myh{Dzg;jKD%8u@8vsI^WgxeROa*hIZWl~?PK{qR zT}R%)ok+OaFnt1DeC+cLy`o%iSoR|in<|VaZq4BBd zSi3}wf6Ss(aES2(Q79R{Hnlc%Uw1EWdFRZhltRP|tnx?LWX6PN!YCT~hUEBGJZeBm z=|Pwb0jmH+@kE8(>SJK?}!Zu%t za1$&~ZbNI|q>HJmXAL3kETIaOO0)Pj%9Qtbgvb1xSCmE8 z68|`9_dGkb%x3s0i_T`!{$xA}ktiv)XKdBm*=p*4gWvl;b!a~w6Zr51v=VJfYI^ze zRD4DGNDNN`HIiWW zJ{chE$-;ViXJ^ToOKJn7eiN{D}&DtpzG@gzUN13UZc zoxHqKcdeIf1oG@vZoll3S}L2*i(iltz($o* z@8$ZY63J%RO~HKDuEA#%<}+5!NC3Cj)Ij zbn~KrRj11c}tjzDb9 zzma0vlIBPq zxJsG3);R}T%#fM>jm~L8jf(_uCMxl_hncE)$GZo2FIOZ2w z&H$$Q`;H?AeHer<#Ypi0Q`aHn5xYy4*C$!3(CS;OO}TI1asRk#uzJAIGJa=vXS;AQ zHxYh3LpHpVXl2PeT1rQm%zVJeHbN@pTffNb9#(%&KV3}min1Qpb_ zF0l>PIdahBJ@+)B_$rZpxl*=ao~L*N_O|K>8WV%t9N=cfV;6!}E5D>>ei1gJ1q`lvUQ#)x&-?CCBq1tE&@S1FsG`l~4`--qCtS3bHXIyu$ zvJ!zws6U9Gn}gZ7+HO?|PhE@aP6|xTPkT%pw2t_R)<#oyxp&ECy5%@Q#ogqxtkg_E zKZ%!ff-SO4#e&Kr?yOU1qn*$le zxtLQ{b#%U6fZBP)@Q#`LUKF#6fcoY-XEB?LW!04Ay=y@onNBO252U}9f4OIW=K!r+Lm+fp%~@TU`Z zt3vCoHl&YpaQ0$;J0B!p0)Om4Dka?AnRQzio?sWke@Xzb=2-F2nH_ug7&(b2YvmM|1|(iO&sHO!>=(rDlOaf^x`DXz|VKHPct|8(ws>&-ONqj zuRnW2cxi_<`TM~m^;=}sEpq#7<86lMF$$58~efpO|B)A>)a*#>$! zUO)+ACOXRKff}l~B+$Q#2UUQX=q>{SHPmqFAUoSYZ^wVtyE1+tUKNJ`DcJ@>99dAV zn2CQ>wbvN)-*ALQ3LFFuEtH;ix&CtmS^~(gsOVJZ(DRO+e??oRvyrd7O12q$F~eIv zqg7#oF3eC?;0lSzk8??Y|dFu0%NR>?l7IkuC*U?0QZk_ Jze-V2{RJ`8Mw\%}} for details. +#' +#' @name %>% +#' @rdname pipe +#' @keywords internal +#' @export +#' @importFrom magrittr %>% +#' @usage lhs \%>\% rhs +#' @param lhs A value or the magrittr placeholder. +#' @param rhs A function call using the magrittr semantics. +#' @return The result of calling `rhs(lhs)`. +NULL diff --git a/man/pipe.Rd b/man/pipe.Rd new file mode 100644 index 0000000..a648c29 --- /dev/null +++ b/man/pipe.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipe.R +\name{\%>\%} +\alias{\%>\%} +\title{Pipe operator} +\usage{ +lhs \%>\% rhs +} +\arguments{ +\item{lhs}{A value or the magrittr placeholder.} + +\item{rhs}{A function call using the magrittr semantics.} +} +\value{ +The result of calling \code{rhs(lhs)}. +} +\description{ +See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. +} +\keyword{internal} From 3225c6349aa3cfb3dc456916afb8098da442d392 Mon Sep 17 00:00:00 2001 From: Michael Aydinbas Date: Thu, 25 Jan 2024 20:09:23 +0100 Subject: [PATCH 8/8] add sysdata? --- R/sysdata.rda | Bin 2763 -> 2763 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/R/sysdata.rda b/R/sysdata.rda index eb303cf804b77f1badf0b2d97168f0a3f6920e66..be6b002cff04d1329d0fdca86a74da91bb6a0978 100644 GIT binary patch delta 19 XcmX>tdRmlAzMF#q445}^9pnN4Eu91g delta 19 XcmX>tdRmlAzMF#q4A?eu9pnN4Evy6y