From dc2c732608f4c7715498e8cf19e1c154d7c490eb Mon Sep 17 00:00:00 2001 From: Kamil Sijko Date: Wed, 8 May 2024 17:26:11 +0200 Subject: [PATCH] WIP: working code, pending tests for baseline logic due to lack of examples --- NAMESPACE | 1 + R/derive_blfl.R | 67 ++++++++---------- inst/derive_blfl/raw_dataset.csv | 2 +- man/derive_blfl.Rd | 112 +++++++++++++++++++++++++++++++ man/dtc_datepart.Rd | 36 ++++++++++ man/dtc_timepart.Rd | 55 +++++++++++++++ 6 files changed, 233 insertions(+), 40 deletions(-) create mode 100644 man/derive_blfl.Rd create mode 100644 man/dtc_datepart.Rd create mode 100644 man/dtc_timepart.Rd diff --git a/NAMESPACE b/NAMESPACE index 6170cee4..28a465fd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(create_iso8601) export(ct_map) export(ct_spec_example) export(ct_spec_vars) +export(derive_blfl) export(derive_study_day) export(fmt_cmp) export(hardcode_ct) diff --git a/R/derive_blfl.R b/R/derive_blfl.R index 1d14693a..c185e5fd 100644 --- a/R/derive_blfl.R +++ b/R/derive_blfl.R @@ -8,7 +8,6 @@ #' partial dates should be set to NA (default is `TRUE`). #' #' @return Character vector containing ISO8601 dates. -#' @export #' #' @examples #' ## Partial or missing dates set to NA by default @@ -53,7 +52,6 @@ dtc_datepart <- function(dtc, partial_as_na = TRUE) { #' seconds should be ignored (default is `TRUE`). #' #' @return Character vector containing ISO 8601 times. -#' @export #' #' @examples #' ## Partial or missing times set to NA and seconds ignored by default @@ -162,6 +160,18 @@ dtc_timepart <- function(dtc, partial_as_na = TRUE, ignore_seconds = TRUE) { #' observations in the study. Common choices for this reference variable #' include "RFSTDTC" (the date/time of the first study treatment) or #' "RFXSTDTC" (the date/time of the first exposure to the study drug). +#' @param baseline_visits A character vector specifying the baseline visits within the study. +#' These visits are identified as critical points for data collection at the start of the study, +#' before any intervention is applied. This parameter allows the function to filter and analyze +#' data specifically from these initial assessment points. For example, baseline visits might +#' include "Cycle 1 Day 1" if this is the first visit where subjects are assessed prior to receiving treatment. +#' @param baseline_timepoints A character vector of dates in "YYYY-MM-DD" format that specifies +#' the specific days during the baseline visits when key assessments or measurements were taken. +#' These timepoints are used to refine the selection of data points to include only those +#' collected on these specific dates, ensuring that only relevant baseline data is analyzed. +#' This is particularly important in studies where the timing of measurements can significantly +#' impact the interpretation of results. An example might be "2020-09-20", indicating a specific +#' day when baseline data was collected. #' #' @return Modified input data frame with baseline flag variable `--BLFL` or #' last observation before exposure flag `--LOBXFL` added. @@ -169,9 +179,9 @@ dtc_timepart <- function(dtc, partial_as_na = TRUE, ignore_seconds = TRUE) { #' @export #' #' @examples -#' DM <- read.csv(system.file("inst/derive_blfl/DM.csv", package = "sdtmoak")) +#' DM <- read.csv(system.file("derive_blfl/DM.csv", package = "sdtm.oak")) #' DM -#' raw_dataset <- read.csv(system.file("inst/derive_blfl/raw_dataset.csv", package = "sdtmoak")) +#' raw_dataset <- read.csv(system.file("derive_blfl/raw_dataset.csv", package = "sdtm.oak")) #' raw_dataset #' observed_output <- derive_blfl(raw_dataset = raw_dataset, #' DM_dataset = DM, @@ -194,7 +204,7 @@ derive_blfl <- function(raw_dataset, # Assert that the input dataset has a "DOMAIN" column checkmate::assert_names(names(raw_dataset), - must.include = "DOMAIN", + must.include = c("DOMAIN", sdtm.oak:::oak_id_vars()), .var.name = "Columns of 'raw_dataset'", add = assertion_collection) @@ -311,10 +321,10 @@ derive_blfl <- function(raw_dataset, # Split --DTC and reference_date_variable into date and time parts # (partial or missing dates and times set to NA) - ds_mod$dom_dt <- dtc_datepart(ds_mod[[domain_prefixed_names["dtc"]]]) - ds_mod$dom_tm <- dtc_timepart(ds_mod[[domain_prefixed_names["dtc"]]]) - ds_mod$ref_dt <- dtc_datepart(ds_mod[[reference_date_variable]]) - ds_mod$ref_tm <- dtc_timepart(ds_mod[[reference_date_variable]]) + ds_mod$dom_dt <- sdtm.oak:::dtc_datepart(ds_mod[[domain_prefixed_names["dtc"]]]) + ds_mod$dom_tm <- sdtm.oak:::dtc_timepart(ds_mod[[domain_prefixed_names["dtc"]]]) + ds_mod$ref_dt <- sdtm.oak:::dtc_datepart(ds_mod[[reference_date_variable]]) + ds_mod$ref_tm <- sdtm.oak:::dtc_timepart(ds_mod[[reference_date_variable]]) # If VISIT not in data frame then assign it as "" for processing @@ -322,32 +332,11 @@ derive_blfl <- function(raw_dataset, ds_mod[["VISIT"]] <- "" } - # Get a vector of baseline visits from Baseline column (if it exists) - # in oak_pkg_env$study_visit_configuration - # TODO: How this should work? Attached file doesn't contain this info - # if ("Baseline" %in% names(oak_pkg_env$study_visit_configuration)) { - # baseline_visits <- oak_pkg_env$study_visit_configuration |> - # dplyr::filter(toupper(Baseline) == "Y") |> - # dplyr::select(TV_Visit) |> - # dplyr::distinct() |> - # unlist() - # } - # If --TPT not in data frame then assign it as "" for processing if (!domain_prefixed_names["tpt"] %in% names(ds_mod)) { ds_mod[[domain_prefixed_names["tpt"]]] <- "" } - # Get a vector of baseline timepoints from Baseline column (if it exists) - # TODO: How this part should work? I need example data - # if ("Baseline" %in% names(oak_pkg_env$study_visit_configuration)) { - # baseline_timepoints <- oak_pkg_env$timepoint_conf |> - # dplyr::filter(Domain == domain & toupper(Baseline) == "Y") |> - # dplyr::select(TPT) |> - # dplyr::distinct() |> - # unlist() - # } - # Filter on rows that have domain and reference dates not equal to NA ds_subset <- dplyr::filter(ds_mod, !is.na(dom_dt) & !is.na(ref_dt)) @@ -368,11 +357,13 @@ derive_blfl <- function(raw_dataset, # - VISIT is in baseline visits list and # - xxTPT is in baseline timepoints list # (*C) - ds_subset_eq_2 <- dplyr::filter(ds_subset, dom_dt == ref_dt, - is.na(dom_tm) | is.na(ref_tm), - (VISIT %in% baseline_visits & get(domain_prefixed_names["tpt"]) %in% baseline_timepoints) | - (VISIT %in% baseline_visits & length(baseline_timepoints) == 0) | - (get(domain_prefixed_names["tpt"]) %in% baseline_timepoints & length(baseline_visits) == 0)) + ds_subset_eq_2 <- + ds_subset |> + dplyr::filter(dom_dt == ref_dt, + is.na(dom_tm) | is.na(ref_tm), + (VISIT %in% baseline_visits & get(domain_prefixed_names["tpt"]) %in% baseline_timepoints) | + (VISIT %in% baseline_visits & length(baseline_timepoints) == 0) | + (get(domain_prefixed_names["tpt"]) %in% baseline_timepoints & length(baseline_visits) == 0)) # Combine (*A) and (*B) and (*C) ds_base <- rbind(ds_subset_lt, ds_subset_eq_1, ds_subset_eq_2) @@ -392,9 +383,7 @@ derive_blfl <- function(raw_dataset, dplyr::group_by(USUBJID, .data[[domain_prefixed_names["testcd"]]]) |> dplyr::slice_max(!!rlang::sym(domain_prefixed_names["dtc"]), na_rm = TRUE) |> dplyr::ungroup() |> - # OAK.id.vars already there - # TODO: this is not true - dplyr::select(dplyr::all_of(c(oak_id_vars(), domain_prefixed_names["testcd"])), + dplyr::select(dplyr::all_of(c(sdtm.oak:::oak_id_vars(), domain_prefixed_names["testcd"])), dplyr::any_of( c(domain_prefixed_names[c("cat", "scat", @@ -414,7 +403,7 @@ derive_blfl <- function(raw_dataset, ds_blfl[[target_sdtm_variable]] <- "Y" # Join baseline flag onto input dataset - ds_out <- dplyr::left_join(raw_dataset, ds_blfl) + ds_out <- dplyr::left_join(raw_dataset, ds_blfl, by = sdtm.oak:::oak_id_vars()) # Assert that merged data frame has same number of rows as input data frame if (nrow(ds_out) != nrow(raw_dataset)) { diff --git a/inst/derive_blfl/raw_dataset.csv b/inst/derive_blfl/raw_dataset.csv index d7c07465..c03e81e7 100644 --- a/inst/derive_blfl/raw_dataset.csv +++ b/inst/derive_blfl/raw_dataset.csv @@ -1,4 +1,4 @@ -"DOMAIN","OAK_ID","RAVE_SOURCE","PATIENT_NUM","USUBJID","VSDTC","VSTESTCD","VSORRES","VSSTAT" +"DOMAIN","oak_id","raw_source","patient_number","USUBJID","VSDTC","VSTESTCD","VSORRES","VSSTAT" "VS",1,"VTLS1","375","GA41070-375","2020-09-01T13:31","DIABP","ND",NA "VS",1,"VTLS1","375","GA41070-375","2020-09-01T13:31","PULSE","ND",NA "VS",1,"VTLS2","375","GA41070-375","2020-09-28T11:00","DIABP","ND",NA diff --git a/man/derive_blfl.Rd b/man/derive_blfl.Rd new file mode 100644 index 00000000..5e5fa3cf --- /dev/null +++ b/man/derive_blfl.Rd @@ -0,0 +1,112 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/derive_blfl.R +\name{derive_blfl} +\alias{derive_blfl} +\title{Derive Baseline Flag or Last Observation Before Exposure Flag} +\usage{ +derive_blfl( + raw_dataset, + DM_dataset, + target_sdtm_variable, + reference_date_variable, + baseline_visits = character(), + baseline_timepoints = character() +) +} +\arguments{ +\item{raw_dataset}{Input data frame.} + +\item{target_sdtm_variable}{Name of variable to be derived (\code{--BLFL} or +\code{--LOBXFL} where \verb{--} is domain).} + +\item{reference_date_variable}{vector of a date/time from the +Demographics (DM) dataset, which serves as a point of comparison for other +observations in the study. Common choices for this reference variable +include "RFSTDTC" (the date/time of the first study treatment) or +"RFXSTDTC" (the date/time of the first exposure to the study drug).} + +\item{baseline_visits}{A character vector specifying the baseline visits within the study. +These visits are identified as critical points for data collection at the start of the study, +before any intervention is applied. This parameter allows the function to filter and analyze +data specifically from these initial assessment points. For example, baseline visits might +include "Cycle 1 Day 1" if this is the first visit where subjects are assessed prior to receiving treatment.} + +\item{baseline_timepoints}{A character vector of dates in "YYYY-MM-DD" format that specifies +the specific days during the baseline visits when key assessments or measurements were taken. +These timepoints are used to refine the selection of data points to include only those +collected on these specific dates, ensuring that only relevant baseline data is analyzed. +This is particularly important in studies where the timing of measurements can significantly +impact the interpretation of results. An example might be "2020-09-20", indicating a specific +day when baseline data was collected.} +} +\value{ +Modified input data frame with baseline flag variable \code{--BLFL} or +last observation before exposure flag \code{--LOBXFL} added. +} +\description{ +Derive the baseline flag variable (\code{--BLFL}) or the last observation before +exposure flag (\code{--LOBXFL}), from the observation date/time (\code{--DTC}), and a +DM domain reference date/time. +} +\details{ +The methodology and approach implemented in this function are based on +concepts and examples found in the Roche version of the {roak} package. + +The derivation is as follows: +\itemize{ +\item Remove records where the result (\code{--ORRES}) is missing. Also, exclude records +with results labeled as "ND" (No Data) or "NOT DONE" in the \code{--ORRES} column, +which indicate that the measurement or observation was not completed. This +step is important even if a previous cleaning step (like the +'oak_clean_not_done' function) might not have been applied to the data yet. +\item Remove records where the status (\code{--STAT}) indicates the observation or test +was not performed, marked as "NOT DONE". +\item Divide the date and time column (\code{--DTC}) and the reference date/time +variable (\code{reference_date_variable}) into separate date and time components. Ignore +any seconds recorded in the time component, focusing only on hours and +minutes for further calculations. +\item Set partial or missing dates to \code{NA}. +\item Set partial or missing times to \code{NA}. +\item Get a list of baseline visits from \verb{Baseline column} +(if it exists) in \code{oak_pkg_env$study_visit_configuration}. +\item Get a list of baseline timepoints from \code{Baseline} column +(if it exists) in \code{oak_pkg_env$timepoint_conf}. +\item Filter on rows that have domain and reference dates not equal to +\code{NA}. (Ref: \strong{X}) +\item Filter \strong{X} on rows with domain date prior to (less than) +reference date. (Ref: \strong{A}) +\item Filter \strong{X} on rows with domain date equal to reference date but +domain and reference times not equal to \code{NA} and domain time prior to (less +than) reference time. (Ref: \strong{B}) +\item Filter \strong{X} on rows with domain date equal to reference date but +domain and/or reference time equal to NA and: +\itemize{ +\item VISIT is in baseline visits list (if it exists) and +\item xxTPT is in baseline timepoints list (if it exists). +} +} + +(Ref: \strong{C}) +\itemize{ +\item Combine the rows from \strong{A}, \strong{B}, and \strong{C} to get a +data frame of pre-reference date observations. Sort the rows by \code{USUBJID}, +\code{--STAT}, and \code{--ORRES}. +\item Group by \code{USUBJID} and \code{--TESTCD} and filter on the rows +that have maximum value from \code{--DTC}. Keep only the oak id variables and +\code{--TESTCD} (because these are the unique values). Remove any duplicate rows. +Assign the baseline flag variable, \code{--BLFL}, the last observation before +exposure flag (\code{--LOBXFL}) variable to these rows. +\item Join the baseline flag onto the input dataset. +} +} +\examples{ +DM <- read.csv(system.file("derive_blfl/DM.csv", package = "sdtm.oak")) +DM +raw_dataset <- read.csv(system.file("derive_blfl/raw_dataset.csv", package = "sdtm.oak")) +raw_dataset +observed_output <- derive_blfl(raw_dataset = raw_dataset, + DM_dataset = DM, + target_sdtm_variable = "VSBLFL", + reference_date_variable = "RFSTDTC") +observed_output +} diff --git a/man/dtc_datepart.Rd b/man/dtc_datepart.Rd new file mode 100644 index 00000000..543d6ff1 --- /dev/null +++ b/man/dtc_datepart.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/derive_blfl.R +\name{dtc_datepart} +\alias{dtc_datepart} +\title{Extract date part from ISO8601 date/time variable} +\usage{ +dtc_datepart(dtc, partial_as_na = TRUE) +} +\arguments{ +\item{dtc}{Character vector containing ISO8601 date/times.} + +\item{partial_as_na}{Logical \code{TRUE} or \code{FALSE} indicating whether +partial dates should be set to NA (default is \code{TRUE}).} +} +\value{ +Character vector containing ISO8601 dates. +} +\description{ +The date part is extracted from an ISO8601 date/time variable. +By default, partial or missing dates are set to NA. +} +\examples{ +## Partial or missing dates set to NA by default +dtc_datepart( + c(NA, "", "2021", "2021-12", "2021-12-25", "2021-12-25T12:00:00") +) + # |--> c(NA, NA, NA, NA, "2021-12-25", "2021-12-25") + +## Prevent partial or missing dates from being set to NA +dtc_datepart( + c(NA, "", "2021", "2021-12", "2021-12-25", "2021-12-25T12:00:00"), + partial_as_na = FALSE +) + # |--> c(NA, "", "2021", "2021-12", "2021-12-25", "2021-12-25") + +} diff --git a/man/dtc_timepart.Rd b/man/dtc_timepart.Rd new file mode 100644 index 00000000..cb821ed2 --- /dev/null +++ b/man/dtc_timepart.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/derive_blfl.R +\name{dtc_timepart} +\alias{dtc_timepart} +\title{Extract time part from ISO 8601 date/time variable} +\usage{ +dtc_timepart(dtc, partial_as_na = TRUE, ignore_seconds = TRUE) +} +\arguments{ +\item{dtc}{Character vector containing ISO 8601 date/times.} + +\item{partial_as_na}{Logical \code{TRUE} or \code{FALSE} indicating whether +partial times should be set to NA (default is \code{TRUE}).} + +\item{ignore_seconds}{Logical \code{TRUE} or \code{FALSE} indicating whether +seconds should be ignored (default is \code{TRUE}).} +} +\value{ +Character vector containing ISO 8601 times. +} +\description{ +The time part is extracted from an ISO 8601 date/time variable. +By default, partial or missing times are set to NA, and seconds are ignored +and not extracted. +} +\examples{ +## Partial or missing times set to NA and seconds ignored by default +dtc_timepart( + c(NA, "", "2021-12-25", "2021-12-25T12", "2021-12-25T12:30", "2021-12-25T12:30:59") +) + # |--> c(NA, NA, NA, NA, "12:30", "12:30") + +## Prevent partial or missing times from being set to NA +dtc_timepart( + c(NA, "", "2021-12-25", "2021-12-25T12", "2021-12-25T12:30", "2021-12-25T12:30:59"), + partial_as_na = FALSE +) + # |--> c(NA, "", "", "12", "12:30", "12:30") + +## Do not ignore seconds, partial or missing times set to NA +dtc_timepart( + c(NA, "", "2021-12-25", "2021-12-25T12", "2021-12-25T12:30", "2021-12-25T12:30:59"), + ignore_seconds = FALSE +) + # |--> c(NA, NA, NA, NA, NA, "12:30:59") + +## Do not ignore seconds and prevent partial or missing times from being set to NA +dtc_timepart( + c(NA, "", "2021-12-25", "2021-12-25T12", "2021-12-25T12:30", "2021-12-25T12:30:59"), + partial_as_na = FALSE, + ignore_seconds = FALSE +) + # |--> c(NA, "", "", "12", "12:30", "12:30:59") + +}