From dc2c732608f4c7715498e8cf19e1c154d7c490eb Mon Sep 17 00:00:00 2001
From: Kamil Sijko <kamil.sijko@ttsi.com.pl>
Date: Wed, 8 May 2024 17:26:11 +0200
Subject: [PATCH] WIP: working code, pending tests for baseline logic due to
 lack of examples

---
 NAMESPACE                        |   1 +
 R/derive_blfl.R                  |  67 ++++++++----------
 inst/derive_blfl/raw_dataset.csv |   2 +-
 man/derive_blfl.Rd               | 112 +++++++++++++++++++++++++++++++
 man/dtc_datepart.Rd              |  36 ++++++++++
 man/dtc_timepart.Rd              |  55 +++++++++++++++
 6 files changed, 233 insertions(+), 40 deletions(-)
 create mode 100644 man/derive_blfl.Rd
 create mode 100644 man/dtc_datepart.Rd
 create mode 100644 man/dtc_timepart.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 6170cee4..28a465fd 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,6 +8,7 @@ export(create_iso8601)
 export(ct_map)
 export(ct_spec_example)
 export(ct_spec_vars)
+export(derive_blfl)
 export(derive_study_day)
 export(fmt_cmp)
 export(hardcode_ct)
diff --git a/R/derive_blfl.R b/R/derive_blfl.R
index 1d14693a..c185e5fd 100644
--- a/R/derive_blfl.R
+++ b/R/derive_blfl.R
@@ -8,7 +8,6 @@
 #'   partial dates should be set to NA (default is `TRUE`).
 #'
 #' @return Character vector containing ISO8601 dates.
-#' @export
 #'
 #' @examples
 #' ## Partial or missing dates set to NA by default
@@ -53,7 +52,6 @@ dtc_datepart <- function(dtc, partial_as_na = TRUE) {
 #'   seconds should be ignored (default is `TRUE`).
 #'
 #' @return Character vector containing ISO 8601 times.
-#' @export
 #'
 #' @examples
 #' ## Partial or missing times set to NA and seconds ignored by default
@@ -162,6 +160,18 @@ dtc_timepart <- function(dtc, partial_as_na = TRUE, ignore_seconds = TRUE) {
 #'   observations in the study. Common choices for this reference variable
 #'   include "RFSTDTC" (the date/time of the first study treatment) or
 #'   "RFXSTDTC" (the date/time of the first exposure to the study drug).
+#' @param baseline_visits A character vector specifying the baseline visits within the study.
+#'   These visits are identified as critical points for data collection at the start of the study,
+#'   before any intervention is applied. This parameter allows the function to filter and analyze
+#'   data specifically from these initial assessment points. For example, baseline visits might
+#'   include "Cycle 1 Day 1" if this is the first visit where subjects are assessed prior to receiving treatment.
+#' @param baseline_timepoints A character vector of dates in "YYYY-MM-DD" format that specifies
+#'   the specific days during the baseline visits when key assessments or measurements were taken.
+#'   These timepoints are used to refine the selection of data points to include only those
+#'   collected on these specific dates, ensuring that only relevant baseline data is analyzed.
+#'   This is particularly important in studies where the timing of measurements can significantly
+#'   impact the interpretation of results. An example might be "2020-09-20", indicating a specific
+#'   day when baseline data was collected.
 #'
 #' @return Modified input data frame with baseline flag variable `--BLFL` or
 #'   last observation before exposure flag `--LOBXFL` added.
@@ -169,9 +179,9 @@ dtc_timepart <- function(dtc, partial_as_na = TRUE, ignore_seconds = TRUE) {
 #' @export
 #'
 #' @examples
-#' DM <- read.csv(system.file("inst/derive_blfl/DM.csv", package = "sdtmoak"))
+#' DM <- read.csv(system.file("derive_blfl/DM.csv", package = "sdtm.oak"))
 #' DM
-#' raw_dataset <- read.csv(system.file("inst/derive_blfl/raw_dataset.csv", package = "sdtmoak"))
+#' raw_dataset <- read.csv(system.file("derive_blfl/raw_dataset.csv", package = "sdtm.oak"))
 #' raw_dataset
 #' observed_output <- derive_blfl(raw_dataset = raw_dataset,
 #'                                DM_dataset = DM,
@@ -194,7 +204,7 @@ derive_blfl <- function(raw_dataset,
 
   # Assert that the input dataset has a "DOMAIN" column
   checkmate::assert_names(names(raw_dataset),
-                          must.include = "DOMAIN",
+                          must.include = c("DOMAIN", sdtm.oak:::oak_id_vars()),
                           .var.name = "Columns of 'raw_dataset'",
                           add = assertion_collection)
 
@@ -311,10 +321,10 @@ derive_blfl <- function(raw_dataset,
 
   # Split --DTC and reference_date_variable into date and time parts
   # (partial or missing dates and times set to NA)
-  ds_mod$dom_dt <- dtc_datepart(ds_mod[[domain_prefixed_names["dtc"]]])
-  ds_mod$dom_tm <- dtc_timepart(ds_mod[[domain_prefixed_names["dtc"]]])
-  ds_mod$ref_dt <- dtc_datepart(ds_mod[[reference_date_variable]])
-  ds_mod$ref_tm <- dtc_timepart(ds_mod[[reference_date_variable]])
+  ds_mod$dom_dt <- sdtm.oak:::dtc_datepart(ds_mod[[domain_prefixed_names["dtc"]]])
+  ds_mod$dom_tm <- sdtm.oak:::dtc_timepart(ds_mod[[domain_prefixed_names["dtc"]]])
+  ds_mod$ref_dt <- sdtm.oak:::dtc_datepart(ds_mod[[reference_date_variable]])
+  ds_mod$ref_tm <- sdtm.oak:::dtc_timepart(ds_mod[[reference_date_variable]])
 
 
   # If VISIT not in data frame then assign it as "<unspecified>" for processing
@@ -322,32 +332,11 @@ derive_blfl <- function(raw_dataset,
     ds_mod[["VISIT"]] <- "<unspecified>"
   }
 
-  # Get a vector of baseline visits from Baseline column (if it exists)
-  # in oak_pkg_env$study_visit_configuration
-  # TODO: How this should work? Attached file doesn't contain this info
-  # if ("Baseline" %in% names(oak_pkg_env$study_visit_configuration)) {
-  #   baseline_visits <- oak_pkg_env$study_visit_configuration |>
-  #     dplyr::filter(toupper(Baseline) == "Y") |>
-  #     dplyr::select(TV_Visit) |>
-  #     dplyr::distinct() |>
-  #     unlist()
-  # }
-
   # If --TPT not in data frame then assign it as "<unspecified>" for processing
   if (!domain_prefixed_names["tpt"] %in% names(ds_mod)) {
     ds_mod[[domain_prefixed_names["tpt"]]] <- "<unspecified>"
   }
 
-  # Get a vector of baseline timepoints from Baseline column (if it exists)
-  # TODO: How this part should work? I need example data
-  # if ("Baseline" %in% names(oak_pkg_env$study_visit_configuration)) {
-  #   baseline_timepoints <- oak_pkg_env$timepoint_conf |>
-  #     dplyr::filter(Domain == domain & toupper(Baseline) == "Y") |>
-  #     dplyr::select(TPT) |>
-  #     dplyr::distinct() |>
-  #     unlist()
-  # }
-
   # Filter on rows that have domain and reference dates not equal to NA
   ds_subset <- dplyr::filter(ds_mod, !is.na(dom_dt) & !is.na(ref_dt))
 
@@ -368,11 +357,13 @@ derive_blfl <- function(raw_dataset,
   # - VISIT is in baseline visits list and
   # - xxTPT is in baseline timepoints list
   # (*C)
-  ds_subset_eq_2 <- dplyr::filter(ds_subset, dom_dt == ref_dt,
-                                  is.na(dom_tm) | is.na(ref_tm),
-                                  (VISIT %in% baseline_visits & get(domain_prefixed_names["tpt"]) %in% baseline_timepoints) |
-                                    (VISIT %in% baseline_visits & length(baseline_timepoints) == 0) |
-                                    (get(domain_prefixed_names["tpt"]) %in% baseline_timepoints & length(baseline_visits) == 0))
+  ds_subset_eq_2 <-
+    ds_subset |>
+    dplyr::filter(dom_dt == ref_dt,
+                  is.na(dom_tm) | is.na(ref_tm),
+                  (VISIT %in% baseline_visits & get(domain_prefixed_names["tpt"]) %in% baseline_timepoints) |
+                  (VISIT %in% baseline_visits & length(baseline_timepoints) == 0) |
+                  (get(domain_prefixed_names["tpt"]) %in% baseline_timepoints & length(baseline_visits) == 0))
 
   # Combine (*A) and (*B) and (*C)
   ds_base <- rbind(ds_subset_lt, ds_subset_eq_1, ds_subset_eq_2)
@@ -392,9 +383,7 @@ derive_blfl <- function(raw_dataset,
     dplyr::group_by(USUBJID, .data[[domain_prefixed_names["testcd"]]]) |>
     dplyr::slice_max(!!rlang::sym(domain_prefixed_names["dtc"]), na_rm = TRUE) |>
     dplyr::ungroup() |>
-    # OAK.id.vars already there
-    # TODO: this is not true
-    dplyr::select(dplyr::all_of(c(oak_id_vars(), domain_prefixed_names["testcd"])),
+    dplyr::select(dplyr::all_of(c(sdtm.oak:::oak_id_vars(), domain_prefixed_names["testcd"])),
                   dplyr::any_of(
                     c(domain_prefixed_names[c("cat",
                                               "scat",
@@ -414,7 +403,7 @@ derive_blfl <- function(raw_dataset,
   ds_blfl[[target_sdtm_variable]] <- "Y"
 
   # Join baseline flag onto input dataset
-  ds_out <- dplyr::left_join(raw_dataset, ds_blfl)
+  ds_out <- dplyr::left_join(raw_dataset, ds_blfl, by = sdtm.oak:::oak_id_vars())
 
   # Assert that merged data frame has same number of rows as input data frame
   if (nrow(ds_out) != nrow(raw_dataset)) {
diff --git a/inst/derive_blfl/raw_dataset.csv b/inst/derive_blfl/raw_dataset.csv
index d7c07465..c03e81e7 100644
--- a/inst/derive_blfl/raw_dataset.csv
+++ b/inst/derive_blfl/raw_dataset.csv
@@ -1,4 +1,4 @@
-"DOMAIN","OAK_ID","RAVE_SOURCE","PATIENT_NUM","USUBJID","VSDTC","VSTESTCD","VSORRES","VSSTAT"
+"DOMAIN","oak_id","raw_source","patient_number","USUBJID","VSDTC","VSTESTCD","VSORRES","VSSTAT"
 "VS",1,"VTLS1","375","GA41070-375","2020-09-01T13:31","DIABP","ND",NA
 "VS",1,"VTLS1","375","GA41070-375","2020-09-01T13:31","PULSE","ND",NA
 "VS",1,"VTLS2","375","GA41070-375","2020-09-28T11:00","DIABP","ND",NA
diff --git a/man/derive_blfl.Rd b/man/derive_blfl.Rd
new file mode 100644
index 00000000..5e5fa3cf
--- /dev/null
+++ b/man/derive_blfl.Rd
@@ -0,0 +1,112 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/derive_blfl.R
+\name{derive_blfl}
+\alias{derive_blfl}
+\title{Derive Baseline Flag or Last Observation Before Exposure Flag}
+\usage{
+derive_blfl(
+  raw_dataset,
+  DM_dataset,
+  target_sdtm_variable,
+  reference_date_variable,
+  baseline_visits = character(),
+  baseline_timepoints = character()
+)
+}
+\arguments{
+\item{raw_dataset}{Input data frame.}
+
+\item{target_sdtm_variable}{Name of variable to be derived (\code{--BLFL} or
+\code{--LOBXFL} where \verb{--} is domain).}
+
+\item{reference_date_variable}{vector of a date/time from the
+Demographics (DM) dataset, which serves as a point of comparison for other
+observations in the study. Common choices for this reference variable
+include "RFSTDTC" (the date/time of the first study treatment) or
+"RFXSTDTC" (the date/time of the first exposure to the study drug).}
+
+\item{baseline_visits}{A character vector specifying the baseline visits within the study.
+These visits are identified as critical points for data collection at the start of the study,
+before any intervention is applied. This parameter allows the function to filter and analyze
+data specifically from these initial assessment points. For example, baseline visits might
+include "Cycle 1 Day 1" if this is the first visit where subjects are assessed prior to receiving treatment.}
+
+\item{baseline_timepoints}{A character vector of dates in "YYYY-MM-DD" format that specifies
+the specific days during the baseline visits when key assessments or measurements were taken.
+These timepoints are used to refine the selection of data points to include only those
+collected on these specific dates, ensuring that only relevant baseline data is analyzed.
+This is particularly important in studies where the timing of measurements can significantly
+impact the interpretation of results. An example might be "2020-09-20", indicating a specific
+day when baseline data was collected.}
+}
+\value{
+Modified input data frame with baseline flag variable \code{--BLFL} or
+last observation before exposure flag \code{--LOBXFL} added.
+}
+\description{
+Derive the baseline flag variable (\code{--BLFL}) or the last observation before
+exposure flag (\code{--LOBXFL}), from the observation date/time (\code{--DTC}), and a
+DM domain reference date/time.
+}
+\details{
+The methodology and approach implemented in this function are based on
+concepts and examples found in the Roche version of the {roak} package.
+
+The derivation is as follows:
+\itemize{
+\item Remove records where the result (\code{--ORRES}) is missing. Also, exclude records
+with results labeled as "ND" (No Data) or "NOT DONE" in the \code{--ORRES} column,
+which indicate that the measurement or observation was not completed. This
+step is important even if a previous cleaning step (like the
+'oak_clean_not_done' function) might not have been applied to the data yet.
+\item Remove records where the status (\code{--STAT}) indicates the observation or test
+was not performed, marked as "NOT DONE".
+\item Divide the date and time column (\code{--DTC}) and the reference date/time
+variable (\code{reference_date_variable}) into separate date and time components. Ignore
+any seconds recorded in the time component, focusing only on hours and
+minutes for further calculations.
+\item Set partial or missing dates to \code{NA}.
+\item Set partial or missing times to \code{NA}.
+\item Get a list of baseline visits from \verb{Baseline column}
+(if it exists) in \code{oak_pkg_env$study_visit_configuration}.
+\item Get a list of baseline timepoints from \code{Baseline} column
+(if it exists) in \code{oak_pkg_env$timepoint_conf}.
+\item Filter on rows that have domain and reference dates not equal to
+\code{NA}. (Ref: \strong{X})
+\item Filter \strong{X} on rows with domain date prior to (less than)
+reference date. (Ref: \strong{A})
+\item Filter \strong{X} on rows with domain date equal to reference date but
+domain and reference times not equal to \code{NA} and domain time prior to (less
+than) reference time. (Ref: \strong{B})
+\item Filter \strong{X} on rows with domain date equal to reference date but
+domain and/or reference time equal to NA and:
+\itemize{
+\item VISIT is in baseline visits list (if it exists) and
+\item xxTPT is in baseline timepoints list (if it exists).
+}
+}
+
+(Ref: \strong{C})
+\itemize{
+\item Combine the rows from \strong{A}, \strong{B}, and \strong{C} to get a
+data frame of pre-reference date observations. Sort the rows by \code{USUBJID},
+\code{--STAT}, and \code{--ORRES}.
+\item Group by \code{USUBJID} and \code{--TESTCD} and filter on the rows
+that have maximum value from \code{--DTC}. Keep only the oak id variables and
+\code{--TESTCD} (because these are the unique values). Remove any duplicate rows.
+Assign the baseline flag variable, \code{--BLFL}, the last observation before
+exposure flag (\code{--LOBXFL}) variable to these rows.
+\item Join the baseline flag onto the input dataset.
+}
+}
+\examples{
+DM <- read.csv(system.file("derive_blfl/DM.csv", package = "sdtm.oak"))
+DM
+raw_dataset <- read.csv(system.file("derive_blfl/raw_dataset.csv", package = "sdtm.oak"))
+raw_dataset
+observed_output <- derive_blfl(raw_dataset = raw_dataset,
+                               DM_dataset = DM,
+                               target_sdtm_variable = "VSBLFL",
+                               reference_date_variable = "RFSTDTC")
+observed_output
+}
diff --git a/man/dtc_datepart.Rd b/man/dtc_datepart.Rd
new file mode 100644
index 00000000..543d6ff1
--- /dev/null
+++ b/man/dtc_datepart.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/derive_blfl.R
+\name{dtc_datepart}
+\alias{dtc_datepart}
+\title{Extract date part from ISO8601 date/time variable}
+\usage{
+dtc_datepart(dtc, partial_as_na = TRUE)
+}
+\arguments{
+\item{dtc}{Character vector containing ISO8601 date/times.}
+
+\item{partial_as_na}{Logical \code{TRUE} or \code{FALSE} indicating whether
+partial dates should be set to NA (default is \code{TRUE}).}
+}
+\value{
+Character vector containing ISO8601 dates.
+}
+\description{
+The date part is extracted from an ISO8601 date/time variable.
+By default, partial or missing dates are set to NA.
+}
+\examples{
+## Partial or missing dates set to NA by default
+dtc_datepart(
+  c(NA, "", "2021", "2021-12", "2021-12-25", "2021-12-25T12:00:00")
+)
+  # |-->  c(NA, NA, NA, NA, "2021-12-25", "2021-12-25")
+
+## Prevent partial or missing dates from being set to NA
+dtc_datepart(
+  c(NA, "", "2021", "2021-12", "2021-12-25", "2021-12-25T12:00:00"),
+  partial_as_na = FALSE
+)
+  # |--> c(NA, "", "2021", "2021-12", "2021-12-25", "2021-12-25")
+
+}
diff --git a/man/dtc_timepart.Rd b/man/dtc_timepart.Rd
new file mode 100644
index 00000000..cb821ed2
--- /dev/null
+++ b/man/dtc_timepart.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/derive_blfl.R
+\name{dtc_timepart}
+\alias{dtc_timepart}
+\title{Extract time part from ISO 8601 date/time variable}
+\usage{
+dtc_timepart(dtc, partial_as_na = TRUE, ignore_seconds = TRUE)
+}
+\arguments{
+\item{dtc}{Character vector containing ISO 8601 date/times.}
+
+\item{partial_as_na}{Logical \code{TRUE} or \code{FALSE} indicating whether
+partial times should be set to NA (default is \code{TRUE}).}
+
+\item{ignore_seconds}{Logical \code{TRUE} or \code{FALSE} indicating whether
+seconds should be ignored (default is \code{TRUE}).}
+}
+\value{
+Character vector containing ISO 8601 times.
+}
+\description{
+The time part is extracted from an ISO 8601 date/time variable.
+By default, partial or missing times are set to NA, and seconds are ignored
+and not extracted.
+}
+\examples{
+## Partial or missing times set to NA and seconds ignored by default
+dtc_timepart(
+  c(NA, "", "2021-12-25", "2021-12-25T12", "2021-12-25T12:30", "2021-12-25T12:30:59")
+)
+  # |--> c(NA, NA, NA, NA, "12:30", "12:30")
+
+## Prevent partial or missing times from being set to NA
+dtc_timepart(
+  c(NA, "", "2021-12-25", "2021-12-25T12", "2021-12-25T12:30", "2021-12-25T12:30:59"),
+  partial_as_na = FALSE
+)
+  # |--> c(NA, "", "", "12", "12:30", "12:30")
+
+## Do not ignore seconds, partial or missing times set to NA
+dtc_timepart(
+  c(NA, "", "2021-12-25", "2021-12-25T12", "2021-12-25T12:30", "2021-12-25T12:30:59"),
+  ignore_seconds = FALSE
+)
+  # |--> c(NA, NA, NA, NA, NA, "12:30:59")
+
+## Do not ignore seconds and prevent partial or missing times from being set to NA
+dtc_timepart(
+  c(NA, "", "2021-12-25", "2021-12-25T12", "2021-12-25T12:30", "2021-12-25T12:30:59"),
+  partial_as_na = FALSE,
+  ignore_seconds = FALSE
+)
+  # |--> c(NA, "", "", "12", "12:30", "12:30:59")
+
+}