From 398ebb272a2b8a06be4786dbfc65519c67d8d255 Mon Sep 17 00:00:00 2001 From: egillax Date: Tue, 6 Aug 2024 16:44:42 +0200 Subject: [PATCH 01/25] generate download tasks --- R/ExternalValidatePlp.R | 56 ++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index 77e0f3e48..76e4da7dd 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -359,7 +359,7 @@ createValidationSettings <- function(recalibrate = NULL, #' @param plpModelList A list of plpModels objects created by \code{runPlp} or a path to such objects #' @param recalibrate A vector of characters specifying the recalibration method to apply, #' @param runCovariateSummary whether to run the covariate summary for the validation data -#' @return A validation design object of class \code{validationDesign} +#' @return A validation design object of class \code{validationDesign} or a list of such objects #' @export createValidationDesign <- function(targetId, @@ -442,6 +442,37 @@ validateExternal <- function(validationDesignList, ParallelLogger::registerLogger(logger) on.exit(closeLog(logger)) + # create download tasks + extractUniqueCombinations <- function(validationDesignList) { + j <- 1 + restrictContentMap <- list() + uniqueCombinations <- do.call(rbind, lapply(seq_along(validationDesignList), function(i) { + design <- validationDesignList[[i]] + restrictContent <- paste0(design$restrictPlpDataSettings, collapse = "|") + if (!(restrictContent %in% names(restrictContentMap))) { + restrictContentMap[[restrictContent]] <<- j + j <<- j + 1 + } + data.frame( + targetId = design$targetId, + outcomeId = design$outcomeId, + restrictPlpIndex = restrictContentMap[[restrictContent]], + restrictPlpDataSettings = restrictContent + ) + })) + uniqueCombinations <- uniqueCombinations %>% + dplyr::group_by(.data$targetId, .data$restrictPlpDataSettings) %>% + dplyr::summarise(outcomeIds = list(unique(.data$outcomeId)), + restrictPlpIndex = dplyr::first(.data$restrictPlpIndex), .groups = "drop") %>% + dplyr::rowwise() %>% + dplyr::mutate(restrictPlpDataSettings = list(validationDesignList[[.data$restrictPlpIndex]]$restrictPlpDataSettings)) + return(uniqueCombinations) + } + downloadTasks <- extractUniqueCombinations(validationDesignList) + + + + results <- NULL for (design in validationDesignList) { for (database in databaseDetails) { @@ -449,17 +480,13 @@ validateExternal <- function(validationDesignList, ParallelLogger::logInfo(paste("Validating model on", database$cdmDatabaseName)) - database$targetId <- design$targetId - - database$outcomeIds <- design$outcomeId - modelDesigns <- extractModelDesigns(design$plpModelList) allCovSettings <- lapply(modelDesigns, function(x) x$covariateSettings) design <- fromDesignOrModel(design, modelDesigns, "restrictPlpDataSettings") checkAllSameInModels(allCovSettings, "covariateSettings") # get plpData - plpData <- getData(design, database, outputFolder, allCovSettings) + plpData <- getData(design, database, outputFolder, allCovSettings, downloadTasks) if (is.null(plpData)) { ParallelLogger::logInfo("Couldn't extract plpData for the given design and database, proceeding to the next one.") next @@ -637,12 +664,23 @@ fromDesignOrModel <- function(validationDesign, modelDesigns, settingName) { #' @param database The databaseDetails object #' @param outputFolder The directory to save the validation results to #' @param allCovSettings A list of covariateSettings from the models +#' @param downloadTasks A list of download tasks determined by unique +#' combinations of targetId and restrictPlpDataSettings #' @return The plpData object #' @keywords internal -getData <- function(design, database, outputFolder, allCovSettings) { +getData <- function(design, database, outputFolder, allCovSettings, downloadTasks) { + # find task associated with design and the index of the task in downloadTasks + task <- downloadTasks %>% + dplyr::mutate(taskId = dplyr::row_number()) %>% + dplyr::filter(.data$targetId == design$targetId, + paste0(.data$restrictPlpDataSettings, collapse = "|") == + paste0(design$restrictPlpDataSettings, collapse = "|")) + databaseName <- database$cdmDatabaseName + database$targetId <- task$targetId + database$outcomeIds <- task$outcomeIds plpDataName <- - paste0("targetId_", design$targetId, "_L", "1") + paste0("targetId_", design$targetId, "_L", task$taskId) plpDataLocation <- file.path(outputFolder, databaseName, plpDataName) if (!dir.exists(plpDataLocation)) { @@ -651,7 +689,7 @@ getData <- function(design, database, outputFolder, allCovSettings) { getPlpData, list( databaseDetails = database, - restrictPlpDataSettings = design$restrictPlpDataSettings, + restrictPlpDataSettings = task$restrictPlpDataSettings, covariateSettings = allCovSettings[[1]] ) ) From 52ddf42547074acacd6fe4cc3d0c62ec6d3c1d40 Mon Sep 17 00:00:00 2001 From: egillax Date: Tue, 6 Aug 2024 17:06:06 +0200 Subject: [PATCH 02/25] docs --- man/createValidationDesign.Rd | 2 +- man/getData.Rd | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/man/createValidationDesign.Rd b/man/createValidationDesign.Rd index 311b8b48f..5b280e922 100644 --- a/man/createValidationDesign.Rd +++ b/man/createValidationDesign.Rd @@ -34,7 +34,7 @@ this is taken from the model.} \item{runCovariateSummary}{whether to run the covariate summary for the validation data} } \value{ -A validation design object of class \code{validationDesign} +A validation design object of class \code{validationDesign} or a list of such objects } \description{ createValidationDesign - Define the validation design for external validation diff --git a/man/getData.Rd b/man/getData.Rd index 175d75831..33af888cf 100644 --- a/man/getData.Rd +++ b/man/getData.Rd @@ -4,7 +4,7 @@ \alias{getData} \title{getData - Get the plpData for the validation} \usage{ -getData(design, database, outputFolder, allCovSettings) +getData(design, database, outputFolder, allCovSettings, downloadTasks) } \arguments{ \item{design}{The validationDesign object} @@ -14,6 +14,9 @@ getData(design, database, outputFolder, allCovSettings) \item{outputFolder}{The directory to save the validation results to} \item{allCovSettings}{A list of covariateSettings from the models} + +\item{downloadTasks}{A list of download tasks determined by unique +combinations of targetId and restrictPlpDataSettings} } \value{ The plpData object From c9e7e5e9f8e47384d0689ab30afa38b48f292b18 Mon Sep 17 00:00:00 2001 From: egillax Date: Wed, 7 Aug 2024 11:50:46 +0200 Subject: [PATCH 03/25] fix filtering of task --- R/ExternalValidatePlp.R | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index 617b55388..cb68353ad 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -497,7 +497,9 @@ validateExternal <- function(validationDesignList, dplyr::summarise(outcomeIds = list(unique(.data$outcomeId)), restrictPlpIndex = dplyr::first(.data$restrictPlpIndex), .groups = "drop") %>% dplyr::rowwise() %>% - dplyr::mutate(restrictPlpDataSettings = list(validationDesignList[[.data$restrictPlpIndex]]$restrictPlpDataSettings)) + dplyr::mutate(restrictPlpDataSettings = + list(validationDesignList[[.data$restrictPlpIndex]]$restrictPlpDataSettings)) %>% + dplyr::ungroup() return(uniqueCombinations) } downloadTasks <- extractUniqueCombinations(validationDesignList) @@ -703,14 +705,15 @@ fromDesignOrModel <- function(validationDesign, modelDesigns, settingName) { getData <- function(design, database, outputFolder, allCovSettings, downloadTasks) { # find task associated with design and the index of the task in downloadTasks task <- downloadTasks %>% - dplyr::mutate(taskId = dplyr::row_number()) %>% + dplyr::mutate(taskId = dplyr::row_number(), + collapsed = sapply(.data$restrictPlpDataSettings, paste0, collapse = "|")) %>% dplyr::filter(.data$targetId == design$targetId, - paste0(.data$restrictPlpDataSettings, collapse = "|") == - paste0(design$restrictPlpDataSettings, collapse = "|")) + .data$collapsed == paste0(design$restrictPlpDataSettings, collapse = "|")) %>% + dplyr::select(-.data$collapsed) databaseName <- database$cdmDatabaseName database$targetId <- task$targetId - database$outcomeIds <- task$outcomeIds + database$outcomeIds <- task$outcomeIds[[1]] plpDataName <- paste0("targetId_", design$targetId, "_L", task$taskId) plpDataLocation <- @@ -721,7 +724,7 @@ getData <- function(design, database, outputFolder, allCovSettings, downloadTask getPlpData, list( databaseDetails = database, - restrictPlpDataSettings = task$restrictPlpDataSettings, + restrictPlpDataSettings = task$restrictPlpDataSettings[[1]], covariateSettings = allCovSettings[[1]] ) ) From f42bb72e4b27fd4a0dc861c8d76acd0c87727df9 Mon Sep 17 00:00:00 2001 From: egillax Date: Wed, 7 Aug 2024 15:09:24 +0200 Subject: [PATCH 04/25] outcome limit in validation --- R/ExternalValidatePlp.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index cb68353ad..55e6b3358 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -528,6 +528,18 @@ validateExternal <- function(validationDesignList, # create study population population <- getPopulation(design, modelDesigns, plpData) + if (sum(population$outcomeCount) < 10) { + ParallelLogger::logInfo( + paste( + "Population size is less than 10, skipping validation for design and database:", + databaseName, + "and", + paste0(design, collapse = "|") + ) + ) + next + } + results <- lapply(design$plpModelList, function(model) { analysisName <- paste0("Analysis_", analysisInfo[databaseName]) analysisDone <- file.exists( From f143e050833d4e9125da268b94421adbb2e05dfe Mon Sep 17 00:00:00 2001 From: egillax Date: Wed, 7 Aug 2024 15:09:24 +0200 Subject: [PATCH 05/25] outcome limit in validation --- R/ExternalValidatePlp.R | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index cb68353ad..733fcae9f 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -481,14 +481,14 @@ validateExternal <- function(validationDesignList, uniqueCombinations <- do.call(rbind, lapply(seq_along(validationDesignList), function(i) { design <- validationDesignList[[i]] restrictContent <- paste0(design$restrictPlpDataSettings, collapse = "|") - if (!(restrictContent %in% names(restrictContentMap))) { - restrictContentMap[[restrictContent]] <<- j + if (!(restrictContent %in% restrictContentMap)) { + restrictContentMap[[j]] <<- restrictContent j <<- j + 1 } data.frame( targetId = design$targetId, outcomeId = design$outcomeId, - restrictPlpIndex = restrictContentMap[[restrictContent]], + restrictPlpIndex = which(restrictContent == restrictContentMap), restrictPlpDataSettings = restrictContent ) })) @@ -528,6 +528,18 @@ validateExternal <- function(validationDesignList, # create study population population <- getPopulation(design, modelDesigns, plpData) + if (sum(population$outcomeCount) < 10) { + ParallelLogger::logInfo( + paste( + "Population size is less than 10, skipping validation for design and database:", + databaseName, + "and", + paste0(design, collapse = "|") + ) + ) + next + } + results <- lapply(design$plpModelList, function(model) { analysisName <- paste0("Analysis_", analysisInfo[databaseName]) analysisDone <- file.exists( @@ -687,7 +699,8 @@ fromDesignOrModel <- function(validationDesign, modelDesigns, settingName) { if (any(unlist(lapply(modelDesigns, function(x) { !identical(x[[settingName]], validationDesign[[settingName]]) })))) { - ParallelLogger::logWarn(settingName, " are not the same in models and validationDesign") + ParallelLogger::logWarn(settingName, " are not the same in models and validationDesign, + using from design") } } return(validationDesign) From feb9740bad1b7fb13486005352f2feb7768c348f Mon Sep 17 00:00:00 2001 From: egillax Date: Tue, 13 Aug 2024 13:07:22 +0200 Subject: [PATCH 06/25] better logging --- R/ExternalValidatePlp.R | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index 733fcae9f..d85dab7c3 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -476,6 +476,9 @@ validateExternal <- function(validationDesignList, # create download tasks extractUniqueCombinations <- function(validationDesignList) { + # todo add covariatesettings (currently only same covariateSettings for all models per design is supported) + # todo where restrictPlpDatasettings is empty, take from model and take that into account when creating tasks + ParallelLogger::logInfo("Extracting unique combinations of targetId and restrictPlpDataSettings for extracting data") j <- 1 restrictContentMap <- list() uniqueCombinations <- do.call(rbind, lapply(seq_along(validationDesignList), function(i) { @@ -512,7 +515,8 @@ validateExternal <- function(validationDesignList, for (database in databaseDetails) { databaseName <- database$cdmDatabaseName - ParallelLogger::logInfo(paste("Validating model on", database$cdmDatabaseName)) + ParallelLogger::logInfo(paste("Validating models on", database$cdmDatabaseName, + "with targetId:", design$targetId, "and outcomeId:", design$outcomeId)) modelDesigns <- extractModelDesigns(design$plpModelList) allCovSettings <- lapply(modelDesigns, function(x) x$covariateSettings) @@ -531,10 +535,10 @@ validateExternal <- function(validationDesignList, if (sum(population$outcomeCount) < 10) { ParallelLogger::logInfo( paste( - "Population size is less than 10, skipping validation for design and database:", + "Outcome size is less than 10, skipping validation for design and database:", databaseName, - "and", - paste0(design, collapse = "|") + "and targetId:", design$targetId, + "outcomeId", design$outcomeId ) ) next From 4020e06e3166329f4ea719ee22f3070b4c3db6ab Mon Sep 17 00:00:00 2001 From: egillax Date: Tue, 13 Aug 2024 13:15:07 +0200 Subject: [PATCH 07/25] better logging --- R/PopulationSettings.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/PopulationSettings.R b/R/PopulationSettings.R index 098a784f7..3134086e2 100644 --- a/R/PopulationSettings.R +++ b/R/PopulationSettings.R @@ -433,7 +433,9 @@ createStudyPopulation <- function( ParallelLogger::logWarn('No outcomes left...') return(NULL) } - + ParallelLogger::logInfo('Population created with: ', nrow(population), + ' observations, ', length(unique(population$subjectId)), + ' unique subjects and ', sum(population$outcomeCount), ' outcomes') population <- as.data.frame(population) attr(population, "metaData") <- metaData From 9036d9671ab726f2d3ac0925023aa32fe89ae1ba Mon Sep 17 00:00:00 2001 From: egillax Date: Fri, 11 Oct 2024 09:36:30 +0200 Subject: [PATCH 08/25] add test for <10 outcomes --- tests/testthat/test-validation.R | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-validation.R b/tests/testthat/test-validation.R index 1fb3ec2e2..aca856306 100644 --- a/tests/testthat/test-validation.R +++ b/tests/testthat/test-validation.R @@ -173,7 +173,7 @@ test_that("createValidationDesigns correctly handles multiple restrictSettings", design <- createValidationDesign( targetId = targetId, - outcomeId =outcomeId, + outcomeId = outcomeId, plpModelList = plpModelList, restrictPlpDataSettings = restrictPlpDataSettings ) @@ -184,4 +184,24 @@ test_that("createValidationDesigns correctly handles multiple restrictSettings", expect_equal(design[[1]]$restrictPlpDataSettings, restrictPlpDataSettings[[1]]) expect_equal(design[[2]]$restrictPlpDataSettings, restrictPlpDataSettings[[2]]) expect_equal(length(design), length(restrictPlpDataSettings)) -}) \ No newline at end of file +}) + +test_that("createValidationSettings errors with <10 outcomes", { + tinyRestrictPlpDataSettings <- createRestrictPlpDataSettings( + sampleSize = 30, + ) + + validationDesign <- createValidationDesign( + targetId = 1, + outcomeId = 3, + plpModelList = list(modelVal), + restrictPlpDataSettings = tinyRestrictPlpDataSettings + ) + + expect_output(validateExternal(validationDesignList = validationDesign, + databaseDetails = databaseDetails, + logSettings = createLogSettings(), + outputFolder = saveLocation), + "skipping validation for design and database") + +}) From e88415f51900e0be0692df432493466f5c9aea41 Mon Sep 17 00:00:00 2001 From: egillax Date: Fri, 11 Oct 2024 09:55:58 +0200 Subject: [PATCH 09/25] refactor and docs --- R/ExternalValidatePlp.R | 69 ++++++++++++++++++-------------- man/PatientLevelPrediction.Rd | 4 +- man/extractUniqueCombinations.Rd | 24 +++++++++++ 3 files changed, 64 insertions(+), 33 deletions(-) create mode 100644 man/extractUniqueCombinations.Rd diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index dff70500e..bc232338e 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -477,37 +477,6 @@ validateExternal <- function(validationDesignList, ParallelLogger::registerLogger(logger) on.exit(closeLog(logger)) - # create download tasks - extractUniqueCombinations <- function(validationDesignList) { - # todo add covariatesettings (currently only same covariateSettings for all models per design is supported) - # todo where restrictPlpDatasettings is empty, take from model and take that into account when creating tasks - ParallelLogger::logInfo("Extracting unique combinations of targetId and restrictPlpDataSettings for extracting data") - j <- 1 - restrictContentMap <- list() - uniqueCombinations <- do.call(rbind, lapply(seq_along(validationDesignList), function(i) { - design <- validationDesignList[[i]] - restrictContent <- paste0(design$restrictPlpDataSettings, collapse = "|") - if (!(restrictContent %in% restrictContentMap)) { - restrictContentMap[[j]] <<- restrictContent - j <<- j + 1 - } - data.frame( - targetId = design$targetId, - outcomeId = design$outcomeId, - restrictPlpIndex = which(restrictContent == restrictContentMap), - restrictPlpDataSettings = restrictContent - ) - })) - uniqueCombinations <- uniqueCombinations %>% - dplyr::group_by(.data$targetId, .data$restrictPlpDataSettings) %>% - dplyr::summarise(outcomeIds = list(unique(.data$outcomeId)), - restrictPlpIndex = dplyr::first(.data$restrictPlpIndex), .groups = "drop") %>% - dplyr::rowwise() %>% - dplyr::mutate(restrictPlpDataSettings = - list(validationDesignList[[.data$restrictPlpIndex]]$restrictPlpDataSettings)) %>% - dplyr::ungroup() - return(uniqueCombinations) - } downloadTasks <- extractUniqueCombinations(validationDesignList) @@ -797,3 +766,41 @@ getPopulation <- function(validationDesign, modelDesigns, plpData) { return(population) } +#' extractUniqueCombinations +#' create download tasks based on unique combinations of targetId and +#' restrictPlpDataSettings. This is used to avoid downloading the same data +#' multiple times. +#' @param validationDesignList A list of validationDesign objects +#' @return A list of download tasks +#' @keywords internal +extractUniqueCombinations <- function(validationDesignList) { + # TODO add covariatesettings (currently only same covariateSettings for all models per design is supported) + # TODO where restrictPlpDatasettings is empty, take from model and take that into account when creating tasks + ParallelLogger::logInfo("Extracting unique combinations of targetId and restrictPlpDataSettings for extracting data") + j <- 1 + restrictContentMap <- list() + uniqueCombinations <- do.call(rbind, + lapply(seq_along(validationDesignList), function(i) { + design <- validationDesignList[[i]] + restrictContent <- paste0(design$restrictPlpDataSettings, collapse = "|") + if (!(restrictContent %in% restrictContentMap)) { + restrictContentMap[[j]] <<- restrictContent + j <<- j + 1 # increment j from parent environment + } + data.frame( + targetId = design$targetId, + outcomeId = design$outcomeId, + restrictPlpIndex = which(restrictContent == restrictContentMap), + restrictPlpDataSettings = restrictContent + ) + })) + uniqueCombinations <- uniqueCombinations %>% + dplyr::group_by(.data$targetId, .data$restrictPlpDataSettings) %>% + dplyr::summarise(outcomeIds = list(unique(.data$outcomeId)), + restrictPlpIndex = dplyr::first(.data$restrictPlpIndex), .groups = "drop") %>% + dplyr::rowwise() %>% + dplyr::mutate(restrictPlpDataSettings = + list(validationDesignList[[.data$restrictPlpIndex]]$restrictPlpDataSettings)) %>% + dplyr::ungroup() + return(uniqueCombinations) + } diff --git a/man/PatientLevelPrediction.Rd b/man/PatientLevelPrediction.Rd index 8bc15fc71..4946949d9 100644 --- a/man/PatientLevelPrediction.Rd +++ b/man/PatientLevelPrediction.Rd @@ -18,15 +18,15 @@ Useful links: } \author{ -\strong{Maintainer}: Jenna Reps \email{jreps@its.jnj.com} +\strong{Maintainer}: Egill Fridgeirsson \email{e.fridgeirsson@erasmusmc.nl} Authors: \itemize{ + \item Jenna Reps \email{jreps@its.jnj.com} \item Martijn Schuemie \item Marc Suchard \item Patrick Ryan \item Peter Rijnbeek - \item Egill Fridgeirsson } } diff --git a/man/extractUniqueCombinations.Rd b/man/extractUniqueCombinations.Rd new file mode 100644 index 000000000..d94e134f8 --- /dev/null +++ b/man/extractUniqueCombinations.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExternalValidatePlp.R +\name{extractUniqueCombinations} +\alias{extractUniqueCombinations} +\title{extractUniqueCombinations +create download tasks based on unique combinations of targetId and +restrictPlpDataSettings. This is used to avoid downloading the same data +multiple times.} +\usage{ +extractUniqueCombinations(validationDesignList) +} +\arguments{ +\item{validationDesignList}{A list of validationDesign objects} +} +\value{ +A list of download tasks +} +\description{ +extractUniqueCombinations +create download tasks based on unique combinations of targetId and +restrictPlpDataSettings. This is used to avoid downloading the same data +multiple times. +} +\keyword{internal} From 5e744e9abbe51baea21b15006187a5f9f7ab7bd8 Mon Sep 17 00:00:00 2001 From: egillax Date: Tue, 15 Oct 2024 08:14:55 +0200 Subject: [PATCH 10/25] WIP allow multiple covariateSettings --- R/ExternalValidatePlp.R | 63 +++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index bc232338e..e7399a302 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -479,9 +479,6 @@ validateExternal <- function(validationDesignList, downloadTasks <- extractUniqueCombinations(validationDesignList) - - - results <- NULL for (design in validationDesignList) { for (database in databaseDetails) { @@ -776,31 +773,53 @@ getPopulation <- function(validationDesign, modelDesigns, plpData) { extractUniqueCombinations <- function(validationDesignList) { # TODO add covariatesettings (currently only same covariateSettings for all models per design is supported) # TODO where restrictPlpDatasettings is empty, take from model and take that into account when creating tasks - ParallelLogger::logInfo("Extracting unique combinations of targetId and restrictPlpDataSettings for extracting data") + ParallelLogger::logInfo("Extracting unique combinations of targetId, \ + restrictPlpDataSettings and covariateSettings for extracting data") j <- 1 - restrictContentMap <- list() + contentMap <- list() + contentValues <- list() uniqueCombinations <- do.call(rbind, lapply(seq_along(validationDesignList), function(i) { design <- validationDesignList[[i]] - restrictContent <- paste0(design$restrictPlpDataSettings, collapse = "|") - if (!(restrictContent %in% restrictContentMap)) { - restrictContentMap[[j]] <<- restrictContent + restrictContent <- list(key = paste0(design$restrictPlpDataSettings, collapse = "|"), + value = design$restrictPlpDataSettings) + covariateSettingsContent <- lapply(design$plpModelList, function(model) { + model <- loadPlpModel(model) + # if covariateSettings is a list, concatenate them all + covariateContent <- list(key = unlist(lapply(model$modelDesign$covariateSettings, + function(settings) paste0(settings, collapse = "|"))), + value = model$modelDesign$covariateSettings) + return(covariateContent) + }) + combinedContent <- apply(expand.grid(vapply(restrictContent, + function(x) x$key, character(1)), + vapply(covariateSettingsContent, + function(x) x$key, character(1))), 1, paste, collapse = "|") + validRows <- which(!(combinedContent %in% contentMap)) + if (length(validRows) > 0) { + contentMap[[j]] <<- combinedContent[validRows] + contentValues[[j]] <<- list(covariateSettings) j <<- j + 1 # increment j from parent environment } - data.frame( - targetId = design$targetId, - outcomeId = design$outcomeId, - restrictPlpIndex = which(restrictContent == restrictContentMap), - restrictPlpDataSettings = restrictContent - ) + do.call(rbind, lapply(seq_along(validRows), function(k) { + data.frame( + targetId = design$targetId, + outcomeId = design$outcomeId, + restrictPlpIndex = which(restrictContent == contentMap), + restrictPlpDataSettings = restrictContent + ) + })) })) - uniqueCombinations <- uniqueCombinations %>% - dplyr::group_by(.data$targetId, .data$restrictPlpDataSettings) %>% - dplyr::summarise(outcomeIds = list(unique(.data$outcomeId)), - restrictPlpIndex = dplyr::first(.data$restrictPlpIndex), .groups = "drop") %>% - dplyr::rowwise() %>% - dplyr::mutate(restrictPlpDataSettings = - list(validationDesignList[[.data$restrictPlpIndex]]$restrictPlpDataSettings)) %>% - dplyr::ungroup() + browser() + # uniqueCombinations <- uniqueCombinations %>% + # dplyr::group_by(.data$targetId, .data$restrictPlpDataSettings, + # .data$covariateSettings) %>% + # dplyr::summarise(outcomeIds = list(unique(.data$outcomeId)), + # restrictPlpIndex = dplyr::first(.data$restrictPlpIndex), .groups = "drop") %>% + # dplyr::rowwise() %>% + # dplyr::mutate(restrictPlpDataSettings = + # list(validationDesignList[[.data$restrictPlpIndex]]$restrictPlpDataSettings), + # covariateSet %>% + # dplyr::ungroup() return(uniqueCombinations) } From 2eb540725f3ed2089c6800a09982db439a86075b Mon Sep 17 00:00:00 2001 From: egillax Date: Tue, 15 Oct 2024 15:43:44 +0200 Subject: [PATCH 11/25] allow multiple covariateSettings --- R/ExternalValidatePlp.R | 110 ++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index e7399a302..5ffad63da 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -490,7 +490,7 @@ validateExternal <- function(validationDesignList, modelDesigns <- extractModelDesigns(design$plpModelList) allCovSettings <- lapply(modelDesigns, function(x) x$covariateSettings) design <- fromDesignOrModel(design, modelDesigns, "restrictPlpDataSettings") - checkAllSameInModels(allCovSettings, "covariateSettings") + # checkAllSameInModels(allCovSettings, "covariateSettings") # get plpData plpData <- getData(design, database, outputFolder, allCovSettings, downloadTasks) @@ -696,10 +696,10 @@ fromDesignOrModel <- function(validationDesign, modelDesigns, settingName) { getData <- function(design, database, outputFolder, allCovSettings, downloadTasks) { # find task associated with design and the index of the task in downloadTasks task <- downloadTasks %>% + dplyr::filter(.data$targetId == design$targetId) %>% dplyr::mutate(taskId = dplyr::row_number(), collapsed = sapply(.data$restrictPlpDataSettings, paste0, collapse = "|")) %>% - dplyr::filter(.data$targetId == design$targetId, - .data$collapsed == paste0(design$restrictPlpDataSettings, collapse = "|")) %>% + dplyr::filter(.data$collapsed == paste0(design$restrictPlpDataSettings, collapse = "|")) %>% dplyr::select(-.data$collapsed) databaseName <- database$cdmDatabaseName @@ -716,7 +716,7 @@ getData <- function(design, database, outputFolder, allCovSettings, downloadTask list( databaseDetails = database, restrictPlpDataSettings = task$restrictPlpDataSettings[[1]], - covariateSettings = allCovSettings[[1]] + covariateSettings = task$covariateSettings[[1]] ) ) }, @@ -771,55 +771,57 @@ getPopulation <- function(validationDesign, modelDesigns, plpData) { #' @return A list of download tasks #' @keywords internal extractUniqueCombinations <- function(validationDesignList) { - # TODO add covariatesettings (currently only same covariateSettings for all models per design is supported) - # TODO where restrictPlpDatasettings is empty, take from model and take that into account when creating tasks - ParallelLogger::logInfo("Extracting unique combinations of targetId, \ - restrictPlpDataSettings and covariateSettings for extracting data") - j <- 1 - contentMap <- list() - contentValues <- list() - uniqueCombinations <- do.call(rbind, - lapply(seq_along(validationDesignList), function(i) { - design <- validationDesignList[[i]] - restrictContent <- list(key = paste0(design$restrictPlpDataSettings, collapse = "|"), - value = design$restrictPlpDataSettings) - covariateSettingsContent <- lapply(design$plpModelList, function(model) { - model <- loadPlpModel(model) - # if covariateSettings is a list, concatenate them all - covariateContent <- list(key = unlist(lapply(model$modelDesign$covariateSettings, - function(settings) paste0(settings, collapse = "|"))), - value = model$modelDesign$covariateSettings) - return(covariateContent) - }) - combinedContent <- apply(expand.grid(vapply(restrictContent, - function(x) x$key, character(1)), - vapply(covariateSettingsContent, - function(x) x$key, character(1))), 1, paste, collapse = "|") - validRows <- which(!(combinedContent %in% contentMap)) - if (length(validRows) > 0) { - contentMap[[j]] <<- combinedContent[validRows] - contentValues[[j]] <<- list(covariateSettings) - j <<- j + 1 # increment j from parent environment + # TODO where restrictPlpDatasettings is empty, take from model and take that into account when creating tasks + + ParallelLogger::logInfo("Extracting unique combinations of targetId, \ + restrictPlpDataSettings and covariateSettings for extracting data") + + rowsList <- list() + modelCache <- list() + for (design in validationDesignList) { + targetId <- design$targetId + outcomeId <- design$outcomeId + restrictPlpDataSettings <- design$restrictPlpDataSettings + plpModelList <- design$plpModelList + for (modelPath in plpModelList) { + if (!is.null(modelCache[[modelPath]])) { + model <- modelCache[[modelPath]] + } else { + model <- loadPlpModel(modelPath) + modelCache[[modelPath]] <- model } - do.call(rbind, lapply(seq_along(validRows), function(k) { - data.frame( - targetId = design$targetId, - outcomeId = design$outcomeId, - restrictPlpIndex = which(restrictContent == contentMap), - restrictPlpDataSettings = restrictContent - ) - })) - })) - browser() - # uniqueCombinations <- uniqueCombinations %>% - # dplyr::group_by(.data$targetId, .data$restrictPlpDataSettings, - # .data$covariateSettings) %>% - # dplyr::summarise(outcomeIds = list(unique(.data$outcomeId)), - # restrictPlpIndex = dplyr::first(.data$restrictPlpIndex), .groups = "drop") %>% - # dplyr::rowwise() %>% - # dplyr::mutate(restrictPlpDataSettings = - # list(validationDesignList[[.data$restrictPlpIndex]]$restrictPlpDataSettings), - # covariateSet %>% - # dplyr::ungroup() - return(uniqueCombinations) + covariateSettings <- model$modelDesign$covariateSettings + row <- list(targetId = targetId, + outcomeIds = outcomeId, + restrictPlpDataSettings = list(restrictPlpDataSettings), + covariateSettings = covariateSettings) + rowsList[[length(rowsList) + 1]] <- row + } } + rowsDf <- dplyr::bind_rows(rowsList) + + rowsDf <- rowsDf %>% + dplyr::rowwise() %>% + dplyr::mutate(restrictKey = digest::digest(restrictPlpDataSettings), + covariateKey = digest::digest(covariateSettings)) %>% + dplyr::ungroup() + + uniqueCovariateSettings <- function(settingsList, settingsKeys) { + uniqueKeys <- unique(settingsKeys) + indices <- match(uniqueKeys, settingsKeys) + uniqueSettings <- settingsList[indices] + return(uniqueSettings) + } + + uniqueCombinations <- rowsDf %>% + dplyr::group_by(.data$targetId, .data$restrictKey) %>% + dplyr::summarise( + outcomeIds = list(unique(.data$outcomeIds)), + restrictPlpDataSettings = .data$restrictPlpDataSettings[1], + covariateSettings = list(uniqueCovariateSettings(.data$covariateSettings, .data$covariateKey)), + .groups = "drop") %>% + dplyr::select(c("targetId", "outcomeIds", "restrictPlpDataSettings", + "covariateSettings")) + + return(uniqueCombinations) +} From e2f4b344e4bc59245c3b67e37e76330eee89ff9c Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 09:58:02 +0200 Subject: [PATCH 12/25] plpModel as object works when creating downloadTasks --- R/ExternalValidatePlp.R | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index 5ffad63da..a05a47f14 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -771,6 +771,7 @@ getPopulation <- function(validationDesign, modelDesigns, plpData) { #' @return A list of download tasks #' @keywords internal extractUniqueCombinations <- function(validationDesignList) { + # TODO currentl works for list of modelPaths, not with objects. # TODO where restrictPlpDatasettings is empty, take from model and take that into account when creating tasks ParallelLogger::logInfo("Extracting unique combinations of targetId, \ @@ -783,18 +784,27 @@ extractUniqueCombinations <- function(validationDesignList) { outcomeId <- design$outcomeId restrictPlpDataSettings <- design$restrictPlpDataSettings plpModelList <- design$plpModelList - for (modelPath in plpModelList) { - if (!is.null(modelCache[[modelPath]])) { - model <- modelCache[[modelPath]] + for (model in plpModelList) { + if (is.character(model)) { + if (!is.null(modelCache[[model]])) { + model <- modelCache[[model]] + } else { + model <- loadPlpModel(model) + modelCache[[model]] <- model + } } else { - model <- loadPlpModel(modelPath) - modelCache[[modelPath]] <- model + modelKey <- digest::digest(model) + if (!is.null(modelCache[[modelKey]])) { + model <- modelCache[[modelKey]] + } else { + modelCache[[modelKey]] <- model + } } covariateSettings <- model$modelDesign$covariateSettings row <- list(targetId = targetId, outcomeIds = outcomeId, restrictPlpDataSettings = list(restrictPlpDataSettings), - covariateSettings = covariateSettings) + covariateSettings = list(covariateSettings)) rowsList[[length(rowsList) + 1]] <- row } } From 7e8544b7c14e8733dc29677e1965007b1068621d Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 10:05:33 +0200 Subject: [PATCH 13/25] add digest dependency --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index f6f7a6b08..a45cfa471 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,6 +27,7 @@ Imports: Andromeda, Cyclops (>= 3.0.0), DatabaseConnector (>= 6.0.0), + digest, dplyr, FeatureExtraction (>= 3.0.0), ggplot2, From 16f8d73ce3b612489971988f0947776352733261 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 14:04:18 +0200 Subject: [PATCH 14/25] downloadTasks and add tests --- R/ExternalValidatePlp.R | 36 ++++---- tests/testthat/test-validation.R | 142 +++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 17 deletions(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index a05a47f14..dc735b77a 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -477,7 +477,7 @@ validateExternal <- function(validationDesignList, ParallelLogger::registerLogger(logger) on.exit(closeLog(logger)) - downloadTasks <- extractUniqueCombinations(validationDesignList) + downloadTasks <- createDownloadTasks(validationDesignList) results <- NULL for (design in validationDesignList) { @@ -488,12 +488,10 @@ validateExternal <- function(validationDesignList, "with targetId:", design$targetId, "and outcomeId:", design$outcomeId)) modelDesigns <- extractModelDesigns(design$plpModelList) - allCovSettings <- lapply(modelDesigns, function(x) x$covariateSettings) design <- fromDesignOrModel(design, modelDesigns, "restrictPlpDataSettings") - # checkAllSameInModels(allCovSettings, "covariateSettings") # get plpData - plpData <- getData(design, database, outputFolder, allCovSettings, downloadTasks) + plpData <- getData(design, database, outputFolder, downloadTasks) if (is.null(plpData)) { ParallelLogger::logInfo("Couldn't extract plpData for the given design and database, proceeding to the next one.") next @@ -608,7 +606,7 @@ checkAllSameInModels <- function(settingsList, settingName) { identical(y, settingsList[[1]])}, settingsList[-1], init = TRUE)) { - stop(paste0(settingName, "are not the same across models which is not supported yet")) + stop(paste0(settingName, "are not the same across models which is not supported")) } } @@ -688,12 +686,11 @@ fromDesignOrModel <- function(validationDesign, modelDesigns, settingName) { #' @param design The validationDesign object #' @param database The databaseDetails object #' @param outputFolder The directory to save the validation results to -#' @param allCovSettings A list of covariateSettings from the models #' @param downloadTasks A list of download tasks determined by unique #' combinations of targetId and restrictPlpDataSettings #' @return The plpData object #' @keywords internal -getData <- function(design, database, outputFolder, allCovSettings, downloadTasks) { +getData <- function(design, database, outputFolder, downloadTasks) { # find task associated with design and the index of the task in downloadTasks task <- downloadTasks %>% dplyr::filter(.data$targetId == design$targetId) %>% @@ -763,26 +760,31 @@ getPopulation <- function(validationDesign, modelDesigns, plpData) { return(population) } -#' extractUniqueCombinations +#' createDownloadTasks #' create download tasks based on unique combinations of targetId and -#' restrictPlpDataSettings. This is used to avoid downloading the same data -#' multiple times. +#' restrictPlpDataSettings. It adds all covariateSettings and outcomes that +#' have that targetId and restrictPlpDataSettings. This is used to avoid +#' downloading the same data multiple times. #' @param validationDesignList A list of validationDesign objects -#' @return A list of download tasks +#' @return A dataframe where each row is a downloadTask #' @keywords internal -extractUniqueCombinations <- function(validationDesignList) { - # TODO currentl works for list of modelPaths, not with objects. - # TODO where restrictPlpDatasettings is empty, take from model and take that into account when creating tasks - +createDownloadTasks <- function(validationDesignList) { ParallelLogger::logInfo("Extracting unique combinations of targetId, \ restrictPlpDataSettings and covariateSettings for extracting data") rowsList <- list() modelCache <- list() + for (design in validationDesignList) { targetId <- design$targetId outcomeId <- design$outcomeId restrictPlpDataSettings <- design$restrictPlpDataSettings + if (is.null(restrictPlpDataSettings)) { + restrictList <- lapply(design$plpModelList, + function(x) x$modelDesign$restrictPlpDataSettings) + checkAllSameInModels(restrictList, "restrictPlpDataSettings") + restrictPlpDataSettings <- restrictList[[1]] + } plpModelList <- design$plpModelList for (model in plpModelList) { if (is.character(model)) { @@ -823,7 +825,7 @@ extractUniqueCombinations <- function(validationDesignList) { return(uniqueSettings) } - uniqueCombinations <- rowsDf %>% + downloadTasks <- rowsDf %>% dplyr::group_by(.data$targetId, .data$restrictKey) %>% dplyr::summarise( outcomeIds = list(unique(.data$outcomeIds)), @@ -833,5 +835,5 @@ extractUniqueCombinations <- function(validationDesignList) { dplyr::select(c("targetId", "outcomeIds", "restrictPlpDataSettings", "covariateSettings")) - return(uniqueCombinations) + return(downloadTasks) } diff --git a/tests/testthat/test-validation.R b/tests/testthat/test-validation.R index aca856306..ec65f48b7 100644 --- a/tests/testthat/test-validation.R +++ b/tests/testthat/test-validation.R @@ -205,3 +205,145 @@ test_that("createValidationSettings errors with <10 outcomes", { "skipping validation for design and database") }) + +test_that("createDownloadTasks handles single design correctly", { + design <- createValidationDesign( + targetId = 1, + outcomeId = 2, + plpModelList = list(modelVal), + restrictPlpDataSettings = createRestrictPlpDataSettings() + ) + result <- createDownloadTasks(list(design)) + expect_s3_class(result, "data.frame") + expect_equal(nrow(result), 1) + expect_equal(ncol(result), 4) +}) + +test_that("createDownloadTasks handles multiple designs correctly", { + design1 <- createValidationDesign( + targetId = 1, + outcomeId = 2, + plpModelList = list(modelVal), + restrictPlpDataSettings = createRestrictPlpDataSettings() + ) + design2 <- createValidationDesign( + targetId = 3, + outcomeId = 4, + plpModelList = list(modelVal), + restrictPlpDataSettings = createRestrictPlpDataSettings() + ) + result <- createDownloadTasks(list(design1, design2)) + expect_s3_class(result, "data.frame") + expect_equal(nrow(result), 2) + expect_equal(ncol(result), 4) +}) + +test_that("createDownloadTasks handles duplicated designs correctly", { + design <- createValidationDesign( + targetId = 1, + outcomeId = 2, + plpModelList = list(modelVal), + restrictPlpDataSettings = createRestrictPlpDataSettings() + ) + result <- createDownloadTasks(list(design, design)) + expect_s3_class(result, "data.frame") + expect_equal(nrow(result), 1) + + design2 <- createValidationDesign( + targetId = 3, + outcomeId = 4, + plpModelList = list(modelVal), + restrictPlpDataSettings = createRestrictPlpDataSettings() + ) + + results <- createDownloadTasks(list(design, design2, design)) + expect_s3_class(results, "data.frame") + expect_equal(nrow(results), 2) +}) + +test_that("createDownloadTasks with different restrictSettings", { + design <- createValidationDesign( + targetId = 1, + outcomeId = 2, + plpModelList = list(modelVal), + restrictPlpDataSettings = createRestrictPlpDataSettings() + ) + design2 <- createValidationDesign( + targetId = 3, + outcomeId = 4, + plpModelList = list(modelVal), + restrictPlpDataSettings = createRestrictPlpDataSettings() + ) + design3 <- createValidationDesign( + targetId = 1, + outcomeId = 2, + plpModelList = list(modelVal), + restrictPlpDataSettings = createRestrictPlpDataSettings(sampleSize = 100) + ) + + result <- createDownloadTasks(list(design, design2, design3)) + + expect_s3_class(result, "data.frame") + expect_equal(nrow(result), 3) +}) + +test_that("createDownloadTasks works with multiple outcomeIds", { + design1 <- createValidationDesign( + targetId = 1, + outcomeId = 2, + plpModelList = list(modelVal), + restrictPlpDataSettings = createRestrictPlpDataSettings() + ) + design2 <- createValidationDesign( + targetId = 1, + outcomeId = 3, + plpModelList = list(modelVal), + restrictPlpDataSettings = createRestrictPlpDataSettings() + ) + result <- createDownloadTasks(list(design1, design2)) + expect_s3_class(result, "data.frame") + expect_equal(nrow(result), 1) + expect_equal(length(result[1, ]$outcomeIds[[1]]), 2) + + design3 <- createValidationDesign( + targetId = 1, + outcomeId = 3, + plpModelList = list(modelVal), + restrictPlpDataSettings = createRestrictPlpDataSettings(sampleSize = 100) + ) + result <- createDownloadTasks(list(design1, design2, design3)) + expect_equal(nrow(result), 2) +}) + +test_that("createDownloadTasks with multiple covSettings", { + modelVal2 <- modelVal + design1 <- createValidationDesign( + targetId = 1, + outcomeId = 2, + plpModelList = list(modelVal), + restrictPlpDataSettings = createRestrictPlpDataSettings() + ) + modelVal2$modelDesign$covariateSettings <- + FeatureExtraction::createCovariateSettings(useChads2 = TRUE) + design2 <- createValidationDesign( + targetId = 1, + outcomeId = 2, + plpModelList = list(modelVal2), + restrictPlpDataSettings = createRestrictPlpDataSettings() + ) + result <- createDownloadTasks(list(design1, design2)) + expect_equal(nrow(result), 1) + expect_equal(length(result[1, ]$covariateSettings)[[1]], 2) + +}) + +test_that("createDownloadTasks when restrictSettings come from models", { + design1 <- createValidationDesign( + targetId = 1, + outcomeId = 2, + plpModelList = list(modelVal) + ) + result <- createDownloadTasks(list(design1)) + expect_s3_class(result[1, ]$restrictPlpDataSettings[[1]], "restrictPlpDataSettings") + +}) From c4b9593c1db1572767fd98b60ceeee6b80150858 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 14:09:22 +0200 Subject: [PATCH 15/25] docs and fix check note --- R/Glm.R | 4 ++-- man/createDownloadTasks.Rd | 26 ++++++++++++++++++++++++++ man/extractUniqueCombinations.Rd | 24 ------------------------ man/getData.Rd | 4 +--- 4 files changed, 29 insertions(+), 29 deletions(-) create mode 100644 man/createDownloadTasks.Rd delete mode 100644 man/extractUniqueCombinations.Rd diff --git a/R/Glm.R b/R/Glm.R index 2f7295139..4cbb779c0 100644 --- a/R/Glm.R +++ b/R/Glm.R @@ -25,7 +25,7 @@ #' @param data An object of type \code{plpData} - the patient level prediction #' data extracted from the CDM. #' @param cohort The population dataframe created using -#' /code{createStudyPopulation} who will have their risks predicted or a cohort +#' \code{createStudyPopulation} who will have their risks predicted or a cohort #' without the outcome known #' @export #' @return A dataframe containing the prediction for each person in the @@ -75,7 +75,7 @@ predictGlm <- function(plpModel, data, cohort) { #' PatientLevelPrediction package. #' @param coefficients A dataframe containing two columns, coefficients and #' covariateId, both of type numeric. The covariateId column must contain -#' valid covariateIds that match those used in the /code{FeatureExtraction} +#' valid covariateIds that match those used in the \code{FeatureExtraction} #' package. #' @param intercept A numeric value representing the intercept of the model. #' @param finalMapping A string representing the final mapping from the diff --git a/man/createDownloadTasks.Rd b/man/createDownloadTasks.Rd new file mode 100644 index 000000000..dc8a51a67 --- /dev/null +++ b/man/createDownloadTasks.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExternalValidatePlp.R +\name{createDownloadTasks} +\alias{createDownloadTasks} +\title{createDownloadTasks +create download tasks based on unique combinations of targetId and +restrictPlpDataSettings. It adds all covariateSettings and outcomes that +have that targetId and restrictPlpDataSettings. This is used to avoid +downloading the same data multiple times.} +\usage{ +createDownloadTasks(validationDesignList) +} +\arguments{ +\item{validationDesignList}{A list of validationDesign objects} +} +\value{ +A dataframe where each row is a downloadTask +} +\description{ +createDownloadTasks +create download tasks based on unique combinations of targetId and +restrictPlpDataSettings. It adds all covariateSettings and outcomes that +have that targetId and restrictPlpDataSettings. This is used to avoid +downloading the same data multiple times. +} +\keyword{internal} diff --git a/man/extractUniqueCombinations.Rd b/man/extractUniqueCombinations.Rd deleted file mode 100644 index d94e134f8..000000000 --- a/man/extractUniqueCombinations.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ExternalValidatePlp.R -\name{extractUniqueCombinations} -\alias{extractUniqueCombinations} -\title{extractUniqueCombinations -create download tasks based on unique combinations of targetId and -restrictPlpDataSettings. This is used to avoid downloading the same data -multiple times.} -\usage{ -extractUniqueCombinations(validationDesignList) -} -\arguments{ -\item{validationDesignList}{A list of validationDesign objects} -} -\value{ -A list of download tasks -} -\description{ -extractUniqueCombinations -create download tasks based on unique combinations of targetId and -restrictPlpDataSettings. This is used to avoid downloading the same data -multiple times. -} -\keyword{internal} diff --git a/man/getData.Rd b/man/getData.Rd index 33af888cf..075f73f3d 100644 --- a/man/getData.Rd +++ b/man/getData.Rd @@ -4,7 +4,7 @@ \alias{getData} \title{getData - Get the plpData for the validation} \usage{ -getData(design, database, outputFolder, allCovSettings, downloadTasks) +getData(design, database, outputFolder, downloadTasks) } \arguments{ \item{design}{The validationDesign object} @@ -13,8 +13,6 @@ getData(design, database, outputFolder, allCovSettings, downloadTasks) \item{outputFolder}{The directory to save the validation results to} -\item{allCovSettings}{A list of covariateSettings from the models} - \item{downloadTasks}{A list of download tasks determined by unique combinations of targetId and restrictPlpDataSettings} } From be595fc0bdd1efaaac575a9d90380c334f81c4a0 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 14:10:55 +0200 Subject: [PATCH 16/25] fix bracket in test --- tests/testthat/test-validation.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-validation.R b/tests/testthat/test-validation.R index ec65f48b7..955d8cec2 100644 --- a/tests/testthat/test-validation.R +++ b/tests/testthat/test-validation.R @@ -333,7 +333,7 @@ test_that("createDownloadTasks with multiple covSettings", { ) result <- createDownloadTasks(list(design1, design2)) expect_equal(nrow(result), 1) - expect_equal(length(result[1, ]$covariateSettings)[[1]], 2) + expect_equal(length(result[1, ]$covariateSettings[[1]]), 2) }) From 6f568f50a4b42fa189122674b298f410e38eb133 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 14:18:33 +0200 Subject: [PATCH 17/25] docs --- man/createGlmModel.Rd | 2 +- man/predictGlm.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/man/createGlmModel.Rd b/man/createGlmModel.Rd index 6a6509fd7..1ba11ca09 100644 --- a/man/createGlmModel.Rd +++ b/man/createGlmModel.Rd @@ -9,7 +9,7 @@ createGlmModel(coefficients, intercept = 0, finalMapping = "logistic") \arguments{ \item{coefficients}{A dataframe containing two columns, coefficients and covariateId, both of type numeric. The covariateId column must contain -valid covariateIds that match those used in the /code{FeatureExtraction} +valid covariateIds that match those used in the \code{FeatureExtraction} package.} \item{intercept}{A numeric value representing the intercept of the model.} diff --git a/man/predictGlm.Rd b/man/predictGlm.Rd index e7e09a57b..6f9e21cc5 100644 --- a/man/predictGlm.Rd +++ b/man/predictGlm.Rd @@ -14,7 +14,7 @@ prediction model} data extracted from the CDM.} \item{cohort}{The population dataframe created using -/code{createStudyPopulation} who will have their risks predicted or a cohort +\code{createStudyPopulation} who will have their risks predicted or a cohort without the outcome known} } \value{ From 2e298f569fe3b006b78d2acd9903f78839f61d0e Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 14:35:31 +0200 Subject: [PATCH 18/25] fix tidyselect --- R/ExternalValidatePlp.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index dc735b77a..aca76de62 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -697,7 +697,7 @@ getData <- function(design, database, outputFolder, downloadTasks) { dplyr::mutate(taskId = dplyr::row_number(), collapsed = sapply(.data$restrictPlpDataSettings, paste0, collapse = "|")) %>% dplyr::filter(.data$collapsed == paste0(design$restrictPlpDataSettings, collapse = "|")) %>% - dplyr::select(-.data$collapsed) + dplyr::select("collapsed") databaseName <- database$cdmDatabaseName database$targetId <- task$targetId From 5e229c015bc15df999b8a59e87d665263d291c62 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 14:44:00 +0200 Subject: [PATCH 19/25] negation in tidyselect --- R/ExternalValidatePlp.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index aca76de62..a8162e44f 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -697,7 +697,7 @@ getData <- function(design, database, outputFolder, downloadTasks) { dplyr::mutate(taskId = dplyr::row_number(), collapsed = sapply(.data$restrictPlpDataSettings, paste0, collapse = "|")) %>% dplyr::filter(.data$collapsed == paste0(design$restrictPlpDataSettings, collapse = "|")) %>% - dplyr::select("collapsed") + dplyr::select(-"collapsed") databaseName <- database$cdmDatabaseName database$targetId <- task$targetId From 3b540e52df237fdb5cbcafa795bc15361ebfcac3 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 15:19:42 +0200 Subject: [PATCH 20/25] fix modelPaths in plpModel --- R/ExternalValidatePlp.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index a8162e44f..ba4a22834 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -788,11 +788,12 @@ createDownloadTasks <- function(validationDesignList) { plpModelList <- design$plpModelList for (model in plpModelList) { if (is.character(model)) { - if (!is.null(modelCache[[model]])) { - model <- modelCache[[model]] + modelKey <- model + if (!is.null(modelCache[[modelKey]])) { + model <- modelCache[[modelKey]] } else { - model <- loadPlpModel(model) - modelCache[[model]] <- model + model <- loadPlpModel(modelKey) + modelCache[[modelKey]] <- model } } else { modelKey <- digest::digest(model) From 7e413b8d335052c4c0ab415e20bdf1e6dcdb0d42 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 16:01:47 +0200 Subject: [PATCH 21/25] remove nested list in covSettings --- R/ExternalValidatePlp.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index ba4a22834..2421a72df 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -698,6 +698,10 @@ getData <- function(design, database, outputFolder, downloadTasks) { collapsed = sapply(.data$restrictPlpDataSettings, paste0, collapse = "|")) %>% dplyr::filter(.data$collapsed == paste0(design$restrictPlpDataSettings, collapse = "|")) %>% dplyr::select(-"collapsed") + task$covariateSettings <- task$covariateSettings[[1]] + if (length(task$covariateSettings) > 1) { + task$covariateSettings <- unlist(task$covariateSettings, recursive = FALSE) + } databaseName <- database$cdmDatabaseName database$targetId <- task$targetId From f19b0119b7e94b9efca1fb7c4170d13840ce4cef Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 16:05:02 +0200 Subject: [PATCH 22/25] fix assignment --- R/ExternalValidatePlp.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index 2421a72df..5ddc099ce 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -698,9 +698,12 @@ getData <- function(design, database, outputFolder, downloadTasks) { collapsed = sapply(.data$restrictPlpDataSettings, paste0, collapse = "|")) %>% dplyr::filter(.data$collapsed == paste0(design$restrictPlpDataSettings, collapse = "|")) %>% dplyr::select(-"collapsed") - task$covariateSettings <- task$covariateSettings[[1]] - if (length(task$covariateSettings) > 1) { + covariateSettings <- task$covariateSettings[[1]] + task$covariateSettings <- NULL + if (length(covariateSettings) > 1) { task$covariateSettings <- unlist(task$covariateSettings, recursive = FALSE) + } else { + task$covariateSettings <- covariateSettings } databaseName <- database$cdmDatabaseName From 2ba89af3358aa0544af459a7f13189dbf378667b Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 16:08:06 +0200 Subject: [PATCH 23/25] fix assignment again --- R/ExternalValidatePlp.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index 5ddc099ce..c1ee3b199 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -701,7 +701,7 @@ getData <- function(design, database, outputFolder, downloadTasks) { covariateSettings <- task$covariateSettings[[1]] task$covariateSettings <- NULL if (length(covariateSettings) > 1) { - task$covariateSettings <- unlist(task$covariateSettings, recursive = FALSE) + task$covariateSettings <- unlist(covariateSettings, recursive = FALSE) } else { task$covariateSettings <- covariateSettings } From 581c10c2d630e0df2b1d0f9f1625b06f02ca1de8 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 17 Oct 2024 16:12:47 +0200 Subject: [PATCH 24/25] fix assignment again --- R/ExternalValidatePlp.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index c1ee3b199..16022e837 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -701,7 +701,7 @@ getData <- function(design, database, outputFolder, downloadTasks) { covariateSettings <- task$covariateSettings[[1]] task$covariateSettings <- NULL if (length(covariateSettings) > 1) { - task$covariateSettings <- unlist(covariateSettings, recursive = FALSE) + task$covariateSettings <- list(unlist(covariateSettings, recursive = FALSE)) } else { task$covariateSettings <- covariateSettings } From 1ba12be0d2a655a5b145756b7189dfa18970554a Mon Sep 17 00:00:00 2001 From: egillax Date: Fri, 18 Oct 2024 09:07:10 +0200 Subject: [PATCH 25/25] deduplicate redundant analyses --- R/ExternalValidatePlp.R | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index 16022e837..74424be7d 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -732,6 +732,10 @@ getData <- function(design, database, outputFolder, downloadTasks) { if (!dir.exists(file.path(outputFolder, databaseName))) { dir.create(file.path(outputFolder, databaseName), recursive = TRUE) } + if (length(covariateSettings) > 1) { + plpData$covariateData <- + deDuplicateCovariateData(plpData$covariateData) + } savePlpData(plpData, file = plpDataLocation) } } else { @@ -845,3 +849,17 @@ createDownloadTasks <- function(validationDesignList) { return(downloadTasks) } + +#' deplucateCovariateData - Remove duplicate covariate data +#' when downloading data with multiple different covariateSettings sometimes +#' there will be duplicated analysisIds which need to be removed +#' @param covariateData The covariate data Andromeda object +#' @return The deduplicated covariate data +#' @keywords internal +deDuplicateCovariateData <- function(covariateData) { + covariateData$covariateRef <- covariateData$covariateRef %>% + dplyr::distinct() + covariateData$covariates <- covariateData$covariates %>% + dplyr::distinct() + return(covariateData) +}