Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate external improvements #485

Open
wants to merge 31 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
398ebb2
generate download tasks
egillax Aug 6, 2024
cdc5034
Merge branch 'develop' into validate_external_improvements
egillax Aug 6, 2024
52ddf42
docs
egillax Aug 6, 2024
c9e7e5e
fix filtering of task
egillax Aug 7, 2024
f42bb72
outcome limit in validation
egillax Aug 7, 2024
f143e05
outcome limit in validation
egillax Aug 7, 2024
feb9740
better logging
egillax Aug 13, 2024
b734ba7
Merge branch 'validate_external_improvements' of https://github.com/o…
egillax Aug 13, 2024
4020e06
better logging
egillax Aug 13, 2024
f8d6813
Merge branch 'develop' into validate_external_improvements
egillax Oct 11, 2024
a39c2de
Merge branch 'develop' into validate_external_improvements
egillax Oct 11, 2024
9036d96
add test for <10 outcomes
egillax Oct 11, 2024
e88415f
refactor and docs
egillax Oct 11, 2024
5e744e9
WIP allow multiple covariateSettings
egillax Oct 15, 2024
2eb5407
allow multiple covariateSettings
egillax Oct 15, 2024
e2f4b34
plpModel as object works when creating downloadTasks
egillax Oct 17, 2024
7e8544b
add digest dependency
egillax Oct 17, 2024
6137760
Merge pull request #488 from OHDSI/extractUniqueRedesign
egillax Oct 17, 2024
16f8d73
downloadTasks and add tests
egillax Oct 17, 2024
c4b9593
docs and fix check note
egillax Oct 17, 2024
be595fc
fix bracket in test
egillax Oct 17, 2024
6f568f5
docs
egillax Oct 17, 2024
2e298f5
fix tidyselect
egillax Oct 17, 2024
5e229c0
negation in tidyselect
egillax Oct 17, 2024
3b540e5
fix modelPaths in plpModel
egillax Oct 17, 2024
7e413b8
remove nested list in covSettings
egillax Oct 17, 2024
f19b011
fix assignment
egillax Oct 17, 2024
2ba89af
fix assignment again
egillax Oct 17, 2024
581c10c
fix assignment again
egillax Oct 17, 2024
1ba12be
deduplicate redundant analyses
egillax Oct 18, 2024
2ab8d7c
Merge branch 'develop' into validate_external_improvements
egillax Nov 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Imports:
Andromeda,
Cyclops (>= 3.0.0),
DatabaseConnector (>= 6.0.0),
digest,
dplyr,
FeatureExtraction (>= 3.0.0),
ggplot2,
Expand Down
156 changes: 140 additions & 16 deletions R/ExternalValidatePlp.R
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@
#' @param plpModelList A list of plpModels objects created by \code{runPlp} or a path to such objects
#' @param recalibrate A vector of characters specifying the recalibration method to apply,
#' @param runCovariateSummary whether to run the covariate summary for the validation data
#' @return A validation design object of class \code{validationDesign}
#' @return A validation design object of class \code{validationDesign} or a list of such objects
#' @export
createValidationDesign <-
function(targetId,
Expand Down Expand Up @@ -477,31 +477,40 @@
ParallelLogger::registerLogger(logger)
on.exit(closeLog(logger))

downloadTasks <- createDownloadTasks(validationDesignList)

results <- NULL
for (design in validationDesignList) {
for (database in databaseDetails) {
databaseName <- database$cdmDatabaseName

ParallelLogger::logInfo(paste("Validating model on", database$cdmDatabaseName))

database$targetId <- design$targetId

database$outcomeIds <- design$outcomeId
ParallelLogger::logInfo(paste("Validating models on", database$cdmDatabaseName,
"with targetId:", design$targetId, "and outcomeId:", design$outcomeId))

modelDesigns <- extractModelDesigns(design$plpModelList)
allCovSettings <- lapply(modelDesigns, function(x) x$covariateSettings)
design <- fromDesignOrModel(design, modelDesigns, "restrictPlpDataSettings")
checkAllSameInModels(allCovSettings, "covariateSettings")

# get plpData
plpData <- getData(design, database, outputFolder, allCovSettings)
plpData <- getData(design, database, outputFolder, downloadTasks)
if (is.null(plpData)) {
ParallelLogger::logInfo("Couldn't extract plpData for the given design and database, proceeding to the next one.")
next
}
# create study population
population <- getPopulation(design, modelDesigns, plpData)

if (sum(population$outcomeCount) < 10) {
ParallelLogger::logInfo(
paste(
"Outcome size is less than 10, skipping validation for design and database:",
databaseName,
"and targetId:", design$targetId,
"outcomeId", design$outcomeId
)
)
next
}

results <- lapply(design$plpModelList, function(model) {
analysisName <- paste0("Analysis_", analysisInfo[databaseName])
analysisDone <- file.exists(
Expand Down Expand Up @@ -597,7 +606,7 @@
identical(y, settingsList[[1]])},
settingsList[-1],
init = TRUE)) {
stop(paste0(settingName, "are not the same across models which is not supported yet"))
stop(paste0(settingName, "are not the same across models which is not supported"))

Check warning on line 609 in R/ExternalValidatePlp.R

View check run for this annotation

Codecov / codecov/patch

R/ExternalValidatePlp.R#L609

Added line #L609 was not covered by tests
}
}

Expand Down Expand Up @@ -666,7 +675,8 @@
if (any(unlist(lapply(modelDesigns, function(x) {
!identical(x[[settingName]], validationDesign[[settingName]])
})))) {
ParallelLogger::logWarn(settingName, " are not the same in models and validationDesign")
ParallelLogger::logWarn(settingName, " are not the same in models and validationDesign,
using from design")
}
}
return(validationDesign)
Expand All @@ -676,13 +686,31 @@
#' @param design The validationDesign object
#' @param database The databaseDetails object
#' @param outputFolder The directory to save the validation results to
#' @param allCovSettings A list of covariateSettings from the models
#' @param downloadTasks A list of download tasks determined by unique
#' combinations of targetId and restrictPlpDataSettings
#' @return The plpData object
#' @keywords internal
getData <- function(design, database, outputFolder, allCovSettings) {
getData <- function(design, database, outputFolder, downloadTasks) {
# find task associated with design and the index of the task in downloadTasks
task <- downloadTasks %>%
dplyr::filter(.data$targetId == design$targetId) %>%
dplyr::mutate(taskId = dplyr::row_number(),
collapsed = sapply(.data$restrictPlpDataSettings, paste0, collapse = "|")) %>%
dplyr::filter(.data$collapsed == paste0(design$restrictPlpDataSettings, collapse = "|")) %>%
dplyr::select(-"collapsed")
covariateSettings <- task$covariateSettings[[1]]
task$covariateSettings <- NULL
if (length(covariateSettings) > 1) {
task$covariateSettings <- list(unlist(covariateSettings, recursive = FALSE))

Check warning on line 704 in R/ExternalValidatePlp.R

View check run for this annotation

Codecov / codecov/patch

R/ExternalValidatePlp.R#L704

Added line #L704 was not covered by tests
} else {
task$covariateSettings <- covariateSettings
}

databaseName <- database$cdmDatabaseName
database$targetId <- task$targetId
database$outcomeIds <- task$outcomeIds[[1]]
plpDataName <-
paste0("targetId_", design$targetId, "_L", "1")
paste0("targetId_", design$targetId, "_L", task$taskId)
plpDataLocation <-
file.path(outputFolder, databaseName, plpDataName)
if (!dir.exists(plpDataLocation)) {
Expand All @@ -691,8 +719,8 @@
getPlpData,
list(
databaseDetails = database,
restrictPlpDataSettings = design$restrictPlpDataSettings,
covariateSettings = allCovSettings[[1]]
restrictPlpDataSettings = task$restrictPlpDataSettings[[1]],
covariateSettings = task$covariateSettings[[1]]
)
)
},
Expand All @@ -704,6 +732,10 @@
if (!dir.exists(file.path(outputFolder, databaseName))) {
dir.create(file.path(outputFolder, databaseName), recursive = TRUE)
}
if (length(covariateSettings) > 1) {
plpData$covariateData <-
deDuplicateCovariateData(plpData$covariateData)

Check warning on line 737 in R/ExternalValidatePlp.R

View check run for this annotation

Codecov / codecov/patch

R/ExternalValidatePlp.R#L736-L737

Added lines #L736 - L737 were not covered by tests
}
savePlpData(plpData, file = plpDataLocation)
}
} else {
Expand Down Expand Up @@ -739,3 +771,95 @@
return(population)
}

#' createDownloadTasks
#' create download tasks based on unique combinations of targetId and
#' restrictPlpDataSettings. It adds all covariateSettings and outcomes that
#' have that targetId and restrictPlpDataSettings. This is used to avoid
#' downloading the same data multiple times.
#' @param validationDesignList A list of validationDesign objects
#' @return A dataframe where each row is a downloadTask
#' @keywords internal
createDownloadTasks <- function(validationDesignList) {
ParallelLogger::logInfo("Extracting unique combinations of targetId, \
restrictPlpDataSettings and covariateSettings for extracting data")

rowsList <- list()
modelCache <- list()

for (design in validationDesignList) {
targetId <- design$targetId
outcomeId <- design$outcomeId
restrictPlpDataSettings <- design$restrictPlpDataSettings
if (is.null(restrictPlpDataSettings)) {
restrictList <- lapply(design$plpModelList,
function(x) x$modelDesign$restrictPlpDataSettings)
checkAllSameInModels(restrictList, "restrictPlpDataSettings")
restrictPlpDataSettings <- restrictList[[1]]
}
plpModelList <- design$plpModelList
for (model in plpModelList) {
if (is.character(model)) {
modelKey <- model
if (!is.null(modelCache[[modelKey]])) {
model <- modelCache[[modelKey]]

Check warning on line 804 in R/ExternalValidatePlp.R

View check run for this annotation

Codecov / codecov/patch

R/ExternalValidatePlp.R#L802-L804

Added lines #L802 - L804 were not covered by tests
} else {
model <- loadPlpModel(modelKey)
modelCache[[modelKey]] <- model

Check warning on line 807 in R/ExternalValidatePlp.R

View check run for this annotation

Codecov / codecov/patch

R/ExternalValidatePlp.R#L806-L807

Added lines #L806 - L807 were not covered by tests
}
} else {
modelKey <- digest::digest(model)
if (!is.null(modelCache[[modelKey]])) {
model <- modelCache[[modelKey]]
} else {
modelCache[[modelKey]] <- model
}
}
covariateSettings <- model$modelDesign$covariateSettings
row <- list(targetId = targetId,
outcomeIds = outcomeId,
restrictPlpDataSettings = list(restrictPlpDataSettings),
covariateSettings = list(covariateSettings))
rowsList[[length(rowsList) + 1]] <- row
}
}
rowsDf <- dplyr::bind_rows(rowsList)

rowsDf <- rowsDf %>%
dplyr::rowwise() %>%
dplyr::mutate(restrictKey = digest::digest(restrictPlpDataSettings),
covariateKey = digest::digest(covariateSettings)) %>%
dplyr::ungroup()

uniqueCovariateSettings <- function(settingsList, settingsKeys) {
uniqueKeys <- unique(settingsKeys)
indices <- match(uniqueKeys, settingsKeys)
uniqueSettings <- settingsList[indices]
return(uniqueSettings)
}

downloadTasks <- rowsDf %>%
dplyr::group_by(.data$targetId, .data$restrictKey) %>%
dplyr::summarise(
outcomeIds = list(unique(.data$outcomeIds)),
restrictPlpDataSettings = .data$restrictPlpDataSettings[1],
covariateSettings = list(uniqueCovariateSettings(.data$covariateSettings, .data$covariateKey)),
.groups = "drop") %>%
dplyr::select(c("targetId", "outcomeIds", "restrictPlpDataSettings",
"covariateSettings"))

return(downloadTasks)
}

#' deplucateCovariateData - Remove duplicate covariate data
#' when downloading data with multiple different covariateSettings sometimes
#' there will be duplicated analysisIds which need to be removed
#' @param covariateData The covariate data Andromeda object
#' @return The deduplicated covariate data
#' @keywords internal
deDuplicateCovariateData <- function(covariateData) {
covariateData$covariateRef <- covariateData$covariateRef %>%
dplyr::distinct()
covariateData$covariates <- covariateData$covariates %>%
dplyr::distinct()
return(covariateData)

Check warning on line 864 in R/ExternalValidatePlp.R

View check run for this annotation

Codecov / codecov/patch

R/ExternalValidatePlp.R#L860-L864

Added lines #L860 - L864 were not covered by tests
}
4 changes: 3 additions & 1 deletion R/PopulationSettings.R
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,9 @@ createStudyPopulation <- function(
ParallelLogger::logWarn('No outcomes left...')
return(NULL)
}

ParallelLogger::logInfo('Population created with: ', nrow(population),
' observations, ', length(unique(population$subjectId)),
' unique subjects and ', sum(population$outcomeCount), ' outcomes')
population <- as.data.frame(population)

attr(population, "metaData") <- metaData
Expand Down
26 changes: 26 additions & 0 deletions man/createDownloadTasks.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/createValidationDesign.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions man/getData.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading