diff --git a/DESCRIPTION b/DESCRIPTION index 0011aef4b..8b681e2cb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: PatientLevelPrediction Type: Package Title: Developing patient level prediction using data in the OMOP Common Data Model -Version: 6.3.9 +Version: 6.3.9.9999 Date: 2024-08-21 Authors@R: c( person("Jenna", "Reps", email = "jreps@its.jnj.com", role = c("aut", "cre")), @@ -71,8 +71,7 @@ Suggests: lightgbm Remotes: ohdsi/BigKnn, - ohdsi/FeatureExtraction, ohdsi/ShinyAppBuilder, - ohdsi/ResultModelManager, -RoxygenNote: 7.3.1 + ohdsi/ResultModelManager +RoxygenNote: 7.3.2 Encoding: UTF-8 diff --git a/NAMESPACE b/NAMESPACE index 38cfb743f..7119f6dd7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ export(createDefaultExecuteSettings) export(createDefaultSplitSetting) export(createExecuteSettings) export(createFeatureEngineeringSettings) +export(createGlmModel) export(createLearningCurve) export(createLogSettings) export(createModelDesign) @@ -82,6 +83,7 @@ export(plotSparseCalibration2) export(plotSparseRoc) export(plotVariableScatterplot) export(predictCyclops) +export(predictGlm) export(predictPlp) export(recalibratePlp) export(recalibratePlpRefit) diff --git a/R/AdditionalCovariates.R b/R/AdditionalCovariates.R index f19f30803..dfb8abca8 100644 --- a/R/AdditionalCovariates.R +++ b/R/AdditionalCovariates.R @@ -67,10 +67,29 @@ getCohortCovariateData <- function( group by a.@row_id_field; " ) + if (is.null(covariateSettings$cohortTable) && + (is.null(covariateSettings$cohortDatabaseSchema))) { + ParallelLogger::logInfo("cohortTable and cohortDatabaseSchema not specified in + cohort covariateSettings. Attempting to fetch from databaseDetails") + # use settings from databaseDetails which is two frames up + # in the call stack + tryCatch(databaseDetails <- get("databaseDetails", parent.frame(n = 2)), + error = function(e) { + stop("cohortTable and cohortDatabaseSchema not specified in + cohort covariateSettings. Attempt to fetch databaseDetails from parent + frame failed with error: ", e$message) + }) + cohortCovariateTable <- databaseDetails$cohortTable + cohortCovariateDatabaseSchema <- databaseDetails$cohortDatabaseSchema + } else { + cohortCovariateTable <- covariateSettings$cohortTable + cohortCovariateDatabaseSchema <- covariateSettings$cohortDatabaseSchema + } + sql <- SqlRender::render( sql, - covariate_cohort_schema = covariateSettings$cohortDatabaseSchema, - covariate_cohort_table = covariateSettings$cohortTable, + covariate_cohort_schema = cohortCovariateDatabaseSchema, + covariate_cohort_table = cohortCovariateTable, covariate_cohort_id = covariateSettings$cohortIds, cohort_temp_table = cohortTable, row_id_field = rowIdField, @@ -154,8 +173,10 @@ getCohortCovariateData <- function( #' #' @param cohortName Name for the cohort #' @param settingId A unique id for the covariate time and -#' @param cohortDatabaseSchema The schema of the database with the cohort -#' @param cohortTable the table name that contains the covariate cohort +#' @param cohortDatabaseSchema The schema of the database with the cohort. If +#' nothing is specified then the cohortDatabaseSchema from databaseDetails at runtime is used. +#' @param cohortTable the table name that contains the covariate cohort. If +#' nothing is specified then the cohortTable from databaseDetails at runtime is used. #' @param cohortId cohort id for the covariate cohort #' @param startDay The number of days prior to index to start observing the cohort #' @param endDay The number of days prior to index to stop observing the cohort @@ -173,8 +194,8 @@ getCohortCovariateData <- function( createCohortCovariateSettings <- function( cohortName, settingId, - cohortDatabaseSchema, - cohortTable, + cohortDatabaseSchema=NULL, + cohortTable=NULL, cohortId, startDay = -30, endDay = 0, @@ -205,4 +226,4 @@ createCohortCovariateSettings <- function( attr(covariateSettings, "fun") <- "PatientLevelPrediction::getCohortCovariateData" class(covariateSettings) <- "covariateSettings" return(covariateSettings) -} \ No newline at end of file +} diff --git a/R/EvaluationSummary.R b/R/EvaluationSummary.R index a318cb923..34924fd7a 100644 --- a/R/EvaluationSummary.R +++ b/R/EvaluationSummary.R @@ -263,7 +263,8 @@ getEvaluationStatistics_survival <- function(prediction, evalColumn, timepoint, eStatistic <- eStatistic90 <- -1 if (!is.null(w)) { eStatistic <- mean(abs(w$actual - w$estimatedSurvival)) - eStatistic90 <- stats::quantile(abs(w$actual - w$estimatedSurvival), probs = .9) + eStatistic90 <- stats::quantile(abs(w$actual - w$estimatedSurvival), + probs = .9, na.rm = TRUE) } result <- rbind( diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index 8e3e7a64a..7948b39bd 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -350,11 +350,16 @@ createValidationSettings <- function(recalibrate = NULL, #' #' @param targetId The targetId of the target cohort to validate on #' @param outcomeId The outcomeId of the outcome cohort to validate on -#' @param populationSettings A list of population restriction settings created by \code{createPopulationSettings} -#' @param restrictPlpDataSettings A list of plpData restriction settings created by \code{createRestrictPlpDataSettings} +#' @param populationSettings A list of population restriction settings created +#' by \code{createPopulationSettings}. Default is NULL and then this is taken +#' from the model +#' @param restrictPlpDataSettings A list of plpData restriction settings +#' created by \code{createRestrictPlpDataSettings}. Default is NULL and then +#' this is taken from the model. #' @param plpModelList A list of plpModels objects created by \code{runPlp} or a path to such objects #' @param recalibrate A vector of characters specifying the recalibration method to apply, #' @param runCovariateSummary whether to run the covariate summary for the validation data +#' @return A validation design object of class \code{validationDesign} #' @export createValidationDesign <- function(targetId, @@ -366,24 +371,63 @@ createValidationDesign <- runCovariateSummary = TRUE) { checkIsClass(targetId, c("numeric", "integer")) checkIsClass(outcomeId, c("numeric", "integer")) - checkIsClass(populationSettings, c("populationSettings")) - checkIsClass(restrictPlpDataSettings, "restrictPlpDataSettings") + + if (!is.null(populationSettings)) { + checkIsClass(populationSettings, c("populationSettings")) + } + if (!is.null(restrictPlpDataSettings)) { + if (inherits(restrictPlpDataSettings, "list")) { + lapply(restrictPlpDataSettings, function(x) { + checkIsClass(x, "restrictPlpDataSettings") + }) + } else { + checkIsClass(restrictPlpDataSettings, "restrictPlpDataSettings") + } + } checkIsClass(plpModelList, "list") - lapply(plpModelList, function(x) - checkIsClass(x, c("plpModel", "character"))) - checkIsClass(recalibrate, c("character", "NULL")) + lapply(plpModelList, function(x) { + checkIsClass(x, c("plpModel", "character")) + }) + checkIsClass(recalibrate, c('character', 'NULL')) + if (!is.null(recalibrate)) { + if (sum(recalibrate %in% c('recalibrationInTheLarge', 'weakRecalibration')) != + length(recalibrate)) { + ParallelLogger::logError( + 'Incorrect recalibrate options used. Must be recalibrationInTheLarge or weakRecalibration' + ) + } + } checkIsClass(runCovariateSummary, "logical") - design <- list( - targetId = targetId, - outcomeId = outcomeId, - populationSettings = populationSettings, - plpModelList = plpModelList, - restrictPlpDataSettings = restrictPlpDataSettings, - recalibrate = recalibrate, - runCovariateSummary = runCovariateSummary - ) - class(design) <- "validationDesign" + # if restrictPlpDataSettings is a list make a list of designs with each + # settings + if (is.list(restrictPlpDataSettings)) { + design <- lapply(restrictPlpDataSettings, function(x) { + design <- list( + targetId = targetId, + outcomeId = outcomeId, + populationSettings = populationSettings, + plpModelList = plpModelList, + restrictPlpDataSettings = x, + recalibrate = recalibrate, + runCovariateSummary = runCovariateSummary + ) + class(design) <- "validationDesign" + return(design) + }) + } else { + design <- list( + targetId = targetId, + outcomeId = outcomeId, + populationSettings = populationSettings, + plpModelList = plpModelList, + restrictPlpDataSettings = restrictPlpDataSettings, + recalibrate = recalibrate, + runCovariateSummary = runCovariateSummary + ) + class(design) <- "validationDesign" + } + return(design) } @@ -403,125 +447,95 @@ validateExternal <- function(validationDesignList, logSettings, outputFolder) { # Input checks - #======= - if (inherits(validationDesignList, 'list')) { - lapply(validationDesignList, function(x) - checkIsClass(x, 'validationDesign')) - } else { - checkIsClass(validationDesignList, 'validationDesign') - validationDesignList <- list(validationDesignList) - } - - # check the class and make a list if a single database setting - if (inherits(databaseDetails, 'list')) { - lapply(databaseDetails, function(x) - checkIsClass(x, 'databaseDetails')) - } else { - checkIsClass(databaseDetails, 'databaseDetails') - databaseDetails <- list(databaseDetails) - } - + changedInputs <- checkValidateExternalInputs(validationDesignList, + databaseDetails, + logSettings, + outputFolder) + validationDesignList <- changedInputs[["validationDesignList"]] + databaseDetails <- changedInputs[["databaseDetails"]] + # create results list with the names of the databases to validate across result <- list() length(result) <- length(databaseDetails) names(result) <- - unlist(lapply(databaseDetails, function(x) - attr(x, 'cdmDatabaseName'))) + unlist(lapply(databaseDetails, function(x) { + attr(x, "cdmDatabaseName")})) # Need to keep track of incremental analysisId's for each database - databaseNames <- unlist(lapply(databaseDetails, function(x) - x$cdmDatabaseName)) + databaseNames <- unlist(lapply(databaseDetails, function(x) { + x$cdmDatabaseName})) + analysisInfo <- list() for (name in databaseNames) { analysisInfo[name] <- 1 } + + # initiate log + logSettings$saveDirectory <- outputFolder + logSettings$logFileName <- "validationLog" + logger <- do.call(createLog, logSettings) + ParallelLogger::registerLogger(logger) + on.exit(closeLog(logger)) + + results <- NULL for (design in validationDesignList) { for (database in databaseDetails) { databaseName <- database$cdmDatabaseName - # initiate log - logSettings$saveDirectory <- file.path(outputFolder, - database$cdmDatabaseName) - logSettings$logFileName <- 'validationLog' - logger <- do.call(createLog, logSettings) - ParallelLogger::registerLogger(logger) - on.exit(logger$close()) - ParallelLogger::logInfo(paste('Validating model on', database$cdmDatabaseName)) + ParallelLogger::logInfo(paste("Validating model on", database$cdmDatabaseName)) database$targetId <- design$targetId database$outcomeIds <- design$outcomeId - allCovSettings <- - lapply(design$plpModelList, function(plpModel) - plpModel$modelDesign$covariateSettings) - # compare all to first covSettings, if not the same stop - if (!Reduce(function(x, y) - x && - identical(y, allCovSettings[[1]]), - allCovSettings[-1], - init = TRUE)) { - stop("covariateSettings are not the same across models which is not supported yet") - } - plpData <- tryCatch({ - do.call( - getPlpData, - list( - databaseDetails = database, - restrictPlpDataSettings = design$restrictPlpDataSettings, - covariateSettings = design$plpModelList[[1]]$modelDesign$covariateSettings - ) - ) - }, - error = function(e) { - ParallelLogger::logError(e) - return(NULL) - }) - plpDataName <- - paste0("targetId_", design$targetId, "_L", "1") # Is the 1 for how many targetIds in file ? - plpDataLocation <- - file.path(outputFolder, databaseName, plpDataName) - if (!dir.exists(file.path(outputFolder, databaseName))) { - dir.create(file.path(outputFolder, databaseName), recursive = TRUE) - } - savePlpData(plpData, file = plpDataLocation) + modelDesigns <- extractModelDesigns(design$plpModelList) + allCovSettings <- lapply(modelDesigns, function(x) x$covariateSettings) + design <- fromDesignOrModel(design, modelDesigns, "restrictPlpDataSettings") + checkAllSameInModels(allCovSettings, "covariateSettings") + # get plpData + plpData <- getData(design, database, outputFolder, allCovSettings) + if (is.null(plpData)) { + ParallelLogger::logInfo("Couldn't extract plpData for the given design and database, proceeding to the next one.") + next + } # create study population - population <- tryCatch({ - do.call( - createStudyPopulation, - list( - plpData = plpData, - outcomeId = design$outcomeId, - populationSettings = design$populationSettings - ) - ) - }, - error = function(e) { - ParallelLogger::logError(e) - return(NULL) - }) - + population <- getPopulation(design, modelDesigns, plpData) + results <- lapply(design$plpModelList, function(model) { analysisName <- paste0("Analysis_", analysisInfo[databaseName]) - validateModel( - plpModel = model, - plpData = plpData, - population = population, - recalibrate = design$recalibrate, - runCovariateSummary = design$runCovariateSummary, - outputFolder = outputFolder, - databaseName = databaseName, - analysisName = analysisName + analysisDone <- file.exists( + file.path( + outputFolder, + databaseName, + analysisName, + "validationResult", + "runPlp.rds" + ) ) + if (!analysisDone) { + validateModel( + plpModel = model, + plpData = plpData, + population = population, + recalibrate = design$recalibrate, + runCovariateSummary = design$runCovariateSummary, + outputFolder = outputFolder, + databaseName = databaseName, + analysisName = analysisName) + } else { + ParallelLogger::logInfo(paste0("Analysis ", analysisName, " already done", + ", Proceeding to the next one.")) + } + analysisInfo[[databaseName]] <<- analysisInfo[[databaseName]] + 1 }) } } for (database in databaseDetails) { databaseName <- database$cdmDatabaseName - sqliteLocation <- - file.path(outputFolder, 'sqlite') + sqliteLocation <- file.path(outputFolder, "sqlite") + tryCatch({ insertResultsToSqlite( resultLocation = file.path(outputFolder, databaseName), @@ -549,6 +563,12 @@ validateModel <- outputFolder, databaseName, analysisName) { + + + if (is.character(plpModel)) { + plpModel <- loadPlpModel(plpModel) + } + result <- externalValidatePlp( plpModel = plpModel, plpData = plpData, @@ -561,7 +581,161 @@ validateModel <- outputFolder, databaseName, analysisName, - 'validationResult' + + "validationResult" )) return(result) } + +#' checkAllSameInModels - Check if all settings are the same across models +#' @param settingsList A list of settings to check +#' @param settingName The name of the setting to check +#' @keywords internal +checkAllSameInModels <- function(settingsList, settingName) { + if (!Reduce(function(x, y) { + x && + identical(y, settingsList[[1]])}, + settingsList[-1], + init = TRUE)) { + stop(paste0(settingName, "are not the same across models which is not supported yet")) + } +} + +#' extractModelDesigns - Extract all modelDesigns from a list of plpModels +#' @param plpModelList A list of plpModels +#' @return A list of modelDesigns +#' @keywords internal +extractModelDesigns <- function(plpModelList) { + lapply(plpModelList, function(plpModel) { + if (is.character(plpModel)) { + modelDesign <- ParallelLogger::loadSettingsFromJson( + normalizePath(file.path(plpModel, "modelDesign.json")) + ) + return(modelDesign) + } else { + plpModel$modelDesign + } + }) +} + +#' checkValidateExternalInputs - Check the inputs for validateExternal +#' @param validationDesignList A list of validationDesign objects +#' @param databaseDetails A list of databaseDetails objects +#' @param logSettings An object of logSettings +#' @param outputFolder The directory to save the validation results to +#' @return A list of inputs that were modified +#' @keywords internal +checkValidateExternalInputs <- function(validationDesignList, + databaseDetails, + logSettings, + outputFolder) { + if (inherits(validationDesignList, "list")) { + lapply(validationDesignList, function(x) { + checkIsClass(x, "validationDesign")}) + } else { + checkIsClass(validationDesignList, "validationDesign") + validationDesignList <- list(validationDesignList) + } + + # check the class and make a list if a single database setting + if (inherits(databaseDetails, "list")) { + lapply(databaseDetails, function(x) { + checkIsClass(x, "databaseDetails")}) + } else { + checkIsClass(databaseDetails, "databaseDetails") + databaseDetails <- list(databaseDetails) + } + results <- list(validationDesignList = validationDesignList, + databaseDetails = databaseDetails) + return(results) +} + +#' fromDesignOrModel - Check if the design has the setting, if not use the model's +#' @param validationDesign The validationDesign object +#' @param modelDesigns A list of modelDesign objects +#' @param settingName The name of the setting to check +#' @return The updated design +#' @keywords internal +fromDesignOrModel <- function(validationDesign, modelDesigns, settingName) { + settingsFromModel <- lapply(modelDesigns, function(x) x[[settingName]]) + if (is.null(validationDesign[[settingName]])) { + checkAllSameInModels(settingsFromModel, settingName) + validationDesign[[settingName]] <- settingsFromModel[[1]] + ParallelLogger::logInfo(paste0(settingName, " not set in design, using model's")) + } else { + if (any(unlist(lapply(modelDesigns, function(x) { + !identical(x[[settingName]], validationDesign[[settingName]]) + })))) { + ParallelLogger::logWarn(settingName, " are not the same in models and validationDesign") + } + } + return(validationDesign) +} + +#' getData - Get the plpData for the validation +#' @param design The validationDesign object +#' @param database The databaseDetails object +#' @param outputFolder The directory to save the validation results to +#' @param allCovSettings A list of covariateSettings from the models +#' @return The plpData object +#' @keywords internal +getData <- function(design, database, outputFolder, allCovSettings) { + databaseName <- database$cdmDatabaseName + plpDataName <- + paste0("targetId_", design$targetId, "_L", "1") + plpDataLocation <- + file.path(outputFolder, databaseName, plpDataName) + if (!dir.exists(plpDataLocation)) { + plpData <- tryCatch({ + do.call( + getPlpData, + list( + databaseDetails = database, + restrictPlpDataSettings = design$restrictPlpDataSettings, + covariateSettings = allCovSettings[[1]] + ) + ) + }, + error = function(e) { + ParallelLogger::logError(e) + return(NULL) + }) + if (!is.null(plpData)) { + if (!dir.exists(file.path(outputFolder, databaseName))) { + dir.create(file.path(outputFolder, databaseName), recursive = TRUE) + } + savePlpData(plpData, file = plpDataLocation) + } + } else { + ParallelLogger::logInfo(paste0("Data already extracted for ", + plpDataName, ": Loading from disk")) + plpData <- loadPlpData(plpDataLocation) + } + return(plpData) +} + +#' getPopulation - Get the population for the validationDesign +#' @param validationDesign The validationDesign objects +#' @param modelDesigns A list of modelDesign objects +#' @param plpData The plpData object +#' @return The population dataframe +#' @keywords internal +getPopulation <- function(validationDesign, modelDesigns, plpData) { + design <- fromDesignOrModel(validationDesign, modelDesigns, "populationSettings") + population <- tryCatch({ + do.call( + createStudyPopulation, + list( + plpData = plpData, + outcomeId = design$outcomeId, + populationSettings = design$populationSettings + ) + ) + }, + error = function(e) { + ParallelLogger::logError(e) + return(NULL) + }) + return(population) +} + diff --git a/R/Glm.R b/R/Glm.R new file mode 100644 index 000000000..2f7295139 --- /dev/null +++ b/R/Glm.R @@ -0,0 +1,112 @@ +# @file Glm.R +# +# Copyright 2024 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#' predict using a logistic regression model +#' +#' @description +#' Predict risk with a given plpModel containing a generalized linear model. +#' +#' @param plpModel An object of type \code{plpModel} - a patient level +#' prediction model +#' @param data An object of type \code{plpData} - the patient level prediction +#' data extracted from the CDM. +#' @param cohort The population dataframe created using +#' /code{createStudyPopulation} who will have their risks predicted or a cohort +#' without the outcome known +#' @export +#' @return A dataframe containing the prediction for each person in the +#' population +#' @export +predictGlm <- function(plpModel, data, cohort) { + start <- Sys.time() + + ParallelLogger::logInfo("predict risk probabilities using predictGlm") + + data$covariateData$coefficients <- plpModel$model$coefficients + on.exit(data$covariateData$coefficients <- NULL) + + prediction <- data$covariateData$covariates %>% + dplyr::inner_join(data$covariateData$coefficients, by = "covariateId") %>% + dplyr::mutate(values = .data$covariateValue * .data$coefficient) %>% + dplyr::group_by(.data$rowId) %>% + dplyr::summarise(value = sum(.data$values, na.rm = TRUE)) %>% + dplyr::select("rowId", "value") + + prediction <- as.data.frame(prediction) + prediction <- merge(cohort, prediction, by = "rowId", all.x = TRUE, fill = 0) + prediction$value[is.na(prediction$value)] <- 0 + prediction$value <- prediction$value + plpModel$model$intercept + + if (plpModel$model$finalMapping == "linear") { + prediction$value <- prediction$value + } else if (plpModel$model$finalMapping == "logistic") { + prediction$value <- 1 / (1 + exp(-prediction$value)) + } else if (plpModel$model$finalMapping == "square") { + prediction$value <- prediction$value^2 + } else if (plpModel$model$finalMapping == "exponential") { + prediction$value <- exp(prediction$value) + } + + attr(prediction, "metaData")$modelType <- "binary" + + delta <- Sys.time() - start + ParallelLogger::logInfo("Prediction took ", signif(delta, 3), " ", attr(delta, "units")) + return(prediction) +} + +#' createGlmModel +#' +#' @description +#' Create a generalized linear model that can be used in the +#' PatientLevelPrediction package. +#' @param coefficients A dataframe containing two columns, coefficients and +#' covariateId, both of type numeric. The covariateId column must contain +#' valid covariateIds that match those used in the /code{FeatureExtraction} +#' package. +#' @param intercept A numeric value representing the intercept of the model. +#' @param finalMapping A string representing the final mapping from the +#' linear predictors to outcome probabilities. For generalized linear models +#' this is the inverse of the link function. Supported values is only +#' "logistic" for logistic regression model at the moment. +#' @return A model object containing the model and the prediction function. +#' @export +createGlmModel <- function(coefficients, + intercept = 0, + finalMapping = "logistic") { + checkIsClass(coefficients, c("data.frame")) + if (!all(c("covariateId", "coefficient") %in% colnames(coefficients))) { + stop("coefficients must contain columns covariateId and coefficient") + } + checkIsClass(coefficients$covariateId, c("numeric")) + checkIsClass(coefficients$coefficient, c("numeric")) + checkHigherEqual(coefficients$covariateId, 0) + checkIsClass(intercept, c("numeric")) + + checkIsClass(finalMapping, c("character")) + if (finalMapping != "logistic") { + stop("finalMapping must be 'logistic'") + } + + plpModel <- list( + intercept = intercept, + coefficients = coefficients, + finalMapping = finalMapping, + predictionFunction = "PatientLevelPrediction::predictGlm" + ) + plpModel$modelType <- "GLM" + return(plpModel) +} diff --git a/R/GradientBoostingMachine.R b/R/GradientBoostingMachine.R index 7035ee44f..fc2a48151 100644 --- a/R/GradientBoostingMachine.R +++ b/R/GradientBoostingMachine.R @@ -204,9 +204,13 @@ fitXgboost <- function( hyperParameters, settings ){ - - if(!is.null(hyperParameters$earlyStopRound)){ + set.seed(settings$seed) + if (!is.null(hyperParameters$earlyStopRound)) { trainInd <- sample(nrow(dataMatrix), nrow(dataMatrix)*0.9) + if (sum(labels$outcomeCount[-trainInd]) == 0) { + stop("No outcomes in early stopping set, either increase size of training + set or turn off early stopping") + } train <- xgboost::xgb.DMatrix( data = dataMatrix[trainInd,, drop = F], label = labels$outcomeCount[trainInd] @@ -215,9 +219,9 @@ fitXgboost <- function( data = dataMatrix[-trainInd,, drop = F], label = labels$outcomeCount[-trainInd] ) - watchlist <- list(train=train, test=test) + watchlist <- list(train = train, test = test) - } else{ + } else { train <- xgboost::xgb.DMatrix( data = dataMatrix, label = labels$outcomeCount @@ -225,10 +229,9 @@ fitXgboost <- function( watchlist <- list() } - outcomes <- sum(labels$outcomeCount>0) + outcomes <- sum(labels$outcomeCount > 0) N <- nrow(labels) outcomeProportion <- outcomes/N - set.seed(settings$seed) model <- xgboost::xgb.train( data = train, params = list( @@ -240,7 +243,6 @@ fitXgboost <- function( lambda = hyperParameters$lambda, alpha = hyperParameters$alpha, objective = "binary:logistic", - #eval.metric = "logloss" base_score = outcomeProportion, eval_metric = "auc" ), diff --git a/_pkgdown.yml b/_pkgdown.yml index 763c23504..9c01644ba 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -273,5 +273,7 @@ reference: - title: "Other functions" contents: - predictCyclops + - predictGlm + - createGlmModel diff --git a/man/checkAllSameInModels.Rd b/man/checkAllSameInModels.Rd new file mode 100644 index 000000000..336de6c10 --- /dev/null +++ b/man/checkAllSameInModels.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExternalValidatePlp.R +\name{checkAllSameInModels} +\alias{checkAllSameInModels} +\title{checkAllSameInModels - Check if all settings are the same across models} +\usage{ +checkAllSameInModels(settingsList, settingName) +} +\arguments{ +\item{settingsList}{A list of settings to check} + +\item{settingName}{The name of the setting to check} +} +\description{ +checkAllSameInModels - Check if all settings are the same across models +} +\keyword{internal} diff --git a/man/checkValidateExternalInputs.Rd b/man/checkValidateExternalInputs.Rd new file mode 100644 index 000000000..4ae2f6e8a --- /dev/null +++ b/man/checkValidateExternalInputs.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExternalValidatePlp.R +\name{checkValidateExternalInputs} +\alias{checkValidateExternalInputs} +\title{checkValidateExternalInputs - Check the inputs for validateExternal} +\usage{ +checkValidateExternalInputs( + validationDesignList, + databaseDetails, + logSettings, + outputFolder +) +} +\arguments{ +\item{validationDesignList}{A list of validationDesign objects} + +\item{databaseDetails}{A list of databaseDetails objects} + +\item{logSettings}{An object of logSettings} + +\item{outputFolder}{The directory to save the validation results to} +} +\value{ +A list of inputs that were modified +} +\description{ +checkValidateExternalInputs - Check the inputs for validateExternal +} +\keyword{internal} diff --git a/man/createCohortCovariateSettings.Rd b/man/createCohortCovariateSettings.Rd index 98d3480eb..37ec35177 100644 --- a/man/createCohortCovariateSettings.Rd +++ b/man/createCohortCovariateSettings.Rd @@ -7,8 +7,8 @@ createCohortCovariateSettings( cohortName, settingId, - cohortDatabaseSchema, - cohortTable, + cohortDatabaseSchema = NULL, + cohortTable = NULL, cohortId, startDay = -30, endDay = 0, @@ -23,9 +23,11 @@ createCohortCovariateSettings( \item{settingId}{A unique id for the covariate time and} -\item{cohortDatabaseSchema}{The schema of the database with the cohort} +\item{cohortDatabaseSchema}{The schema of the database with the cohort. If +nothing is specified then the cohortDatabaseSchema from databaseDetails at runtime is used.} -\item{cohortTable}{the table name that contains the covariate cohort} +\item{cohortTable}{the table name that contains the covariate cohort. If +nothing is specified then the cohortTable from databaseDetails at runtime is used.} \item{cohortId}{cohort id for the covariate cohort} diff --git a/man/createGlmModel.Rd b/man/createGlmModel.Rd new file mode 100644 index 000000000..6a6509fd7 --- /dev/null +++ b/man/createGlmModel.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Glm.R +\name{createGlmModel} +\alias{createGlmModel} +\title{createGlmModel} +\usage{ +createGlmModel(coefficients, intercept = 0, finalMapping = "logistic") +} +\arguments{ +\item{coefficients}{A dataframe containing two columns, coefficients and +covariateId, both of type numeric. The covariateId column must contain +valid covariateIds that match those used in the /code{FeatureExtraction} +package.} + +\item{intercept}{A numeric value representing the intercept of the model.} + +\item{finalMapping}{A string representing the final mapping from the +linear predictors to outcome probabilities. For generalized linear models +this is the inverse of the link function. Supported values is only +"logistic" for logistic regression model at the moment.} +} +\value{ +A model object containing the model and the prediction function. +} +\description{ +Create a generalized linear model that can be used in the +PatientLevelPrediction package. +} diff --git a/man/createValidationDesign.Rd b/man/createValidationDesign.Rd index f54b6aa78..a3c9a55db 100644 --- a/man/createValidationDesign.Rd +++ b/man/createValidationDesign.Rd @@ -7,9 +7,9 @@ createValidationDesign( targetId, outcomeId, - populationSettings, - restrictPlpDataSettings, plpModelList, + populationSettings = NULL, + restrictPlpDataSettings = NULL, recalibrate = NULL, runCovariateSummary = TRUE ) @@ -19,16 +19,25 @@ createValidationDesign( \item{outcomeId}{The outcomeId of the outcome cohort to validate on} -\item{populationSettings}{A list of population restriction settings created by \code{createPopulationSettings}} +\item{plpModelList}{A list of plpModels objects created by \code{runPlp} or a path to such objects} -\item{restrictPlpDataSettings}{A list of plpData restriction settings created by \code{createRestrictPlpDataSettings}} +\item{populationSettings}{A list of population restriction settings created +by \code{createPopulationSettings}. Default is NULL and then this is taken +from the model} -\item{plpModelList}{A list of plpModels objects created by \code{runPlp} or a path to such objects} +\item{restrictPlpDataSettings}{A list of plpData restriction settings +created by \code{createRestrictPlpDataSettings}. Default is NULL and then +this is taken from the model.} \item{recalibrate}{A vector of characters specifying the recalibration method to apply,} \item{runCovariateSummary}{whether to run the covariate summary for the validation data} } + +\value{ +A validation design object of class \code{validationDesign} +} + \description{ createValidationDesign - Define the validation design for external validation } diff --git a/man/extractModelDesigns.Rd b/man/extractModelDesigns.Rd new file mode 100644 index 000000000..478906e79 --- /dev/null +++ b/man/extractModelDesigns.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExternalValidatePlp.R +\name{extractModelDesigns} +\alias{extractModelDesigns} +\title{extractModelDesigns - Extract all modelDesigns from a list of plpModels} +\usage{ +extractModelDesigns(plpModelList) +} +\arguments{ +\item{plpModelList}{A list of plpModels} +} +\value{ +A list of modelDesigns +} +\description{ +extractModelDesigns - Extract all modelDesigns from a list of plpModels +} +\keyword{internal} diff --git a/man/fromDesignOrModel.Rd b/man/fromDesignOrModel.Rd new file mode 100644 index 000000000..7ad7c2d1a --- /dev/null +++ b/man/fromDesignOrModel.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExternalValidatePlp.R +\name{fromDesignOrModel} +\alias{fromDesignOrModel} +\title{fromDesignOrModel - Check if the design has the setting, if not use the model's} +\usage{ +fromDesignOrModel(validationDesign, modelDesigns, settingName) +} +\arguments{ +\item{validationDesign}{The validationDesign object} + +\item{modelDesigns}{A list of modelDesign objects} + +\item{settingName}{The name of the setting to check} +} +\value{ +The updated design +} +\description{ +fromDesignOrModel - Check if the design has the setting, if not use the model's +} +\keyword{internal} diff --git a/man/getData.Rd b/man/getData.Rd new file mode 100644 index 000000000..175d75831 --- /dev/null +++ b/man/getData.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExternalValidatePlp.R +\name{getData} +\alias{getData} +\title{getData - Get the plpData for the validation} +\usage{ +getData(design, database, outputFolder, allCovSettings) +} +\arguments{ +\item{design}{The validationDesign object} + +\item{database}{The databaseDetails object} + +\item{outputFolder}{The directory to save the validation results to} + +\item{allCovSettings}{A list of covariateSettings from the models} +} +\value{ +The plpData object +} +\description{ +getData - Get the plpData for the validation +} +\keyword{internal} diff --git a/man/getPopulation.Rd b/man/getPopulation.Rd new file mode 100644 index 000000000..62cd94c59 --- /dev/null +++ b/man/getPopulation.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExternalValidatePlp.R +\name{getPopulation} +\alias{getPopulation} +\title{getPopulation - Get the population for the validationDesign} +\usage{ +getPopulation(validationDesign, modelDesigns, plpData) +} +\arguments{ +\item{validationDesign}{The validationDesign objects} + +\item{modelDesigns}{A list of modelDesign objects} + +\item{plpData}{The plpData object} +} +\value{ +The population dataframe +} +\description{ +getPopulation - Get the population for the validationDesign +} +\keyword{internal} diff --git a/man/predictGlm.Rd b/man/predictGlm.Rd new file mode 100644 index 000000000..e7e09a57b --- /dev/null +++ b/man/predictGlm.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Glm.R +\name{predictGlm} +\alias{predictGlm} +\title{predict using a logistic regression model} +\usage{ +predictGlm(plpModel, data, cohort) +} +\arguments{ +\item{plpModel}{An object of type \code{plpModel} - a patient level +prediction model} + +\item{data}{An object of type \code{plpData} - the patient level prediction +data extracted from the CDM.} + +\item{cohort}{The population dataframe created using +/code{createStudyPopulation} who will have their risks predicted or a cohort +without the outcome known} +} +\value{ +A dataframe containing the prediction for each person in the +population +} +\description{ +Predict risk with a given plpModel containing a generalized linear model. +} diff --git a/tests/testthat/helper-functions.R b/tests/testthat/helper-functions.R index 3a44091e8..7170cf2aa 100644 --- a/tests/testthat/helper-functions.R +++ b/tests/testthat/helper-functions.R @@ -31,4 +31,33 @@ createTinyPlpData <- function(plpData, plpResult, n= 20) { attributes(tinyPlpData)$metaData <- attributes(plpData)$metaData class(tinyPlpData) <- class(plpData) return(tinyPlpData) -} \ No newline at end of file +} + +createData <- function(observations, features, totalFeatures, + numCovs = FALSE, + outcomeRate = 0.5, + seed = 42) { + rowId <- rep(1:observations, each = features) + withr::with_seed(42, { + columnId <- sample(1:totalFeatures, observations * features, replace = TRUE) + }) + covariateValue <- rep(1, observations * features) + covariates <- data.frame(rowId = rowId, columnId = columnId, covariateValue = covariateValue) + if (numCovs) { + numRow <- 1:observations + numCol <- rep(totalFeatures + 1, observations) + withr::with_seed(seed, { + numVal <- runif(observations) + }) + numCovariates <- data.frame(rowId = as.integer(numRow), + columnId = as.integer(numCol), + covariateValue = numVal) + covariates <- rbind(covariates, numCovariates) + } + withr::with_seed(seed, { + labels <- as.numeric(sample(0:1, observations, replace = TRUE, prob = c(1 - outcomeRate, outcomeRate))) + }) + + data <- list(covariates = covariates, labels = labels) + return(data) +} diff --git a/tests/testthat/test-KNN.R b/tests/testthat/test-KNN.R index 23cc8486b..001e20f3b 100644 --- a/tests/testthat/test-KNN.R +++ b/tests/testthat/test-KNN.R @@ -21,7 +21,8 @@ test_that('KNN fit works', { test_that("KNN settings", { -skip_on_ci() + skip_on_ci() + model_set <- setKNN(k=5) testthat::expect_is(model_set, "modelSettings") testthat::expect_length(model_set,2) diff --git a/tests/testthat/test-rclassifier.R b/tests/testthat/test-rclassifier.R index 5a0dadc31..72f6d024d 100644 --- a/tests/testthat/test-rclassifier.R +++ b/tests/testthat/test-rclassifier.R @@ -116,3 +116,37 @@ test_that("GBM working checks", { expect_equal(sum(abs(fitModel$covariateImportance$covariateValue))>0, TRUE) }) + + +test_that("GBM without outcomes in early stopping set errors", { + hyperParameters <- list( + ntrees = 10, + earlyStopRound = 2, + maxDepth = 3, + learnRate = 0.1, + minChildWeight = 1, + scalePosWeight = 1, + lambda = 1, + alpha = 0 + ) + observations <- 100 + features <- 10 + data <- createData(observations = observations, features = features, + totalFeatures = 10, + numCovs = FALSE, outcomeRate = 0.05) + dataMatrix <- Matrix::sparseMatrix( + i = data$covariates %>% dplyr::pull("rowId"), + j = data$covariates %>% dplyr::pull("columnId"), + x = data$covariates %>% dplyr::pull("covariateValue"), + dims = c(observations,features) + ) + labels <- data.frame(outcomeCount = data$labels) + settings <- list(seed = 42, threads = 2) + expect_error(fitXgboost(dataMatrix = dataMatrix, + labels = labels, + hyperParameters = hyperParameters, + settings = settings), + regexp = "* or turn off early stopping") + +}) + diff --git a/tests/testthat/test-validation.R b/tests/testthat/test-validation.R index 84ee78516..ba3b793c8 100644 --- a/tests/testthat/test-validation.R +++ b/tests/testthat/test-validation.R @@ -73,3 +73,98 @@ test_that("external validate", { testthat::expect_equal(class(exVal[[1]]), 'externalValidatePlp') }) + +test_that("fromDesignOrModel helper works", { + + settingName <- "restrictPlpDataSettings" + validationDesign <- list(targetId = 1, + outcomeId = 2, + restrictPlpDataSettings = list(a = 1, b = 2) + ) + modelDesigns <- list( + list(targetId = 1, + outcomeId = 2, + restrictPlpDataSettings = list(a = 3, b = 4) + ), + list(targetId = 1, + outcomeId = 2, + restrictPlpDataSettings = list(a = 3, b = 4) + ) + ) + output <- fromDesignOrModel(validationDesign, modelDesigns, settingName) + + expect_equal(output[[settingName]], list(a = 1, b = 2)) + validationDesign[[settingName]] <- NULL + output <- fromDesignOrModel(validationDesign, modelDesigns, settingName) + expect_equal(output[[settingName]], list(a = 3, b = 4)) + +}) + +test_that("createValidationDesign errors", { + + expect_error(createValidationDesign(targetId = NULL, outcomeId = 2, + plpModelList = list())) + expect_error(createValidationDesign(targetId = 1, outcomeId = NULL, + plpModelList = list())) + expect_error(createValidationDesign(targetId = "a", outcomeId = 2, + plpModelList = list())) + expect_error(createValidationDesign(targetId = 1, outcomeId = "a", + plpModelList = list())) + expect_error(createValidationDesign(targetId = 1, outcomeId = 2, + plpModelList = list(), + populationSettings = list())) + expect_error(createValidationDesign(targetId = 1, outcomeId = 2, + plpModelList = list(), + recalibrate = 1)) + expect_error(createValidationDesign(targetId = 1, outcomeId = 2, + plpModelList = list(), + runCovariateSummary = 1)) +}) + +test_that("createValidationDesign works with minimal required arguments", { + targetId <- 1 + outcomeId <- 2 + plpModelList <- list() + + design <- createValidationDesign(targetId, outcomeId, plpModelList) + expect_s3_class(design, "validationDesign") + expect_equal(design$targetId, targetId) + expect_equal(design$outcomeId, outcomeId) + expect_equal(design$plpModelList, plpModelList) +}) + +test_that("createValidationDesign works with all arguments", { + targetId <- 1 + outcomeId <- 2 + plpModelList <- list("model1", "model2") + populationSettings <- createStudyPopulationSettings() + restrictPlpDataSettings <- createRestrictPlpDataSettings() # Replace with actual restrictPlpDataSettings object + recalibrate <- c("recalibrationInTheLarge") + runCovariateSummary <- FALSE + + design <- createValidationDesign(targetId, outcomeId, plpModelList, populationSettings, restrictPlpDataSettings, recalibrate, runCovariateSummary) + expect_s3_class(design[[1]], "validationDesign") + expect_equal(design[[1]]$targetId, targetId) + expect_equal(design[[1]]$outcomeId, outcomeId) + expect_equal(design[[1]]$plpModelList, plpModelList) + expect_equal(design[[1]]$populationSettings, populationSettings) + expect_equal(design[[1]]$restrictPlpDataSettings, restrictPlpDataSettings[[1]]) + expect_equal(design[[1]]$recalibrate, recalibrate) + expect_equal(design[[1]]$runCovariateSummary, runCovariateSummary) +}) + +test_that("createValidationDesigns correctly handles multiple restrictSettings", { + targetId <- 1 + outcomeId <- 2 + plpModelList <- list() + restrictPlpDataSettings <- list(createRestrictPlpDataSettings(), createRestrictPlpDataSettings()) + + design <- createValidationDesign(targetId, outcomeId, plpModelList, restrictPlpDataSettings = restrictPlpDataSettings) + expect_s3_class(design[[1]], "validationDesign") + expect_equal(design[[1]]$targetId, targetId) + expect_equal(design[[1]]$outcomeId, outcomeId) + expect_equal(design[[1]]$plpModelList, plpModelList) + expect_equal(design[[1]]$restrictPlpDataSettings, restrictPlpDataSettings[[1]]) + expect_equal(design[[2]]$restrictPlpDataSettings, restrictPlpDataSettings[[2]]) + expect_equal(length(design), length(restrictPlpDataSettings)) +}) \ No newline at end of file diff --git a/vignettes/BestPractices.rmd b/vignettes/BestPractices.rmd index bfc1bc792..1fba21806 100644 --- a/vignettes/BestPractices.rmd +++ b/vignettes/BestPractices.rmd @@ -23,11 +23,13 @@ output: number_sections: yes toc: yes --- + +```{=html} - +``` ## Best practice publications using the OHDSI PatientLevelPrediction framework