Skip to content

Commit

Permalink
Merge pull request #395 from OHDSI/overlap_extend
Browse files Browse the repository at this point in the history
merge or extend overlapping records
  • Loading branch information
edward-burn authored Dec 8, 2024
2 parents b37a4f6 + 291cd55 commit 2436a12
Show file tree
Hide file tree
Showing 4 changed files with 263 additions and 16 deletions.
98 changes: 91 additions & 7 deletions R/conceptCohort.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,23 @@
#' condition_start_date and condition_end_date for records coming
#' from the condition_occurrence tables). So that the resulting table
#' satisfies the requirements of an OMOP CDM cohort table:
#' * Overlapping records are collapsed into a single cohort entry.
#' * If a record starts outside of an observation period it will be
#' silently ignored.
#' * If a record ends outside of an observation period it will be
#' trimmed so as to end at the preceding observation period end date.
#' * Cohort entries will not overlap. Overlapping records will be
#' combined based on the overlap argument.
#' * Cohort entries will not go out of observation. If a record starts
#' outside of an observation period it will be silently ignored. If a
#' record ends outside of an observation period it will be trimmed so
#' as to end at the preceding observation period end date.
#'
#' @inheritParams cdmDoc
#' @inheritParams conceptSetDoc
#' @inheritParams nameDoc
#' @param exit How the cohort end date is defined. Can be either
#' "event_end_date" or "event_start_date".
#' @param overlap How to deal with overlapping records. In all
#' cases cohort start will be set as the earliest start date. If
#' "merge", cohort end will be the latest end date. If "extend",
#' cohort end date will be set by adding together the total days
#' from each of the overlapping records.
#' @param useSourceFields If TRUE, the source concept_id fields will also be
#' used when identifying relevant clinical records. If FALSE, only the standard
#' concept_id fields will be used.
Expand Down Expand Up @@ -57,6 +63,7 @@ conceptCohort <- function(cdm,
conceptSet,
name,
exit = "event_end_date",
overlap = "merge",
useSourceFields = FALSE,
subsetCohort = NULL,
subsetCohortId = NULL) {
Expand All @@ -65,6 +72,7 @@ conceptCohort <- function(cdm,
cdm <- omopgenerics::validateCdmArgument(cdm)
conceptSet <- omopgenerics::validateConceptSetArgument(conceptSet, cdm)
omopgenerics::assertChoice(exit, c("event_start_date", "event_end_date"))
omopgenerics::assertChoice(overlap, c("merge", "extend"), length = 1)
omopgenerics::assertLogical(useSourceFields, length = 1)

useIndexes <- getOption("CohortConstructor.use_indexes")
Expand Down Expand Up @@ -190,10 +198,20 @@ conceptCohort <- function(cdm,
cli::cli_inform(c("i" = "Applying cohort requirements."))
cdm[[name]] <- fulfillCohortReqs(cdm = cdm, name = name)

cli::cli_inform(c("i" = "Collapsing records."))
if(overlap == "merge"){
cli::cli_inform(c("i" = "Merging overlapping records."))
cdm[[name]] <- cdm[[name]] |>
joinOverlap(name = name, gap = 0) |>
omopgenerics::recordCohortAttrition(reason = "Collapse overlapping records")
omopgenerics::recordCohortAttrition(reason = "Merge overlapping records")
}

if(overlap == "extend"){
cli::cli_inform(c("i" = "Merging overlapping records."))
cdm[[name]] <- cdm[[name]] |>
extendOverlap(name = name) |>
omopgenerics::recordCohortAttrition(reason = "Add overlapping records")
}


cdm[[name]] <- omopgenerics::newCohortTable(table = cdm[[name]])

Expand Down Expand Up @@ -462,3 +480,69 @@ getDomainCohort <- function(cdm,
) |>
dplyr::compute(temporary = FALSE, name = name)
}

extendOverlap <- function(cohort,
name){

workingTblNames <- paste0(omopgenerics::uniqueTableName(), "_", c(1:4))

cdm <- omopgenerics::cdmReference(cohort)

cohort <- cohort %>%
dplyr::mutate(record_id = dplyr::row_number()) |>
dplyr::compute(temporary = FALSE,
name = workingTblNames[1])

# keep overlapping records
cohort_overlap <- cohort %>%
dplyr::inner_join(cohort,
by = c("cohort_definition_id", "subject_id"),
suffix = c("", "_overlap")) |>
dplyr::filter(
record_id != record_id_overlap,
cohort_start_date <= cohort_end_date_overlap &
cohort_end_date >= cohort_start_date_overlap
) |>
dplyr::select("cohort_definition_id", "subject_id",
"cohort_start_date", "cohort_end_date",
"record_id") |>
dplyr::distinct() |>
dplyr::compute(temporary = FALSE,
name = workingTblNames[2])

cohort_no_overlap <- cohort |>
dplyr::anti_join(cohort_overlap |>
dplyr::select("record_id"),
by = "record_id") |>
dplyr::select(!"record_id") |>
dplyr::compute(temporary = FALSE,
name = workingTblNames[3])

cohort_overlap <- cohort_overlap %>%
dplyr::mutate(days = !!CDMConnector::datediff("cohort_start_date",
"cohort_end_date")) |>
dplyr::group_by(dplyr::pick("cohort_definition_id",
"subject_id")) |>
dplyr::summarise(cohort_start_date = min(.data$cohort_start_date, na.rm = TRUE),
days = as.integer(sum(.data$days))) %>%
dplyr:: ungroup() %>%
dplyr::mutate(cohort_end_date = as.Date(
!!CDMConnector::dateadd(
date = "cohort_start_date",
number = "days",
interval = "day"
))) |>
dplyr::select(!"days") |>
dplyr::compute(temporary = FALSE,
name = workingTblNames[4])

cohort_updated <- dplyr::union_all(cohort_overlap,
cohort_no_overlap) |>
dplyr::compute(name = name, temporary = FALSE)

CDMConnector::dropTable(cdm = cdm,
name = workingTblNames)

cohort_updated

}
18 changes: 13 additions & 5 deletions man/conceptCohort.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions man/measurementCohort.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

156 changes: 155 additions & 1 deletion tests/testthat/test-conceptCohort.R
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ test_that("simple example", {
"reason_id" = 1:4L,
"reason" = c(
"Initial qualifying events", "Record start <= record end",
"Record in observation", "Collapse overlapping records"
"Record in observation", "Merge overlapping records"
),
"excluded_records" = c(0L, 0L, 0L, 5L),
"excluded_subjects" = 0L
Expand Down Expand Up @@ -747,6 +747,160 @@ test_that("missing event end dates", {
PatientProfiles::mockDisconnect(cdm)
})

test_that("overlap option", {
skip_on_cran()
cdm <- omock::mockPerson(nPerson = 1)
cdm <- omopgenerics::insertTable(
cdm = cdm, name = "observation_period", table = dplyr::tibble(
"observation_period_id" = c(1L, 2L),
"person_id" = c(1L, 2L),
"observation_period_start_date" = as.Date(c("2020-01-01")),
"observation_period_end_date" = as.Date(c("2020-01-30")),
"period_type_concept_id" = NA_integer_
))
cdm <- omopgenerics::insertTable(
cdm = cdm, name = "drug_exposure", table = dplyr::tibble(
"drug_exposure_id" = c(1L, 2L, 3L, 4L, 5L, 6L),
"person_id" = c(1L, 1L, 1L, 1L, 2L, 1L),
"drug_concept_id" = c(1L, 1L, 1L, 1L, 1L, 2L),
# 3 overlapping records
"drug_exposure_start_date" = as.Date(c("2020-01-01",
"2020-01-04",
"2020-01-05",
"2020-01-20",
"2020-01-01",
"2020-01-02")),
"drug_exposure_end_date" = as.Date(c("2020-01-06",
"2020-01-08",
"2020-01-06",
"2020-01-21",
"2020-01-21",
"2020-01-03")),
"drug_type_concept_id" = 1L
) )
cdm <- omopgenerics::insertTable(
cdm = cdm, name = "concept", table = dplyr::tibble(
"concept_id" = c(1L, 2L),
"concept_name" = c("concept 1", "concept 2"),
"domain_id" = "drug",
"vocabulary_id" = NA,
"concept_class_id" = NA,
"concept_code" = NA,
"valid_start_date" = NA,
"valid_end_date" = NA
)
)

cdm <- cdm |> copyCdm()

expect_no_error(cdm$cohort_1 <- conceptCohort(cdm = cdm,
conceptSet = list(a = 1L,
b = 2L),
name = "cohort_1",
exit = "event_end_date",
overlap = "merge"))
expect_true(nrow(cdm$cohort_1 |>
dplyr::collect()) == 4)
expect_true(all(sort(cdm$cohort_1 |>
dplyr::pull("cohort_start_date")) ==
as.Date(c("2020-01-01",
"2020-01-01",
"2020-01-02",
"2020-01-20"))))
expect_true(all(sort(cdm$cohort_1 |>
dplyr::pull("cohort_end_date")) ==
as.Date(c("2020-01-03",
"2020-01-08",
"2020-01-21",
"2020-01-21"))))

expect_no_error(cdm$cohort_2 <- conceptCohort(cdm = cdm,
conceptSet = list(a = 1L,
b = 2L),
name = "cohort_2",
exit = "event_end_date",
overlap = "extend"))
expect_true(all(sort(cdm$cohort_2 |>
dplyr::pull("cohort_start_date")) ==
as.Date(c("2020-01-01",
"2020-01-01",
"2020-01-02",
"2020-01-20"))))
expect_true(all(sort(cdm$cohort_2 |>
dplyr::pull("cohort_end_date")) ==
as.Date(c("2020-01-03",
"2020-01-11", # now 11 instead of 8 (3 days overlap)
"2020-01-21",
"2020-01-21"))))


# only overlapping records
cdm <- omopgenerics::insertTable(
cdm = cdm, name = "drug_exposure", table = dplyr::tibble(
"drug_exposure_id" = c(1L, 2L),
"person_id" = c(1L, 1L),
"drug_concept_id" = c(1L, 1L),
"drug_exposure_start_date" = as.Date(c("2020-01-01",
"2020-01-04")),
"drug_exposure_end_date" = as.Date(c("2020-01-06",
"2020-01-08")),
"drug_type_concept_id" = 1L
))

expect_no_error(cdm$cohort_3 <- conceptCohort(cdm = cdm,
conceptSet = list(a = 1L,
b = 2L),
name = "cohort_3",
exit = "event_end_date",
overlap = "extend"))
expect_true(nrow(cdm$cohort_3 |>
dplyr::collect()) == 1)
expect_true(all(sort(cdm$cohort_3 |>
dplyr::pull("cohort_start_date")) ==
as.Date(c("2020-01-01"))))
expect_true(all(sort(cdm$cohort_3 |>
dplyr::pull("cohort_end_date")) ==
as.Date(c("2020-01-10"))))

# no overlapping records
cdm <- omopgenerics::insertTable(
cdm = cdm, name = "drug_exposure", table = dplyr::tibble(
"drug_exposure_id" = c(1L, 2L),
"person_id" = c(1L, 1L),
"drug_concept_id" = c(1L, 1L),
"drug_exposure_start_date" = as.Date(c("2020-01-01",
"2020-01-05")),
"drug_exposure_end_date" = as.Date(c("2020-01-03",
"2020-01-10")),
"drug_type_concept_id" = 1L
))

expect_no_error(cdm$cohort_4 <- conceptCohort(cdm = cdm,
conceptSet = list(a = 1L,
b = 2L),
name = "cohort_4",
exit = "event_end_date",
overlap = "extend"))
expect_true(nrow(cdm$cohort_4 |>
dplyr::collect()) == 2)
expect_true(all(sort(cdm$cohort_4 |>
dplyr::pull("cohort_start_date")) ==
as.Date(c("2020-01-01", "2020-01-05"))))
expect_true(all(sort(cdm$cohort_4 |>
dplyr::pull("cohort_end_date")) ==
as.Date(c("2020-01-03", "2020-01-10"))))

# wrong input
expect_error(cdm$cohort_5 <- conceptCohort(cdm = cdm,
conceptSet = list(a = 1L),
name = "cohort_5",
exit = "event_end_date",
overlap = "another"))


PatientProfiles::mockDisconnect(cdm)
})

test_that("test indexes - postgres", {
skip_on_cran()
skip_if(Sys.getenv("CDM5_POSTGRESQL_DBNAME") == "")
Expand Down

0 comments on commit 2436a12

Please sign in to comment.