From 368782a8b2d9c3908cb885d0d5bd453e739ce51e Mon Sep 17 00:00:00 2001 From: Raphael Winkelmann Date: Fri, 30 Oct 2020 18:29:50 +0100 Subject: [PATCH] more flat data join stuff --- R/emuR-database.flatfiledata.R | 80 +++++++++++------ .../test_emuR-database.flatfiledata.R | 88 +++++++++++++------ 2 files changed, 111 insertions(+), 57 deletions(-) diff --git a/R/emuR-database.flatfiledata.R b/R/emuR-database.flatfiledata.R index cbe8229d..c5779313 100644 --- a/R/emuR-database.flatfiledata.R +++ b/R/emuR-database.flatfiledata.R @@ -1,4 +1,44 @@ -##' List sample rates of media and annotation (_annot.json) files +read_and_join_key_value_tsv <- function(emuDBhandle, + file, + x, + bundleName, + sessionName){ + if(file.exists(file)){ + key_value_tsv = readr::read_tsv(file, col_types = readr::cols()) + if(all(names(key_value_tsv) == c("key", "value"))){ + key_value_tsv_pivoted = tidyr::pivot_wider(key_value_tsv, names_from = "key", values_from = "value") + if(missing(bundleName) && missing(sessionName)){ + res = dplyr::full_join(x, key_value_tsv_pivoted, by = character()) + } else if(missing(bundleName) && !missing(sessionName)) { + # join by session + browser() + key_value_tsv_pivoted$session = sessionName + res = dplyr::left_join(x, key_value_tsv_pivoted, by = "session") + } + } else { + stop(path2tsv, " doesn't only contain the columns 'key' and 'value'. Only these two columns are permitted!") + } + return(res) + }else{ + return(x) + } +} + +read_and_join_long_tsv <- function(emuDBhandle, file, x){ + if(file.exists(file)){ + long_tsv = readr::read_tsv(file, col_types = readr::cols()) + if(all(c("session", "bundle") %in% names(long_tsv))){ + res = dplyr::left_join(x, long_tsv, by = c("bundle", "session")) + } else { + stop(file, " doesn't only contain the columns 'session' and 'value'. Only these two columns are permitted!") + } + return(res) + } else { + return(x) + } +} + +##' Join flat file .tsv data to x ##' ##' @param emuDBhandle emuDB handle object (see \link{load_emuDB}) ##' @param sessionPattern A regular expression pattern matching session names to be searched from the database @@ -28,44 +68,26 @@ join_tsvs <- function(emuDBhandle, & grepl(pattern = bundlePattern, x = all_bundles, perl = T) ] - - ############################## - # handle session level - - for(sessio_name in unique(all_bundles$session)){ - # get keyValue tsv file on session level - path2tsv = file.path(emuDBhandle$basePath, sessio_name, paste0(sessio_name, "_keyValue.", "tsv")) - } - ############################## # handle emuDB level # get keyValue tsv file on emuDB level - #browser() path2tsv = file.path(emuDBhandle$basePath, paste0(emuDBhandle$dbName, "_keyValue.", "tsv")) - if(file.exists(path2tsv)){ - key_value_tsv = readr::read_tsv(path2tsv, col_types = readr::cols()) - if(all(names(key_value_tsv) == c("key", "value"))){ - res = dplyr::full_join(x, key_value_tsv, by = character()) - } else { - stop(path2tsv, " doesn't only contain the columns 'key' and 'value'. Only these two columns are permitted!") - } - - } + x = read_and_join_key_value_tsv(emuDBhandle, file = path2tsv, x = x) # get long tsv file on emuDB level path2tsv = file.path(emuDBhandle$basePath, paste0(emuDBhandle$dbName, "_long.", "tsv")) - if(file.exists(path2tsv)){ - long_tsv = readr::read_tsv(path2tsv, col_types = readr::cols()) - if(all(c("session", "bundle") %in% names(long_tsv))){ - res = dplyr::left_join(res, long_tsv, by = c("bundle", "session")) - } else { - stop(path2tsv, " doesn't only contain the columns 'key' and 'value'. Only these two columns are permitted!") - } - } + x = read_and_join_long_tsv(emuDBhandle, file = path2tsv, x) + ############################## + # handle session level + for(session_name in unique(all_bundles$session)){ + # get keyValue tsv file on session level + path2tsv = file.path(emuDBhandle$basePath, paste0(session_name, session.suffix), paste0(session_name, "_keyValue.", "tsv")) + x = read_and_join_key_value_tsv(emuDBhandle, file = path2tsv, x = x, sessionName = session_name) + } - return(res) + return(x) } diff --git a/tests/testthat/test_emuR-database.flatfiledata.R b/tests/testthat/test_emuR-database.flatfiledata.R index b31fd27a..8d751a43 100644 --- a/tests/testthat/test_emuR-database.flatfiledata.R +++ b/tests/testthat/test_emuR-database.flatfiledata.R @@ -16,15 +16,25 @@ unlink(path2db, recursive = T) file.copy(path2orig, path2testData, recursive = T) + +# create a second session +dir.create(file.path(path2db, "0001_ses")) +file.copy(from = list.files(file.path(path2db, "0000_ses"), full.names = T), + to = file.path(path2db, "0001_ses"), + recursive = T) + +# extract internalVars from environment .emuR_pkgEnv +internalVars = get("internalVars", + envir = .emuR_pkgEnv) + db = load_emuDB(path2db, inMemoryCache = internalVars$testingVars$inMemoryCache, verbose = F) - test_that("join_tsvs works on emuDB level", { # key value emuDB data - flat_data = tibble::tibble(key = c("location of creation", "institution"), value = c("Muenchen", "IPS")) + flat_data = tibble::tibble(key = c("location", "institution"), value = c("Muenchen", "IPS")) readr::write_tsv(x = flat_data, file = file.path(db$basePath, paste0(db$dbName, "_keyValue.", "tsv"))) @@ -32,29 +42,11 @@ test_that("join_tsvs works on emuDB level", { sl_joined = join_tsvs(db, sl) - expect_true(all(c("key", "value") %in% names(sl_joined))) - - expect_equal(sl_joined$key, c("location of creation", - "institution", - "location of creation", - "institution", - "location of creation", - "institution", - "location of creation", - "institution", - "location of creation", - "institution")) - - expect_equal(sl_joined$value, c("Muenchen", - "IPS", - "Muenchen", - "IPS", - "Muenchen", - "IPS", - "Muenchen", - "IPS", - "Muenchen", - "IPS")) + expect_true(all(c("location", "institution") %in% names(sl_joined))) + + expect_equal(sl_joined$location, rep("Muenchen", 10)) + + expect_equal(sl_joined$institution, rep("IPS", 10)) # long emuDB data # specify session and bundle @@ -65,8 +57,48 @@ test_that("join_tsvs works on emuDB level", { readr::write_tsv(x = long_data, file = file.path(db$basePath, paste0(db$dbName, "_long.", "tsv"))) sl_joined = join_tsvs(db, sl) - # 4 NAs - expect_equal(length(which(is.na(sl_joined$eyecolor))), 4) + expect_equal(length(which(is.na(sl_joined$eyecolor))), 7) +}) + +test_that("join_tsvs works on session level", { + + # key value session data + flat_data = tibble::tibble(key = c("location", "fudge", "speed"), value = c("Muenchen", "yummy", "fast")) + + readr::write_tsv(x = flat_data, file = file.path(db$basePath, "0000_ses", paste0("0000", "_keyValue.", "tsv"))) + + sl = query(db, "Phonetic == S") + + sl_joined = join_tsvs(db, sl) + + expect_true(all(c("location.x", "location.y") %in% names(sl_joined))) + expect_equal(length(which(is.na(sl_joined$location.x))), 5) + expect_equal(length(which(is.na(sl_joined$fudge))), 5) + expect_equal(length(which(is.na(sl_joined$speed))), 5) -}) \ No newline at end of file + # + # expect_equal(sl_joined$location, c("Muenchen", + # "Muenchen", + # "Muenchen", + # "Muenchen", + # "Muenchen")) + # + # expect_equal(sl_joined$institution, c("IPS", + # "IPS", + # "IPS", + # "IPS", + # "IPS")) + # + # # long emuDB data + # # specify session and bundle + # long_data = tibble::tibble(session = c("0000", "0000"), + # bundle = c("msajc003", "msajc012"), + # eyecolor = c("blue", "brown")) + # + # readr::write_tsv(x = long_data, file = file.path(db$basePath, paste0(db$dbName, "_long.", "tsv"))) + # + # sl_joined = join_tsvs(db, sl) + # # 4 NAs + # expect_equal(length(which(is.na(sl_joined$eyecolor))), 2) +})