From 01234332f6d3b55d4f88913e57fc4fd1537b9139 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Thu, 19 Sep 2024 20:16:13 -0600 Subject: [PATCH 01/10] refactor/simplify get_data_package code --- R/get_data_packages.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_data_packages.R b/R/get_data_packages.R index 10ddb1c..0f6bc17 100644 --- a/R/get_data_packages.R +++ b/R/get_data_packages.R @@ -112,7 +112,7 @@ get_data_packages <- function(reference_id, destination_dir <- paste("data/", reference_id[i], sep = "") #if the directory already exists, prompt user to overwrite: if(force == FALSE) { - if (file.exists(destination_dir) & force == FALSE){ + if (file.exists(destination_dir)){ cat("The directory ", crayon::blue$bold(destination_dir), " already exists.\n", From ffdb1f0d8cd40482088b72b580e0f1ae507704ff Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Thu, 19 Sep 2024 20:16:43 -0600 Subject: [PATCH 02/10] add @noRd to roxygen documentation for non-exported functions. --- R/load_core_metadata.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/load_core_metadata.R b/R/load_core_metadata.R index be2d518..b102711 100644 --- a/R/load_core_metadata.R +++ b/R/load_core_metadata.R @@ -132,6 +132,7 @@ load_core_metadata <- function(ds_ref, path = paste0(getwd(), "/data")){ #' #' @return dataframe #' @keywords private +#' @noRd #' #' @examples #' \dontrun{ @@ -199,6 +200,7 @@ load_core_metadata <- function(ds_ref, path = paste0(getwd(), "/data")){ #' #' @return dataframe #' @keywords private +#' @noRd #' #' @examples #' \dontrun{ From cdf04fb969a52a986234d9dd0ebc09644d04dfb2 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Thu, 19 Sep 2024 20:17:09 -0600 Subject: [PATCH 03/10] Add @keywords private and @noRd roxygen to functions that are not exported. --- R/utils.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index 84f77e1..3e88b00 100644 --- a/R/utils.R +++ b/R/utils.R @@ -29,7 +29,8 @@ assign("ds_dev_api", "https://irmadevservices.nps.gov/datastore-secure/v7/rest/" #' Prompts for, gets, and returns binary user input (1 or 2) #' #' @return Factor. 1 or 2. -#' +#' @keywords internal +#' @noRd #' @examples #' \dontrun{ #' var1 <- .get_user_input() From 32b17366d4aa9fa6e758db7caa1e0c92e0457855 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Thu, 19 Sep 2024 20:17:42 -0600 Subject: [PATCH 04/10] updated via devtools document and pkgdown::build_site_github_pages --- docs/articles/NPSutils.html | 2 +- docs/news/index.html | 2 +- docs/pkgdown.yml | 2 +- docs/reference/dot-get_authors.html | 110 ------------------------- docs/reference/dot-get_contacts.html | 109 ------------------------ docs/reference/dot-get_user_input.html | 97 ---------------------- docs/reference/index.html | 12 --- docs/sitemap.xml | 3 - man/dot-get_authors.Rd | 28 ------- man/dot-get_contacts.Rd | 26 ------ man/dot-get_user_input.Rd | 19 ----- 11 files changed, 3 insertions(+), 407 deletions(-) delete mode 100644 docs/reference/dot-get_authors.html delete mode 100644 docs/reference/dot-get_contacts.html delete mode 100644 docs/reference/dot-get_user_input.html delete mode 100644 man/dot-get_authors.Rd delete mode 100644 man/dot-get_contacts.Rd delete mode 100644 man/dot-get_user_input.Rd diff --git a/docs/articles/NPSutils.html b/docs/articles/NPSutils.html index af4a28f..7bbf9e3 100644 --- a/docs/articles/NPSutils.html +++ b/docs/articles/NPSutils.html @@ -79,7 +79,7 @@

NPSutils

+library(NPSutils)

NPS Data Store Utilities

diff --git a/docs/news/index.html b/docs/news/index.html index 7edd376..ff54202 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -61,7 +61,7 @@
-
  • added private functions .get_authors() and .get_contacts() to retrieve authors and contacts (and emails) from EML
  • +
    • added private functions .get_authors() and .get_contacts() to retrieve authors and contacts (and emails) from EML
    • added load_EML_df(), which retrieves commonly available metadata items from an EML-formatted R object and returns them as a single dataframe (for loading into Power BI)
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 2b2f88b..53d583e 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -3,4 +3,4 @@ pkgdown: 2.1.0 pkgdown_sha: ~ articles: NPSutils: NPSutils.html -last_built: 2024-08-27T19:22Z +last_built: 2024-09-19T22:29Z diff --git a/docs/reference/dot-get_authors.html b/docs/reference/dot-get_authors.html deleted file mode 100644 index e8aedfb..0000000 --- a/docs/reference/dot-get_authors.html +++ /dev/null @@ -1,110 +0,0 @@ - -Extracts authors and contact email addresses from EML metadata — .get_authors • NPSutils - - -
-
- - - -
-
- - -
-

`.get_authors()` extracts the "creators" element from EML metadata and returns it as a dataframe with three columsn, first a column indicating that each row is an author. Second, and column with the author's name (first last). Third, the author's email address.

-
- -
-
.get_authors(metadata)
-
- -
-

Arguments

- - -
metadata
-

an EML formatted R object

- -
-
-

Value

-

dataframe

-
-
-

Details

-

`r lifecycle::badge('experimental')`

-

There are some known issues with this function; unfortunately at this time we do not have example data packages to test them. These include: authors without a givenName, authors with more than two givenNames (e.g. multiple middle names), organizations as authors where there is no individualName.

-
- -
-

Examples

-
if (FALSE) { # \dontrun{
-authors <- get_authors(metadata)
-} # }
-
-
-
- -
- - -
- -
-

Site built with pkgdown 2.1.0.

-
- -
- - - - - - - - diff --git a/docs/reference/dot-get_contacts.html b/docs/reference/dot-get_contacts.html deleted file mode 100644 index 88a50e8..0000000 --- a/docs/reference/dot-get_contacts.html +++ /dev/null @@ -1,109 +0,0 @@ - -Extracts contacts and email addresses from EML metadata — .get_contacts • NPSutils - - -
-
- - - -
-
- - -
-

`.get_contacts()` extracts the "contacts" element from EML metadata and returns it as a dataframe with three columsn, first a column indicating that each row is an contact. Second, and column with the contact's name (first last). Third, the contact's email address.

-
- -
-
.get_contacts(metadata)
-
- -
-

Arguments

- - -
metadata
-

an EML formatted R object

- -
-
-

Value

-

dataframe

-
-
-

Details

-

`r lifecycle::badge('experimental')`

-
- -
-

Examples

-
if (FALSE) { # \dontrun{
-contacts <- get_contacts(metadata)
-} # }
-
-
-
- -
- - -
- -
-

Site built with pkgdown 2.1.0.

-
- -
- - - - - - - - diff --git a/docs/reference/dot-get_user_input.html b/docs/reference/dot-get_user_input.html deleted file mode 100644 index 2c99954..0000000 --- a/docs/reference/dot-get_user_input.html +++ /dev/null @@ -1,97 +0,0 @@ - -Get Binary User Input — .get_user_input • NPSutils - - -
-
- - - -
-
- - -
-

Prompts for, gets, and returns binary user input (1 or 2)

-
- -
-
.get_user_input()
-
- -
-

Value

-

Factor. 1 or 2.

-
- -
-

Examples

-
if (FALSE) { # \dontrun{
-var1 <- .get_user_input()
-} # }
-
-
-
- -
- - -
- -
-

Site built with pkgdown 2.1.0.

-
- -
- - - - - - - - diff --git a/docs/reference/index.html b/docs/reference/index.html index 05f11a3..183fbbc 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -65,18 +65,6 @@

All functions check_ref_exists()

Check whether a reference exists on DataStore

- -

.get_authors()

- -

Extracts authors and contact email addresses from EML metadata

- -

.get_contacts()

- -

Extracts contacts and email addresses from EML metadata

- -

.get_user_input()

- -

Get Binary User Input

get_data_packages() get_data_package()

diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 89b2506..0911b3e 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -10,9 +10,6 @@ /reference/check_is_data_package.html /reference/check_new_version.html /reference/check_ref_exists.html -/reference/dot-get_authors.html -/reference/dot-get_contacts.html -/reference/dot-get_user_input.html /reference/get_data_packages.html /reference/get_new_version_id.html /reference/get_park_code.html diff --git a/man/dot-get_authors.Rd b/man/dot-get_authors.Rd deleted file mode 100644 index 606aec3..0000000 --- a/man/dot-get_authors.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/load_core_metadata.R -\name{.get_authors} -\alias{.get_authors} -\title{Extracts authors and contact email addresses from EML metadata} -\usage{ -.get_authors(metadata) -} -\arguments{ -\item{metadata}{an EML formatted R object} -} -\value{ -dataframe -} -\description{ -`.get_authors()` extracts the "creators" element from EML metadata and returns it as a dataframe with three columsn, first a column indicating that each row is an author. Second, and column with the author's name (first last). Third, the author's email address. -} -\details{ -`r lifecycle::badge('experimental')` - -There are some known issues with this function; unfortunately at this time we do not have example data packages to test them. These include: authors without a givenName, authors with more than two givenNames (e.g. multiple middle names), organizations as authors where there is no individualName. -} -\examples{ -\dontrun{ -authors <- get_authors(metadata) -} -} -\keyword{private} diff --git a/man/dot-get_contacts.Rd b/man/dot-get_contacts.Rd deleted file mode 100644 index 401ce7e..0000000 --- a/man/dot-get_contacts.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/load_core_metadata.R -\name{.get_contacts} -\alias{.get_contacts} -\title{Extracts contacts and email addresses from EML metadata} -\usage{ -.get_contacts(metadata) -} -\arguments{ -\item{metadata}{an EML formatted R object} -} -\value{ -dataframe -} -\description{ -`.get_contacts()` extracts the "contacts" element from EML metadata and returns it as a dataframe with three columsn, first a column indicating that each row is an contact. Second, and column with the contact's name (first last). Third, the contact's email address. -} -\details{ -`r lifecycle::badge('experimental')` -} -\examples{ -\dontrun{ -contacts <- get_contacts(metadata) -} -} -\keyword{private} diff --git a/man/dot-get_user_input.Rd b/man/dot-get_user_input.Rd deleted file mode 100644 index 9560c1d..0000000 --- a/man/dot-get_user_input.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{.get_user_input} -\alias{.get_user_input} -\title{Get Binary User Input} -\usage{ -.get_user_input() -} -\value{ -Factor. 1 or 2. -} -\description{ -Prompts for, gets, and returns binary user input (1 or 2) -} -\examples{ -\dontrun{ -var1 <- .get_user_input() -} -} From bbb7cd82a841a90e8818bd53a3ed991bfaa78291 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Wed, 25 Sep 2024 15:47:47 -0600 Subject: [PATCH 05/10] first semi-working version of load_data_packages that calls column types using metadata attributes. --- R/load_data_packages.R | 207 +++++++++++++++++++++++++++++++++-------- 1 file changed, 167 insertions(+), 40 deletions(-) diff --git a/R/load_data_packages.R b/R/load_data_packages.R index e774334..e79f62c 100644 --- a/R/load_data_packages.R +++ b/R/load_data_packages.R @@ -21,15 +21,10 @@ #' } #' load_data_packages <- function(reference_id, - directory = here::here(), + directory = here::here("data"), assign_attributes = FALSE, simplify = TRUE){ - #capture original working directory - orig_wd <- getwd() - #set directory back to original working directory on exit. - on.exit(setwd(orig_wd)) - #set wd to path; defaults to wd. - setwd(directory) + #is user specifies "allData" get all directories from the data folder: if (reference_id == "all_data") { @@ -38,44 +33,176 @@ load_data_packages <- function(reference_id, recursive = FALSE) } + ### if only one data package is specified: ### fix how single data packages are handled later: - if(length( - seq_along( - reference_id)) == 1 - & reference_id != "all_data" - & simplify == TRUE) { - #return a tibble of data files - } - - - for(i in 1:seq_along(reference_id)){ - data_package_directory <- paste("data/", reference_id[i]) - filenames <- list.files( - path = data_package_directory, - pattern = data_format) - ## Create list of data frame names without the ".csv" part - names <- gsub(pattern = "\\.csv$", "", filenames) + if (assign_attributes == TRUE) { + tibble_list <- list() + for (h in 1:length(seq_along(reference_id))) { + + directory <- paste0(directory, "/", reference_id[h]) + #get csv file names: + filenames <- list.files(path = directory, + pattern = "*csv") + ## Create list of data frame names without the ".csv" part + names <- gsub(pattern = "\\.csv$", "", filenames) + + #load metadata: + metadata <- DPchecker::load_metadata(directory = directory) - ### Load all files into tibbles - reference_id[i] <- list() - for (j in names) { - filepath <- file.path(data_package_directory, paste(j, ".csv", sep = "")) - tibble_list[[i]] <- assign(j, - readr::read_csv(filepath, - show_col_types = FALSE)) + ### Load all files into tibbles + tibble <- list() + for (i in 1:length(seq_along(filenames))) { + file_path <- file.path(paste0(directory,"/", filenames[i])) + + #get attributes information from metadata: + # To do: specifically call dataTable by name, not position! ######### + dataTable <- metadata[["dataset"]][["dataTable"]][[i]] + attribs <- purrr::map_dfr(dataTable[["attributeList"]][["attribute"]], + tibble::as_tibble) + + attribs <- attribs %>% dplyr::mutate(R_data_type = dplyr::case_when( + storageType == "string" ~ "collector_character", + storageType == "date" ~ "collector_date", + storageType == "float" ~ "collector_double")) + + #get column specification as R would guess: + csv_cols <- readr::spec_csv(file_path) + + #set data types based on EML, simple: + for(j in 1:nrow(attribs)) { + class(csv_cols$cols[[j]]) <- attribs$R_data_type[[j]] + } + + #set date/time col type format string: + for(j in 1:nrow(attribs)) { + if("dateTime" %in% names(attribs$measurementScale[j])) { + eml_date <- + attribs$measurementScale[j][["dateTime"]][["formatString"]] + r_date <- QCkit::convert_datetime_format(eml_date) + csv_cols$cols[[j]]$format <- r_date + } + } + #set levels for factor call types: + for (j in 1:nrow(attribs)) { + if("nominal" %in% names(attribs$measurementScale[j])) { + nom <- attribs$measurementScale[j][["nominal"]] + if ("nonNumericDomain" %in% names(nom)) { + nom2 <- nom[["nonNumericDomain"]] + if ("enumeratedDomain" %in% names(nom2)) { + nom3 <- nom2[["enumeratedDomain"]] + if ("codeDefinition" %in% names(nom3)) { + nom4 <- nom3[["codeDefinition"]] + #get factors + factors <- NULL + #handle case where there is only one code definition + if ("code" %in% names(nom4)) { + nom4 <- list(nom4) + } + for (k in 1:length(seq_along(nom4))) { + factors <- append(factors, nom4[[k]][["code"]]) + } + #set column type: + csv_cols$cols[[j]] <- readr::col_factor(factors, + include_na = FALSE, + ordered = FALSE) + } + } + } + } + } + suppressWarnings(tibble_list[[i]] <- + assign(names[i], + readr::read_csv(file_path, + col_types = csv_cols, + show_col_types = FALSE) + ) + ) + names(tibble_list)[i] <- names[i] + } } } - - data_package_filename <- paste0(data_package_directory, "/", reference_id, - ".zip") + return(tibble_list) +} - if (data_format == "csv" & metadata_format == "eml") { - filelist <- utils::unzip(data_package_filename, list = TRUE) - if (assign_attributes == TRUE) { - #assign attributes using metadata via a yet-to-be-built sub-function. + + + + +get_attribute_type <- function(data_filename, + reference_id, + directory = here::here("data") + ){ + + metadata <- DPchecker::load_metadata(directory = paste0(directory, + "/", + reference_id)) + #get dataTable(s): + #if there is only one dataTable, put it in a list for consitency: + if("physical" %in% names(metadata$dataset$dataTable)) { + dataTable <- list(metadata$dataset$dataTable) + } else { + dataTable <- metadata$dataset$dataTable + } + # create a place to put attributes and information + attribute_list <- list() + #find the right dataTable: + for (i in 1:length(seq_along(dataTable))) { + if (dataTable[[i]][["physical"]][["objectName"]] == filename) { + #get attribute names: + attr_names <- unlist(dataTable[[i]])[grepl('attributeName', + names(unlist(dataTable[[i]])), + fixed=T)] + names(attr_names) <- NULL + + #get attribute storage types + attr_type <- unlist(dataTable[[i]])[grepl('storageType', + names(unlist(dataTable[[i]])), + fixed=T)] + names(attr_type) <- NULL + + #turn these into a dataframe: + filename_data <- tibble::as_tibble(data.frame(attr_names, attr_type)) + + + date_format <- unlist(dataTable[[i]])[grepl('formatString', + names(unlist(dataTable[[i]])), + fixed=T)] + names(date_format) <- NULL + + + filename_data2 <- filename_data %>% dplyr::mutate(date_format = dplyr::casewhen(attr_type == "date" ~ x)) + + + + filename_data1 <- filename_data %>% dplyr::mutate(attr_type_abbr = dplyr::case_when( + attr_type == "float" ~ "d", + attr_type == "date" ~ "T", + attr_type == "string" ~ "c" + )) + + + + + + + #add date formats to the dataframe: + #get date formats: + date_format <- unlist(dataTable[[i]])[grepl('formatString', + names(unlist(dataTable[[i]])), + fixed=T)] + names(date_format) <- NULL + + transform(filename_data, format = ifelse( (attr_type == "date"), "Y", "unk")) + + + + + + attribute_list[i] <- assign(attributeNames, + readr::read_csv(file_path, + show_col_types = FALSE)) } - return(fileList) - } else { - print("data/metadata format combination not supported") + } } + From 255def3cb86559b2eaf4f3cc7c66c392be348edf Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Wed, 2 Oct 2024 11:08:12 -0600 Subject: [PATCH 06/10] increment version, add QCkit to remotes, add QCkit, tibble, and purrr to dependencies --- DESCRIPTION | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f0d4e4a..2f09736 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: NPSutils Type: Package Title: Collection of Functions to read and manipulate information from the NPS DataStore -Version: 0.3.1 +Version: 0.3.2 Authors@R: c( person(given = "Robert", family = "Baker", email = "robert_baker@nps.gov", role = c("aut", "cre"), @@ -17,9 +17,10 @@ Description: NPSutils is a collection of functions for interacting with NPS Data License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -Remotes: +Remotes: nationalparkservice/EMLeditor, - nationalparkservice/DPchecker + nationalparkservice/DPchecker, + nationalparkservice/QCkit Imports: EML, sf, @@ -36,9 +37,12 @@ Imports: lifecycle, EMLeditor (>= 0.1.5), DPchecker (>= 0.3.4), + QCkit (>= 0.1.4), here, jsonlite, - cli + cli, + purrr, + tibble RoxygenNote: 7.3.2 Suggests: knitr, From 3508398c63e51d448d706c508aad01323f7a5919 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Wed, 2 Oct 2024 11:08:32 -0600 Subject: [PATCH 07/10] Add information about new load_data_packages functions --- NEWS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 1f59d6a..f312b22 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,6 @@ -# NPSutils 0.3.2 (in development) +# NPSutils 0.3.2 "Lost Coast" + * Add new functions, `load_data_packages()` and `load_data_package()`, which can load data packages (EML in .xml and data in .csv) similarly to the deprecated `load_data_package_deprecated()` function but also allows the data types in the tibbles loaded to be specified based on the information in the metadata. + * Deprecate `load_data_package()` and rename it to `load_data_package_deprecated()`. * Update readme to us pak for package installation instead of devtools. * Update _pkgdown.yml to use bootstrap 5 * added helper functions for API requests and user input to facilitate unit testing. From 8c80ce06a9558a895bf5bdcf82cd8546776ca010 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Wed, 2 Oct 2024 11:09:18 -0600 Subject: [PATCH 08/10] changed name of legacy load_data_package to load_data_package_deprecated; add deprecation badges and warnings. --- R/load_data_package.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/R/load_data_package.R b/R/load_data_package.R index 0f11d5d..5ae4073 100644 --- a/R/load_data_package.R +++ b/R/load_data_package.R @@ -1,6 +1,7 @@ #' Read contents of data package and constructs a list of tibbles based on the data file(s) #' -#' @description \code{load_data_package} reads the data file(s) from a package and loads it into a list of tibbles. Current implementation only supports .csv data files. +#' `r lifecycle::badge("deprecated")` +#' @description `load_data_package_deprecated()` reads the data file(s) from a package and loads it into a list of tibbles. Current implementation only supports .csv data files. #' #' @param reference_id is a 6-7 digit number corresponding to the reference ID of the data package. #' @@ -12,9 +13,13 @@ #' \dontrun{ #' load_data_package(2272461) #' } -load_data_package <- function(reference_id) { +load_data_package_deprecated <- function(reference_id) { data_package_directory <- paste("data/", reference_id, sep = "") data_package_filename <- paste(data_package_directory, ".zip", sep = "") + + lifecycle::deprecate_warn("0.3.2", + "load_data_pacakge_deprecated()", + "load_data_packages()") # Look for the zipped data package and attempt to unzip it. If the zipped file exists but cannot be unzipped, give the user a warning. If neither the unzipped nor zipped data packages exist, suggest the user check their working directory or use getDataPackage() to get the data package. if (!file.exists(data_package_directory)) { From fb7d4b488b748f691981e74bc6042fff2ba21509 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Wed, 2 Oct 2024 11:09:42 -0600 Subject: [PATCH 09/10] finish first draft of load_data_packages that can use metadata to call data types. --- R/load_data_packages.R | 304 +++++++++++++++++++---------------------- 1 file changed, 143 insertions(+), 161 deletions(-) diff --git a/R/load_data_packages.R b/R/load_data_packages.R index e79f62c..4069c5c 100644 --- a/R/load_data_packages.R +++ b/R/load_data_packages.R @@ -1,15 +1,13 @@ -#' Read contents of data package(s) and return a tibble with a tibble for each data file. -#' -#' `r lifecycle::badge("experimental")` +#' Read contents of data package(s) and return a list of tibbles list of tibbles based on the data file(s). Can use metadata to specify data types. #' -#' @description `load_data_packages()` loads one to may data packages and returns a tibble of tibbles where each data package is a tibble and within that each data file is it's own tibble. `load_data_packages()` will only work with .csv data files and EML metadata. `load_data_packages()` can also utilize the metadata to assign attributes to each data column. +#' @description `load_data_packages()` loads one to many data packages and returns a list. If only one data package is loaded, the list will be a list of tibbles where each tibble is a data (.csv) file from the data package. If multiple data packages are loaded, the list will be a list of lists where each nested list contains a list of tibble and each tibble is a data file (.csv). See `simplify` below for details on handling these lists. #' -#' @details currently `load_data_packages()` only supports EML metadata and .csv files. To take advantage of the default settings in load_data_packages, use the default settings in `get_data_package()` or `get_data_packages()`. Archived (.zip) files must be extracted before `load_data_packages()` will work properly. Again, `get_data_package()` or `get_data_packages()` will accomplish this for you. +#' @details currently `load_data_packages()` only supports EML metadata and .csv files. The reference_id #' ' -#' @param reference_id is a list of 6-7 digit numbers corresponding to the DataStore reference ID of the datapackage(s) to load. Alternatively, you can set `reference_id` to "load_all", which will load all the data packages in your /data folder. -#' @param directory is the location of a folder, 'data' (created during `get_data_packages()`) which contains sub-directories where each sub-directory is the DataStore referenceId of the data package. Again, this file structure is all set up using `get_data_packages()`. Defaults to the current working directory (which is the default location for `get_data_packages()`). -#' @param assign_attributes Logical. Defaults to FALSE. Data will be loaded using `readr::read_csv()` guessing algorithm for calling column types. If set to TRUE, column types will be set using metadata attributes via the yet-to-be written `load_metadata()` function. `r lifecycle::badge('experimental')` -#' @param simplify Logical. Defaults to TRUE. If there is only a single data package loaded, the function will return a simple list of tibbles (where each tibble reflects a data file from within the data package). If set to FALSE, the function will return a list that contains a list of tibbles. This structure mirrors the object structure returned if multiple data packages are simultaneously loaded (a list of data packages with each data package containing a list of tibbles where each tibble corresponds to a data file in the given data package). +#' @param reference_id the immediate directory/directories where your data packages reside. For data packages downloaded from DataStore using `get_data_package()` or `get_data_packages()` default settings, this is the DataStore reference ID for your data package(s). Alternatively, you can set `reference_id` to "`load_all`", which will load all the data packages in the directory specified in via `directory` (typically ./data). +#' @param directory is the location of a folder that contains all of the data packages (where data packages are a folder containing .csv data files and a single .xml EML metadata file). If these data packages were downloaded from DataStore using the default settings for `get_data_packages`, this folder is "./data" and you can use the default settings for `directory`. +#' @param assign_attributes Logical. Defaults to FALSE. Data will be loaded using `readr::read_csv()` guessing algorithm for calling column types. If you set to `assign_attributes = TRUE`, column types will be set using the data types specified in the metadata. Currently supported data types include string, dateTime, float, double, integer, and categorical (factor in R). This assignment is very stringent: for instance if you did not specify date-time formats using ISO-8601 notation (i.e. "YYYY", not "yyyy"), your data will import as NAs. If you have undefined missing values or blank cells, your data will not import at all. If you run into problems consider using the default settings and letting `read_csv` guess the column types. +#' @param simplify Logical. Defaults to TRUE. If `simplify = TRUE`, the function will return a list of tibbles where each tibble is a data file from the data package(s) specified. The tibbles are named using the following format: "pkg_% dplyr::mutate(R_data_type = dplyr::case_when( - storageType == "string" ~ "collector_character", - storageType == "date" ~ "collector_date", - storageType == "float" ~ "collector_double")) - - #get column specification as R would guess: - csv_cols <- readr::spec_csv(file_path) - - #set data types based on EML, simple: - for(j in 1:nrow(attribs)) { - class(csv_cols$cols[[j]]) <- attribs$R_data_type[[j]] - } + #To do: handle case when only one data file in the data package! + if (assign_attributes == TRUE) { + #load metadata: + metadata <- DPchecker::load_metadata(directory = directory1) + # when there is only one dataTable: + if ("physical" %in% names(metadata$dataset$dataTable)) { + dataTable <- metadata[["dataset"]][["dataTable"]] + } else { + for (j in 1:length(seq_along(metadata$dataset$dataTable))) { + if (filenames[i] %in% + metadata$dataset$dataTable[[j]]$physical$objectName) { + dataTable <- metadata[["dataset"]][["dataTable"]][[j]] + } + } + } + #turn the metadata into a useable tibble + attribs <- purrr::map_dfr(dataTable[["attributeList"]][["attribute"]], + tibble::as_tibble) + #map_dfr started double counting rows; fix it if it happens: + attribs <- attribs %>% dplyr::distinct(attributeName, + .keep_all = TRUE) + + attribs <- attribs %>% dplyr::mutate(R_data_type = dplyr::case_when( + storageType == "string" ~ "collector_character", + storageType == "date" ~ "collector_date", + storageType == "float" ~ "collector_double", + storageType == "double" ~ "collector_double", + storageType == "integer" ~ "collector_integer")) + + #get column specification as R would guess: + csv_cols <- readr::spec_csv(file_path) + + #set data types based on EML, simple: + for(j in 1:nrow(attribs)) { + class(csv_cols$cols[[j]]) <- attribs$R_data_type[[j]] + } - #set date/time col type format string: - for(j in 1:nrow(attribs)) { - if("dateTime" %in% names(attribs$measurementScale[j])) { - eml_date <- - attribs$measurementScale[j][["dateTime"]][["formatString"]] - r_date <- QCkit::convert_datetime_format(eml_date) - csv_cols$cols[[j]]$format <- r_date + #set date/time col type format string: + for(j in 1:nrow(attribs)) { + if("dateTime" %in% names(attribs$measurementScale[j])) { + eml_date <- + attribs$measurementScale[j][["dateTime"]][["formatString"]] + r_date <- QCkit::convert_datetime_format(eml_date) + csv_cols$cols[[j]]$format <- r_date + } } - } - #set levels for factor call types: - for (j in 1:nrow(attribs)) { - if("nominal" %in% names(attribs$measurementScale[j])) { - nom <- attribs$measurementScale[j][["nominal"]] - if ("nonNumericDomain" %in% names(nom)) { - nom2 <- nom[["nonNumericDomain"]] - if ("enumeratedDomain" %in% names(nom2)) { - nom3 <- nom2[["enumeratedDomain"]] - if ("codeDefinition" %in% names(nom3)) { - nom4 <- nom3[["codeDefinition"]] - #get factors - factors <- NULL - #handle case where there is only one code definition - if ("code" %in% names(nom4)) { - nom4 <- list(nom4) - } - for (k in 1:length(seq_along(nom4))) { - factors <- append(factors, nom4[[k]][["code"]]) + #set levels for factor call types: + for (j in 1:nrow(attribs)) { + if("nominal" %in% names(attribs$measurementScale[j])) { + nom <- attribs$measurementScale[j][["nominal"]] + if ("nonNumericDomain" %in% names(nom)) { + nom2 <- nom[["nonNumericDomain"]] + if ("enumeratedDomain" %in% names(nom2)) { + nom3 <- nom2[["enumeratedDomain"]] + if ("codeDefinition" %in% names(nom3)) { + nom4 <- nom3[["codeDefinition"]] + #get factors + factors <- NULL + #handle case where there is only one code definition + if ("code" %in% names(nom4)) { + nom4 <- list(nom4) + } + for (k in 1:length(seq_along(nom4))) { + factors <- append(factors, nom4[[k]][["code"]]) + } + #set column type: + csv_cols$cols[[j]] <- readr::col_factor(factors, + include_na = FALSE, + ordered = FALSE) } - #set column type: - csv_cols$cols[[j]] <- readr::col_factor(factors, - include_na = FALSE, - ordered = FALSE) } } } } - } - suppressWarnings(tibble_list[[i]] <- + suppressWarnings(package_data[[i]] <- + assign(names[i], + readr::read_csv(file_path, + col_types = csv_cols, + show_col_types = FALSE) + ) + ) + names(package_data)[i] <- names[i] + } else { + # Do not call attributes: + suppressWarnings(package_data[[i]] <- assign(names[i], readr::read_csv(file_path, - col_types = csv_cols, show_col_types = FALSE) - ) + ) ) - names(tibble_list)[i] <- names[i] + names(package_data)[i] <- names[i] } } + tibble_list[[h]] <- package_data + names(tibble_list)[[h]] <- paste0("pkg_", reference_id[h]) + } + #put all the tibbles in a single list that is not nested + #(simplifies subsequent extraction) + if (simplify == TRUE) { + tibble_list <- extract_tbl(tibble_list) } return(tibble_list) -} - - - - +} -get_attribute_type <- function(data_filename, - reference_id, - directory = here::here("data") - ){ +#' @export +#' @rdname load_data_packages +load_data_package <- function(reference_id, + directory = here::here("data"), + assign_attributes = FALSE, + simplify = TRUE) { - metadata <- DPchecker::load_metadata(directory = paste0(directory, - "/", - reference_id)) - #get dataTable(s): - #if there is only one dataTable, put it in a list for consitency: - if("physical" %in% names(metadata$dataset$dataTable)) { - dataTable <- list(metadata$dataset$dataTable) - } else { - dataTable <- metadata$dataset$dataTable - } - # create a place to put attributes and information - attribute_list <- list() - #find the right dataTable: - for (i in 1:length(seq_along(dataTable))) { - if (dataTable[[i]][["physical"]][["objectName"]] == filename) { - #get attribute names: - attr_names <- unlist(dataTable[[i]])[grepl('attributeName', - names(unlist(dataTable[[i]])), - fixed=T)] - names(attr_names) <- NULL - - #get attribute storage types - attr_type <- unlist(dataTable[[i]])[grepl('storageType', - names(unlist(dataTable[[i]])), - fixed=T)] - names(attr_type) <- NULL - - #turn these into a dataframe: - filename_data <- tibble::as_tibble(data.frame(attr_names, attr_type)) - - - date_format <- unlist(dataTable[[i]])[grepl('formatString', - names(unlist(dataTable[[i]])), - fixed=T)] - names(date_format) <- NULL - - - filename_data2 <- filename_data %>% dplyr::mutate(date_format = dplyr::casewhen(attr_type == "date" ~ x)) - - - - filename_data1 <- filename_data %>% dplyr::mutate(attr_type_abbr = dplyr::case_when( - attr_type == "float" ~ "d", - attr_type == "date" ~ "T", - attr_type == "string" ~ "c" - )) - - - - - - - #add date formats to the dataframe: - #get date formats: - date_format <- unlist(dataTable[[i]])[grepl('formatString', - names(unlist(dataTable[[i]])), - fixed=T)] - names(date_format) <- NULL - - transform(filename_data, format = ifelse( (attr_type == "date"), "Y", "unk")) - - - - - - attribute_list[i] <- assign(attributeNames, - readr::read_csv(file_path, - show_col_types = FALSE)) - } - - } + x <- load_data_packages(reference_id, + directory = here::here("data"), + assign_attributes = FALSE, + simplify = TRUE) + return(x) } +#' extract nested tibbles +#' +#' Adapted from stack overflow find_df function found at: +#' https://stackoverflow.com/questions/70512869/extract-data-frames-from-nested-list +#' And accessed on 2024-10-02 +#' +#' @param x a (potentially deeply) nested list containing at least one tibble +#' +#' @return a list where each item in the list is a tibble found in the nested list `x` +#' @keywords Internal +#' @noRd +#' +#' @examples +#' \dontrun{ +#' z <- .extract_tbl(x) +#' } +extract_tbl <- function(x) { + if (is_tibble(x)) + return(list(x)) + if (!is.list(x)) + return(NULL) + unlist(lapply(x, extract_tbl), FALSE) +} \ No newline at end of file From 77ac48dffe80dfddf4788f1e4c1f9181d660e019 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Wed, 2 Oct 2024 11:10:13 -0600 Subject: [PATCH 10/10] updated via devtools::document and pkgdown::build_site_github_pages --- NAMESPACE | 1 + docs/404.html | 2 +- docs/LICENSE-text.html | 2 +- docs/LICENSE.html | 2 +- docs/articles/NPSutils.html | 2 +- docs/articles/index.html | 2 +- docs/authors.html | 6 +- docs/index.html | 2 +- docs/news/index.html | 8 +- docs/pkgdown.yml | 2 +- docs/reference/NPSutils-package.html | 2 +- docs/reference/check_is_data_package.html | 2 +- docs/reference/check_new_version.html | 2 +- docs/reference/check_ref_exists.html | 2 +- docs/reference/get_data_packages.html | 2 +- docs/reference/get_new_version_id.html | 2 +- docs/reference/get_park_code.html | 2 +- docs/reference/get_park_taxon_citations.html | 2 +- docs/reference/get_park_taxon_refs.html | 2 +- docs/reference/get_park_taxon_url.html | 2 +- docs/reference/get_ref_info.html | 2 +- docs/reference/get_unit_code.html | 2 +- docs/reference/get_unit_code_info.html | 2 +- docs/reference/get_unit_info.html | 2 +- docs/reference/index.html | 10 +- docs/reference/load_data_package.html | 105 ------------------ docs/reference/load_data_packages.html | 28 +++-- docs/reference/load_domains.html | 2 +- docs/reference/load_pkg_metadata.html | 2 +- docs/reference/map_wkt.html | 2 +- docs/reference/rm_local_packages.html | 2 +- docs/reference/validate_data_package.html | 2 +- docs/sitemap.xml | 2 +- ...age.Rd => load_data_package_deprecated.Rd} | 11 +- man/load_data_packages.Rd | 26 +++-- 35 files changed, 81 insertions(+), 168 deletions(-) delete mode 100644 docs/reference/load_data_package.html rename man/{load_data_package.Rd => load_data_package_deprecated.Rd} (59%) diff --git a/NAMESPACE b/NAMESPACE index a2e51f2..683736c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(get_unit_code_info) export(get_unit_info) export(load_core_metadata) export(load_data_package) +export(load_data_package_deprecated) export(load_data_packages) export(load_domains) export(load_pkg_metadata) diff --git a/docs/404.html b/docs/404.html index dc40cd6..3949753 100644 --- a/docs/404.html +++ b/docs/404.html @@ -32,7 +32,7 @@ NPSutils - 0.3.1 + 0.3.2

diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index e0c578d..3db8c14 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/LICENSE.html b/docs/LICENSE.html index 4e75229..1ad2900 100644 --- a/docs/LICENSE.html +++ b/docs/LICENSE.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/articles/NPSutils.html b/docs/articles/NPSutils.html index 7bbf9e3..6d63125 100644 --- a/docs/articles/NPSutils.html +++ b/docs/articles/NPSutils.html @@ -32,7 +32,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/articles/index.html b/docs/articles/index.html index 7587471..42a4eb9 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/authors.html b/docs/authors.html index 2402ff8..b3dad51 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 @@ -74,13 +74,13 @@

Citation

Baker R, DeVivo J, Patterson J (2024). NPSutils: Collection of Functions to read and manipulate information from the NPS DataStore. -R package version 0.3.1, https://nationalparkservice.github.io/NPSutils/. +R package version 0.3.2, https://nationalparkservice.github.io/NPSutils/.

@Manual{,
   title = {NPSutils: Collection of Functions to read and manipulate information from the NPS DataStore},
   author = {Robert Baker and Joe DeVivo and Judd Patterson},
   year = {2024},
-  note = {R package version 0.3.1},
+  note = {R package version 0.3.2},
   url = {https://nationalparkservice.github.io/NPSutils/},
 }
diff --git a/docs/index.html b/docs/index.html index 4d69894..35b3854 100644 --- a/docs/index.html +++ b/docs/index.html @@ -33,7 +33,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/news/index.html b/docs/news/index.html index ff54202..3d53506 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 @@ -51,8 +51,10 @@

Changelog

- -
  • Update readme to us pak for package installation instead of devtools.
  • + +
    • Add new functions, load_data_packages() and load_data_package(), which can load data packages (EML in .xml and data in .csv) similarly to the deprecated load_data_package_deprecated() function but also allows the data types in the tibbles loaded to be specified based on the information in the metadata.
    • +
    • Deprecate load_data_package() and rename it to load_data_package_deprecated().
    • +
    • Update readme to us pak for package installation instead of devtools.
    • Update _pkgdown.yml to use bootstrap 5
    • added helper functions for API requests and user input to facilitate unit testing.
    • refactored get_data_packages() to take advantage of new helper functions.
    • diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 53d583e..a8e436a 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -3,4 +3,4 @@ pkgdown: 2.1.0 pkgdown_sha: ~ articles: NPSutils: NPSutils.html -last_built: 2024-09-19T22:29Z +last_built: 2024-10-02T17:03Z diff --git a/docs/reference/NPSutils-package.html b/docs/reference/NPSutils-package.html index 480e93e..da4d40b 100644 --- a/docs/reference/NPSutils-package.html +++ b/docs/reference/NPSutils-package.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2
diff --git a/docs/reference/check_is_data_package.html b/docs/reference/check_is_data_package.html index 1bc4bf8..e3c199e 100644 --- a/docs/reference/check_is_data_package.html +++ b/docs/reference/check_is_data_package.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/check_new_version.html b/docs/reference/check_new_version.html index b27c1b0..610ff5a 100644 --- a/docs/reference/check_new_version.html +++ b/docs/reference/check_new_version.html @@ -18,7 +18,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/check_ref_exists.html b/docs/reference/check_ref_exists.html index e3f8daf..7d0d038 100644 --- a/docs/reference/check_ref_exists.html +++ b/docs/reference/check_ref_exists.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/get_data_packages.html b/docs/reference/get_data_packages.html index a2f6384..f02241d 100644 --- a/docs/reference/get_data_packages.html +++ b/docs/reference/get_data_packages.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/get_new_version_id.html b/docs/reference/get_new_version_id.html index e4c44eb..7c70310 100644 --- a/docs/reference/get_new_version_id.html +++ b/docs/reference/get_new_version_id.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/get_park_code.html b/docs/reference/get_park_code.html index 12f797f..048af29 100644 --- a/docs/reference/get_park_code.html +++ b/docs/reference/get_park_code.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/get_park_taxon_citations.html b/docs/reference/get_park_taxon_citations.html index 3a7a1e8..c29de14 100644 --- a/docs/reference/get_park_taxon_citations.html +++ b/docs/reference/get_park_taxon_citations.html @@ -19,7 +19,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/get_park_taxon_refs.html b/docs/reference/get_park_taxon_refs.html index ea0426e..63575cd 100644 --- a/docs/reference/get_park_taxon_refs.html +++ b/docs/reference/get_park_taxon_refs.html @@ -21,7 +21,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/get_park_taxon_url.html b/docs/reference/get_park_taxon_url.html index 5db45c9..a714250 100644 --- a/docs/reference/get_park_taxon_url.html +++ b/docs/reference/get_park_taxon_url.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/get_ref_info.html b/docs/reference/get_ref_info.html index af6ad7b..4fcdb79 100644 --- a/docs/reference/get_ref_info.html +++ b/docs/reference/get_ref_info.html @@ -18,7 +18,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/get_unit_code.html b/docs/reference/get_unit_code.html index 38b2279..7f00270 100644 --- a/docs/reference/get_unit_code.html +++ b/docs/reference/get_unit_code.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/get_unit_code_info.html b/docs/reference/get_unit_code_info.html index dc325e7..2d75bfe 100644 --- a/docs/reference/get_unit_code_info.html +++ b/docs/reference/get_unit_code_info.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/get_unit_info.html b/docs/reference/get_unit_info.html index 71f2333..424e864 100644 --- a/docs/reference/get_unit_info.html +++ b/docs/reference/get_unit_info.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/index.html b/docs/reference/index.html index 183fbbc..a7e1335 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 @@ -110,13 +110,13 @@

All functions

Gets common EML metadata elements and puts them in a dataframe #' `r lifecycle::badge('experimental')`

-

load_data_package()

+

load_data_packages() load_data_package()

-

Read contents of data package and constructs a list of tibbles based on the data file(s)

+

Read contents of data package(s) and return a list of tibbles list of tibbles based on the data file(s). Can use metadata to specify data types.

-

load_data_packages()

+

load_data_package_deprecated()

-

Read contents of data package(s) and return a tibble with a tibble for each data file.

+

Read contents of data package and constructs a list of tibbles based on the data file(s)

load_domains()

diff --git a/docs/reference/load_data_package.html b/docs/reference/load_data_package.html deleted file mode 100644 index 923a297..0000000 --- a/docs/reference/load_data_package.html +++ /dev/null @@ -1,105 +0,0 @@ - -Read contents of data package and constructs a list of tibbles based on the data file(s) — load_data_package • NPSutils - - -
-
- - - -
-
- - -
-

load_data_package reads the data file(s) from a package and loads it into a list of tibbles. Current implementation only supports .csv data files.

-
- -
-
load_data_package(reference_id)
-
- -
-

Arguments

- - -
reference_id
-

is a 6-7 digit number corresponding to the reference ID of the data package.

- -
-
-

Value

-

a list of one or more tibbles contained within the data package to the global environment.

-
- -
-

Examples

-
if (FALSE) { # \dontrun{
-load_data_package(2272461)
-} # }
-
-
-
- -
- - -
- -
-

Site built with pkgdown 2.1.0.

-
- -
- - - - - - - - diff --git a/docs/reference/load_data_packages.html b/docs/reference/load_data_packages.html index 1e7afae..d9e69d5 100644 --- a/docs/reference/load_data_packages.html +++ b/docs/reference/load_data_packages.html @@ -1,5 +1,5 @@ -Read contents of data package(s) and return a tibble with a tibble for each data file. — load_data_packages • NPSutilsRead contents of data package(s) and return a list of tibbles list of tibbles based on the data file(s). Can use metadata to specify data types. — load_data_packages • NPSutils @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 @@ -46,19 +46,26 @@
-

`load_data_packages()` loads one to may data packages and returns a tibble of tibbles where each data package is a tibble and within that each data file is it's own tibble. `load_data_packages()` will only work with .csv data files and EML metadata. `load_data_packages()` can also utilize the metadata to assign attributes to each data column.

+

`load_data_packages()` loads one to many data packages and returns a list. If only one data package is loaded, the list will be a list of tibbles where each tibble is a data (.csv) file from the data package. If multiple data packages are loaded, the list will be a list of lists where each nested list contains a list of tibble and each tibble is a data file (.csv). See `simplify` below for details on handling these lists.

load_data_packages(
   reference_id,
-  directory = here::here(),
+  directory = here::here("data"),
+  assign_attributes = FALSE,
+  simplify = TRUE
+)
+
+load_data_package(
+  reference_id,
+  directory = here::here("data"),
   assign_attributes = FALSE,
   simplify = TRUE
 )
@@ -69,19 +76,19 @@

Arguments

reference_id
-

is a list of 6-7 digit numbers corresponding to the DataStore reference ID of the datapackage(s) to load. Alternatively, you can set `reference_id` to "load_all", which will load all the data packages in your /data folder.

+

the immediate directory/directories where your data packages reside. For data packages downloaded from DataStore using `get_data_package()` or `get_data_packages()` default settings, this is the DataStore reference ID for your data package(s). Alternatively, you can set `reference_id` to "`load_all`", which will load all the data packages in the directory specified in via `directory` (typically ./data).

directory
-

is the location of a folder, 'data' (created during `get_data_packages()`) which contains sub-directories where each sub-directory is the DataStore referenceId of the data package. Again, this file structure is all set up using `get_data_packages()`. Defaults to the current working directory (which is the default location for `get_data_packages()`).

+

is the location of a folder that contains all of the data packages (where data packages are a folder containing .csv data files and a single .xml EML metadata file). If these data packages were downloaded from DataStore using the default settings for `get_data_packages`, this folder is "./data" and you can use the default settings for `directory`.

assign_attributes
-

Logical. Defaults to FALSE. Data will be loaded using `readr::read_csv()` guessing algorithm for calling column types. If set to TRUE, column types will be set using metadata attributes via the yet-to-be written `load_metadata()` function. `r lifecycle::badge('experimental')`

+

Logical. Defaults to FALSE. Data will be loaded using `readr::read_csv()` guessing algorithm for calling column types. If you set to `assign_attributes = TRUE`, column types will be set using the data types specified in the metadata. Currently supported data types include string, dateTime, float, double, integer, and categorical (factor in R). This assignment is very stringent: for instance if you did not specify date-time formats using ISO-8601 notation (i.e. "YYYY", not "yyyy"), your data will import as NAs. If you have undefined missing values or blank cells, your data will not import at all. If you run into problems consider using the default settings and letting `read_csv` guess the column types.

simplify
-

Logical. Defaults to TRUE. If there is only a single data package loaded, the function will return a simple list of tibbles (where each tibble reflects a data file from within the data package). If set to FALSE, the function will return a list that contains a list of tibbles. This structure mirrors the object structure returned if multiple data packages are simultaneously loaded (a list of data packages with each data package containing a list of tibbles where each tibble corresponds to a data file in the given data package).

+

Logical. Defaults to TRUE. If `simplify = TRUE`, the function will return a list of tibbles where each tibble is a data file from the data package(s) specified. The tibbles are named using the following format: "pkg_<reference_id.filename" (without the filename extension). If you want to load each individual data file into R for further processing, use `simplify = TRUE` and then run `list2env(x, envir=.GlobalEnv)`. If you set `simplify = FALSE`, the object returned will either be a list of tibbles identical to that returned by `simplify = TRUE` (if only one data package is loaded) or will be a list of lists where each nested list is a contains one tibble for each data file in each data package.Setting `simplify = FALSE` may make it easier to do post-processing on a package-by-package level rather than a tibble-by-tibble level.

@@ -90,8 +97,7 @@

Value

Details

-

`r lifecycle::badge("experimental")`

-

currently `load_data_packages()` only supports EML metadata and .csv files. To take advantage of the default settings in load_data_packages, use the default settings in `get_data_package()` or `get_data_packages()`. Archived (.zip) files must be extracted before `load_data_packages()` will work properly. Again, `get_data_package()` or `get_data_packages()` will accomplish this for you. +

currently `load_data_packages()` only supports EML metadata and .csv files. The reference_id '

diff --git a/docs/reference/load_domains.html b/docs/reference/load_domains.html index 91b6a08..ef92238 100644 --- a/docs/reference/load_domains.html +++ b/docs/reference/load_domains.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2
diff --git a/docs/reference/load_pkg_metadata.html b/docs/reference/load_pkg_metadata.html index 421cc7a..d199c11 100644 --- a/docs/reference/load_pkg_metadata.html +++ b/docs/reference/load_pkg_metadata.html @@ -19,7 +19,7 @@ NPSutils - 0.3.1 + 0.3.2
diff --git a/docs/reference/map_wkt.html b/docs/reference/map_wkt.html index 22a6292..70771c7 100644 --- a/docs/reference/map_wkt.html +++ b/docs/reference/map_wkt.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/rm_local_packages.html b/docs/reference/rm_local_packages.html index db5ee0a..ed0de13 100644 --- a/docs/reference/rm_local_packages.html +++ b/docs/reference/rm_local_packages.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/reference/validate_data_package.html b/docs/reference/validate_data_package.html index 314bd5b..84fe751 100644 --- a/docs/reference/validate_data_package.html +++ b/docs/reference/validate_data_package.html @@ -17,7 +17,7 @@ NPSutils - 0.3.1 + 0.3.2 diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 0911b3e..80cfae3 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -22,8 +22,8 @@ /reference/get_unit_info.html /reference/index.html /reference/load_core_metadata.html -/reference/load_data_package.html /reference/load_data_packages.html +/reference/load_data_package_deprecated.html /reference/load_domains.html /reference/load_pkg_metadata.html /reference/map_wkt.html diff --git a/man/load_data_package.Rd b/man/load_data_package_deprecated.Rd similarity index 59% rename from man/load_data_package.Rd rename to man/load_data_package_deprecated.Rd index ff5ba7b..55fefad 100644 --- a/man/load_data_package.Rd +++ b/man/load_data_package_deprecated.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/load_data_package.R -\name{load_data_package} -\alias{load_data_package} +\name{load_data_package_deprecated} +\alias{load_data_package_deprecated} \title{Read contents of data package and constructs a list of tibbles based on the data file(s)} \usage{ -load_data_package(reference_id) +load_data_package_deprecated(reference_id) } \arguments{ \item{reference_id}{is a 6-7 digit number corresponding to the reference ID of the data package.} @@ -13,7 +13,10 @@ load_data_package(reference_id) a list of one or more tibbles contained within the data package to the global environment. } \description{ -\code{load_data_package} reads the data file(s) from a package and loads it into a list of tibbles. Current implementation only supports .csv data files. +`load_data_package_deprecated()` reads the data file(s) from a package and loads it into a list of tibbles. Current implementation only supports .csv data files. +} +\details{ +`r lifecycle::badge("deprecated")` } \examples{ \dontrun{ diff --git a/man/load_data_packages.Rd b/man/load_data_packages.Rd index 5e2e58e..6fd978c 100644 --- a/man/load_data_packages.Rd +++ b/man/load_data_packages.Rd @@ -2,34 +2,40 @@ % Please edit documentation in R/load_data_packages.R \name{load_data_packages} \alias{load_data_packages} -\title{Read contents of data package(s) and return a tibble with a tibble for each data file.} +\alias{load_data_package} +\title{Read contents of data package(s) and return a list of tibbles list of tibbles based on the data file(s). Can use metadata to specify data types.} \usage{ load_data_packages( reference_id, - directory = here::here(), + directory = here::here("data"), + assign_attributes = FALSE, + simplify = TRUE +) + +load_data_package( + reference_id, + directory = here::here("data"), assign_attributes = FALSE, simplify = TRUE ) } \arguments{ -\item{reference_id}{is a list of 6-7 digit numbers corresponding to the DataStore reference ID of the datapackage(s) to load. Alternatively, you can set `reference_id` to "load_all", which will load all the data packages in your /data folder.} +\item{reference_id}{the immediate directory/directories where your data packages reside. For data packages downloaded from DataStore using `get_data_package()` or `get_data_packages()` default settings, this is the DataStore reference ID for your data package(s). Alternatively, you can set `reference_id` to "`load_all`", which will load all the data packages in the directory specified in via `directory` (typically ./data).} -\item{directory}{is the location of a folder, 'data' (created during `get_data_packages()`) which contains sub-directories where each sub-directory is the DataStore referenceId of the data package. Again, this file structure is all set up using `get_data_packages()`. Defaults to the current working directory (which is the default location for `get_data_packages()`).} +\item{directory}{is the location of a folder that contains all of the data packages (where data packages are a folder containing .csv data files and a single .xml EML metadata file). If these data packages were downloaded from DataStore using the default settings for `get_data_packages`, this folder is "./data" and you can use the default settings for `directory`.} -\item{assign_attributes}{Logical. Defaults to FALSE. Data will be loaded using `readr::read_csv()` guessing algorithm for calling column types. If set to TRUE, column types will be set using metadata attributes via the yet-to-be written `load_metadata()` function. `r lifecycle::badge('experimental')`} +\item{assign_attributes}{Logical. Defaults to FALSE. Data will be loaded using `readr::read_csv()` guessing algorithm for calling column types. If you set to `assign_attributes = TRUE`, column types will be set using the data types specified in the metadata. Currently supported data types include string, dateTime, float, double, integer, and categorical (factor in R). This assignment is very stringent: for instance if you did not specify date-time formats using ISO-8601 notation (i.e. "YYYY", not "yyyy"), your data will import as NAs. If you have undefined missing values or blank cells, your data will not import at all. If you run into problems consider using the default settings and letting `read_csv` guess the column types.} -\item{simplify}{Logical. Defaults to TRUE. If there is only a single data package loaded, the function will return a simple list of tibbles (where each tibble reflects a data file from within the data package). If set to FALSE, the function will return a list that contains a list of tibbles. This structure mirrors the object structure returned if multiple data packages are simultaneously loaded (a list of data packages with each data package containing a list of tibbles where each tibble corresponds to a data file in the given data package).} +\item{simplify}{Logical. Defaults to TRUE. If `simplify = TRUE`, the function will return a list of tibbles where each tibble is a data file from the data package(s) specified. The tibbles are named using the following format: "pkg_