calculate_segregation_metrics updates

UI-Research · Jan 31, 2024 · 7084de6 · 7084de6
1 parent c930784
commit 7084de6
Show file tree

Hide file tree

Showing 3 changed files with 78 additions and 51 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: urbnindicators
 Type: Package
 Title: Out of the Box Social Science Indicators from the American Community Survey (ACS)
-Version: 0.0.001
+Version: 0.0.0.9001
 Authors@R: person("Will", "Curran-Groome", email = "[email protected]",
                   role = c("aut", "cre"))
 Description: There are many packages available that facilitate queries to the Census

diff --git a/R/calculate_segregation_metrics.R b/R/calculate_segregation_metrics.R
@@ -3,19 +3,23 @@
 #' @details Given data at a smaller geography (e.g., tract), `calculate_segregation_metrics()`
 #'    returns Mutual Information Index (M) values and associated p-values for a perfectly
 #'    nested larger geography (e.g., a county or state) as well as decomposed values for the
-#'    smaller geography (e.g., tract). Notethat all segregation calculations rely on `segregation`
+#'    smaller geography (e.g., tract). Note that all segregation calculations rely on `segregation`
 #'    and users should refer to that package at https://github.com/elbersb/segregation for
 #'    further implementation details.
 #' @param data A dataframe containing a `GEOID` column and the required input measures,
-#'    e.g., of race or income, at a single geography (e.g., tract), formatted wide.
-#'    the GEOID column must be a character column, and each GEOID must be unique.
-#'    inputted data cannot contain other measures. for example: tibble::tribble(
+#'    e.g., of race or income, at a single geography (e.g., tract). The `GEOID` column must
+#'    be a character column, and each `GEOID` must be unique. If data are formatted wide, there must be
+#'    at least two columns in addition to `GEOID`. If data are formatted long, there must be a single
+#'    columd in addition to `GEOID`. inputted data cannot contain other measures.For example:
+#'    tibble::tribble(
 #'        ~GEOID, ~race_nonhispanic_white_alone, ~race_nonhispanic_black_alone,
 #'        "37001020100", 2835, 1035,
 #'        "37001020200", 1205, 1321)
-#' @param nesting_geography_geoid_length The length of the GEOID that identifies nesting
+#' @param data_format Describe the structure of the inputted data. One of "wide" or "long". Data are
+#'    returned in the same format in which they are passed to the function.
+#' @param nesting_geography_geoid_length The length of the `GEOID` that identifies nesting
 #'    geographies. For example, if smaller_geography_data is defined at the tract
-#'    level (with a GEOID of length 11), then nesting_geography_geoid_length = 5 would
+#'    level (with a GEOID of length 11), then `nesting_geography_geoid_length = 5` would
 #'    return segregation metrics for counties (which have a GEOID of length 5) and
 #'    for tracts (relative to other tracts within the same county).
 #' @seealso Functions used for underlying segregation calculations are from the `segregation` package.
@@ -28,48 +32,46 @@
 #'   race_nonhispanic_native_alone_ = "B03002_005",
 #'   race_nonhispanic_asian_alone_ = "B03002_006",
 #'   race_nonhispanic_nhpi_alone_ = "B03002_007")
-#'
-#' test_data = tidycensus::get_acs(
+#' df_long = tidycensus::get_acs(
 #'   geography = "tract",
 #'   state = "SC",
 #'   variables = variables,
-#'   output = "wide") %>%
+#'   output = "tidy") %>%
 #'   # can only include a GEOID column and segregation-related measures
 #'   dplyr::select(-c(NAME, matches("_M$")))
-#'
-#' calculate_segregation_metrics(data = test_data, nesting_geography_geoid_length = 5)
+#' calculate_segregation_metrics(
+#'   data = df_long,
+#'   data_format = "long",
+#'   nesting_geography_geoid_length = 5)
 #' @export
 #' @importFrom magrittr %>%
-calculate_segregation_metrics = function(data, nesting_geography_geoid_length) {
+calculate_segregation_metrics = function(data, data_format, nesting_geography_geoid_length) {
+
+  ## There are only two formal options for this parameter; "tidy" is also accepted as a synonym for "long"
+  stopifnot(data_format %in% c("wide", "long", "tidy"))
+
+  if (data_format == "tidy") {
+    warning("data_format == `tidy` is not a formal argument option for the data_format parameter and has been translated to data_format == `long`.")
+    data_format = "long" }
 
   ## Provided data must contain a GEOID column.
   stopifnot("GEOID" %in% colnames(data))
 
-  ## All GEOIDs must be the same length
-  stopifnot(
-    data %>%
-      dplyr::mutate(geoid_length = nchar(GEOID)) %>%
-      dplyr::pull(geoid_length) %>%
-      unique %>% length == 1)
-
   geoid_length = data %>% dplyr::pull(GEOID) %>% .[1] %>% nchar
 
-  ## All GEOIDs must be unique
-  stopifnot(
-    (data %>%
-      dplyr::pull(GEOID) %>%
-      unique %>%
-      length) == data %>% nrow)
-
   ## The nesting geography GEOID must be shorter than the provided GEOID
   stopifnot(nesting_geography_geoid_length < geoid_length)
 
-  df_segregation = data %>%
-    tidyr::pivot_longer(
-      cols = -GEOID,
-      names_to = "variable",
-      values_to = "estimate") %>%
-    dplyr::mutate(nesting_geography_geoid = stringr::str_sub(GEOID, 1, nesting_geography_geoid_length))
+  if (data_format == "wide") {
+    df_segregation = data %>%
+      tidyr::pivot_longer(
+        cols = -GEOID,
+        names_to = "variable",
+        values_to = "estimate") %>%
+      dplyr::mutate(nesting_geography_geoid = stringr::str_sub(GEOID, 1, nesting_geography_geoid_length)) }
+  if (data_format == "long") {
+    df_segregation = data %>%
+      dplyr::mutate(nesting_geography_geoid = stringr::str_sub(GEOID, 1, nesting_geography_geoid_length)) }
 
   segregation_larger = segregation::mutual_within(
       data = df_segregation,
@@ -102,11 +104,12 @@ calculate_segregation_metrics = function(data, nesting_geography_geoid_length) {
 
   small_segregation_results = data %>%
     dplyr::select(GEOID) %>%
+    dplyr::distinct() %>%
     dplyr::left_join(segregation_smaller)
 
   number_error_geographies = small_segregation_results %>%
     dplyr::filter(is.na(segregation_small_geography)) %>%
-    nrow
+    nrow()
 
   if (number_error_geographies > 0) {
     input_data_missingness = data %>%
@@ -119,13 +122,27 @@ The remaining ", number_error_geographies - input_data_missingness, " observatio
 a single smaller geography within the larger geography (e.g., a
 county comprising a single tract).")) }
 
-  segregation_results = small_segregation_results %>%
-    dplyr::mutate(GEOID_larger = stringr::str_sub(GEOID, 1, nesting_geography_geoid_length)) %>%
-    dplyr::left_join(segregation_larger, by = c("GEOID_larger" = "nesting_geography_geoid")) %>%
-    dplyr::select(-GEOID_larger)
+  if (data_format == "long") {
+    segregation_results = dplyr::bind_rows(
+      segregation_larger %>%
+        dplyr::rename(
+          GEOID = nesting_geography_geoid,
+          segregation = segregation_large_geography,
+          segregation_p = p_large_geography),
+      small_segregation_results %>%
+        dplyr::rename(
+          segregation = segregation_small_geography,
+          segregation_p = p_small_geography)) }
+
+  if (data_format == "wide") {
+    segregation_results = small_segregation_results %>%
+      dplyr::mutate(GEOID_larger = stringr::str_sub(GEOID, 1, nesting_geography_geoid_length)) %>%
+      dplyr::left_join(segregation_larger, by = c("GEOID_larger" = "nesting_geography_geoid")) %>%
+      dplyr::select(-GEOID_larger) }
 
   return(segregation_results)
 }
 
 utils::globalVariables(c(
-  "nesting_geography_geoid", "H", "p", "segregation_small_geography", "GEOID_larger"))
+  "nesting_geography_geoid", "H", "p", "segregation_small_geography", "GEOID_larger",
+  "segregation_large_geography", "p_large_geography", "p_small_geography"))
diff --git a/man/calculate_segregation_metrics.Rd b/man/calculate_segregation_metrics.Rd