Skip to content

Commit

Permalink
calculate_segregation_metrics updates
Browse files Browse the repository at this point in the history
  • Loading branch information
wcurrangroome committed Jan 31, 2024
1 parent c930784 commit 7084de6
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 51 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: urbnindicators
Type: Package
Title: Out of the Box Social Science Indicators from the American Community Survey (ACS)
Version: 0.0.001
Version: 0.0.0.9001
Authors@R: person("Will", "Curran-Groome", email = "[email protected]",
role = c("aut", "cre"))
Description: There are many packages available that facilitate queries to the Census
Expand Down
93 changes: 55 additions & 38 deletions R/calculate_segregation_metrics.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,23 @@
#' @details Given data at a smaller geography (e.g., tract), `calculate_segregation_metrics()`
#' returns Mutual Information Index (M) values and associated p-values for a perfectly
#' nested larger geography (e.g., a county or state) as well as decomposed values for the
#' smaller geography (e.g., tract). Notethat all segregation calculations rely on `segregation`
#' smaller geography (e.g., tract). Note that all segregation calculations rely on `segregation`
#' and users should refer to that package at https://github.com/elbersb/segregation for
#' further implementation details.
#' @param data A dataframe containing a `GEOID` column and the required input measures,
#' e.g., of race or income, at a single geography (e.g., tract), formatted wide.
#' the GEOID column must be a character column, and each GEOID must be unique.
#' inputted data cannot contain other measures. for example: tibble::tribble(
#' e.g., of race or income, at a single geography (e.g., tract). The `GEOID` column must
#' be a character column, and each `GEOID` must be unique. If data are formatted wide, there must be
#' at least two columns in addition to `GEOID`. If data are formatted long, there must be a single
#' columd in addition to `GEOID`. inputted data cannot contain other measures.For example:
#' tibble::tribble(
#' ~GEOID, ~race_nonhispanic_white_alone, ~race_nonhispanic_black_alone,
#' "37001020100", 2835, 1035,
#' "37001020200", 1205, 1321)
#' @param nesting_geography_geoid_length The length of the GEOID that identifies nesting
#' @param data_format Describe the structure of the inputted data. One of "wide" or "long". Data are
#' returned in the same format in which they are passed to the function.
#' @param nesting_geography_geoid_length The length of the `GEOID` that identifies nesting
#' geographies. For example, if smaller_geography_data is defined at the tract
#' level (with a GEOID of length 11), then nesting_geography_geoid_length = 5 would
#' level (with a GEOID of length 11), then `nesting_geography_geoid_length = 5` would
#' return segregation metrics for counties (which have a GEOID of length 5) and
#' for tracts (relative to other tracts within the same county).
#' @seealso Functions used for underlying segregation calculations are from the `segregation` package.
Expand All @@ -28,48 +32,46 @@
#' race_nonhispanic_native_alone_ = "B03002_005",
#' race_nonhispanic_asian_alone_ = "B03002_006",
#' race_nonhispanic_nhpi_alone_ = "B03002_007")
#'
#' test_data = tidycensus::get_acs(
#' df_long = tidycensus::get_acs(
#' geography = "tract",
#' state = "SC",
#' variables = variables,
#' output = "wide") %>%
#' output = "tidy") %>%
#' # can only include a GEOID column and segregation-related measures
#' dplyr::select(-c(NAME, matches("_M$")))
#'
#' calculate_segregation_metrics(data = test_data, nesting_geography_geoid_length = 5)
#' calculate_segregation_metrics(
#' data = df_long,
#' data_format = "long",
#' nesting_geography_geoid_length = 5)
#' @export
#' @importFrom magrittr %>%
calculate_segregation_metrics = function(data, nesting_geography_geoid_length) {
calculate_segregation_metrics = function(data, data_format, nesting_geography_geoid_length) {

## There are only two formal options for this parameter; "tidy" is also accepted as a synonym for "long"
stopifnot(data_format %in% c("wide", "long", "tidy"))

if (data_format == "tidy") {
warning("data_format == `tidy` is not a formal argument option for the data_format parameter and has been translated to data_format == `long`.")
data_format = "long" }

## Provided data must contain a GEOID column.
stopifnot("GEOID" %in% colnames(data))

## All GEOIDs must be the same length
stopifnot(
data %>%
dplyr::mutate(geoid_length = nchar(GEOID)) %>%
dplyr::pull(geoid_length) %>%
unique %>% length == 1)

geoid_length = data %>% dplyr::pull(GEOID) %>% .[1] %>% nchar

## All GEOIDs must be unique
stopifnot(
(data %>%
dplyr::pull(GEOID) %>%
unique %>%
length) == data %>% nrow)

## The nesting geography GEOID must be shorter than the provided GEOID
stopifnot(nesting_geography_geoid_length < geoid_length)

df_segregation = data %>%
tidyr::pivot_longer(
cols = -GEOID,
names_to = "variable",
values_to = "estimate") %>%
dplyr::mutate(nesting_geography_geoid = stringr::str_sub(GEOID, 1, nesting_geography_geoid_length))
if (data_format == "wide") {
df_segregation = data %>%
tidyr::pivot_longer(
cols = -GEOID,
names_to = "variable",
values_to = "estimate") %>%
dplyr::mutate(nesting_geography_geoid = stringr::str_sub(GEOID, 1, nesting_geography_geoid_length)) }
if (data_format == "long") {
df_segregation = data %>%
dplyr::mutate(nesting_geography_geoid = stringr::str_sub(GEOID, 1, nesting_geography_geoid_length)) }

segregation_larger = segregation::mutual_within(
data = df_segregation,
Expand Down Expand Up @@ -102,11 +104,12 @@ calculate_segregation_metrics = function(data, nesting_geography_geoid_length) {

small_segregation_results = data %>%
dplyr::select(GEOID) %>%
dplyr::distinct() %>%
dplyr::left_join(segregation_smaller)

number_error_geographies = small_segregation_results %>%
dplyr::filter(is.na(segregation_small_geography)) %>%
nrow
nrow()

if (number_error_geographies > 0) {
input_data_missingness = data %>%
Expand All @@ -119,13 +122,27 @@ The remaining ", number_error_geographies - input_data_missingness, " observatio
a single smaller geography within the larger geography (e.g., a
county comprising a single tract).")) }

segregation_results = small_segregation_results %>%
dplyr::mutate(GEOID_larger = stringr::str_sub(GEOID, 1, nesting_geography_geoid_length)) %>%
dplyr::left_join(segregation_larger, by = c("GEOID_larger" = "nesting_geography_geoid")) %>%
dplyr::select(-GEOID_larger)
if (data_format == "long") {
segregation_results = dplyr::bind_rows(
segregation_larger %>%
dplyr::rename(
GEOID = nesting_geography_geoid,
segregation = segregation_large_geography,
segregation_p = p_large_geography),
small_segregation_results %>%
dplyr::rename(
segregation = segregation_small_geography,
segregation_p = p_small_geography)) }

if (data_format == "wide") {
segregation_results = small_segregation_results %>%
dplyr::mutate(GEOID_larger = stringr::str_sub(GEOID, 1, nesting_geography_geoid_length)) %>%
dplyr::left_join(segregation_larger, by = c("GEOID_larger" = "nesting_geography_geoid")) %>%
dplyr::select(-GEOID_larger) }

return(segregation_results)
}

utils::globalVariables(c(
"nesting_geography_geoid", "H", "p", "segregation_small_geography", "GEOID_larger"))
"nesting_geography_geoid", "H", "p", "segregation_small_geography", "GEOID_larger",
"segregation_large_geography", "p_large_geography", "p_small_geography"))
34 changes: 22 additions & 12 deletions man/calculate_segregation_metrics.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 7084de6

Please sign in to comment.