Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial PR for preschool update, adding historical years #429

Draft
wants to merge 9 commits into
base: version2025
Choose a base branch
from
6,469 changes: 6,469 additions & 0 deletions 08_education/calculate-prek-2014.html

Large diffs are not rendered by default.

6,469 changes: 6,469 additions & 0 deletions 08_education/calculate-prek-2016.html

Large diffs are not rendered by default.

7,789 changes: 7,789 additions & 0 deletions 08_education/calculate-prek-2018.html

Large diffs are not rendered by default.

6,459 changes: 6,459 additions & 0 deletions 08_education/calculate-prek-county-2014.html

Large diffs are not rendered by default.

6,708 changes: 6,708 additions & 0 deletions 08_education/calculate-prek-county-2016.html

Large diffs are not rendered by default.

6,700 changes: 6,700 additions & 0 deletions 08_education/calculate-prek-county-2018.html

Large diffs are not rendered by default.

11,426 changes: 8,856 additions & 2,570 deletions 08_education/data/final/metrics_preschool_county_all_longitudinal.csv

Large diffs are not rendered by default.

37,717 changes: 37,717 additions & 0 deletions 08_education/data/final/metrics_preschool_county_income_longitudinal.csv

Large diffs are not rendered by default.

42,264 changes: 36,847 additions & 5,417 deletions 08_education/data/final/metrics_preschool_county_race-ethnicity_longitudinal.csv

Large diffs are not rendered by default.

Large diffs are not rendered by default.

1,945 changes: 1,945 additions & 0 deletions 08_education/data/final/metrics_preschool_place_all_longitudinal_all.csv

Large diffs are not rendered by default.

5,833 changes: 5,833 additions & 0 deletions 08_education/data/final/metrics_preschool_place_income_longitudinal.csv

Large diffs are not rendered by default.

9,752 changes: 7,306 additions & 2,446 deletions 08_education/data/final/metrics_preschool_place_race-ethnicity_longitudinal.csv

Large diffs are not rendered by default.

Large diffs are not rendered by default.

6,976 changes: 6,976 additions & 0 deletions 08_education/preschool_county_calculate.html

Large diffs are not rendered by default.

1,198 changes: 1,198 additions & 0 deletions 08_education/preschool_county_calculate.qmd

Large diffs are not rendered by default.

1,841 changes: 1,139 additions & 702 deletions 08_education/preschool_place.html

Large diffs are not rendered by default.

648 changes: 24 additions & 624 deletions 08_education/preschool_place.qmd

Large diffs are not rendered by default.

6,936 changes: 6,936 additions & 0 deletions 08_education/preschool_place_calculate.html

Large diffs are not rendered by default.

1,187 changes: 1,187 additions & 0 deletions 08_education/preschool_place_calculate.qmd

Large diffs are not rendered by default.

88 changes: 88 additions & 0 deletions 08_education/run_preschool_years.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
---
title: "Run preschool"
date: today
format:html
execute:
warning: false
editor_options:
chunk_output_type: console
---

## Housekeeping

Import necessary libraries.

```{r}
library(quarto)
library(here)

```


## Render QMD Years

Select which years of new data you want to run the preschool place calculate file for.

```{r}

years = c("2014", "2016", "2018")
```

Update the quorto render function so it reads out in the correct folder

```{r}
quarto_render_move <- function(
input,
output_file = NULL,
output_dir = NULL,
...
) {

# Get all the input / output file names and paths
x <- quarto::quarto_inspect(input)
output_format <- names(x$formats)
output <- x$formats[[output_format]]$pandoc$`output-file`
if (is.null(output_file)) { output_file <- output }
input_dir <- dirname(input)
if (is.null(output_dir)) { output_dir <- input_dir }
output_path_from <- file.path(input_dir, output)
output_path_to <- file.path(output_dir, output_file)

# Render qmd file to input_dir
quarto::quarto_render(input = input, ... = ...)

# If output_dir is different from input_dir, copy the rendered output
# there and delete the original file
if (input_dir != output_dir) {

# Try to make the folder if it doesn't yet exist
if (!dir.exists(output_dir)) { dir.create(output_dir) }

# Now move the output to the output_dir and remove the original output
file.copy(
from = output_path_from,
to = output_path_to,
overwrite = TRUE
)
file.remove(output_path_from)

# If the output_dir is the same as input_dir, but the output_file
# has a different name from the input file, then just rename it
} else if (output_file != output) {
file.rename(from = output_path_from, to = output_path_to)
}
}
```

Render files.
```{r}

reports <-
tibble(
input = here::here("08_education", "preschool_county_calculate.qmd"),
output_file = glue::glue("calculate-prek-county-{years}.html"),
execute_params = map(years, ~ list(year = .))
)

pwalk(reports, quarto_render_move)
```
132 changes: 132 additions & 0 deletions functions/API/extract_ipums_aws.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# Add library here for filepath
library(here)
library(ipumsr)
library(aws.s3)
library(tidyverse)

extract_ipums_aws <- function(extract_name, extract_date, extract_description, survey){

# Set folder path, .gz, and .xml variables
folder_path <- here("data", "temp", "raw")
extract_gz_filename <- paste0(extract_name, "_umf.dat.gz")
extract_xml_filename <- paste0(extract_name, "_umf.xml")


#Check if file exists in AWS

if (aws.s3::object_exists(paste0(s3_dir, "/", extract_name, "_", extract_date, ".rds"), bucket = my_bucket)){

acs_imported <- s3read_using(FUN=readRDS,
bucket = my_bucket,
object=paste0(s3_dir, "/", extract_name, "_", extract_date, ".rds"))
} else{

# Create the folder path if it doesn't exist
if (!dir.exists(folder_path)) {
dir.create(folder_path, recursive = TRUE)
}

# Check if extract already exists in your directory. If it does this function will read in the existing data.

#If extract does not exist, create the extract using the IPUMS API
usa_ext_umf <-
define_extract_usa(
description = extract_description,
samples = c(survey),
variables = c(
"ADJUST",
"STATEFIP",
"PUMA",
"GQ",
"HHINCOME",
"AGE",
"EMPSTAT",
"VACANCY",
"PERNUM",
"RACE",
"HISPAN",
"EDUCD",
"GRADEATT",
"SEX",
"DIFFCARE",
"DIFFSENS",
"DIFFMOB",
"DIFFPHYS",
"DIFFREM",
"CBPERNUM"
)
)

#Submit the extract.
usa_ext_umf_submitted <- submit_extract(usa_ext_umf)

usa_ext_complete <- wait_for_extract(usa_ext_umf_submitted)

#The directory is set to download into the "raw" data folder inside of the universal data/temp. If the data already exists this step will be skipped.
filepath <-
download_extract(
usa_ext_umf_submitted,
download_dir = here(folder_path),
progress = TRUE
)

#Rename extract file
ipums_files <-
list.files(paste0(here(folder_path)), full.names = TRUE) %>%
as_tibble() %>%
filter(str_detect(value, "dat.gz|xml"), !str_detect(value, "umf")) %>%
pull()

file.rename(ipums_files, c(
here(folder_path, extract_gz_filename),
here(folder_path, extract_xml_filename)
))


# Read extract file
ddi <-
read_ipums_ddi(here(folder_path, extract_xml_filename))

micro_data <-
read_ipums_micro(
ddi,
data_file = here(folder_path, extract_gz_filename)
)

#DDI is a codebook that is used by IPUMSR to format the micro data downloaded
#Lower variable names and get rid of unnecessary variables
acs_imported <- micro_data %>%
rename_with(tolower) %>%
select(-serial, -raced, -strata, - cluster, -hispand, -empstatd)

rm(micro_data)

#Zap labels and reformat State and PUMA variable
acs_imported <- acs_imported %>%
mutate(
across(c(sample, gq, race, hispan), ~as_factor(.x)),
across(c(sample, gq, race, hispan, sex, diffcare, diffsens, diffmob, diffphys, diffrem), ~as_factor(.x)),
across(c(statefip, puma, hhincome, vacancy, age, empstat), ~zap_labels(.x)),
statefip = sprintf("%0.2d", as.numeric(statefip)),
puma = sprintf("%0.5d", as.numeric(puma)),
unique_person_id = paste0(sample, cbserial, cbpernum)
)

# my-bucket
my_bucket <- "mobility-from-poverty-test"

# write file to S3
tmp <- tempfile()
on.exit(unlink(tmp))
saveRDS(acs_imported, file = tmp, compress = TRUE)

# put object with an upload progress bar
put_object(tmp, object = paste0(s3_dir, "/", extract_name, "_", extract_date, ".rds"), bucket = my_bucket,
show_progress = TRUE, multipart = TRUE)

}
#Return the ACS data set
return(acs_imported)

}

123 changes: 123 additions & 0 deletions functions/API/ipums_repwt_pre-k_aws.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# API Pull Function
#
# Using the API, read in the IPUMS micro data replicate weights for individuals between 19 and 20 years old. To check on available surveys you can use the function get_sample_info('usa').
# This function allows the user to chose the survey year and type (for example 2021a is the 1-year ACS data).
#
# Function call: ipums_repwt_preschool
# Inputs:
# extract_name (str): the name of the extract that will be saved in the data/temp/raw folder
# extract_description (str): the metadata that will be attached to this extract
# survey (list of str): the list of survey
# Outputs:
# extract_name_umf.dat.gz in folder data/temp/raw
# extract_name_umf.xml in folder data/temp/raw
# folder data/temp/raw if it does not exist already
# Returns:
# acs_imported (tibble) containing the extract required for analysis

# Add library here for filepath
library(here)
library(ipumsr)
library(aws.s3)
library(tidyverse)

ipums_repwt_pre_k_aws <- function(extract_name, extract_date, extract_description, survey){

# Set folder path, .gz, and .xml variables
folder_path <- here("data", "temp", "raw")
extract_gz_filename <- paste0(extract_name, "_umf.dat.gz")
extract_xml_filename <- paste0(extract_name, "_umf.xml")

#Check if file exists in AWS

if (aws.s3::object_exists(paste0(s3_dir, "/", extract_name, "_", extract_date, ".rds"), bucket = my_bucket)){

acs_imported <- s3read_using(FUN=readRDS,
bucket = my_bucket,
object=paste0(s3_dir, "/", extract_name, "_", extract_date, ".rds"))
} else{

# Create the folder path if it doesn't exist
if (!dir.exists(folder_path)) {
dir.create(folder_path, recursive = TRUE)
}

# Check if extract already exists in your directory. If it does this function will read in the existing data.
if(!file.exists(here(folder_path, extract_gz_filename))){

#If extract does not exist, create the extract using the IPUMS API. Note for preschool readiness we only need 19 and 20 year olds.
usa_ext_umf <-
define_extract_usa(
description = extract_description,
samples = c(survey),
variables = list(
var_spec("AGE",
case_selections = c("3", "4")),
"REPWTP",
"CBPERNUM"
)
)

#Submit the extract.
usa_ext_umf_submitted <- submit_extract(usa_ext_umf)

usa_ext_complete <- wait_for_extract(usa_ext_umf_submitted)

#The directory is set to download into the "raw" data folder inside of the universal data/temp. If the data already exists this step will be skipped.
filepath <-
download_extract(
usa_ext_umf_submitted,
download_dir = here(folder_path),
progress = FALSE
)

#Rename extract file
ipums_files <-
list.files(paste0(here(folder_path)), full.names = TRUE) %>%
as_tibble() %>%
filter(str_detect(value, "dat.gz|xml"), !str_detect(value, "umf")) %>%
pull()

file.rename(ipums_files, c(
here(folder_path, extract_gz_filename),
here(folder_path, extract_xml_filename)
))

}

# Read extract file
ddi <-
read_ipums_ddi(here(folder_path, extract_xml_filename))

micro_data <-
read_ipums_micro(
ddi,
data_file = here(folder_path, extract_gz_filename)
)

#Lower variable names and get rid of unnecessary variables
acs_imported <- micro_data %>%
rename_with(tolower) %>%
select(-serial, -strata, -cluster, -year,
-pernum, -perwt, -hhwt, -gq, -age) %>%
mutate(sample = as_factor(sample),
unique_person_id = paste0(sample, cbserial, cbpernum))

# my-bucket
my_bucket <- "mobility-from-poverty-test"

# write file to S3
tmp <- tempfile()
on.exit(unlink(tmp))
saveRDS(acs_imported, file = tmp, compress = TRUE)

# put object with an upload progress bar
put_object(tmp, object = paste0(s3_dir, "/", extract_name, "_", extract_date, ".rds"), bucket = my_bucket,
show_progress = TRUE, multipart = TRUE)

}

#Return the ACS data set
return(acs_imported)

}
Loading