Skip to content

Commit

Permalink
update data processing
Browse files Browse the repository at this point in the history
  • Loading branch information
emprestige committed Sep 24, 2024
1 parent f9207b7 commit bcd30f4
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 45 deletions.
60 changes: 30 additions & 30 deletions analysis/data_processing.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,29 +92,29 @@ df_input <- df_input %>%
)
}

df_input$age_band <- as.factor(df_input$age_band)
df_input$age_band <- factor(df_input$age_band)

#data manipulation
df_input <- df_input %>%
mutate(
#assign ethnicity group
latest_ethnicity_group = as.factor(case_when(
latest_ethnicity_group = factor(case_when(
latest_ethnicity_code == "1" ~ "White",
latest_ethnicity_code == "2" ~ "Mixed",
latest_ethnicity_code == "3" ~ "Asian or Asian British",
latest_ethnicity_code == "4" ~ "Black or Black British",
latest_ethnicity_code == "5" ~ "Other Ethnic Groups",
TRUE ~ "Unknown")),
TRUE ~ "Unknown"), ordered = TRUE),
#calculate IMD quintile
imd_quintile = as.factor(case_when(
imd_quintile = factor(case_when(
imd_rounded >= 0 & imd_rounded < as.integer(32800 * 1 / 5) ~ "1 (most deprived)",
imd_rounded < as.integer(32800 * 2 / 5) ~ "2",
imd_rounded < as.integer(32800 * 3 / 5) ~ "3",
imd_rounded < as.integer(32800 * 4 / 5) ~ "4",
imd_rounded < as.integer(32800 * 5 / 5) ~ "5 (least deprived)",
TRUE ~ NA_character_)),
#format sex
sex = as.factor(case_when(
sex = factor(case_when(
sex == "female" ~ "Female",
sex == "male" ~ "Male",
sex == "intersex" ~ "Intersex",
Expand All @@ -129,7 +129,7 @@ logical_cols <- which(sapply(df_input, is.logical) & !grepl("primary|secondary|m
df_input <- df_input %>%
mutate(across(
.cols = all_of(logical_cols),
.fns = ~as.factor(case_when(
.fns = ~factor(case_when(
. == FALSE ~ "No",
. == TRUE ~ "Yes",
TRUE ~ NA_character_
Expand All @@ -148,19 +148,19 @@ df_input <- df_input %>%
"3" = "3", "4" = "3", "5" = "4", "6" = "4",
"7" = "5", "8" = "5", .missing = "Unknown"),
#define household size categories
household_size_cat = as.factor(case_when(
household_size_cat = factor(case_when(
household_size >= 1 & household_size <= 2 ~ "1",
household_size >= 3 & household_size <= 5 ~ "2",
household_size >= 6 ~ "3",
TRUE ~ "Unknown")),
#assign rurality classification
rurality_classification = as.factor(case_when(
rurality_classification = factor(case_when(
rurality_code == "1" ~ "Urban Major Conurbation",
rurality_code == "2" ~ "Urban Minor Conurbation",
rurality_code == "3" ~ "Urban City and Town",
rurality_code == "4" ~ "Rural Town and Fringe",
rurality_code == "5" ~ "Rural Village and Dispersed",
TRUE ~ "Unknown"))
TRUE ~ "Unknown"), ordered = TRUE)
)

#flu vaccination
Expand All @@ -170,7 +170,7 @@ if (cohort != "infants" & cohort != "infants_subgroup") {
#assign flu vaccination status
flu_vaccination_immunity_date = flu_vaccination_date + days(10),
#current flu vaccination status including a lag time
flu_vaccination = as.factor(if_else(
flu_vaccination = factor(if_else(
is.na(flu_vaccination_immunity_date), "No", "Yes"
))
)
Expand All @@ -180,7 +180,7 @@ if (cohort != "infants" & cohort != "infants_subgroup") {
if (study_start_date >= covid_prior_vacc_min & cohort != "infants" & cohort != "infants_subgroup") {
df_input <- df_input %>%
mutate(
time_since_last_covid_vaccination = as.factor(case_when(
time_since_last_covid_vaccination = factor(case_when(
time_length(difftime(study_start_date, last_covid_vaccination_date,
units = "days"), "months") >= 0 &
time_length(difftime(study_start_date, last_covid_vaccination_date,
Expand All @@ -199,31 +199,31 @@ if (study_start_date >= covid_current_vacc_min & cohort != "infants" & cohort !=
mutate(
covid_vaccination_immunity_date = covid_vaccination_date + days(10),
#current covid vaccination status including a lag time
covid_vaccination = as.factor(if_else(
covid_vaccination = factor(if_else(
is.na(covid_vaccination_immunity_date), "No", "Yes"
))
)
}

#re-level factors so they have reference categories for the regression models
df_input <- df_input %>%
mutate(
latest_ethnicity_group = fct_relevel(latest_ethnicity_group,
c("White", "Mixed", "Asian or Asian British",
"Black or Black British", "Other Ethnic Groups"))
) %>% arrange(latest_ethnicity_group)
df_input <- df_input %>%
mutate(
rurality_classification = fct_relevel(rurality_classification,
c("Urban Major Conurbation", "Urban Minor Conurbation",
"Urban City and Town", "Rural Town and Fringe",
"Rural Village and Dispersed", "Unknown"))
) %>% arrange(rurality_classification)
df_input <- df_input %>%
# #re-level factors so they have reference categories for the regression models
# df_input <- df_input %>%
# mutate(
# latest_ethnicity_group = fct_relevel(latest_ethnicity_group,
# c("White", "Mixed", "Asian or Asian British",
# "Black or Black British", "Other Ethnic Groups"))
# ) %>% arrange(latest_ethnicity_group)
# df_input <- df_input %>%
# mutate(
# rurality_classification = fct_relevel(rurality_classification,
# c("Urban Major Conurbation", "Urban Minor Conurbation",
# "Urban City and Town", "Rural Town and Fringe",
# "Rural Village and Dispersed", "Unknown"))
# ) %>% arrange(rurality_classification)
df_input <- df_input %>%
mutate(
composition_category = fct_relevel(composition_category,
c("Multiple of the Same Generation", "Living Alone",
"One Other Generation", "Two Other Generations",
composition_category = fct_relevel(composition_category,
c("Multiple of the Same Generation", "Living Alone",
"One Other Generation", "Two Other Generations",
"Three Other Generations"))
) %>% arrange(composition_category)

Expand Down
30 changes: 15 additions & 15 deletions analysis/sensitivity_analyses/data_processing_sens.R
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ df_input_filt <- df_input_filt %>%
latest_ethnicity_code == "3" ~ "Asian or Asian British",
latest_ethnicity_code == "4" ~ "Black or Black British",
latest_ethnicity_code == "5" ~ "Other Ethnic Groups",
TRUE ~ "Unknown")),
TRUE ~ "Unknown"), ordered = TRUE),
#calculate IMD quintile
imd_quintile = as.factor(case_when(
imd_rounded >= 0 & imd_rounded < as.integer(32800 * 1 / 5) ~ "1 (most deprived)",
Expand Down Expand Up @@ -173,7 +173,7 @@ df_input_filt <- df_input_filt %>%
rurality_code == "3" ~ "Urban City and Town",
rurality_code == "4" ~ "Rural Town and Fringe",
rurality_code == "5" ~ "Rural Village and Dispersed",
TRUE ~ "Unknown"))
TRUE ~ "Unknown"), ordered = TRUE)
)

#flu vaccination
Expand Down Expand Up @@ -219,19 +219,19 @@ if (study_start_date >= covid_current_vacc_min & cohort != "infants" & cohort !=
}

#re-level factors so they have reference categories for the regression models
df_input_filt <- df_input_filt %>%
mutate(
latest_ethnicity_group = fct_relevel(latest_ethnicity_group,
c("White", "Mixed", "Asian or Asian British",
"Black or Black British", "Other Ethnic Groups"))
) %>% arrange(latest_ethnicity_group)
df_input_filt <- df_input_filt %>%
mutate(
rurality_classification = fct_relevel(rurality_classification,
c("Urban Major Conurbation", "Urban Minor Conurbation",
"Urban City and Town", "Rural Town and Fringe",
"Rural Village and Dispersed", "Unknown"))
) %>% arrange(rurality_classification)
# df_input_filt <- df_input_filt %>%
# mutate(
# latest_ethnicity_group = fct_relevel(latest_ethnicity_group,
# c("White", "Mixed", "Asian or Asian British",
# "Black or Black British", "Other Ethnic Groups"))
# ) %>% arrange(latest_ethnicity_group)
# df_input_filt <- df_input_filt %>%
# mutate(
# rurality_classification = fct_relevel(rurality_classification,
# c("Urban Major Conurbation", "Urban Minor Conurbation",
# "Urban City and Town", "Rural Town and Fringe",
# "Rural Village and Dispersed", "Unknown"))
# ) %>% arrange(rurality_classification)
df_input_filt <- df_input_filt %>%
mutate(
composition_category = fct_relevel(composition_category,
Expand Down

0 comments on commit bcd30f4

Please sign in to comment.