update data processing

opensafely · Sep 24, 2024 · bcd30f4 · bcd30f4
1 parent f9207b7
commit bcd30f4
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 45 deletions.
diff --git a/analysis/data_processing.R b/analysis/data_processing.R
@@ -92,29 +92,29 @@ df_input <- df_input %>%
     )
 }
 
-df_input$age_band <- as.factor(df_input$age_band)
+df_input$age_band <- factor(df_input$age_band)
 
 #data manipulation
 df_input <- df_input %>%
   mutate(
     #assign ethnicity group
-    latest_ethnicity_group = as.factor(case_when(
+    latest_ethnicity_group = factor(case_when(
       latest_ethnicity_code == "1" ~ "White",
       latest_ethnicity_code == "2" ~ "Mixed",
       latest_ethnicity_code == "3" ~ "Asian or Asian British",
       latest_ethnicity_code == "4" ~ "Black or Black British",
       latest_ethnicity_code == "5" ~ "Other Ethnic Groups",
-      TRUE ~ "Unknown")),
+      TRUE ~ "Unknown"), ordered = TRUE),
     #calculate IMD quintile
-    imd_quintile = as.factor(case_when(
+    imd_quintile = factor(case_when(
       imd_rounded >= 0 & imd_rounded < as.integer(32800 * 1 / 5) ~ "1 (most deprived)",
       imd_rounded < as.integer(32800 * 2 / 5) ~ "2",
       imd_rounded < as.integer(32800 * 3 / 5) ~ "3",
       imd_rounded < as.integer(32800 * 4 / 5) ~ "4",
       imd_rounded < as.integer(32800 * 5 / 5) ~ "5 (least deprived)",
       TRUE ~ NA_character_)),
     #format sex
-    sex = as.factor(case_when(
+    sex = factor(case_when(
       sex == "female" ~ "Female",
       sex == "male" ~ "Male",
       sex == "intersex" ~ "Intersex",
@@ -129,7 +129,7 @@ logical_cols <- which(sapply(df_input, is.logical) & !grepl("primary|secondary|m
 df_input <- df_input %>%
   mutate(across(
     .cols = all_of(logical_cols), 
-    .fns = ~as.factor(case_when(
+    .fns = ~factor(case_when(
       . == FALSE ~ "No",
       . == TRUE ~ "Yes",
       TRUE ~ NA_character_
@@ -148,19 +148,19 @@ df_input <- df_input %>%
                            "3" = "3", "4" = "3", "5" = "4", "6" = "4", 
                            "7" = "5", "8" = "5", .missing = "Unknown"),
     #define household size categories
-    household_size_cat = as.factor(case_when(
+    household_size_cat = factor(case_when(
       household_size >= 1 & household_size <= 2 ~ "1",
       household_size >= 3 & household_size <= 5 ~ "2",
       household_size >= 6 ~ "3",
       TRUE ~ "Unknown")),
     #assign rurality classification
-    rurality_classification = as.factor(case_when(
+    rurality_classification = factor(case_when(
       rurality_code == "1" ~ "Urban Major Conurbation",
       rurality_code == "2" ~ "Urban Minor Conurbation",
       rurality_code == "3" ~ "Urban City and Town",
       rurality_code == "4" ~ "Rural Town and Fringe",
       rurality_code == "5" ~ "Rural Village and Dispersed",
-      TRUE ~ "Unknown"))
+      TRUE ~ "Unknown"), ordered = TRUE)
   )
 
 #flu vaccination
@@ -170,7 +170,7 @@ if (cohort != "infants" & cohort != "infants_subgroup") {
     #assign flu vaccination status
     flu_vaccination_immunity_date = flu_vaccination_date + days(10),
     #current flu vaccination status including a lag time
-    flu_vaccination = as.factor(if_else(
+    flu_vaccination = factor(if_else(
       is.na(flu_vaccination_immunity_date), "No", "Yes"
     ))
   )
@@ -180,7 +180,7 @@ if (cohort != "infants" & cohort != "infants_subgroup") {
 if (study_start_date >= covid_prior_vacc_min & cohort != "infants" & cohort != "infants_subgroup") {
   df_input <- df_input %>%
     mutate(
-      time_since_last_covid_vaccination = as.factor(case_when(
+      time_since_last_covid_vaccination = factor(case_when(
       time_length(difftime(study_start_date, last_covid_vaccination_date, 
                              units = "days"), "months") >= 0 &
         time_length(difftime(study_start_date, last_covid_vaccination_date,
@@ -199,31 +199,31 @@ if (study_start_date >= covid_current_vacc_min & cohort != "infants" & cohort !=
     mutate(
       covid_vaccination_immunity_date = covid_vaccination_date + days(10),
       #current covid vaccination status including a lag time
-      covid_vaccination = as.factor(if_else(
+      covid_vaccination = factor(if_else(
         is.na(covid_vaccination_immunity_date), "No", "Yes"
       ))
     )
 }
 
-#re-level factors so they have reference categories for the regression models
-df_input <- df_input %>% 
-  mutate(
-    latest_ethnicity_group = fct_relevel(latest_ethnicity_group, 
-                             c("White", "Mixed", "Asian or Asian British", 
-                               "Black or Black British", "Other Ethnic Groups"))
-  ) %>% arrange(latest_ethnicity_group)
-df_input <- df_input %>% 
-  mutate(
-    rurality_classification = fct_relevel(rurality_classification, 
-                              c("Urban Major Conurbation", "Urban Minor Conurbation", 
-                              "Urban City and Town", "Rural Town and Fringe", 
-                              "Rural Village and Dispersed", "Unknown"))
-  ) %>% arrange(rurality_classification)
-df_input <- df_input %>% 
+# #re-level factors so they have reference categories for the regression models
+# df_input <- df_input %>% 
+#   mutate(
+#     latest_ethnicity_group = fct_relevel(latest_ethnicity_group, 
+#                              c("White", "Mixed", "Asian or Asian British", 
+#                                "Black or Black British", "Other Ethnic Groups"))
+#   ) %>% arrange(latest_ethnicity_group)
+# df_input <- df_input %>% 
+#   mutate(
+#     rurality_classification = fct_relevel(rurality_classification, 
+#                               c("Urban Major Conurbation", "Urban Minor Conurbation", 
+#                               "Urban City and Town", "Rural Town and Fringe", 
+#                               "Rural Village and Dispersed", "Unknown"))
+#   ) %>% arrange(rurality_classification)
+df_input <- df_input %>%
   mutate(
-    composition_category = fct_relevel(composition_category, 
-                                       c("Multiple of the Same Generation", "Living Alone", 
-                                         "One Other Generation", "Two Other Generations", 
+    composition_category = fct_relevel(composition_category,
+                                       c("Multiple of the Same Generation", "Living Alone",
+                                         "One Other Generation", "Two Other Generations",
                                          "Three Other Generations"))
   ) %>% arrange(composition_category)
 

diff --git a/analysis/sensitivity_analyses/data_processing_sens.R b/analysis/sensitivity_analyses/data_processing_sens.R
@@ -117,7 +117,7 @@ df_input_filt <- df_input_filt %>%
       latest_ethnicity_code == "3" ~ "Asian or Asian British",
       latest_ethnicity_code == "4" ~ "Black or Black British",
       latest_ethnicity_code == "5" ~ "Other Ethnic Groups",
-      TRUE ~ "Unknown")),
+      TRUE ~ "Unknown"), ordered = TRUE),
     #calculate IMD quintile
     imd_quintile = as.factor(case_when(
       imd_rounded >= 0 & imd_rounded < as.integer(32800 * 1 / 5) ~ "1 (most deprived)",
@@ -173,7 +173,7 @@ df_input_filt <- df_input_filt %>%
       rurality_code == "3" ~ "Urban City and Town",
       rurality_code == "4" ~ "Rural Town and Fringe",
       rurality_code == "5" ~ "Rural Village and Dispersed",
-      TRUE ~ "Unknown"))
+      TRUE ~ "Unknown"), ordered = TRUE)
   )
 
 #flu vaccination
@@ -219,19 +219,19 @@ if (study_start_date >= covid_current_vacc_min & cohort != "infants" & cohort !=
 }
 
 #re-level factors so they have reference categories for the regression models
-df_input_filt <- df_input_filt %>% 
-  mutate(
-    latest_ethnicity_group = fct_relevel(latest_ethnicity_group, 
-                             c("White", "Mixed", "Asian or Asian British", 
-                               "Black or Black British", "Other Ethnic Groups"))
-  ) %>% arrange(latest_ethnicity_group)
-df_input_filt <- df_input_filt %>% 
-  mutate(
-    rurality_classification = fct_relevel(rurality_classification, 
-                              c("Urban Major Conurbation", "Urban Minor Conurbation", 
-                              "Urban City and Town", "Rural Town and Fringe", 
-                              "Rural Village and Dispersed", "Unknown"))
-  ) %>% arrange(rurality_classification)
+# df_input_filt <- df_input_filt %>% 
+#   mutate(
+#     latest_ethnicity_group = fct_relevel(latest_ethnicity_group, 
+#                              c("White", "Mixed", "Asian or Asian British", 
+#                                "Black or Black British", "Other Ethnic Groups"))
+#   ) %>% arrange(latest_ethnicity_group)
+# df_input_filt <- df_input_filt %>% 
+#   mutate(
+#     rurality_classification = fct_relevel(rurality_classification, 
+#                               c("Urban Major Conurbation", "Urban Minor Conurbation", 
+#                               "Urban City and Town", "Rural Town and Fringe", 
+#                               "Rural Village and Dispersed", "Unknown"))
+#   ) %>% arrange(rurality_classification)
 df_input_filt <- df_input_filt %>% 
   mutate(
     composition_category = fct_relevel(composition_category,