new ds + generalise

markanewman · Apr 14, 2021 · 6a9e42c · 6a9e42c
1 parent a982f51
commit 6a9e42c
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 54 deletions.
diff --git a/datasets/diabetes.rmd b/datasets/diabetes.rmd
@@ -0,0 +1,49 @@
+---
+title: 'Diabetes Data from Kaggle'
+author: 'Mark Newman'
+date: '2021-04-13'
+editor_options: 
+  chunk_output_type: console
+---
+
+<!--
+This RMD can and must be runnable in a stand alone manner.
+However, it is intended to be called from other RMDs using the r chunk's `child` parameter.
+-->
+
+Download the data from [Kaggle](https://www.kaggle.com/kandij/diabetes-dataset).
+It needs to be downloaded first because it is behind Kaggle's Sign In page.
+
+```{r label = 'load data'}
+data <- read.csv('../data/diabetes2.csv')
+```
+
+`factor()` the data as necessary based on the data dictionary.
+
+```{r label = 'data dictionary conversion', echo = F}
+data$Outcome <- factor(data$Outcome, levels = 0:1)
+levels(data$Outcome) <- c('no', 'yes')
+```
+
+QA check the conversion by testing for `NA`s in the factor variables.
+
+```{r label = 'qa factor'}
+for(col in colnames(data)) {
+  t1 <- data[,col]
+  if(is.factor(t1) & any(is.na(t1))) {
+    warning(paste0("QA: Column '", col, "' has NAs"))
+  }
+  rm(t1)
+}
+rm(col)
+```
+
+Identify the _classification_ variable and name it `CLASS`.
+This allows for a consistent processing.
+
+```{r label = 'formula'}
+if('CLASS' %in% colnames(data)) { stop('QA: column name clash') }
+data$CLASS <- data$Outcome
+data$Outcome <- NULL
+class_name <- 'Outcome'
+```
diff --git a/datasets/heartattack.rmd b/datasets/heartattack.rmd
@@ -76,9 +76,12 @@ for(col in colnames(data)) {
 rm(col)
 ```
 
-Mark the `class` variable in both string and formula form.
+Identify the _classification_ variable and name it `CLASS`.
+This allows for a consistent processing.
 
 ```{r label = 'formula'}
-class <- 'target'
-formula <- target ~ .
+if('CLASS' %in% colnames(data)) { stop('QA: column name clash') }
+data$CLASS <- data$target
+data$target <- NULL
+class_name <- 'target'
 ```
diff --git a/recipes/caret-classification.rmd b/recipes/caret-classification.rmd
@@ -19,6 +19,12 @@ Choosing a dataset that has both continuous and categorical predictors is a good
 
 # What?
 
+<!--
+Works with the following `../datasets`:
+* heartattack.rmd
+* diabetes.rmd
+-->
+
 ```{r child = '../datasets/heartattack.rmd'}
 ```
 
@@ -38,7 +44,7 @@ set.seed(0)
 k <- 10
 sr <- .85
 
-indx <- createDataPartition(data$target, p = sr, list = F)
+indx <- createDataPartition(data$CLASS, p = sr, list = F)
 train <- data[indx,]
 test <- data[-indx,]
 tCtrl <-
@@ -58,16 +64,16 @@ rm(k, sr, data, indx)
 ```
 
 Setup the results `data.frame()` and associated helper functions.
-**NOTE**: If the target variable ever becomes something other than `$target`, `capture_results()` will need to change.
+Setup `formula` to prevent rewriting it.
 
 ```{r label = 'results helper functions'}
 capture_results <- function(fit, test) {
   pre <- predict(fit, test)
   cm <-
     confusionMatrix(
-      reference = test$target,
+      reference = test$CLASS,
       data = pre,
-      positive = levels(test$target)[2])
+      positive = levels(test$CLASS)[2])
   list(
     train_acc = max(fit$results$Accuracy),
     test_acc = unname(cm$overall['Accuracy']),
@@ -93,6 +99,8 @@ results <-
     test_cil = character(),
     test_ciu = character(),
     stringsAsFactors = F)
+
+formula = CLASS ~.
 ```
 
 The list of `caret` models can be found [here](https://topepo.github.io/caret/available-models.html).
@@ -300,5 +308,5 @@ rm(t1)
 ```
 
 ```{r label = 'final cleanup', echo = F}
-rm(results, tCtrl, test, train, capture_results, update_results, class, formula)
+rm(results, tCtrl, test, train, capture_results, update_results, class_name, formula)
 ```
diff --git a/recipes/eda-classification.rmd b/recipes/eda-classification.rmd
@@ -17,14 +17,20 @@ Having a copy-paste process for EDA is useful in getting any research off the gr
 
 # What?
 
+<!--
+Works with the following `../datasets`:
+* heartattack.rmd
+* diabetes.rmd
+-->
+
 ```{r child = '../datasets/heartattack.rmd'}
 ```
 
 Separate out the categorical and continuous predictors.
 
 ```{r label = 'separate'}
 cn <- colnames(data)
-cn <- cn[!(cn %in% class)]
+cn <- cn[!(cn %in% 'CLASS')]
 indx <- sapply(cn, function(x){is.factor(data[,x])})
 colnames_cat <- cn[indx]
 colnames_cont <- cn[!indx]
@@ -61,13 +67,15 @@ In this case the ordering can be seen below.
 library(knitr)
 library(kableExtra)
 
-test_cont <- function(data, class, predictor) {
-  t1 <- data[,c(class, predictor)]
-  colnames(t1) <- c('Class', 'Value')
-  t.test(Value ~ Class, data = t1)
+test_cont <- function(data, predictor) {
+  t1 <- data[,c('CLASS', predictor)]
+  colnames(t1)[2] <- 'Value'
+  t.test(Value ~ CLASS, data = t1)
 }
-test_cat <- function(data, class, predictor) {
-  t1 <- table(data[,class], data[,predictor])
+test_cat <- function(data, predictor) {
+  t1 <- data[,c('CLASS', predictor)]
+  colnames(t1)[2] <- 'Value'
+  t1 <- xtabs(~., data = t1)
   chisq.test(t1)
 }
 format_pvalue <- function(pv) {
@@ -79,40 +87,46 @@ format_pvalue <- function(pv) {
     paste0('= ', round(pv, 3))
   }
 }
-univariate_test_all <- function(data, test, class, predictors) {
-  df1 <- data.frame(pre = predictors, stat = as.numeric(NA), df = NA, pv = NA)
-  rownames(df1) <- predictors
-  for(predictor in predictors) {
-    t1 <- test(data, class, predictor)
-    df1[predictor, 'stat'] <- t1$statistic
-    df1[predictor, 'df'] <- t1$parameter
-    df1[predictor, 'pv'] <- format_pvalue(t1$p.value)
-    rm(t1)
+univariate_test_all <- function(data, test, predictors) {
+  if(length(predictors) > 0) {
+    df1 <- data.frame(pre = predictors, stat = as.numeric(NA), df = NA, pv = NA)
+    rownames(df1) <- predictors
+    for(predictor in predictors) {
+      t1 <- test(data, predictor)
+      df1[predictor, 'stat'] <- t1$statistic
+      df1[predictor, 'df'] <- t1$parameter
+      df1[predictor, 'pv'] <- format_pvalue(t1$p.value)
+      rm(t1)
+    }
+    rm(predictor)
+    df1 <- df1[order(abs(df1$stat), decreasing  = T),]
+    rownames(df1) <- NULL
+    df1
+  } else {
+    NULL
   }
-  rm(predictor)
-  df1 <- df1[order(abs(df1$stat), decreasing  = T),]
-  rownames(df1) <- NULL
-  df1
 }
-render_test_all <- function(table, test_stat) {
-  t1 <-
-    kable(
-      table,
-      caption = 'Univariate Results',
-      col.names = c('Predictor', test_stat, 'DF', 'P Value'),
-      digits = c(0, 2, 2, 0))
-  kable_styling(t1)
+render_test_all <- function(table, type, test_stat) {
+  if(!is.null(table)) {
+    t1 <-
+      kable(
+        table,
+        caption = paste0('Univariate Results (', type, ')'),
+        col.names = c('Predictor', test_stat, 'DF', 'P Value'),
+        digits = c(0, 2, 2, 0))
+    kable_styling(t1)
+  }
 }
 ```
 
-Using the univariate tests of `t.test()` and `chisq.test()` we can see the relationship between the individual `predictor`s and the class ``r class``.
+Using the univariate tests of `t.test()` and `chisq.test()` we can see the relationship between the individual `predictor`s and the class ``r class_name``.
 **NOTE**: When combining this work with other work, remember to adjust the p-values to prevent an inflated alpha error.
 
 ```{r label = 'results test', echo = F, message = F, warning = F}
-t1 <- univariate_test_all(data, test_cont, class, colnames_cont)
-t2 <- univariate_test_all(data, test_cat, class, colnames_cat)
-render_test_all(t1, '$t$')
-render_test_all(t2, '$\\chi^2$')
+t1 <- univariate_test_all(data, test_cont, colnames_cont)
+t2 <- univariate_test_all(data, test_cat, colnames_cat)
+render_test_all(t1, 'Continuous', '$t$')
+render_test_all(t2, 'Categorical', '$\\chi^2$')
 rm(test_cont, test_cat, format_pvalue, univariate_test_all, render_test_all)
 rm(colnames_cont, colnames_cat)
 ```
@@ -121,29 +135,29 @@ rm(colnames_cont, colnames_cat)
 library(vcd)
 library(ggplot2)
 
-univariate_figure_cont <- function(data, class, predictors) {
+univariate_figure_cont <- function(data, predictors) {
   for(predictor in predictors) {
-    t1 <- data[,c(class, predictor)]
-    colnames(t1) <- c('Class', 'Value')
+    t1 <- data[,c('CLASS', predictor)]
+    colnames(t1)[2] <- 'Value'
     p1 <-
-      ggplot(t1, aes(x = Value, color = Class)) + 
+      ggplot(t1, aes(x = Value, color = CLASS)) + 
       scale_color_brewer(palette = 'Dark2') + 
       geom_line(stat = 'ecdf') + 
       theme_bw() +
       labs(
         x = predictor,
         y = 'Cumulative Density',
-        color = class)
+        color = class_name)
     plot(p1)
   }
 }
-univariate_figure_cat <- function(data, class, predictors) {
+univariate_figure_cat <- function(data, predictors) {
   for(predictor in predictors) {
-    t1 <- data[,c(class, predictor)]
-    colnames(t1) <- c('Class', 'Value')
-    t1 <- xtabs(~ Class + Value, data = t1)
+    t1 <- data[,c('CLASS', predictor)]
+    colnames(t1)[2] <- 'Value'
+    t1 <- xtabs(~ CLASS + Value, data = t1)
     t2 <- dimnames(t1)
-    names(t2) <- c(class, predictor)
+    names(t2) <- c(class_name, predictor)
     dimnames(t1) <- t2
     mosaic(
       t1,
@@ -156,13 +170,13 @@ univariate_figure_cat <- function(data, class, predictors) {
 Visualizations of the above tests can be seen below.
 
 ```{r label = 'results figure', echo = F}
-univariate_figure_cont(data, class, t1$pre)
-univariate_figure_cat(data, class, t2$pre)
+univariate_figure_cont(data, t1$pre)
+univariate_figure_cat(data, t2$pre)
 rm(univariate_figure_cont, univariate_figure_cat)
 ```
 
 ```{r label = 'final cleanup', echo = F}
-rm(data, t1, t2, class, formula)
+rm(data, t1, t2, class_name)
 ```