diff --git a/datasets/diabetes.rmd b/datasets/diabetes.rmd new file mode 100644 index 0000000..5d2ef9e --- /dev/null +++ b/datasets/diabetes.rmd @@ -0,0 +1,49 @@ +--- +title: 'Diabetes Data from Kaggle' +author: 'Mark Newman' +date: '2021-04-13' +editor_options: + chunk_output_type: console +--- + + + +Download the data from [Kaggle](https://www.kaggle.com/kandij/diabetes-dataset). +It needs to be downloaded first because it is behind Kaggle's Sign In page. + +```{r label = 'load data'} +data <- read.csv('../data/diabetes2.csv') +``` + +`factor()` the data as necessary based on the data dictionary. + +```{r label = 'data dictionary conversion', echo = F} +data$Outcome <- factor(data$Outcome, levels = 0:1) +levels(data$Outcome) <- c('no', 'yes') +``` + +QA check the conversion by testing for `NA`s in the factor variables. + +```{r label = 'qa factor'} +for(col in colnames(data)) { + t1 <- data[,col] + if(is.factor(t1) & any(is.na(t1))) { + warning(paste0("QA: Column '", col, "' has NAs")) + } + rm(t1) +} +rm(col) +``` + +Identify the _classification_ variable and name it `CLASS`. +This allows for a consistent processing. + +```{r label = 'formula'} +if('CLASS' %in% colnames(data)) { stop('QA: column name clash') } +data$CLASS <- data$Outcome +data$Outcome <- NULL +class_name <- 'Outcome' +``` \ No newline at end of file diff --git a/datasets/heartattack.rmd b/datasets/heartattack.rmd index 33c1ffc..d986f47 100644 --- a/datasets/heartattack.rmd +++ b/datasets/heartattack.rmd @@ -76,9 +76,12 @@ for(col in colnames(data)) { rm(col) ``` -Mark the `class` variable in both string and formula form. +Identify the _classification_ variable and name it `CLASS`. +This allows for a consistent processing. ```{r label = 'formula'} -class <- 'target' -formula <- target ~ . +if('CLASS' %in% colnames(data)) { stop('QA: column name clash') } +data$CLASS <- data$target +data$target <- NULL +class_name <- 'target' ``` diff --git a/recipes/caret-classification.rmd b/recipes/caret-classification.rmd index f15b8c7..de2fd0b 100644 --- a/recipes/caret-classification.rmd +++ b/recipes/caret-classification.rmd @@ -19,6 +19,12 @@ Choosing a dataset that has both continuous and categorical predictors is a good # What? + + ```{r child = '../datasets/heartattack.rmd'} ``` @@ -38,7 +44,7 @@ set.seed(0) k <- 10 sr <- .85 -indx <- createDataPartition(data$target, p = sr, list = F) +indx <- createDataPartition(data$CLASS, p = sr, list = F) train <- data[indx,] test <- data[-indx,] tCtrl <- @@ -58,16 +64,16 @@ rm(k, sr, data, indx) ``` Setup the results `data.frame()` and associated helper functions. -**NOTE**: If the target variable ever becomes something other than `$target`, `capture_results()` will need to change. +Setup `formula` to prevent rewriting it. ```{r label = 'results helper functions'} capture_results <- function(fit, test) { pre <- predict(fit, test) cm <- confusionMatrix( - reference = test$target, + reference = test$CLASS, data = pre, - positive = levels(test$target)[2]) + positive = levels(test$CLASS)[2]) list( train_acc = max(fit$results$Accuracy), test_acc = unname(cm$overall['Accuracy']), @@ -93,6 +99,8 @@ results <- test_cil = character(), test_ciu = character(), stringsAsFactors = F) + +formula = CLASS ~. ``` The list of `caret` models can be found [here](https://topepo.github.io/caret/available-models.html). @@ -300,5 +308,5 @@ rm(t1) ``` ```{r label = 'final cleanup', echo = F} -rm(results, tCtrl, test, train, capture_results, update_results, class, formula) +rm(results, tCtrl, test, train, capture_results, update_results, class_name, formula) ``` diff --git a/recipes/eda-classification.rmd b/recipes/eda-classification.rmd index 5255679..c21daad 100644 --- a/recipes/eda-classification.rmd +++ b/recipes/eda-classification.rmd @@ -17,6 +17,12 @@ Having a copy-paste process for EDA is useful in getting any research off the gr # What? + + ```{r child = '../datasets/heartattack.rmd'} ``` @@ -24,7 +30,7 @@ Separate out the categorical and continuous predictors. ```{r label = 'separate'} cn <- colnames(data) -cn <- cn[!(cn %in% class)] +cn <- cn[!(cn %in% 'CLASS')] indx <- sapply(cn, function(x){is.factor(data[,x])}) colnames_cat <- cn[indx] colnames_cont <- cn[!indx] @@ -61,13 +67,15 @@ In this case the ordering can be seen below. library(knitr) library(kableExtra) -test_cont <- function(data, class, predictor) { - t1 <- data[,c(class, predictor)] - colnames(t1) <- c('Class', 'Value') - t.test(Value ~ Class, data = t1) +test_cont <- function(data, predictor) { + t1 <- data[,c('CLASS', predictor)] + colnames(t1)[2] <- 'Value' + t.test(Value ~ CLASS, data = t1) } -test_cat <- function(data, class, predictor) { - t1 <- table(data[,class], data[,predictor]) +test_cat <- function(data, predictor) { + t1 <- data[,c('CLASS', predictor)] + colnames(t1)[2] <- 'Value' + t1 <- xtabs(~., data = t1) chisq.test(t1) } format_pvalue <- function(pv) { @@ -79,40 +87,46 @@ format_pvalue <- function(pv) { paste0('= ', round(pv, 3)) } } -univariate_test_all <- function(data, test, class, predictors) { - df1 <- data.frame(pre = predictors, stat = as.numeric(NA), df = NA, pv = NA) - rownames(df1) <- predictors - for(predictor in predictors) { - t1 <- test(data, class, predictor) - df1[predictor, 'stat'] <- t1$statistic - df1[predictor, 'df'] <- t1$parameter - df1[predictor, 'pv'] <- format_pvalue(t1$p.value) - rm(t1) +univariate_test_all <- function(data, test, predictors) { + if(length(predictors) > 0) { + df1 <- data.frame(pre = predictors, stat = as.numeric(NA), df = NA, pv = NA) + rownames(df1) <- predictors + for(predictor in predictors) { + t1 <- test(data, predictor) + df1[predictor, 'stat'] <- t1$statistic + df1[predictor, 'df'] <- t1$parameter + df1[predictor, 'pv'] <- format_pvalue(t1$p.value) + rm(t1) + } + rm(predictor) + df1 <- df1[order(abs(df1$stat), decreasing = T),] + rownames(df1) <- NULL + df1 + } else { + NULL } - rm(predictor) - df1 <- df1[order(abs(df1$stat), decreasing = T),] - rownames(df1) <- NULL - df1 } -render_test_all <- function(table, test_stat) { - t1 <- - kable( - table, - caption = 'Univariate Results', - col.names = c('Predictor', test_stat, 'DF', 'P Value'), - digits = c(0, 2, 2, 0)) - kable_styling(t1) +render_test_all <- function(table, type, test_stat) { + if(!is.null(table)) { + t1 <- + kable( + table, + caption = paste0('Univariate Results (', type, ')'), + col.names = c('Predictor', test_stat, 'DF', 'P Value'), + digits = c(0, 2, 2, 0)) + kable_styling(t1) + } } ``` -Using the univariate tests of `t.test()` and `chisq.test()` we can see the relationship between the individual `predictor`s and the class ``r class``. +Using the univariate tests of `t.test()` and `chisq.test()` we can see the relationship between the individual `predictor`s and the class ``r class_name``. **NOTE**: When combining this work with other work, remember to adjust the p-values to prevent an inflated alpha error. ```{r label = 'results test', echo = F, message = F, warning = F} -t1 <- univariate_test_all(data, test_cont, class, colnames_cont) -t2 <- univariate_test_all(data, test_cat, class, colnames_cat) -render_test_all(t1, '$t$') -render_test_all(t2, '$\\chi^2$') +t1 <- univariate_test_all(data, test_cont, colnames_cont) +t2 <- univariate_test_all(data, test_cat, colnames_cat) +render_test_all(t1, 'Continuous', '$t$') +render_test_all(t2, 'Categorical', '$\\chi^2$') rm(test_cont, test_cat, format_pvalue, univariate_test_all, render_test_all) rm(colnames_cont, colnames_cat) ``` @@ -121,29 +135,29 @@ rm(colnames_cont, colnames_cat) library(vcd) library(ggplot2) -univariate_figure_cont <- function(data, class, predictors) { +univariate_figure_cont <- function(data, predictors) { for(predictor in predictors) { - t1 <- data[,c(class, predictor)] - colnames(t1) <- c('Class', 'Value') + t1 <- data[,c('CLASS', predictor)] + colnames(t1)[2] <- 'Value' p1 <- - ggplot(t1, aes(x = Value, color = Class)) + + ggplot(t1, aes(x = Value, color = CLASS)) + scale_color_brewer(palette = 'Dark2') + geom_line(stat = 'ecdf') + theme_bw() + labs( x = predictor, y = 'Cumulative Density', - color = class) + color = class_name) plot(p1) } } -univariate_figure_cat <- function(data, class, predictors) { +univariate_figure_cat <- function(data, predictors) { for(predictor in predictors) { - t1 <- data[,c(class, predictor)] - colnames(t1) <- c('Class', 'Value') - t1 <- xtabs(~ Class + Value, data = t1) + t1 <- data[,c('CLASS', predictor)] + colnames(t1)[2] <- 'Value' + t1 <- xtabs(~ CLASS + Value, data = t1) t2 <- dimnames(t1) - names(t2) <- c(class, predictor) + names(t2) <- c(class_name, predictor) dimnames(t1) <- t2 mosaic( t1, @@ -156,13 +170,13 @@ univariate_figure_cat <- function(data, class, predictors) { Visualizations of the above tests can be seen below. ```{r label = 'results figure', echo = F} -univariate_figure_cont(data, class, t1$pre) -univariate_figure_cat(data, class, t2$pre) +univariate_figure_cont(data, t1$pre) +univariate_figure_cat(data, t2$pre) rm(univariate_figure_cont, univariate_figure_cat) ``` ```{r label = 'final cleanup', echo = F} -rm(data, t1, t2, class, formula) +rm(data, t1, t2, class_name) ```