From dc346fb1f258259da0dd2feaace5ed487774a39b Mon Sep 17 00:00:00 2001 From: "C. Li" <47674556+cxli233@users.noreply.github.com> Date: Tue, 10 Dec 2024 11:16:00 -0500 Subject: [PATCH] Add files via upload new scripts for don't use boxplot for binomial data and histogram for small n. --- Scripts/BoxPlot_for_Binomial.Rmd | 97 +++++++++++++++++++++++ Scripts/Histogram_for_small_n.Rmd | 124 ++++++++++++++++++++++++++++++ 2 files changed, 221 insertions(+) create mode 100644 Scripts/BoxPlot_for_Binomial.Rmd create mode 100644 Scripts/Histogram_for_small_n.Rmd diff --git a/Scripts/BoxPlot_for_Binomial.Rmd b/Scripts/BoxPlot_for_Binomial.Rmd new file mode 100644 index 0000000..d9b4659 --- /dev/null +++ b/Scripts/BoxPlot_for_Binomial.Rmd @@ -0,0 +1,97 @@ +--- +title: "BoxPlot_for_Binomial" +author: "Chenxin Li" +date: "2024-12-10" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +# Friends don't let friends use boxpot for binomial data + +# Packages +```{r} +library(tidyverse) +library(RColorBrewer) +library(ggbeeswarm) + +library(patchwork) +``` + +## Data +```{r} +set.seed(666) +``` + +```{r} +data1 <- data.frame( + response = rnorm(n = 100, mean = 5, sd = 2) +) %>% + mutate(group = "group1") + +data2 <- data.frame( + response = c( + rnorm(n = 50, mean = 2.5, sd = 1), + rnorm(n = 50, mean = 7.5, sd = 1) + )) %>% + mutate(group = "group2") + +data3 <- data.frame( + response = c( + rnorm(n = 33, mean = 2, sd = 0.5), + rnorm(n = 33, mean = 5, sd = 0.5), + rnorm(n = 33, mean = 8, sd = 0.5) + )) %>% + mutate(group = "group3") +``` + +## Bad example +```{r} +Box <- rbind( + data1, + data2, + data3 +) %>% + ggplot(aes(x = group, y = response)) + + geom_boxplot(aes(fill = group), alpha = 0.8, width = 0.7) + + scale_fill_manual(values = brewer.pal(8, "Set2")) + + labs(title = "Very similar!") + + theme_classic() + +Box +``` +## Good example +```{r} +Dots <- rbind( + data1, + data2, + data3 +) %>% + ggplot(aes(x = group, y = response)) + + geom_quasirandom(aes(color = group), alpha = 0.8) + + scale_color_manual(values = brewer.pal(8, "Set2")) + + labs(title = "I guess not!") + + theme_classic() + +Dots +``` +# Wrap them together +```{r} +wrap_plots( + Box, Dots, + nrow = 1 +) & + theme(legend.position = "none") + +ggsave("../Results/BoxPlots_for_binomial.svg", width = 5, height = 2.5) +ggsave("../Results/BoxPlots_for_binomial.png", width = 5, height = 2.5) +``` +Before making a box plot, one should check the distribution of their data, +since box plots focus on median and quartiles, +they cannot handle binomial data (and by extension data with multiple modes). +Ploting all the data points using `geom_quasirandom()` from the [ggbeeswarm package](https://github.com/eclarke/ggbeeswarm) is the best practice for small sample to moderate (less than tens of thousands) sample sizes, +as dots are robust to small sample sizes, +whereas distribution-based graphics such as violin plots and histograms are not. +See [this section](https://github.com/cxli233/FriendsDontLetFriends#2-friends-dont-let-friends-make-violin-plots-for-small-sample-sizes) and [this section](https://github.com/cxli233/FriendsDontLetFriends/tree/main?tab=readme-ov-file#friends-dont-let-friends-use-histogram-for-small-sample-sizes) for details. \ No newline at end of file diff --git a/Scripts/Histogram_for_small_n.Rmd b/Scripts/Histogram_for_small_n.Rmd new file mode 100644 index 0000000..fa242af --- /dev/null +++ b/Scripts/Histogram_for_small_n.Rmd @@ -0,0 +1,124 @@ +--- +title: "Histogram_for_small_n" +author: "Chenxin Li" +date: "2024-12-10" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +# Friends don't let friends use histogram for small sample sizes + +# Packages +```{r} +library(tidyverse) +library(RColorBrewer) +library(ggbeeswarm) +library(viridis) +library(patchwork) +``` + +# Data +```{r} +set.seed(666) +``` + +```{r} +n10 <- data.frame( + response = rnorm(n = 10) +) %>% + mutate(group = "n = 10") + +n100 <- data.frame( + response = rnorm(n = 100) +) %>% + mutate(group = "n = 100") + +n1000 <- data.frame( + response = rnorm(n = 1000) +) %>% + mutate(group = "n = 1000") +``` + +# Graphs +```{r} +bins10 <- rbind( + n10, n100, n1000 +) %>% + ggplot(aes(x = response)) + + facet_wrap(~group, scales = "free", ncol = 1) + + geom_histogram(bins = 10, width = 0.7, color = "white", alpha = 0.8, + fill = viridis(n = 8, begin = 0.1, end = 0.8)[1]) + + labs(title = "10 bins") + + theme_classic() + + theme(panel.spacing = unit(1, "lines"), + strip.placement = "outside", + strip.background = element_blank(), + strip.text = element_text(hjust = 0)) + +bins10 +``` + +```{r} +bins30 <- rbind( + n10, n100, n1000 +) %>% + ggplot(aes(x = response)) + + facet_wrap(~group, scales = "free", ncol = 1) + + geom_histogram(bins = 30, width = 0.7, color = "white", + fill = viridis(n = 8, begin = 0.1, end = 0.8)[4]) + + labs(title = "30 bins") + + theme_classic() + + theme(panel.spacing = unit(1, "lines"), + strip.placement = "outside", + strip.background = element_blank(), + strip.text = element_text(hjust = 0)) + +bins30 +``` + +```{r} +bins50 <- rbind( + n10, n100, n1000 +) %>% + ggplot(aes(x = response)) + + facet_wrap(~group, scales = "free", ncol = 1) + + geom_histogram(bins = 50, width = 0.7, color = "white", + fill = viridis(n = 8, begin = 0.1, end = 0.8)[7]) + + labs(title = "50 bins") + + theme_classic() + + theme(panel.spacing = unit(1, "lines"), + strip.placement = "outside", + strip.background = element_blank(), + strip.text = element_text(hjust = 0)) + +bins50 +``` + +# wrap them +```{r} +wrap_plots( + bins10, bins30, bins50 + + labs(caption = "\nWow, the appearance does change with different bin numbers."), + ncol = 3 +) & + theme(plot.caption = element_text(size = 10)) + +ggsave("../Results/Histogram_for_small_n.svg", height = 6, width = 8) +ggsave("../Results/Histogram_for_small_n.png", height = 6, width = 8) +``` +I've seen histogram being proposed as the replacement for bar plots. +However, a serious caveat for histogram is that histograms are not robust to bin numbers for small (and even moderate) sample sizes. +What is a histogram anyway? In a histogram, we first bin the data into a defined number of bins. +Then we count how many observations are there for each bin and graph them. + +In this example, I sampled _the same_ normal distribution 3 times with different sample sizes (n = 10, 100, and 1000). +Even though they came from _the same_ normal distribution, the histograms look quite different based on the number of bins. +To showcase this, I plotted histogram for 10, 30, and 50 bins. + +First of all, histogram makes no sense for small sample sizes. With small sample sizes (n < 30), the better practice is to graph all data points. +Second of all, you can see that the shape of the histogram is only robust to changing bin number when the sample size is fairly large (like 1000). +Even if n = 100, the appearance of the histogram can change drastically as the number of bins changes. +